In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets, load_dataset

Reading the raw dataset.

In [None]:
TRUST_REMOTE_CODE = False

category_split_size = 2_000
# Removed "unknown".
categories = [
    "All_Beauty",
    "Amazon_Fashion",
    "Appliances",
    "Arts_Crafts_and_Sewing",
    "Automotive",
    "Baby_Products",
    "Beauty_and_Personal_Care",
    "Books",
    "CDs_and_Vinyl",
    "Cell_Phones_and_Accessories",
    "Clothing_Shoes_and_Jewelry",
    "Digital_Music",
    "Electronics",
    "Gift_Cards",
    "Grocery_and_Gourmet_Food",
    "Handmade_Products",
    "Health_and_Household",
    "Health_and_Personal_Care",
    "Home_and_Kitchen",
    "Industrial_and_Scientific",
    "Kindle_Store",
    "Magazine_Subscriptions",
    "Movies_and_TV",
    "Musical_Instruments",
    "Office_Products",
    "Patio_Lawn_and_Garden",
    "Pet_Supplies",
    "Software",
    "Sports_and_Outdoors",
    "Subscription_Boxes",
    "Tools_and_Home_Improvement",
    "Toys_and_Games",
    "Video_Games",
]

iterable_dataset = concatenate_datasets(
    [load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023",
        f"raw_review_{category}",
        revision="2b6d039ed471f2ba5fd2acb718bf33b0a7e5598e",
        split=f"full",
        trust_remote_code=TRUST_REMOTE_CODE,
        streaming=True,
    ).select_columns(["title", "text"]).take(category_split_size) for category in categories]
)
df = Dataset.from_generator(lambda: (yield from iterable_dataset), features=iterable_dataset.features).to_pandas()
df

Renaming columns.

In [None]:
df = df.rename(columns={"text": "body"})

Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows missing both the title and the body.

In [None]:
df = df.dropna(subset=["title", "body"], how="all")

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["non-political"] * len(df))

Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

After the inspection, rows with bodies shorter than 3 words seem to contain no useful value. Removing them.

In [None]:
body_word_count_lower_bound = 3
df = df[df["body_word_count"] >= body_word_count_lower_bound]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::1000], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["title_length"].mean()

In [None]:
df["body_length"].mean()

In [None]:
df.to_parquet("../preprocessed/amazon_reviews_2023.parquet")