In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets, load_dataset

Reading the raw dataset.

In [None]:
subreddit_sample_size = 5_000
# Removed "changemyview" and "UpliftingNews", since they may contain significant amount of political topics.
subreddits = [
    "tifu",
    "explainlikeimfive",
    "WritingPrompts",
    "LifeProTips",
    "todayilearned",
    "science",
    "askscience",
    "ifyoulikeblank",
    "Foodforthought",
    "IWantToLearn",
    "bestof",
    "IAmA",
    "socialskills",
    "relationship_advice",
    "philosophy",
    "YouShouldKnow",
    "history",
    "books",
    "Showerthoughts",
    "personalfinance",
    "buildapc",
    "EatCheapAndHealthy",
    "boardgames",
    "malefashionadvice",
    "femalefashionadvice",
    "scifi",
    "Fantasy",
    "Games",
    "bodyweightfitness",
    "SkincareAddiction",
    "podcasts",
    "suggestmeabook",
    "AskHistorians",
    "gaming",
    "DIY",
    "mildlyinteresting",
    "sports",
    "space",
    "gadgets",
    "Documentaries",
    "GetMotivated",
    "technology",
    "Fitness",
    "travel",
    "lifehacks",
    "Damnthatsinteresting",
    "gardening",
    "programming",
]

iterable_dataset = concatenate_datasets(
    [load_dataset(
        "HuggingFaceGECLM/REDDIT_comments",
        revision="54779d3d1f1c1b12e5989f695e13d38b394a558f",
        split=subreddit,
        streaming=True,
    ).take(subreddit_sample_size).select_columns(["body"]) for subreddit in subreddits]
)
df = Dataset.from_generator(lambda: (yield from iterable_dataset), features=iterable_dataset.features).to_pandas()
df

Removing `[deleted]` from the bodies.

In [None]:
df["body"] = df["body"].str.replace("[deleted]", "", regex=False)

Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows with missing data.

In [None]:
df = df.dropna()

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["non-political"] * len(df))

Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

After the inspection, rows with bodies shorter than 3 words seem to contain no useful value. Removing them.

In [None]:
body_word_count_lower_bound = 3
df = df[df["body_word_count"] >= body_word_count_lower_bound]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::1000], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["body_length"].mean()

In [None]:
df.to_parquet("../preprocessed/reddit_comments.parquet")