In [None]:
import numpy as np
import pandas as pd
from datasets import concatenate_datasets, load_dataset
from matplotlib import pyplot as plt

from utils.dataset_utils import PoliticalLeaningDataset

Reading the raw dataset.

In [None]:
ds = load_dataset(
    "Jacobvs/PoliticalTweets",
    revision="1ddaa14beed79edda621fdd72ad22fd654d760b3"
)
df = concatenate_datasets(ds.values()).to_pandas()
df = df.set_index("index").rename_axis(None)
df

Dropping useless columns.

In [None]:
df = df.drop(columns=["id", "username", "labels"])

Renaming columns.

In [None]:
df = df.rename(columns={"text": "body", "party": "leaning"})

Categorizing the leaning label column.

In [None]:
df["leaning"].unique()

In [None]:
df["leaning"] = df["leaning"].replace({"Democrat": "left", "Republican": "right"})
df["leaning"] = df["leaning"].astype("category")

Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["political"] * len(df))

Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows with missing data.

In [None]:
df = df.dropna()

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

After the inspection, rows with bodies shorter than 2 words seem to contain no political value. Removing them.

In [None]:
body_word_count_lower_bound = 2
df = df[df["body_word_count"] >= body_word_count_lower_bound]

Downsampling the dataset.

In [None]:
SAMPLE_SIZE = 100_000

dataset = PoliticalLeaningDataset("political_tweets", df)
df = dataset.take_even_class_sample_by_size(SAMPLE_SIZE).dataframe

In [None]:
df.to_parquet("../preprocessed/political_tweets.parquet")