In [None]:
import numpy as np
from datasets import concatenate_datasets, load_dataset
from matplotlib import pyplot as plt

Reading the raw dataset.

In [None]:
ds = load_dataset(
    "cajcodes/political-bias",
    revision="f24cd353b9c4a69c274fab4e43610ad90b1ae0d2"
)
df = concatenate_datasets(ds.values()).to_pandas()
df

Renaming columns.

In [None]:
df = df.rename(columns={"text": "body", "label": "leaning"})

Categorizing the leaning column.

In [None]:
df["leaning"].unique()

In [None]:
df["leaning"] = df["leaning"].replace({0: "right", 1: "right", 2: "center", 3: "left", 4: "left"})
df["leaning"] = df["leaning"].astype("category")

Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows with missing data.

In [None]:
df = df.dropna()

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

In [None]:
# Ensuring to include the last (longest) item.
df["body_length"].plot.bar().xaxis.set_ticks([])

In [None]:
df["body_length"].mean()

The leaning distribution.

In [None]:
df.groupby("leaning", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per leaning.

In [None]:
df.groupby("leaning", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

The body length distribution by leaning.

In [None]:
plt.figure(figsize=(10, 6))

for leaning in df["leaning"].unique():
    plt.plot(df[df["leaning"] == leaning].reset_index(drop=True)["body_length"], label=leaning)

plt.xlabel("downsampled index")
plt.ylabel("body length")
plt.title("body length by political leaning")
plt.legend(title="leaning")
plt.show()

In [None]:
df.to_parquet("../preprocessed/gpt4_political_bias.parquet")