In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset, concatenate_datasets
from matplotlib import pyplot as plt

Reading the raw dataset.

In [None]:
ds = load_dataset(
    "mlburnham/political_or_not",
    revision="6fb9b9e4d6d40f7e46d109acb5dbb7e39b2da749"
)
df = concatenate_datasets(ds.values()).to_pandas()
df

Dropping useless columns.

In [None]:
df = df.drop(columns=["entailment", "dataset", "hypothesis", "validation_source"])

Renaming columns.

In [None]:
df = df.rename(columns={"premise": "body", "validated_label": "politicalness"})

Categorizing the politicalness column.

In [None]:
df["politicalness"].unique()

In [None]:
df["politicalness"] = df["politicalness"].replace({0: "political", 1: "non-political"})
df["politicalness"] = df["politicalness"].astype("category")

Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows with missing data.

In [None]:
df = df.dropna()

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting the body length.

In [None]:
df["body_length"] = df["body"].str.len()
df["body_word_count"] = df["body"].str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

After the inspection, articles with bodies shorter than 2 words seem to contain no political value. Removing them.

In [None]:
body_word_count_lower_bound = 2
df = df[df["body_word_count"] >= body_word_count_lower_bound]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::10], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["body_length"].mean()

The politicalness distribution.

In [None]:
df.groupby("politicalness", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per politicalness.

In [None]:
df.groupby("politicalness", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

The body length distribution by politicalness.

In [None]:
plt.figure(figsize=(10, 6))

for politicalness in df["politicalness"].unique():
    df_politicalness = df[df["politicalness"] == politicalness]
    downsampled = pd.concat(
        # Ensuring to include the last (longest) item.
        [df_politicalness["body_length"].iloc[::10], df_politicalness["body_length"].tail(1)]
    ).drop_duplicates().reset_index(drop=True)
    plt.plot(downsampled, label=politicalness)

plt.xlabel("downsampled index")
plt.ylabel("body length")
plt.title("body length by politicalness")
plt.legend(title="politicalness")
plt.show()

In [None]:
df.to_parquet("../preprocessed/political_or_not.parquet")