In [None]:
import numpy as np
from datasets import concatenate_datasets, load_dataset


Reading the raw dataset.

In [None]:
ds = load_dataset(
    "mlburnham/political_or_not",
    revision="6fb9b9e4d6d40f7e46d109acb5dbb7e39b2da749",
)
df = concatenate_datasets(ds.values()).to_pandas()
df


Dropping useless columns.

In [None]:
df = df.drop(
    columns=["entailment", "dataset", "hypothesis", "validation_source"]
)


Renaming columns.

In [None]:
df = df.rename(columns={"premise": "body", "validated_label": "politicalness"})


Categorizing the politicalness column.

In [None]:
df["politicalness"].unique()


In [None]:
df["politicalness"] = df["politicalness"].replace(
    {0: "political", 1: "non-political"}
)
df["politicalness"] = df["politicalness"].astype("category")


Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows with missing data.

In [None]:
df = df.dropna()


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


After the inspection, articles with bodies shorter than 2 words seem to contain no political value. Removing them.

In [None]:
body_word_count_lower_bound = 2
df = df[df["body_word_count"] >= body_word_count_lower_bound]


In [None]:
df.to_parquet("../preprocessed/political_or_not.parquet")
