In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from utils.dataset_utils import PoliticalLeaningDataset

Reading the raw dataset.

In [None]:
df = pd.read_csv(
    open("../raw/media_political_stance.tsv"),
    sep="\t",
    header=None,
    names=["topic10", "topic15", "stance", "oscarID", "url", "text"],
    encoding="utf-8"
)
df

Dropping useless columns.

In [None]:
df = df.drop(columns=["topic10", "topic15", "oscarID", "url"])

Renaming and reordering columns.

In [None]:
df = df.rename(columns={"text": "body", "stance": "leaning"})
df = df[["body", "leaning"]].copy()

Categorizing the leaning label column.

In [None]:
df["leaning"].unique()

In [None]:
df["leaning"] = df["leaning"].astype("category")

Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["political"] * len(df))

Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows with missing data.

In [None]:
df = df.dropna()

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Removing the new paragraph token from the bodies.

In [None]:
df["body"] = df["body"].str.replace(" <NS>", "")

Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

Downsampling the dataset.

In [None]:
SAMPLE_SIZE = 100_000

dataset = PoliticalLeaningDataset("media_political_stance", df)
df = dataset.take_even_class_distribution_sample(SAMPLE_SIZE).dataframe

Inspecting the body word count median.

In [None]:
df["body_word_count"].median()

The leaning distribution.

In [None]:
df.groupby("leaning", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per leaning.

In [None]:
df.groupby("leaning", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

The body length distribution by leaning.

In [None]:
plt.figure(figsize=(10, 6))

for leaning in df["leaning"].unique():
    df_leaning = df[df["leaning"] == leaning]
    downsampled = pd.concat(
        # Ensuring to include the last (longest) item.
        [df_leaning["body_length"].iloc[::100], df_leaning["body_length"].tail(1)]
    ).drop_duplicates().reset_index(drop=True)
    plt.plot(downsampled, label=leaning)

plt.xlabel("downsampled index")
plt.ylabel("body length")
plt.title("body length by political leaning")
plt.legend(title="leaning")
plt.show()

In [None]:
df.to_parquet("../preprocessed/media_political_stance.parquet")