In [None]:
import json
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

Reading the raw dataset.

In [None]:
raw_directory = "../raw/article_bias_prediction"

data = []
for filename in os.listdir(raw_directory):
    filepath = os.path.join(raw_directory, filename)
    with open(filepath, "r") as file:
        data.append(json.load(file))

df = pd.DataFrame(data)
df

Dropping useless columns. The content column is
> the processed and tokenized content, which is used as input to the different models

and so the original content can be dropped.

In [None]:
df = df.drop(columns=["topic", "bias", "source", "url", "date", "authors", "content_original", "source_url", "ID"])

Renaming and reordering columns.

In [None]:
df = df.rename(columns={"content": "body", "bias_text": "leaning"})
df = df[["title", "body", "leaning"]].copy()

Categorizing the leaning column.

In [None]:
df["leaning"] = df["leaning"].astype("category")

Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows missing either the leaning or both the title and the body.

In [None]:
df = df.dropna(subset=["leaning"])
df = df.dropna(subset=["title", "body"], how="all")

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

After the inspection, rows with bodies shorter than 40 words seem to contain no political value. Removing them.

In [None]:
body_word_count_lower_bound = 40
df = df[df["body_word_count"] >= body_word_count_lower_bound]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::100], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["title_length"].mean()

In [None]:
df["body_length"].mean()

The leaning distribution.

In [None]:
df.groupby("leaning", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per leaning.

In [None]:
df.groupby("leaning", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

The body length distribution by leaning.

In [None]:
plt.figure(figsize=(10, 6))

for leaning in df["leaning"].unique():
    df_leaning = df[df["leaning"] == leaning]
    downsampled = pd.concat(
        # Ensuring to include the last (longest) item.
        [df_leaning["body_length"].iloc[::100], df_leaning["body_length"].tail(1)]
    ).drop_duplicates().reset_index(drop=True)
    plt.plot(downsampled, label=leaning)

plt.xlabel("downsampled index")
plt.ylabel("body length")
plt.title("body length by political leaning")
plt.legend(title="leaning")
plt.show()

In [None]:
df.to_parquet("../preprocessed/article_bias_prediction.parquet")