In [None]:
import csv
from io import StringIO

import pandas as pd
from matplotlib import pyplot as plt
from pandas import Series

Reading the raw dataset. Some rows (e.g. line 1010) contain complex nested quotes and commas, which requires modifying each line before passing it to `pandas.read_csv()`. The outer quotes get replaced by an arbitrary (`\u2603`, the snowman) character, which is then specified as the quote character used for parsing.

In [None]:
csv.field_size_limit(4 * 131072)  # 4 times the default

with open("../raw/webis_bias_flipper_18.csv", mode="r", encoding="utf-8") as file:
    modified_lines = (
        "\u2603".join(line.replace("\",\"", "\u2603,\u2603").replace("\"", "\u2603", 1).rsplit("\"", 1))
        for line in file
    )
    df = pd.read_csv(StringIO("".join(modified_lines)), quotechar="\u2603", encoding="utf-8", engine="python")
df

Comparison of the body column (from AllSides) versus the original body column (from the news portals). The text volume of the original columns is much higher, probably because AllSides cuts the length. This means the title and body columns can be dropped and the original ones will be used.

In [None]:
pd.Series(
    [df["original_body"].str.len().sum(), df["body"].str.len().sum()],
    index=["original body", "body"]
).plot.pie(autopct="%1.1f%%");

Dropping useless columns.

In [None]:
df = df.drop(columns=["story_id", "title", "body", "source"])

Renaming and reordering columns.

In [None]:
df = df.rename(columns={"original_title": "title", "original_body": "body", "bias": "leaning"})
df = df[["title", "body", "leaning"]].copy()

Categorizing the leaning column.

In [None]:
df["leaning"] = df["leaning"].astype("category")
df["leaning"] = df["leaning"].cat.rename_categories(
    {"From the Left": "left", "From the Center": "center", "From the Right": "right"}
)

Dropping rows with missing data.

In [None]:
df = df.dropna(subset=["body", "leaning"])

Printing duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting body length.

In [None]:
df["body_length"] = df["body"].str.len()
df["body_word_count"] = df["body"].str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

After inspection, articles with bodies shorter than 15 words seem to contain no political value. Removing them.

In [None]:
body_word_count_lower_bound = 15
df = df[df["body_word_count"] >= body_word_count_lower_bound]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::10], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["body_length"].mean()

The leaning distribution.

In [None]:
df.groupby("leaning", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per leaning.

In [None]:
df.groupby("leaning", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

Body length distribution by leaning.

In [None]:
plt.figure(figsize=(10, 6))

for leaning in df["leaning"].unique():
    df_leaning = df[df["leaning"] == leaning]
    downsampled = pd.concat(
        # Ensuring to include the last (longest) item.
        [df_leaning["body_length"].iloc[::10], Series(df_leaning["body_length"].tail(1))]
    ).drop_duplicates().reset_index(drop=True)
    plt.plot(downsampled, label=leaning)

plt.xlabel("downsampled index")
plt.ylabel("body length")
plt.title("body length by political leaning")
plt.legend(title="leaning")
plt.show()

In [None]:
df.to_parquet("../preprocessed/webis_bias_flipper_18.parquet")