In [None]:
import csv
from io import StringIO

import numpy as np
import pandas as pd
from sympy.physics.control.control_plots import matplotlib

Reading the raw dataset. Some rows (e.g. line 1010) contain complex nested quotes and commas, which requires modifying each line before passing it to `pandas.read_csv()`. The outer quotes get replaced by an arbitrary (`\u2603`, the snowman) character, which is then specified as the quote character used for parsing.

In [None]:
csv.field_size_limit(4 * 131072)  # 4 times the default

with open("../raw/webis_bias_flipper_18.csv", mode="r", encoding="utf-8") as file:
    modified_lines = (
        "\u2603".join(line.replace("\",\"", "\u2603,\u2603").replace("\"", "\u2603", 1).rsplit("\"", 1))
        for line in file
    )
    df = pd.read_csv(StringIO("".join(modified_lines)), quotechar="\u2603", encoding="utf-8", engine="python")
df

Comparison of the body column (from AllSides) versus the original body column (from the news portals). The text volume of the original columns is much higher, probably because AllSides cuts the length. This means the title and body columns can be dropped and the original ones will be used.

In [None]:
pd.Series(
    [df["original_body"].str.len().sum(), df["body"].str.len().sum()],
    index=["original body", "body"]
).plot.pie(autopct="%1.1f%%");

Dropping useless columns.

In [None]:
df = df.drop(columns=["story_id", "title", "body", "source"])

Renaming and reordering columns.

In [None]:
df = df.rename(columns={"original_title": "title", "original_body": "body", "bias": "leaning"})
df = df[["title", "body", "leaning"]].copy()

Categorizing the leaning column.

In [None]:
df["leaning"] = df["leaning"].astype("category")
df["leaning"] = df["leaning"].cat.rename_categories(
    {"From the Left": "left", "From the Center": "center", "From the Right": "right"}
)

Printing duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting body length.

In [None]:
df["body_length"] = df["body"].str.len()
df["body_word_count"] = df["body"].str.split().str.len()
df.sort_values(by="body_length", ascending=False).head()

In [None]:
df.sort_values(by="body_length").head()

Calculating the bounds for body length.

In [None]:
q1 = np.percentile(df["body_length"], 25)
q3 = np.percentile(df["body_length"], 75)
iqr = q3 - q1
lower_bound_multiplier = 0.5
upper_bound_multiplier = 9
lower_bound = q1 - lower_bound_multiplier * iqr
upper_bound = q3 + upper_bound_multiplier * iqr
lower_bound, upper_bound

Removing the outliers based on the body length.

In [None]:
df = df[(df["body_length"] >= lower_bound) & (df["body_length"] <= upper_bound)]
body_length = df["body_length"].sort_values()
# Ensuring to include the last (longest) item.
downsampled = pd.concat([body_length.iloc[::100], body_length.iloc[[-1]]]).drop_duplicates()
downsampled.plot.bar()

The leaning distribution.

In [None]:
df.groupby("leaning", observed=True).size().plot.pie(autopct="%1.1f%%");

This dataset is basically a subset of the Webis-News-Bias-20 dataset.

In [None]:
df2 = pd.read_csv("../preprocessed/webis_news_bias_20.csv")
print(len(df))
pd.Series(list(set(df["title"]).intersection(set(df2["title"])))).count()

In [None]:
df.to_csv("../preprocessed/webis_news_bias_20.csv")