In [None]:
from io import StringIO

import numpy as np
import pandas as pd

Reading the raw dataset.

In [None]:
json_array = "["
with open("../raw/webis_news_bias_20.json") as f:
    json_array += ",".join(line for line in f)
json_array += "]"

df = pd.read_json(StringIO(json_array))
df

Dropping useless columns.

In [None]:
df = df.drop(columns=["source", "event_id", "adfontes_fair", "adfontes_political", "misc"])

Renaming and reordering columns.

In [None]:
df = df.rename(columns={"content": "body", "allsides_bias": "leaning"})
df = df[["title", "body", "leaning"]].copy()

Categorizing the leaning column.

In [None]:
df["leaning"] = df["leaning"].astype("category")
df["leaning"] = df["leaning"].cat.rename_categories(
    {"From the Left": "left", "From the Center": "center", "From the Right": "right"}
)

Printing duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting body length.

In [None]:
df["body_length"] = df["body"].str.len()
df["body_word_count"] = df["body"].str.split().str.len()
df.sort_values(by="body_length", ascending=False).head()

In [None]:
df.sort_values(by="body_length").head()

Calculating the bounds for body length.

In [None]:
q1 = np.percentile(df["body_length"], 25)
q3 = np.percentile(df["body_length"], 75)
iqr = q3 - q1
lower_bound_multiplier = 0.5
upper_bound_multiplier = 9
lower_bound = q1 - lower_bound_multiplier * iqr
upper_bound = q3 + upper_bound_multiplier * iqr
lower_bound, upper_bound

Removing the outliers based on the body length.

In [None]:
df = df[(df["body_length"] >= lower_bound) & (df["body_length"] <= upper_bound)]
body_length = df["body_length"].sort_values()
# Ensuring to include the last (longest) item.
downsampled = pd.concat([body_length.iloc[::100], body_length.iloc[[-1]]]).drop_duplicates()
downsampled.plot.bar()

The leaning distribution.

In [None]:
df.groupby("leaning", observed=True).size().plot.pie(autopct="%1.1f%%");

In [None]:
df.to_csv("../preprocessed/webis_news_bias_20.csv")