In [None]:
import numpy as np

import pandas as pd


Reading the raw dataset.

In [None]:
df = pd.read_json(open("../raw/webis_news_bias_20.json"), lines=True)
df


Dropping useless columns.

In [None]:
df = df.drop(
    columns=[
        "source",
        "event_id",
        "adfontes_fair",
        "adfontes_political",
        "misc",
    ]
)


Renaming and reordering columns.

In [None]:
df = df.rename(columns={"content": "body", "allsides_bias": "leaning"})
df = df[["title", "body", "leaning"]].copy()


Categorizing the leaning label column.

In [None]:
df["leaning"] = df["leaning"].astype("category")
df["leaning"] = df["leaning"].cat.rename_categories(
    {
        "From the Left": "left",
        "From the Center": "center",
        "From the Right": "right",
    }
)


Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["political"] * len(df))


Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows missing either the leaning or both the title and the body.

In [None]:
df = df.dropna(subset=["leaning"])
df = df.dropna(subset=["title", "body"], how="all")


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


After the inspection, rows with bodies shorter than 30 words seem to contain no political value. Removing them.

In [None]:
body_word_count_lower_bound = 30
df = df[df["body_word_count"] >= body_word_count_lower_bound]


In [None]:
df.to_parquet("../preprocessed/webis_news_bias_20.parquet")
