In [None]:
import numpy as np
import pandas as pd

Reading the raw dataset.

In [None]:
df = pd.read_csv(open("../raw/qbias.csv"), encoding="utf-8")
df

Dropping useless columns.

In [None]:
df = df.drop(columns=["Unnamed: 0", "tags", "heading", "source"])

Renaming columns.

In [None]:
df = df.rename(columns={"text": "body", "bias_rating": "leaning"})

Categorizing the leaning column.

In [None]:
df["leaning"] = df["leaning"].astype("category")

Dropping rows with missing data.

In [None]:
df = df.dropna()

Printing duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting body length.

In [None]:
df["body_length"] = df["body"].str.len()
df["body_word_count"] = df["body"].str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

In [None]:
df.tail()

In [None]:
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::100], df["body_length"].iloc[[-1]]]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

Calculating the bounds for body length.

In [None]:
q1 = np.percentile(df["body_length"], 25)
q3 = np.percentile(df["body_length"], 75)
iqr = q3 - q1
lower_bound_multiplier = 0.5
lower_bound = q1 - lower_bound_multiplier * iqr

Removing the outliers based on the body length.

In [None]:
df = df[df["body_length"] >= lower_bound]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::100], df["body_length"].iloc[[-1]]]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["body_length"].mean()

The leaning distribution.

In [None]:
df.groupby("leaning", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per leaning.

In [None]:
df.groupby("leaning", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

Body length distribution of left leaning articles.

In [None]:
df_leaning = df[df["leaning"] == "left"]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df_leaning["body_length"].iloc[::100], df_leaning["body_length"].iloc[[-1]]]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

Body length distribution of center leaning articles.

In [None]:
df_leaning = df[df["leaning"] == "center"]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df_leaning["body_length"].iloc[::100], df_leaning["body_length"].iloc[[-1]]]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

Body length distribution of right leaning articles.

In [None]:
df_leaning = df[df["leaning"] == "right"]
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df_leaning["body_length"].iloc[::100], df_leaning["body_length"].iloc[[-1]]]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df.to_parquet("../preprocessed/qbias.parquet")