In [None]:
import gc
from pathlib import Path

import numpy as np
import pandas as pd

from utils.dataset_utils import PoliticalLeaningDataset


Reading the raw dataset.

In [None]:
df_left = pd.read_json(
    open(Path("../raw/bignewsbln/BIGNEWSBLN_left.json")), encoding="utf-8"
)
df_left


In [None]:
df_center = pd.read_json(
    open(Path("../raw/bignewsbln/BIGNEWSBLN_center.json")), encoding="utf-8"
)
df_center


In [None]:
df_right = pd.read_json(
    open(Path("../raw/bignewsbln/BIGNEWSBLN_right.json")), encoding="utf-8"
)
df_right


Adding the leaning label column.

In [None]:
df_left["leaning"] = ["left"] * len(df_left)
df_center["leaning"] = ["center"] * len(df_center)
df_right["leaning"] = ["right"] * len(df_right)
df = pd.concat([df_left, df_center, df_right])
del df_left
del df_center
del df_right
gc.collect()
df["leaning"] = df["leaning"].astype("category")


Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["political"] * len(df))


Dropping useless columns.

In [None]:
df = df.drop(columns=["date", "url", "source", "html"])


Renaming and reordering columns.

In [None]:
df = df.rename(columns={"text": "body"})
df = df[["title", "body", "leaning", "politicalness"]].copy()


Joining the body parts.

In [None]:
df["body"] = df["body"].apply(lambda body_parts: " ".join(body_parts))


Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows missing either the leaning or both the title and the body.

In [None]:
df = df.dropna(subset=["leaning"])
df = df.dropna(subset=["title", "body"], how="all")


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


Downsampling the dataset.

In [None]:
SAMPLE_SIZE = 100_000

dataset = PoliticalLeaningDataset("bignewsbln", df)
df = dataset.take_even_class_sample_by_size(SAMPLE_SIZE).dataframe


In [None]:
df.to_parquet("../preprocessed/bignewsbln.parquet")
