In [None]:
import json
import os
import zipfile

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

Reading the raw dataset.

In [None]:
raw_directory = "../raw/free-news-datasets/News_Datasets"

for filename in os.listdir(raw_directory):
    if filename.endswith(".zip"):
        path = os.path.join(raw_directory, filename)
        with zipfile.ZipFile(path, "r") as zip_ref:
            zip_ref.extractall(raw_directory)

data = []
for subdirectory_name in os.listdir(raw_directory):
    subdirectory_path = os.path.join(raw_directory, subdirectory_name)

    if not os.path.isdir(subdirectory_path):
        continue

    for filename in os.listdir(subdirectory_path):
        path = os.path.join(subdirectory_path, filename)
        with open(path, "r") as file:
            data.append(json.load(file))

df = pd.DataFrame(data)
df

Dropping useless columns.

In [None]:
df = df[["title", "text", "categories", "language"]]

Renaming columns.

In [None]:
df = df.rename(columns={"text": "body"})

Dropping non-English rows.

In [None]:
df["language"].unique()

In [None]:
df = df[df["language"] == "english"]

Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Dropping rows missing either the categories or both the title and the body.

In [None]:
df = df.dropna(subset=["categories"])
df = df.dropna(subset=["title", "body"], how="all")

Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["non-political"] * len(df), categories=["political", "non-political"])

Marking rows with political categories as political.

In [None]:
pd.Series([category for categories in df["categories"] for category in categories]).unique()

In [None]:
political_categories = {
    "Politics",
}

df.loc[~df["categories"].map(political_categories.isdisjoint), "politicalness"] = "political"

Dropping rows with potentially politically ambiguous categories.

In [None]:
ambiguous_categories = {
    "Social Issue",
    "War, Conflict and Unrest",
    "Crime, Law and Justice",
    "Environment",
    "Financial and Economic News",
    "Business and Market Analysis",
    "Corporate Governance and CSR",
    "Religion and Belief",
}

df = df[df["categories"].map(ambiguous_categories.isdisjoint)]

Dropping useless columns.

In [None]:
df = df.drop(columns=["categories", "language"])

Printing body duplicates.

In [None]:
df[df.duplicated(subset="body", keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

In [None]:
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::100], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["title_word_count"].median()

In [None]:
df["body_word_count"].median()

The politicalness distribution.

In [None]:
df.groupby("politicalness", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per politicalness.

In [None]:
df.groupby("politicalness", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

The body length distribution by politicalness.

In [None]:
plt.figure(figsize=(10, 6))

for politicalness in df["politicalness"].unique():
    df_politicalness = df[df["politicalness"] == politicalness]
    downsampled = pd.concat(
        # Ensuring to include the last (longest) item.
        [df_politicalness["body_length"].iloc[::10], df_politicalness["body_length"].tail(1)]
    ).drop_duplicates().reset_index(drop=True)
    plt.plot(downsampled, label=politicalness)

plt.xlabel("downsampled index")
plt.ylabel("body length")
plt.title("body length by politicalness")
plt.legend(title="politicalness")
plt.show()

In [None]:
df.to_parquet("../preprocessed/free_news.parquet")