In [None]:
from pathlib import Path

import kagglehub
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt


Reading the raw dataset.

In [None]:
path = kagglehub.dataset_download("rmisra/news-category-dataset/versions/3")
df = pd.read_json(open(Path(path, "News_Category_Dataset_v3.json")), lines=True)
df


Dropping useless columns.

In [None]:
df = df.drop(columns=["link", "authors", "date"])


Renaming columns.

In [None]:
df = df.rename(columns={"headline": "title", "short_description": "body"})


Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows missing either the leaning or both the title and the body.

In [None]:
df = df.dropna(subset=["category"])
df = df.dropna(subset=["title", "body"], how="all")


Dropping rows with potentially politically ambiguous categories.

In [None]:
df["category"].unique()


In [None]:
ambiguous_categories = [
    "COMEDY",
    "U.S. NEWS",
    "WORLD NEWS",
    "SPORTS",
    "ENTERTAINMENT",
    "WEIRD NEWS",
    "EDUCATION",
    "CRIME",
    "BUSINESS",
    "MEDIA",
    "QUEER VOICES",
    "WOMEN",
    "BLACK VOICES",
    "LATINO VOICES",
    "IMPACT",
    "ARTS & CULTURE",
    "GREEN",
    "THE WORLDPOST",
    "WORLDPOST",
]

df = df[~df["category"].isin(ambiguous_categories)]


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Adding the politicalness label column.

In [None]:
political_categories = [
    "POLITICS",
]

df["politicalness"] = pd.Categorical(
    [
        "political" if category in political_categories else "non-political"
        for category in df["category"]
    ]
)


Dropping useless columns.

In [None]:
df = df.drop(columns=["category"])


Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


After the inspection, rows with bodies shorter than 3 words seem to contain no useful value. Removing them.

In [None]:
body_word_count_lower_bound = 3
df = df[df["body_word_count"] >= body_word_count_lower_bound]


In [None]:
df.to_parquet("../preprocessed/news_category.parquet")
