In [None]:
from pathlib import Path

import kagglehub
import numpy as np
import pandas as pd


Reading the raw dataset.

In [None]:
path = kagglehub.dataset_download(
    "divyapatel4/microsoft-pens-personalized-news-headlines/versions/7"
)
df = pd.read_csv(
    open(Path(path, "PENS", "news.tsv")), sep="\t", encoding="utf-8"
)
df


Dropping useless columns.

In [None]:
df = df.drop(columns=["News ID", "Category", "Title entity", "Entity content"])


Renaming columns.

In [None]:
df = df.rename(
    columns={"Topic": "topic", "Headline": "title", "News body": "body"}
)


Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows missing either the topic or both the title and the body.

In [None]:
df = df.dropna(subset=["topic"])
df = df.dropna(subset=["title", "body"], how="all")


Dropping rows with potentially politically ambiguous topics.

In [None]:
df["topic"].unique()


In [None]:
ambiguous_topics = [
    "ads-tax-proof",
    "binge",
    "causes",
    "causes-food-insecurity",
    "causes-homelessness",
    "causes-human-rights",
    "causes-poverty",
    "columnists",
    "comedy",
    "downtime",
    "finance-companies",
    "finance-education",
    "finance-healthcare",
    "finance-insurance",
    "finance-retirement",
    "finance-saving-investing",
    "finance-top-stocks",
    "finance-top-stories",
    "finance-video",
    "financenews",
    "healthtrending",
    "indepth",
    "lifestyleroyals",
    "localnews",
    "markets",
    "movievideo",
    "news",
    "news-causes-lgbt",
    "newslocal",
    "newsopinion",
    "newsphotos",
    "newsscienceandtechnology",
    "newstrends",
    "newsuk",
    "newsus",
    "newsvideo",
    "newsvideos",
    "newsworld",
    "northamerica-video",
    "people-places",
    "peopleandplaces",
    "seasonalvideos",
    "spotlight",
    "topnews",
    "tunedin",
    "tv",
    "tv-gallery",
    "tv-recaps",
    "tvnews",
    "viral",
    "watch",
]

df = df[~df["topic"].isin(ambiguous_topics)]


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Adding the politicalness label column.

In [None]:
political_topics = [
    "brexit",
    "narendramodi_opinion",
    "newsfactcheck",
    "newspolitics",
    "politicsvideo",
    "royals",
]

df["politicalness"] = pd.Categorical(
    [
        "political" if topic in political_topics else "non-political"
        for topic in df["topic"]
    ]
)


Dropping useless columns.

In [None]:
df = df.drop(columns=["topic"])


Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


After the inspection, rows with bodies shorter than 4 words seem to contain no useful value. Removing them.

In [None]:
body_word_count_lower_bound = 4
df = df[df["body_word_count"] >= body_word_count_lower_bound]


In [None]:
df.to_parquet("../preprocessed/pens.parquet")
