In [None]:
from pathlib import Path

import kagglehub
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt


Reading the raw dataset.

In [None]:
path = kagglehub.dataset_download("nulldata/medium-post-titles/versions/1")
df = pd.read_csv(open(Path(path, "medium_post_titles.csv")), encoding="utf-8")
df


Renaming columns.

In [None]:
df = df.rename(columns={"subtitle": "body"})


Dropping rows with truncated subtitles.

In [None]:
df = df[~df["subtitle_truncated_flag"]]


Stripping the titles and the bodies.

In [None]:
df["title"] = (df["title"].str.strip()).replace("", np.nan)
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows missing either the category or both the title and the body.

In [None]:
df = df.dropna(subset=["category"])
df = df.dropna(subset=["title", "body"], how="all")


Dropping rows with potentially politically ambiguous categories.

In [None]:
df["category"].unique()


In [None]:
ambiguous_categories = [
    "lgbtqia",
    "culture",
    "environment",
    "social-media",
    "media",
    "world",
    "books",
    "economy",
    "equality",
    "art",
    "psychology",
    "technology",
    "space",
    "business",
    "justice",
    "future",
    "podcasts",
    "women",
    "humor",
    "history",
    "philosophy",
    "basic-income",
    "comics",
    "education",
    "language",
    "san-francisco",
    "gun-control",
    "medium-magazine",
    "race",
    "immigration",
    "privacy",
]

df = df[~df["category"].isin(ambiguous_categories)]


Adding the politicalness label column.

In [None]:
political_categories = [
    "politics",
    "election-2020",
]

df["politicalness"] = pd.Categorical(
    [
        "political" if category in political_categories else "non-political"
        for category in df["category"]
    ]
)


Dropping useless columns.

In [None]:
df = df.drop(columns=["category", "subtitle_truncated_flag"])


Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


After the inspection, rows with bodies shorter than 4 words seem to contain no useful value. Removing them.

In [None]:
body_word_count_lower_bound = 4
df = df[
    (df["body_word_count"] == 0)
    | (df["body_word_count"] >= body_word_count_lower_bound)
]


In [None]:
df.to_parquet("../preprocessed/medium_post_titles.parquet")
