In [None]:
from pathlib import Path

import kagglehub
import pandas as pd
from matplotlib import pyplot as plt

Reading the raw dataset.

In [None]:
path = kagglehub.dataset_download("nulldata/medium-post-titles/versions/1")
df = pd.read_csv(open(Path(path, "medium_post_titles.csv")), encoding="utf-8")
df

Dropping rows with truncated subtitles.

In [None]:
df = df[~df["subtitle_truncated_flag"]]

Dropping rows with missing data.

In [None]:
df = df.dropna()

Merging the titles and subtitles into a body column.

In [None]:
df["title"] = df["title"].str.strip()
# Adding a dot at the end if not already present.
df["title"] = df["title"].map(lambda x: x if x.endswith(".") else x + ".")
df["subtitle"] = df["subtitle"].str.strip()
df["body"] = df["title"] + " " + df["subtitle"]

Dropping useless columns.

In [None]:
df = df.drop(columns=["title", "subtitle", "subtitle_truncated_flag"])

Dropping rows with potentially politically ambiguous categories.

In [None]:
df["category"].unique()

In [None]:
ambiguous_categories = [
    "lgbtqia",
    "culture",
    "environment",
    "social-media",
    "media",
    "world",
    "books",
    "economy",
    "equality",
    "art",
    "psychology",
    "technology",
    "space",
    "business",
    "justice",
    "future",
    "podcasts",
    "women",
    "humor",
    "history",
    "philosophy",
    "basic-income",
    "comics",
    "education",
    "language",
    "san-francisco",
    "gun-control",
    "medium-magazine",
    "race",
    "immigration",
    "privacy",
]

df = df[~df["category"].isin(ambiguous_categories)]

Adding the politicalness label column.

In [None]:
political_categories = [
    "politics",
    "election-2020",
]

df["politicalness"] = pd.Categorical(
    ["political" if category in political_categories else "non-political" for category in df["category"]]
)

Dropping useless columns.

In [None]:
df = df.drop(columns=["category"])

Inspecting body length.

In [None]:
df["body_length"] = df["body"].str.len()
df["body_word_count"] = df["body"].str.split().str.len()
df = df.sort_values(by="body_length")
df.head(100)

In [None]:
# Ensuring to include the last (longest) item.
downsampled = pd.concat([df["body_length"].iloc[::100], df["body_length"].tail(1)]).drop_duplicates()
downsampled.plot.bar().xaxis.set_ticks([]);

In [None]:
df["body_length"].mean()

The politicalness distribution.

In [None]:
df.groupby("politicalness", observed=True).size().plot.pie(autopct="%1.1f%%");

The distribution of body length sums per politicalness.

In [None]:
df.groupby("politicalness", observed=True)["body_length"].sum().plot.pie(autopct="%1.1f%%", ylabel="");

Body length distribution by politicalness.

In [None]:
plt.figure(figsize=(10, 6))

for politicalness in df["politicalness"].unique():
    df_politicalness = df[df["politicalness"] == politicalness]
    downsampled = pd.concat(
        # Ensuring to include the last (longest) item.
        [df_politicalness["body_length"].iloc[::10], df_politicalness["body_length"].tail(1)]
    ).drop_duplicates().reset_index(drop=True)
    plt.plot(downsampled, label=politicalness)

plt.xlabel("downsampled index")
plt.ylabel("body length")
plt.title("body length by politicalness")
plt.legend(title="politicalness")
plt.show()

In [None]:
df.to_parquet("../preprocessed/medium_post_titles.parquet")