In [None]:
from pathlib import Path

import kagglehub
import numpy as np
import pandas as pd


Reading the raw dataset.

In [None]:
path = kagglehub.dataset_download(
    "nbandhi/political-podcasts-listing-with-audio-links/versions/1"
)
df = pd.read_csv(open(Path(path, "politicalpodcasts.csv")), encoding="utf-8")
df = df.set_index("Unnamed: 0").rename_axis(None)
df


Dropping useless columns.

In [None]:
df = df.drop(columns=["podcaster", "pub_date", "pod_link"])


Renaming and reordering columns.

In [None]:
df = df.rename(columns={"abstract": "body", "type": "leaning"})
df = df[["title", "body", "leaning"]].copy()


Categorizing the leaning label column.

In [None]:
df["leaning"].unique()


In [None]:
df["leaning"] = df["leaning"].replace({"liberal": "left", "conservative": "right"})
df["leaning"] = df["leaning"].astype("category")


Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["political"] * len(df))


Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows with missing data.

In [None]:
df = df.dropna()


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Inspecting the title and the body length.

In [None]:
df["title_length"] = df["title"].fillna("").str.len()
df["title_word_count"] = df["title"].fillna("").str.split().str.len()
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


After the inspection, rows with bodies shorter than 6 words seem to contain no political value. Removing them.

In [None]:
body_word_count_lower_bound = 6
df = df[df["body_word_count"] >= body_word_count_lower_bound]


In [None]:
df.to_parquet("../preprocessed/political_podcasts.parquet")
