In [None]:
import numpy as np
import pandas as pd
from datasets import concatenate_datasets, load_dataset


Reading the raw dataset.

In [None]:
ds = load_dataset(
    "JyotiNayak/political_ideologies",
    revision="f748ec8a7cbbf916453ba489fdc9766b9e4f19c8",
)
df = concatenate_datasets(ds.values()).to_pandas()
df = df.set_index("__index_level_0__").rename_axis(None)
df


Dropping useless columns.

In [None]:
df = df.drop(columns=["issue_type"])


Renaming columns.

In [None]:
df = df.rename(columns={"statement": "body", "label": "leaning"})


Categorizing the leaning label column.

In [None]:
df["leaning"].unique()


In [None]:
df["leaning"] = df["leaning"].replace({0: "right", 1: "left"})
df["leaning"] = df["leaning"].astype("category")


Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["political"] * len(df))


Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows with missing data.

In [None]:
df = df.dropna()


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


In [None]:
df.to_parquet("../preprocessed/gpt4_political_ideologies.parquet")
