In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from faker import Faker


Reading the raw dataset.

In [None]:
df_political = pd.read_csv(
    open(
        Path(
            "../raw/polibertweet/published_data_polibertweet-LREC-2022_election_sampled_10000.csv"
        )
    ),
    encoding="utf-8",
)
df_political


In [None]:
df_non_political = pd.read_csv(
    open(
        Path(
            "../raw/polibertweet/published_data_polibertweet-LREC-2022_non_election_sampled_10000.csv"
        )
    ),
    encoding="utf-8",
)
df_non_political


Adding the politicalness label column.

In [None]:
df_political["politicalness"] = ["political"] * len(df_political)
df_non_political["politicalness"] = ["non-political"] * len(df_non_political)
df = pd.concat([df_political, df_non_political])
df["politicalness"] = df["politicalness"].astype("category")


Renaming columns.

In [None]:
df = df.rename(columns={"text": "body"})


Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)


Dropping rows with missing data.

In [None]:
df = df.dropna()


Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]


Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")


Filling username and URL placeholders.

In [None]:
fake = Faker()
fake.seed_instance(37)
df["body"] = df["body"].str.replace("@USER", lambda _: fake.name(), regex=True)
df["body"] = df["body"].str.replace(
    "HTTPURL", lambda _: f"{fake.url()}{fake.uri_path()}", regex=True
)
df


Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()


In [None]:
df.to_parquet("../preprocessed/polibertweet.parquet")
