In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets, load_dataset
from faker import Faker

Reading the raw dataset.

In [None]:
TRUST_REMOTE_CODE = False

ds = load_dataset(
    "cardiffnlp/tweet_topic_multi",
    revision="4bce21b1f9211f24ff5ec321db8ea10894e3f425",
    trust_remote_code=TRUST_REMOTE_CODE,
)
df = concatenate_datasets(ds.values()).to_pandas()
df

Dropping useless columns.

In [None]:
df = df.drop(columns=["date", "label", "id"])

Renaming columns.

In [None]:
df = df.rename(columns={"text": "body"})

Stripping the bodies.

In [None]:
df["body"] = (df["body"].str.strip()).replace("", np.nan)

Filling username and URL placeholders.

In [None]:
fake = Faker()
fake.seed_instance(37)
df["body"] = df["body"].str.replace("{@", "").str.replace("@}", "")
df["body"] = df["body"].str.replace("{{USERNAME}}", lambda _: fake.name(), regex=True)
df["body"] = df["body"].str.replace("{{URL}}", lambda _: f"{fake.url()}{fake.uri_path()}", regex=True)
df

Dropping rows with potentially political bodies.

In [None]:
df = df[df["label_name"].map({"news_&_social_concern"}.isdisjoint)]

Dropping useless columns.

In [None]:
df = df.drop(columns=["label_name"])

Dropping rows with missing data.

In [None]:
df = df.dropna()

Printing body duplicates.

In [None]:
df[df["body"].duplicated(keep=False)]

Dropping the body duplicates.

In [None]:
df = df.drop_duplicates(subset="body")

Adding the politicalness label column.

In [None]:
df["politicalness"] = pd.Categorical(["non-political"] * len(df))

Inspecting the body length.

In [None]:
df["body_length"] = df["body"].fillna("").str.len()
df["body_word_count"] = df["body"].fillna("").str.split().str.len()
df = df.sort_values(by="body_length")
df.head()

In [None]:
df.to_parquet("../preprocessed/tweet_topic_multi.parquet")