# OpenCampus NLP Project
## Tweet Generator for famous Twitter personalities
-----------
This notebook preprocesses the Tweets.

## Imports

In [None]:
import functools
import os
import random
import re

import pandas as pd

import datasets
import matplotlib.pyplot as plt

from datasets import load_dataset, load_from_disk
from pathlib import Path

In [None]:
plt.rcParams["font.monospace"] = ["DejaVu Sans Mono"]
plt.rcParams["font.family"] = "monospace"

## Prepare the dataset for Training
### Download the dataset

First we download our custom HuggingFace (HF) dataset. The dataset can be found on our [HuggingFace site](https://huggingface.co/datasets/ML-Projects-Kiel/tweetyface). It contains Tweets from English and German Twitter users.

In [None]:
dataset = load_dataset(
    "ML-Projects-Kiel/tweetyface", "english", download_mode="force_redownload"
)

The dataset already is split into a training and validation subset. It contains no test data, because the text generation task does not require test data.

In [None]:
dataset

In [None]:
dataset["train"].features

### Preprocess the text

#### Remove retweets

In [None]:
userlist = dataset["train"].features["label"].names
all_data_pd = datasets.concatenate_datasets(
    [dataset["train"], dataset["validation"]]
).to_pandas()
all_data_pd["user"] = [userlist[label] for label in all_data_pd["label"]]

In [None]:
stats = pd.concat(
    [
        all_data_pd.groupby("user").size(),
        all_data_pd[all_data_pd["text"].str.contains("^RT .*")].groupby("user").size(),
    ],
    axis=1,
)
stats.columns = ["all_tweets", "retweets"]

stats

In [None]:
# Remove retweets
dataset_without_rt = dataset.filter(
    lambda row: not bool(re.search("^RT .*", row["text"]))
)

#### Filter characters

In [None]:
ids = random.sample(range(0, dataset_without_rt["train"].num_rows), 10)
ids

In [None]:
ids = [12784, 15988, 14475, 10926, 23442, 6853, 22511, 18022, 13039, 22725]
dataset_without_rt["train"][ids]["text"]

In [None]:
def preprocess_text(element: dict[str] | dict[list]) -> dict[str] | dict[list]:
    if isinstance(element, datasets.arrow_dataset.Batch):
        # Input is of form dict[list]
        element["text"] = [re.sub("\n", " ", txt) for txt in element["text"]]
        element["text"] = [re.sub(r"http\S+", "URL", txt) for txt in element["text"]]
        element["text"] = [re.sub("&amp;", "&", txt) for txt in element["text"]]
        element["text"] = [re.sub("&lt;", "<", txt) for txt in element["text"]]
        element["text"] = [re.sub("&gt;", ">", txt) for txt in element["text"]]
        element["text"] = [re.sub(" +", " ", txt) for txt in element["text"]]
        element["text"] = [" ".join(txt.split()) for txt in element["text"]]
    else:
        # Input is of form dict[str]
        element["text"] = element["text"]
    return element

In [None]:
dataset_processed = dataset_without_rt.map(preprocess_text, batched=True)

In [None]:
dataset_processed["train"][ids]["text"]

#### Filter Text length

In [None]:
min_text_length = 50
dataset_processed = dataset_processed.filter(
    lambda row: len(row["text"]) > min_text_length
)

In [None]:
stats = pd.concat(
    [
        stats,
        all_data_pd[all_data_pd["text"].apply(len) <= min_text_length]
        .groupby("user")
        .size(),
    ],
    axis=1,
)
stats.columns = ["all_tweets", "retweets", "short_tweets"]

stats

In [None]:
dataset_processed["train"][0]

### Create Prompts

In [None]:
def create_prompt(
    element: dict[str] | dict[list], userlist: list[str]
) -> dict[str] | dict[list]:
    if isinstance(element, datasets.arrow_dataset.Batch):
        # Input is of form dict[list]
        element["text_prompt"] = [
            f"User: {userlist[label]}\nTweet: {txt}"
            for txt, label in zip(element["text"], element["label"])
        ]
    else:
        # Input is of form dict[str]
        element[
            "text_prompt"
        ] = f"User: {userlist[element['label']]}\nTweet: {element['text']}"
    return element

In [None]:
full_features = dataset["train"].features["label"].names  # Create List with all users
create_prompt_partial = functools.partial(create_prompt, userlist=full_features)

In [None]:
dataset_proc_prompt = dataset_processed.map(create_prompt_partial, batched=True)

In [None]:
dataset_proc_prompt["train"][0]

### Filter the Users
The full dataset contains more users than we want to use for the first trials. Therefore we will reduce the number of users.

In [None]:
short_features = [
    "MKBHD",
    "elonmusk",
    "katyperry",
    "neiltyson",
    "BillGates",
    "BillNye",
    "BarackObama",
]

In [None]:
datset_proc_prompt_filter = dataset_proc_prompt.filter(
    lambda row: full_features[row["label"]] in short_features
)

### Final Dataset

In [None]:
feature_data_path = os.path.join("data", "feature", "final_dataset")
levels = 2

In [None]:
parent_path = Path(os.path.abspath("")).parents[levels - 1]
feature_dir = os.path.join(parent_path, feature_data_path)

In [None]:
datset_proc_prompt_filter.save_to_disk(feature_dir)

In [None]:
datset_proc_prompt_filter = load_from_disk(feature_dir)

In [None]:
dataset_train = datset_proc_prompt_filter["train"]
dataset_val = datset_proc_prompt_filter["validation"]

In [None]:
dataset_train[0]