# OpenCampus NLP Project
## Tweet Generator for famous Twitter personalities
-----------
This notebook preprocesses the Tweets.

## Imports

In [None]:
import functools
import os

import datasets
from datasets import load_dataset, load_from_disk
from pathlib import Path

## Prepare the dataset for Training
### Download the dataset

First we download our custom HuggingFace (HF) dataset. The dataset can be found on our [HuggingFace site](https://huggingface.co/datasets/ML-Projects-Kiel/tweetyface). It contains Tweets from English and German Twitter users.

In [None]:
dataset = load_dataset("ML-Projects-Kiel/tweetyface", "english")

The dataset already is split into a training and validation subset. It contains no test data, because the text generation task does not require test data.

In [None]:
dataset

In [None]:
dataset["train"].features

### Preprocess the text
#### Filter characters

In [None]:
min_text_length = 50

In [None]:
def preprocess_text(element: dict[str] | dict[list]) -> dict[str] | dict[list]:
    if isinstance(element, datasets.arrow_dataset.Batch):
        # Input is of form dict[list]
        element["text"] = [f"Tweet: {txt}" for txt in element["text"]]
    else:
        # Input is of form dict[str]
        element["text"] = f"Tweet: {element['text']}"
    return element

In [None]:
dataset_processed = dataset.map(preprocess_text, batched=True)

#### Filter Text length

In [None]:
dataset_processed = dataset_processed.filter(lambda row: len(row["text"]) > min_text_length)

In [None]:
dataset_processed

In [None]:
dataset_processed["train"][0]

### Create Prompts

In [None]:
def create_prompt(element: dict[str] | dict[list], userlist: list[str]) -> dict[str] | dict[list]:
    if isinstance(element, datasets.arrow_dataset.Batch):
        # Input is of form dict[list]
        element["text_prompt"] = [
            f"User: {userlist[label]}\nTweet: {txt}"
            for txt, label in zip(element["text"], element["label"])
        ]
    else:
        # Input is of form dict[str]
        element["text_prompt"] = f"User: {userlist[element['label']]}\nTweet: {element['text']}"
    return element

In [None]:
full_features = dataset["train"].features["label"].names  # Create List with all users
create_prompt_partial = functools.partial(create_prompt, userlist=full_features)

In [None]:
dataset_proc_prompt = dataset_processed.map(create_prompt_partial, batched=True)

In [None]:
dataset_proc_prompt["train"][0]

### Filter the Users
The full dataset contains more users than we want to use for the first trials. Therefore we will reduce the number of users.

In [None]:
short_features = [
    "KBHD",
    "elonmusk",
    "alyankovic",
    "GretaThunberg",
    "BarackObama",
    "Trevornoah",
]

In [None]:
datset_proc_prompt_filter = dataset_proc_prompt.filter(
    lambda row: full_features[row["label"]] in short_features
)

### Final Dataset

In [None]:
feature_data_path = os.path.join("data", "feature", "final_dataset")
levels = 2

In [None]:
parent_path = Path(os.path.abspath("")).parents[levels - 1]
feature_dir = os.path.join(parent_path, feature_data_path)

In [None]:
datset_proc_prompt_filter.save_to_disk(feature_dir)

In [None]:
datset_proc_prompt_filter = load_from_disk(feature_dir)

In [None]:
dataset_train = datset_proc_prompt_filter["train"]
dataset_val = datset_proc_prompt_filter["validation"]

In [None]:
dataset_train[0]