# OpenCampus NLP Project
## Tweet Generator for famous Twitter personalities
-----------
This notebook builds the finetuned model.

## Imports

In [None]:
import os

import matplotlib.pyplot as plt

from datasets import load_from_disk
from pathlib import Path

In [None]:
plt.rcParams["font.monospace"] = ["DejaVu Sans Mono"]
plt.rcParams["font.family"] = "monospace"

## Model finetuning

### Load the prepared data

In [None]:
feature_data_path = os.path.join("data", "feature", "final_dataset")
levels = 2

In [None]:
parent_path = Path(os.path.abspath("")).parents[levels - 1]
feature_dir = os.path.join(parent_path, feature_data_path)

In [None]:
dataset_proc_prompt_filter = load_from_disk(feature_dir)

## Load model

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

In [None]:
checkpoint = "gpt2"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # , return_special_tokens_mask=True)

In [None]:
tokenizer.eos_token

In [None]:
tokenizer.model_max_length

In [None]:
dataset_proc_prompt_filter

In [None]:
print(max([len(txt) for txt in dataset_proc_prompt_filter["train"]["text_prompt"]]))
print(max([len(txt) for txt in dataset_proc_prompt_filter["validation"]["text_prompt"]]))

In [None]:
def tokenize_function(example):
    return tokenizer(example["text_prompt"])


tokenized_datasets = dataset_proc_prompt_filter.map(tokenize_function)

tokenized_datasets = tokenized_datasets.remove_columns(
    ["text", "label", "idx", "ref_tweet", "reply_tweet", "text_prompt"]
)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
model = AutoModelForCausalLM.from_pretrained(checkpoint)

The maximum length of characters both in train and validation set even is below the default maximum length of gpt2 of 1024, which applies to tokens of the tokenizer and not characters. Therefore no problem.

In [None]:
LEARNING_RATE = 1.372e-4

training_args = TrainingArguments(
    output_dir="../../model/",
    overwrite_output_dir=True,
    do_train=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    prediction_loss_only=True,
    logging_steps=5,
    save_steps=0,
    seed=20,
    learning_rate=LEARNING_RATE,
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()