# GTP-2 Language Model fine-tuning from scratch
An attempt at fine-tuning GTP-2 to generate livestream messages based on compiled data from 10 x 6-8hour livestreams.

In [None]:
# INSTALL DEPENDENCIES
!pip uninstall -y tensorflow
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# Expect: transformers --3.3.0 / tokenizers --0.8.1rc2

Need to train a byte-level BPE tokenizer for GTP-2

In [None]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("compiled.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=100_000, min_frequency=2, special_tokens=[
    "<BOS>",
    "<EOS>",
    "<PAD>",
])

Save the model to directory

In [None]:
!mkdir streamchat_model
tokenizer.save_model("streamchat_model")

Test the tokenizer out

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./streamchat_model/vocab.json",
    "./streamchat_model/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("<EOS>", tokenizer.token_to_id("<EOS>")),
    ("<BOS>", tokenizer.token_to_id("<BOS>")),
)
tokenizer.enable_truncation(max_length=512)

tokenizer.encode("Those are some Pog flowers.").tokens

# Training the language model

In [None]:
# Check that we have a GPU
!nvidia-smi

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

Define the config for the model

In [None]:
from transformers import GPT2Config

config = GPT2Config(
    vocab_size=100_000,
)

Re-create the tokenizer but with transformers

In [None]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("./streamchat_model", bos_token="<BOS>", eos_token="<EOS>", pad_token="<PAD>")

Initialise the model with the config

In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel(config=config)
model.num_parameters()

# Build the dataset

Build the dataset by applying the custom tokenizer to text file
As using a single textfile as data source, just use `LineByLineDataset` out-of-the-box

In [None]:
%%time
from transformers import LineByLineTextDataset

#LineByLine requires a pad_token in the tokenizer
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./train.txt", #PUT THE DATASET IN HERE
    block_size=180,
)
eval_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./test.txt", #PUT THE DATASET IN HERE
    block_size=180,
)

Need to define a data_collator so that we can batch different samples of the input data into a PyTorch compatible object

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)
# MLM is masked language modeling

Initialise the trainer

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./streamchat_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=20,
    save_steps=10_000,
    save_total_limit=0,
    prediction_loss_only=True,
    do_train=True,
    do_eval=True, 
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

Begin training

In [None]:
%%time
trainer.train()

Save the final model (+ tokenizer + config)

In [None]:
trainer.save_model("./streamchat_model")

Apparently it's over...

In [None]:
!zip -r /content/streamchat_model.zip /content/streamchat_model

# Text generation

Try using the model to run some text generation

In [None]:
!python run_generation.py --model_type="gpt2" --model_name_or_path="./streamchat_model" --prompt="<BOS>" --stop_token="<EOS>" --length=200 --k=30 --num_return_sequences=40