<a href="https://colab.research.google.com/github/pjheslin/colab-notebooks/blob/main/gpt2_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning GPT-2 to write Shakespeare

Code adapted and updated for more recent versions of Huggingface libraries from [this blog](http://education.abcom.com/using-gpt-2-to-write-like-shakespeare/).

In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install --upgrade pyarrow

In [None]:
!pip install git+https://github.com/huggingface/datasets

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [None]:
!mkdir output

In [None]:
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/pytorch/language-modeling/run_clm.py

In [None]:
import datasets

In [None]:
!python run_clm.py \
    --model_name_or_path gpt2 \
    --train_file '/content/input.txt' \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --do_train \
    --do_eval \
    --output_dir /output \
    --overwrite_output_dir

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
tokenizer = GPT2Tokenizer.from_pretrained('/output')
model = GPT2LMHeadModel.from_pretrained('/output')

In [None]:
ids = tokenizer.encode('To be or not to be, that is the',
                      return_tensors='pt')

In [None]:
greedy_output = model.generate(ids, max_length=100)

In [None]:
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

In [None]:
beam_output = model.generate(
    ids, 
    max_length=100, 
    num_beams=4, 
    early_stopping=True
)

In [None]:
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

In [None]:
import tensorflow as tf
tf.random.set_seed(0)

In [None]:
sample_output = model.generate(
    ids, 
    do_sample=True, 
    max_length=100
)

In [None]:
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

In [None]:
tf.random.set_seed(0)
 
# set top_k to 50
top_k_output = model.generate(
    ids, 
    do_sample=True, 
    max_length=100, 
    top_k=50
)

In [None]:
print(tokenizer.decode(top_k_output[0], skip_special_tokens=True))

In [None]:
tf.random.set_seed(1)
 
# set top_k = 50 and set top_p = 0.95 
final_output = model.generate(
    ids,
    do_sample=True, 
    max_length=100, 
    top_k=40, 
    top_p=0.95, 
)

In [None]:
print(tokenizer.decode(final_output[0], skip_special_tokens=True))