## Fairy Tale Generator with GPT-2

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, Dataset

import re, random, math, time
from tqdm.notebook import tqdm

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# define the computing device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# set random seed
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


### 1. Loading data

In [2]:
df = pd.read_csv('grimms_fairytales.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Title,Text
0,0,THE GOLDEN BIRD,"A certain king had a beautiful garden, and in ..."
1,1,HANS IN LUCK,Some men are born to good luck: all they do or...
2,2,JORINDA AND JORINDEL,"There was once an old castle, that stood in th..."
3,3,THE TRAVELLING MUSICIANS,An honest farmer had once an ass that had been...
4,4,OLD SULTAN,"A shepherd had a faithful dog, called Sultan, ..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  63 non-null     int64 
 1   Title       63 non-null     object
 2   Text        63 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.6+ KB


In [4]:
# drop the Unnamed: 0 column
df = df.drop('Unnamed: 0', axis=1)

df.head()

Unnamed: 0,Title,Text
0,THE GOLDEN BIRD,"A certain king had a beautiful garden, and in ..."
1,HANS IN LUCK,Some men are born to good luck: all they do or...
2,JORINDA AND JORINDEL,"There was once an old castle, that stood in th..."
3,THE TRAVELLING MUSICIANS,An honest farmer had once an ass that had been...
4,OLD SULTAN,"A shepherd had a faithful dog, called Sultan, ..."


In [5]:
df['Text'][0]

'A certain king had a beautiful garden, and in the garden stood a tree\nwhich bore golden apples. These apples were always counted, and about\nthe time when they began to grow ripe it was found that every night one\nof them was gone. The king became very angry at this, and ordered the\ngardener to keep watch all night under the tree. The gardener set his\neldest son to watch; but about twelve o’clock he fell asleep, and in\nthe morning another of the apples was missing. Then the second son was\nordered to watch; and at midnight he too fell asleep, and in the morning\nanother apple was gone. Then the third son offered to keep watch; but\nthe gardener at first would not let him, for fear some harm should come\nto him: however, at last he consented, and the young man laid himself\nunder the tree to watch. As the clock struck twelve he heard a rustling\nnoise in the air, and a bird came flying that was of pure gold; and as\nit was snapping at one of the apples with its beak, the gardener’s

In [6]:
# replace new line characters with space
df['Text'] = df['Text'].replace('\n', ' ', regex=True)

In [7]:
df['Text'][0]

'A certain king had a beautiful garden, and in the garden stood a tree which bore golden apples. These apples were always counted, and about the time when they began to grow ripe it was found that every night one of them was gone. The king became very angry at this, and ordered the gardener to keep watch all night under the tree. The gardener set his eldest son to watch; but about twelve o’clock he fell asleep, and in the morning another of the apples was missing. Then the second son was ordered to watch; and at midnight he too fell asleep, and in the morning another apple was gone. Then the third son offered to keep watch; but the gardener at first would not let him, for fear some harm should come to him: however, at last he consented, and the young man laid himself under the tree to watch. As the clock struck twelve he heard a rustling noise in the air, and a bird came flying that was of pure gold; and as it was snapping at one of the apples with its beak, the gardener’s son jumped u

In [8]:
# split the data into train, val and test
train_df, val_df = train_test_split(df, test_size=0.2, random_state=SEED)

train_df.shape, val_df.shape

((50, 2), (13, 2))

In [9]:
# create raw datasets
from datasets import Dataset, DatasetDict

ds_train = Dataset.from_pandas(train_df)
ds_valid = Dataset.from_pandas(val_df)

raw_datasets = DatasetDict(
    {
        "train": ds_train,
        "valid": ds_valid
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Title', 'Text', '__index_level_0__'],
        num_rows: 50
    })
    valid: Dataset({
        features: ['Title', 'Text', '__index_level_0__'],
        num_rows: 13
    })
})

In [10]:
# remove '__index_level_0__' column
raw_datasets["train"] = raw_datasets["train"].remove_columns("__index_level_0__")
raw_datasets["valid"] = raw_datasets["valid"].remove_columns("__index_level_0__")

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Title', 'Text'],
        num_rows: 50
    })
    valid: Dataset({
        features: ['Title', 'Text'],
        num_rows: 13
    })
})

In [11]:
for key in raw_datasets["train"][0]:
    print(f"{key}: {raw_datasets['train'][0][key][:200]}")

Title: FREDERICK AND CATHERINE
Text: There was once a man called Frederick: he had a wife whose name was Catherine, and they had not long been married. One day Frederick said. ‘Kate! I am going to work in the fields; when I come back I s


### 2. Preprocessing

In [12]:
# create the tokenizer
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)

outputs = tokenizer(
    raw_datasets["train"][:2]["Text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 48
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 94, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [13]:
# tokenize the data
def tokenize(element):
    outputs = tokenizer(
        element["Text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

tokenized_datasets

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 805
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 206
    })
})

### 3. Preparing data loaders

In [14]:
from torch.utils.data.dataloader import DataLoader

batch_size = 8

tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, shuffle=True)
eval_dataloader  = DataLoader(tokenized_datasets["valid"], batch_size=batch_size)

len(train_dataloader), len(eval_dataloader)

(101, 26)

In [None]:
for i in train_dataloader:
    print(i['input_ids'].shape)
    break

### 4. Modeling

In [15]:
# define the model
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [16]:
# prepare key token ids
keytoken_ids = []
for keyword in [
    "Once upon a time",
    "Long long ago",
    "In a faraway land"
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: Once upon a time
Keyword has not single token: Long long ago
Keyword has not single token: In a faraway land


In [30]:
# define the loss function and the optimizer
from torch.nn import CrossEntropyLoss

def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False) #change to reduction=None
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate weighted average
    weighted_loss = loss_per_sample.mean()
    return weighted_loss

weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

model = GPT2LMHeadModel(config).to(device)

from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [31]:
# use Accelerator to speed up training
from accelerate import Accelerator

accelerator = Accelerator(mixed_precision='fp16')

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [32]:
# use get_scheduler library to schedule the learning rate
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=10,
    num_training_steps=num_training_steps,
)

In [20]:
# log in to huggingface hub
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\MARC\.cache\huggingface\token
Login successful


In [21]:
# create a repository to save the model
from huggingface_hub import Repository, get_full_repo_name

model_name = "fairy-tale-generator-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'myomyintmaung/fairy-tale-generator-accelerate'

In [22]:
# clone the repository in a local folder
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

output_dir = "fairy-tale-generator-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

d:\GitHub\nlp-assignments\09-HuggingFace-Language-Model\fairy-tale-generator-accelerate is already a clone of https://huggingface.co/myomyintmaung/fairy-tale-generator-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


### 5. Training

In [23]:
# function to evaluate the model during training
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])
            outputs.loss = outputs.loss.reshape(1)
        losses.append(accelerator.gather(outputs.loss))        
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [24]:
# test the evaluate function
evaluate()

(11.021819114685059, 61194.8984375)

In [33]:
# train the model

gradient_accumulation_steps = 10
eval_steps = 2

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 10 == 0:
            accelerator.print(
                {
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

  0%|          | 0/101 [00:00<?, ?it/s]

RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 204244448 bytes.

### 6. Inference

#### Greedy search

#### Beam search