# GPT 2 model training for XMAS generator

In [None]:
# imports
import numpy as np
import matplotlib.pyplot as plt

import time
import math

import neptune.new as neptune

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Model, pipeline

from sklearn.model_selection import train_test_split
import re


In [None]:
## setup device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(device)   

cuda:1


## Init model and tokenizer

In [None]:
# Init model from pretrained GPT2 model. We use with LM(language model) head
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# set tokenizer padding token
tokenizer.pad_token = tokenizer.eos_token
# load pretrained model from huggingface
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

## Data setup
The data is prepared using a Dataset class and split up into training and test sets

In [None]:
# setup datasets
class lyricsdataset(Dataset):
    def __init__(self, data_full, tokenizer, transform=None):
        
        self.data_split = data_full
        self.token = tokenizer

        # collect all lyrics into one string
        self.string = ' '.join(self.data_split)
        
        # split string into list of lyrics
        self.data = self.string.split('#CLAES#')
        
        # find max length of lyrics
        self.length = np.zeros((len(self.data),1))
        for i in range(len(self.data)):
            self.length[i] = len(re.findall(r' ', self.data[i]))

        self.max_length = int(np.max(self.length))  
        
        
    def __len__(self):
        # last lyric is empty so we subtract 1
        return len(self.data) - 1
    
    
    def __getitem__(self, idx):            
        
        self.data_input = self.data[idx]
        
        # return tokenize indexed lyrics
        return self.token(self.data_input, padding="max_length", max_length=self.max_length,truncation = True,return_tensors="pt" )

# load data from file:
with open("xmas_lyrics_dlpl.txt", "r", encoding = "utf-8") as f:
    raw_data = f.readlines()


# create dataset with Dataset subclass
dataset = lyricsdataset(raw_data,tokenizer)


# implement dataset split using sklearns train test split function
train_data, test_data = train_test_split(dataset, test_size=0.2)

# get dataloaders for training and testing
train_data_loader = DataLoader(train_data)
test_data_loader = DataLoader(test_data)


## Neptune setup
Used for logging data about our training

In [None]:
run = neptune.init(
    project="XMAS/DLXMAS",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJjMzI3MjcxNy02ZmMzLTRmNDEtOWI5MS1mOGJkNTI2ZTgzMDkifQ==",
)  # Lasse Bjørnskov

params = {
    "learning_rate": 0.001, 
    "optimizer": "AdamW",
    "batch_size": 64
}

run["parameters"] = params


https://app.neptune.ai/XMAS/DLXMAS/e/DLXMAS-47
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


## Training loop
Where the magic happens. The data is set into Dataloaders and the hyperparameters are set. Then the TRAINING begins

In [None]:
# training loop
# hyperparameters
nEpochs = 50
batchSize = params["batch_size"]
learningRate = params["learning_rate"]

# For timing training
start = time.time()

model.to(device)


# optimizer to be used for training
optimizer = torch.optim.AdamW(model.parameters(), lr = learningRate)

# loss for plotting
epoch_loss = 0
epoch_eval_loss = 0
for iEpoch in range(nEpochs):
    model.train()
    for Xbatch in train_data_loader:

        Xbatch = Xbatch.to(device)
        optimizer.zero_grad()
        
        outputs = model(**Xbatch, labels=Xbatch["input_ids"])

        loss = outputs.loss

        epoch_loss += loss.detach().cpu().numpy()

        loss.backward()
        optimizer.step()
    model.eval()
    for Xbatch in test_data_loader:
        Xbatch = Xbatch.to(device)
        optimizer.zero_grad()
        
        outputs = model(**Xbatch, labels=Xbatch["input_ids"])

        loss = outputs.loss

        epoch_eval_loss += loss.detach().cpu().numpy()

    print(f'Epoch: {iEpoch} loss: {np.round(epoch_loss, 2)} Eval loss: {np.round(epoch_eval_loss, 2)}')
    
    
    # log loss to neptune
    run["train/loss"].log(epoch_loss)
    run["Eval/loss"].log(epoch_eval_loss)
    
    # rest epoch loss
    epoch_loss = 0
    epoch_eval_loss = 0
    
    if iEpoch % 5 == 0:
        path = "model" + str(iEpoch) + "epoch.pt"
        model_name = "model" + str(iEpoch) + "epoch"
        torch.save(model, path)
        run[model_name].upload(path)
    
run.stop()


Epoch: 0 loss: 451.48 Eval loss: 74.59
Epoch: 1 loss: 247.4 Eval loss: 66.86
Epoch: 2 loss: 169.36 Eval loss: 68.37
Epoch: 3 loss: 125.85 Eval loss: 68.85
Epoch: 4 loss: 97.4 Eval loss: 70.02
Epoch: 5 loss: 81.16 Eval loss: 73.18
Epoch: 6 loss: 65.95 Eval loss: 78.47
Epoch: 7 loss: 52.27 Eval loss: 75.13
Epoch: 8 loss: 45.43 Eval loss: 80.02
Epoch: 9 loss: 42.25 Eval loss: 83.62
Epoch: 10 loss: 38.51 Eval loss: 87.37
Epoch: 11 loss: 37.26 Eval loss: 85.99
Epoch: 12 loss: 33.02 Eval loss: 87.1
Epoch: 13 loss: 31.29 Eval loss: 87.0
Epoch: 14 loss: 29.55 Eval loss: 86.6
Epoch: 15 loss: 27.92 Eval loss: 86.2
Epoch: 16 loss: 25.89 Eval loss: 98.37
Epoch: 17 loss: 26.55 Eval loss: 92.0
Epoch: 18 loss: 26.95 Eval loss: 88.74
Epoch: 19 loss: 24.84 Eval loss: 94.17
Epoch: 20 loss: 23.3 Eval loss: 91.38
Epoch: 21 loss: 22.98 Eval loss: 95.93
Epoch: 22 loss: 23.87 Eval loss: 92.91
Epoch: 23 loss: 22.43 Eval loss: 100.49
Epoch: 24 loss: 28.82 Eval loss: 104.29
Epoch: 25 loss: 26.42 Eval loss: 91.2

In [None]:
# save the model 
#model.save_pretrained()
model=model.cpu()
torch.save(model, "model_5epoch.pt")

In [29]:
#model=model.cpu()
pipe = pipeline("text-generation", model = model, tokenizer=tokenizer)
# test the model
sent = 2
gen_song = pipe("joyfull", max_length=200, num_return_sequences=sent, temperature=1)
for i in range(sent):
    print("song nr:" + str(i+1) + "\n")
    print(gen_song[i]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


song nr:1

joyfull Mr. Kringle is soon gonna jingle
 The bells that'll tingle all your troubles away
 Everyone's waiting for the man with the bag
 Cause Christmas is coming again
 He's got a sleighful, it's not gonna stay full
 He's got stuff to drop at every stop on the way
 Everyone's waiting for the man with the bag
 Cause Christmas is coming again
 He'll be here with the answers to the prayers
 That you made through the year
 You'll get yours if you've done
 Everything you should, extra special good
 He'll make this December the one you'll remember
 The best and the merriest you ever did have
 Everybody's waiting for the man with the bag
 Christmas is here again
 He's got a sleighful and it's not gonna stay full
 He's got stuff to drop at every stop on the way
 Everybody's waiting for the man with the bag
 Christmas is here again

song nr:2

joyfull night, holy night
 All is calm, All is calm
 All is sleeping, All is bright
 'Round yon virgin Mother and Child
 Holy infant so tender