# GPT2 - Language Models are Unsupervised Multitask Learners

# Fine tuning GPT2

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

# import logging
# logging.getLogger().setLevel(logging.CRITICAL)

# import warnings
# warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model and tokenizer

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

In [18]:
input_tokens = tokenizer("Hello, my dog is cute", return_tensors="pt").to(device)
# output_tokens = model(**input_tokens)
output_tokens = model(work_jokes_tens, labels=work_jokes_tens)
output_tokens

CausalLMOutputWithCrossAttentions(loss=tensor(4.8562, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[[-131.8282, -131.5743, -133.0684,  ..., -140.5847, -139.5674,
          -133.0660],
         [ -44.0537,  -44.0142,  -46.1587,  ...,  -56.9722,  -55.2199,
           -46.5648],
         [ -48.4926,  -48.5329,  -50.1996,  ...,  -56.8987,  -56.0599,
           -45.7592],
         ...,
         [ -44.8523,  -42.6457,  -52.1564,  ...,  -60.0582,  -56.5454,
           -49.8920],
         [ -62.4253,  -62.5598,  -64.8840,  ...,  -73.1778,  -70.1621,
           -60.0100],
         [ -81.3447,  -74.9731,  -77.0077,  ...,  -94.9401,  -93.7663,
           -82.2147]]], device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.0069,  0.1264,  0.0405,  ..., -0.5266, -0.1800,  0.0680],
          [-0.3817,  0.3969, -0.2691,  ...,  0.8831, -0.4337, -0.2111],
          [-0.7537,  0.5908,  0.4224,  ...,  0.2131,  0.1256, -0.7317],
          ...,
          [ 0.1877,  

In [19]:
output_tokens.keys()

odict_keys(['loss', 'logits', 'past_key_values'])

In [20]:
output_tokens.logits

tensor([[[-131.8282, -131.5743, -133.0684,  ..., -140.5847, -139.5674,
          -133.0660],
         [ -44.0537,  -44.0142,  -46.1587,  ...,  -56.9722,  -55.2199,
           -46.5648],
         [ -48.4926,  -48.5329,  -50.1996,  ...,  -56.8987,  -56.0599,
           -45.7592],
         ...,
         [ -44.8523,  -42.6457,  -52.1564,  ...,  -60.0582,  -56.5454,
           -49.8920],
         [ -62.4253,  -62.5598,  -64.8840,  ...,  -73.1778,  -70.1621,
           -60.0100],
         [ -81.3447,  -74.9731,  -77.0077,  ...,  -94.9401,  -93.7663,
           -82.2147]]], device='cuda:0', grad_fn=<UnsafeViewBackward0>)

In [21]:
output_tokens.logits.shape

torch.Size([1, 24, 50257])

In [22]:
tokenizer.decode(output_tokens.logits.argmax(dim=-1)[0])

'ING_\n-, on a mouse.. mouse the is to "I\'m I film is written than andThe'

### Dataset

In [7]:
from torch.utils.data import Dataset
import os
import json
import csv

class JokesDataset(Dataset):
    def __init__(self, jokes_dataset_path = 'jokes_data/'):
        super().__init__()

        short_jokes_path = os.path.join(jokes_dataset_path, 'shortjokes.csv')

        self.joke_list = []
        self.end_of_text_token = "<|endoftext|>"
        
        with open(short_jokes_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            
            x = 0
            for i, row in enumerate(csv_reader):
                if i == 0:
                    continue
                joke_str = f"JOKE:{row[1]}{self.end_of_text_token}"
                self.joke_list.append(joke_str)
        
    def __len__(self):
        return len(self.joke_list)

    def __getitem__(self, item):
        return self.joke_list[item]

In [8]:
dataset = JokesDataset()

In [9]:
dataset[0]

'JOKE:[me narrating a documentary about narrators] "I can\'t hear what they\'re saying cuz I\'m talking"<|endoftext|>'

### Dataloader

In [10]:
from torch.utils.data import DataLoader

joke_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [11]:
next(iter(joke_loader))

['JOKE:Its disgusting and derogatory to call a gay man a fruit How could something so pure and sweet be compared to a homosexual!!!<|endoftext|>']

### Training

In [12]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400
from transformers import AdamW#, WarmupLinearSchedule

In [17]:
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_jokes_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,joke in enumerate(joke_loader):
        print(f"\njoke {idx}: {joke}")
        
        #################### "Fit as many joke sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        joke_tens = torch.tensor(tokenizer.encode(joke[0])).unsqueeze(0).to(device)
        print(f"joke_tens shape {joke_tens.shape}")
        
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if joke_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first joke sequence in the sequence
        # if not torch.is_tensor(tmp_jokes_tens):
        #     tmp_jokes_tens = joke_tens
        #     print(f"tmp_jokes_tens shape {tmp_jokes_tens.shape}")
        #     continue
        # else:
        #     #The next joke does not fit in so we process the sequence and leave the last joke 
        #     #as the start for next sequence 
        #     if tmp_jokes_tens.size()[1] + joke_tens.size()[1] > MAX_SEQ_LEN:
        #         work_jokes_tens = tmp_jokes_tens
        #         tmp_jokes_tens = joke_tens
        #     else:
        #         #Add the joke to sequence, continue and try to add more
        #         tmp_jokes_tens = torch.cat([tmp_jokes_tens, joke_tens[:,1:]], dim=1)
        #         print(f"tmp_jokes_tens shape {tmp_jokes_tens.shape}")
        #         continue
        tmp_jokes_tens = joke_tens
        work_jokes_tens = tmp_jokes_tens
        print(f"tmp_jokes_tens shape {tmp_jokes_tens.shape}")
        print(f"work_jokes_tens shape {work_jokes_tens.shape}")

        print(f"joke_tens shape {joke_tens.shape}")
        print(f"joke tensor decoded {tokenizer.decode(joke_tens[0])}")
        print(f"joke: {joke}")
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_jokes_tens, labels=work_jokes_tens)
        print(f"outputs logits shape {outputs.logits.shape}")
        print(f"outputs logits decoded {tokenizer.decode(torch.argmax(outputs.logits, axis=2)[0])}")
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            # scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
        
        break
    
    break
    
    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_joker_{epoch}.pt"))


joke 0: ['JOKE:Two mice chewing on a film roll One of them goes, "I think the book was better"<|endoftext|>']
joke_tens shape torch.Size([1, 24])
tmp_jokes_tens shape torch.Size([1, 24])
work_jokes_tens shape torch.Size([1, 24])
joke_tens shape torch.Size([1, 24])
joke tensor decoded JOKE:Two mice chewing on a film roll One of them goes, "I think the book was better"<|endoftext|>
joke: ['JOKE:Two mice chewing on a film roll One of them goes, "I think the book was better"<|endoftext|>']
outputs logits shape torch.Size([1, 24, 50257])
outputs logits decoded -: I-, on a piece of of mouse them is to "I'm I film is a than andThe




### Inference

In [3]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [None]:
MODEL_EPOCH = 4

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_joker_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

jokes_output_file_path = f'generated_{MODEL_EPOCH}.jokes'

model.eval()
if os.path.exists(jokes_output_file_path):
    os.remove(jokes_output_file_path)
    
joke_num = 0
with torch.no_grad():
   
        for joke_idx in range(1000):
        
            joke_finished = False

            cur_ids = torch.tensor(tokenizer.encode("JOKE:")).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    joke_finished = True
                    break

            
            if joke_finished:
                
                joke_num = joke_num + 1
                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(jokes_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")