### General imports

In [1]:
!pip install matplotlib
!pip install tqdm
!pip install sklearn



In [12]:
import torch 
import torchvision
import torch.nn as nn 
import torch.nn.init as init
from IPython.display import Image 
from torchvision import transforms
import matplotlib.pyplot as plt
import random
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import torch.optim as optim
import time
import torch.nn.functional as F

import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit

import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 12345
random.seed(seed)
torch.manual_seed(seed)

%matplotlib inline

### Model

In [None]:
!pip install transformers

In [13]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch


### Loading the data

In [14]:
train = pickle.load(open('okcupid_train.pkl', 'rb'))
val = pickle.load(open('okcupid_val.pkl', 'rb'))
test = pickle.load(open('okcupid_test.pkl', 'rb'))

### Dataset to handle the data

In [15]:
len(train)

7240

In [17]:
class GPTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, batch_size):
        #tokenizers usually do the tokenization and numericalization in one step, use encode to get word->index and  
        # decode to get index->word
        self.tokenizer = GPT2TokenizerFast.from_pretrained('distilgpt2')
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.dataset = dataset
        self.batch_size = batch_size
        
        self.batches = []
        
        for i in range(int(len(self.dataset) / batch_size)):
            batch = self.dataset[i * self.batch_size : (i + 1) * self.batch_size]
            self.batches += [self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt")]
            
    def __getitem__(self, index):
        return self.batches[index]

    def __len__(self):
        return len(self.batches)

### Preparing the data for training. BATCH SIZE is specified here

In [18]:
batch_size = 16

In [19]:
train_dataset = GPTDataset(train, batch_size)
val_dataset = GPTDataset(val, batch_size)
test_dataset = GPTDataset(test, batch_size)

### Training loop

In [20]:
def train_loop(model, optimizer, scheduler, train_dataset, device, epoch=5, 
               val_dataset=None, save_model_at_end = False):   
    output_dir = '/model/'
    train_losses = []
    val_losses = []
    
    for t in tqdm(range(epoch)):
        batches = 0 
        total = 0
        model.train()       
        total_loss = 0
        for batch_idx, x in tqdm(list(enumerate(train_dataset))):
            batches += 1
            x = x.to(device)
            
            output = model(**x, labels=x['input_ids'])
            
            loss = output[0]
            total_loss += loss.sum().detach().item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step(loss.sum().detach().item())
            
        train_losses += [total_loss / batches]
        
        if val_dataset is not None:
            model.eval()
            with torch.no_grad():
                total_loss = 0
                batches = 0 
                total = 0
                for batch_idx, x in enumerate(val_dataset):
                    batches += 1
                    x = x.to(device)

                    output = model(**x, labels=x['input_ids'])
                    loss = output[0]
                    total_loss += loss.sum().detach().item()

                val_losses += [total_loss / batches]
                
        print("[EPOCH]: %i, [TRAIN LOSS]: %.6f" % (t, train_losses[-1]))
        if val_dataset is not None:
            print("[EPOCH]: %i, [VAL LOSS]: %.6f" % (t, val_losses[-1]))
    #change- return model, and save on last epoch
    if save_model_at_end:
        torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
    return model, train_losses, val_losses

### Training the model

In [None]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
#model = model.cuda()
model.resize_token_embeddings(len(train_dataset.tokenizer))
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
device = torch.device('cpu')
trained_model, train_losses, val_losses = train_loop(model=model, optimizer=optimizer,
                                      scheduler=scheduler, 
                                      train_dataset=train_dataset, 
                                      device=device, epoch=1, val_dataset=val_dataset, save_model_at_end = True)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/452 [00:00<?, ?it/s]

In [None]:
def generate(model, tokenizer, prompt, entry_count=1, entry_length=30, top_p =0.8, temperature = 1.,):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("[PAD]"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
                output_list = list(generated.squeeze().numpy())
                output_text = f"{tokenizer.decode(output_list)}[PAD]" 
                generated_list.append(output_text)
                
    return generated_list

In [None]:
def text_generation(test_data):
    #testdata has to be a list, use test and not test_dataset
    generated_desc = []
    for i in range(len(test_data)):
        x = generate(model.to('cpu'), tokenizer, test_data[i], entry_count=1)
        generated_desc.append(x)
    return generated_desc

#Run the functions to generate the descriptions
generated = text_generation(test)