### General imports

In [None]:
!pip install matplotlib
!pip install tqdm
!pip install sklearn

In [1]:
import torch 
import torchvision
import torch.nn as nn 
import torch.nn.init as init
from IPython.display import Image 
from torchvision import transforms
import matplotlib.pyplot as plt
import random
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import os

import torch.optim as optim
import time
import torch.nn.functional as F

import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit

import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 12345
random.seed(seed)
torch.manual_seed(seed)

%matplotlib inline

### Model

In [None]:
!pip install transformers

In [2]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import torch


### Loading the data

In [3]:
train = pickle.load(open('okcupid_train.pkl', 'rb'))
val = pickle.load(open('okcupid_val.pkl', 'rb'))
test = pickle.load(open('okcupid_test.pkl', 'rb'))

### Dataset to handle the data

In [4]:
train[0]

'age : 35 ,  sex : m ,  education : working on space camp ,  job : hospitality / travel ,  income : 80000 ,  body_type : average  => "i am a chef: this is what that means. 1. i am a workaholic. 2. i love to cook regardless of whether i am at work. 3. i love to drink and eat foods that are probably really bad for me. 4. i love being around people that resemble line 1-3. i love the outdoors and i am an avid skier. if its snowing i will be in tahoe at the very least. i am a very confident and friendly. i\'m not interested in acting or being a typical guy. i have no time or patience for rediculous acts of territorial pissing. overall i am a very likable easygoing individual. i am very adventurous and always looking forward to doing new things and hopefully sharing it with the right person."'

In [4]:
class GPTDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, batch_size):
        #tokenizers usually do the tokenization and numericalization in one step, use encode to get word->index and  
        # decode to get index->word
        self.tokenizer = GPT2TokenizerFast.from_pretrained('distilgpt2')
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.dataset = dataset
        self.batch_size = batch_size
        
        self.batches = []
        
        for i in range(int(len(self.dataset) / batch_size)):
            batch = self.dataset[i * self.batch_size : (i + 1) * self.batch_size]
            self.batches += [self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt")]
            
    def __getitem__(self, index):
        return self.batches[index]

    def __len__(self):
        return len(self.batches)

### Preparing the data for training. BATCH SIZE is specified here

In [5]:
batch_size = 3

In [6]:
train_dataset = GPTDataset(train, batch_size)
val_dataset = GPTDataset(val, batch_size)
test_dataset = GPTDataset(test, batch_size)

In [10]:
# train_dataset[0]

### Training loop

In [12]:
def train_loop(model, optimizer, scheduler, train_dataset, device, epoch=5, 
               val_dataset=None, save_model_at_epoch = False):   
    output_dir = '/model/'
    output_prefix = 'step'
    train_losses = []
    val_losses = []
    
    for t in tqdm(range(epoch)):
        batches = 0 
        total = 0
        model.train()       
        total_loss = 0
        for batch_idx, x in tqdm(list(enumerate(train_dataset))):
            batches += 1
            x = x.to(device)
            
            output = model(**x, labels=x['input_ids'])
            
            loss = output[0]
            total_loss += loss.sum().detach().item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step(loss.sum().detach().item())
            
        train_losses += [total_loss / batches]
        
        if val_dataset is not None:
            model.eval()
            with torch.no_grad():
                total_loss = 0
                batches = 0 
                total = 0
                for batch_idx, x in enumerate(val_dataset):
                    batches += 1
                    x = x.to(device)

                    output = model(**x, labels=x['input_ids'])
                    loss = output[0]
                    total_loss += loss.sum().detach().item()

                val_losses += [total_loss / batches]
                
        print("[EPOCH]: %i, [TRAIN LOSS]: %.6f" % (t, train_losses[-1]))
        if val_dataset is not None:
            print("[EPOCH]: %i, [VAL LOSS]: %.6f" % (t, val_losses[-1]))
        
        if save_model_at_epoch:


            path = './checkpoints/'
            if not os.path.exists(path):
                os.makedirs(path)
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_losses[-1],
                'valid_loss': val_losses[-1],
                }, os.path.join(path, f"epoch-{epoch}.pt"))   
 #        if save_model_at_end:
 #            torch.save(
 #                model.state_dict(),
 # f"{output_prefix}-{t}.pt",
 #            )
    return model, train_losses, val_losses

### Training the model

In [13]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model = model.cuda()
model.resize_token_embeddings(len(train_dataset.tokenizer))

Embedding(50258, 768)

In [14]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sat Dec 11 15:50:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    31W /  70W |  13747MiB / 15109MiB |      0%      Default |
|                               |            

In [None]:
# optimizer = optim.AdamW(model.parameters(), lr=5e-5)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = 4)
# device = torch.device('cuda')
# trained_model, train_losses, val_losses = train_loop(model=model, optimizer=optimizer,
#                                       scheduler=scheduler, 
#                                       train_dataset=train_dataset, 
#                                       device=device, epoch=20, val_dataset=val_dataset, 
#                                       save_model_at_end=True)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
device = torch.device('cuda')
trained_model, train_losses, val_losses = train_loop(model=model, optimizer=optimizer,
                                      scheduler=scheduler, 
                                      train_dataset=train_dataset, 
                                      device=device, epoch=20, val_dataset=val_dataset, save_model_at_epoch = True)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 0, [TRAIN LOSS]: 3.462656
[EPOCH]: 0, [VAL LOSS]: 3.340747


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 1, [TRAIN LOSS]: 3.403092
[EPOCH]: 1, [VAL LOSS]: 3.317709


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 2, [TRAIN LOSS]: 3.377835
[EPOCH]: 2, [VAL LOSS]: 3.294382


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 3, [TRAIN LOSS]: 3.353113
[EPOCH]: 3, [VAL LOSS]: 3.272411


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 4, [TRAIN LOSS]: 3.330984
[EPOCH]: 4, [VAL LOSS]: 3.251709


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 5, [TRAIN LOSS]: 3.308307
[EPOCH]: 5, [VAL LOSS]: 3.232215


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 6, [TRAIN LOSS]: 3.289156
[EPOCH]: 6, [VAL LOSS]: 3.213706


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 7, [TRAIN LOSS]: 3.269221
[EPOCH]: 7, [VAL LOSS]: 3.196211


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 8, [TRAIN LOSS]: 3.252000
[EPOCH]: 8, [VAL LOSS]: 3.179798


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 9, [TRAIN LOSS]: 3.234160
[EPOCH]: 9, [VAL LOSS]: 3.164723


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 10, [TRAIN LOSS]: 3.219480
[EPOCH]: 10, [VAL LOSS]: 3.150645


  0%|          | 0/2413 [00:00<?, ?it/s]

[EPOCH]: 11, [TRAIN LOSS]: 3.204081
[EPOCH]: 11, [VAL LOSS]: 3.137572


  0%|          | 0/2413 [00:00<?, ?it/s]

### Saving the Model


In [None]:
torch.save(
                trained_model.state_dict(),
                "./20e/lr-bs3-ep20-5e5.pt",
            )
pickle.dump(train_losses, open('./20e/train_loss.p', 'wb'))
pickle.dump(val_losses, open('./20e/valid_loss.p', 'wb'))


In [None]:
model.save_pretrained("./models/lr-bs3-ep20-5e5/tokenizer/")

In [None]:
import pickle


pickle.dump(train_dataset, open('./20e/train_dataset.p', 'wb'))
pickle.dump(val_dataset, open('./20e/val_dataset.p', 'wb'))
pickle.dump(test_dataset, open('./20e/test_dataset.p', 'wb'))