<h1>Fine-tuning someGPT on Dialog task</h1>

In [41]:
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
#device = 'cpu'
cuda.empty_cache()

from torch.utils.data import Dataset

import pandas as pd
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer


### For displaying

import os
from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)

<h3>Load dataset</h3>

In [10]:
df = pd.read_pickle('wizards-of-wikipedia-data-extraction/out.pkl')
print(df)

                                               prev_turn  \
0                                                          
1             I'm a huge fan of science fiction myself!    
2      I agree. One of my favorite forms of science f...   
3      And that's difficult to do when dealing with t...   
4      Thank you for the suggestion, I will definitel...   
...                                                  ...   
82717  I love heavy metal music. My favorite bands ar...   
82718  Awesome, it helps me relax and relieve my stre...   
82719  It's a great form of exercise as well. I usual...   
82720  Yeah, that would be awesome. At least Ozzy Osb...   
82721  Yeah metal wouldn't be the same without Ozzy. ...   

                                              human_sent  \
0      I think science fiction is an amazing genre fo...   
1      Awesome! I really love how sci-fi storytellers...   
2      It's not quite sci-fi, but my favorite version...   
3      If you really want a look at the

<h3>Model Initialization</h3>

In [8]:
# loading tokenizer from the saved model path
model_path = '../models/DialoGPT-small/saves'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)
# creating the model
model = TFGPT2LMHeadModel(config)

2021-12-01 21:11:40.592965: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2021-12-01 21:11:40.592993: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: farjad-Lenovo-Y520
2021-12-01 21:11:40.593012: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: farjad-Lenovo-Y520
2021-12-01 21:11:40.593269: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.82.0
2021-12-01 21:11:40.593291: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.82.0
2021-12-01 21:11:40.593296: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.82.0
2021-12-01 21:11:40.593716: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in per

<h3>Dataset Class</h3>

In [48]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and 
    loading it into the dataloader to pass it to the neural network for finetuning the model

    """

    def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        temp = {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }
        
        return temp

<h2>Model Implementation</h2>

In [78]:
def GPT(dataframe, source_text, target_text, model_params, output_dir="./outputs/" ):
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    tokenizer.add_special_tokens({
      "eos_token": "</s>",
      "bos_token": "<s>",
      "unk_token": "<unk>",
      "pad_token": "<pad>",
      "mask_token": "<mask>"
    })
    # creating the configurations from which the model can be made
    config = GPT2Config(
      vocab_size=tokenizer.vocab_size,
      bos_token_id=tokenizer.bos_token_id,
      eos_token_id=tokenizer.eos_token_id
    )
    # creating the model
    model = TFGPT2LMHeadModel(config)
    
    # Prepare dataset
    dataframe = dataframe[[source_text,target_text]]
    display_df(dataframe.head(2))


    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    train_size = 0.8
    train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
    val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")

    inputs = tokenizer.batch_encode_plus(train_dataset[source_text], max_length= model_params["MAX_SOURCE_TEXT_LENGTH"], pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    #print(train_dataset[source_text][0])
    #print(len(inputs['input_ids']))
    #print(inputs)
    labels = tokenizer.batch_encode_plus(train_dataset[target_text], max_length= model_params["MAX_TARGET_TEXT_LENGTH"], pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    #print(train_dataset[target_text][0])
    #print(len(labels['input_ids']))
    #print(labels)
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    #print(dataset)
    
    #return
    # Creating the Training and Validation dataset for further creation of Dataloader
    #training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
    #val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)

    """
    # Defining the parameters for creation of dataloaders
    train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }
    
    print(training_set[0:2])
    """
    
    

<h3>Run the model</h3>

In [79]:
src_text = 'knowledge_sent'
out_text = 'human_sent'
model_params={
    "MODEL":"../models/DialoGPT-small/saves",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":1,          # training batch size
    "VALID_BATCH_SIZE":1,          # validation batch size
    "TRAIN_EPOCHS":3,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":2e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":512,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 
}
GPT(df, src_text, out_text, model_params, 'myGPT_out')

Autism is a developmental disorder characterized by troubles with social interaction and communication.
66178
{'input_ids': tensor([[16541,  1042,   318,  ..., 50259, 50259, 50259],
        [   32,  1204, 14864,  ..., 50259, 50259, 50259],
        [   32,  4356, 39834,  ..., 50259, 50259, 50259],
        ...,
        [   39, 30921, 20351,  ..., 50259, 50259, 50259],
        [ 6653,   717,  1492,  ..., 50259, 50259, 50259],
        [28566,  1895,   373,  ..., 50259, 50259, 50259]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
Oh my, I'm sure it's good to find out. I know Autism is a developmental disorder, causing issues with communication and interaction in social situations, but not much more about it. Have you treatment plans?
66178
{'input_ids': tensor([[ 5812,   616,    11,  ..., 50259, 50259