<h1>Fine-tuning someGPT on Dialog task</h1>

In [1]:
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
#device = 'cpu'
cuda.empty_cache()

from torch.utils.data import Dataset

import pandas as pd
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer


### For displaying

import os
from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)

  return torch._C._cuda_getDeviceCount() > 0
2021-12-01 22:21:16.753182: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-01 22:21:16.753223: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


<h3>Load dataset</h3>

In [2]:
df = pd.read_pickle('wizards-of-wikipedia-data-extraction/out.pkl')
print(df)

                                               prev_turn  \
0                                                          
1             I'm a huge fan of science fiction myself!    
2      I agree. One of my favorite forms of science f...   
3      And that's difficult to do when dealing with t...   
4      Thank you for the suggestion, I will definitel...   
...                                                  ...   
82717  I love heavy metal music. My favorite bands ar...   
82718  Awesome, it helps me relax and relieve my stre...   
82719  It's a great form of exercise as well. I usual...   
82720  Yeah, that would be awesome. At least Ozzy Osb...   
82721  Yeah metal wouldn't be the same without Ozzy. ...   

                                              human_sent  \
0      I think science fiction is an amazing genre fo...   
1      Awesome! I really love how sci-fi storytellers...   
2      It's not quite sci-fi, but my favorite version...   
3      If you really want a look at the

<h3>Dataset Class</h3>

In [3]:
class YourDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and 
    loading it into the dataloader to pass it to the neural network for finetuning the model

    """

    def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        temp = {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }
        
        return temp

<h2>Model Implementation</h2>

In [36]:
def GPT(dataframe, source_text, target_text, model_params, output_dir="./outputs/" ):
    tokenizer = GPT2Tokenizer.from_pretrained(model_params["MODEL"])
    tokenizer.add_special_tokens({
      "eos_token": "</s>",
      "bos_token": "<s>",
      "unk_token": "<unk>",
      "pad_token": "<pad>",
      "mask_token": "<mask>"
    })
    # creating the configurations from which the model can be made
    config = GPT2Config(
      vocab_size=tokenizer.vocab_size,
      bos_token_id=tokenizer.bos_token_id,
      eos_token_id=tokenizer.eos_token_id
    )
    # creating the model
    model = TFGPT2LMHeadModel(config)
    
    # Prepare dataset
    dataframe = dataframe[[source_text,target_text]]
    display_df(dataframe.head(2))


    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    train_size = 0.8
    train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
    val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")

    inputs = tokenizer.batch_encode_plus(train_dataset[source_text], max_length= model_params["MAX_SOURCE_TEXT_LENGTH"], pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt').data
    print(train_dataset[source_text][0])
    print(len(inputs['input_ids']))
    print(inputs)
    labels = tokenizer.batch_encode_plus(train_dataset[target_text], max_length= model_params["MAX_TARGET_TEXT_LENGTH"], pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt').data
    print(train_dataset[target_text][0])
    print(len(labels['input_ids']))
    print(labels)
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    #print(dataset)
    
    #return
    # Creating the Training and Validation dataset for further creation of Dataloader
    #training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
    #val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)

    """
    # Defining the parameters for creation of dataloaders
    train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }
    
    print(training_set[0:2])
    """
    
    dataset = dataset.shuffle(1000).batch(model_params["TRAIN_BATCH_SIZE"], drop_remainder=True)
    
    
    # COMPILE MODEL
    # defining our optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    # definining our loss function
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    # defining our metric which we want to observe
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    # compiling the model
    model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])
    
    
    # MODEL TRAINING
    history = model.fit(dataset, epochs=model_params["TRAIN_EPOCHS"])
    
    return model, tokenizer

<h3>Generate responses</h3>

In [37]:
def predict(model, tokenizer):
    # encoding the input text
    input_ids = tokenizer.encode(text, return_tensors='tf')
    # getting out output
    beam_output = model.generate(
      input_ids,
      max_length = model_params["MAX_TARGET_TEXT_LENGTH"],
      num_beams = 5,
      temperature = 0.7,
      no_repeat_ngram_size=2,
      num_return_sequences=5
    )

<h3>Run the model</h3>

In [39]:
src_text = 'knowledge_sent'
out_text = 'human_sent'
model_params={
    "MODEL":"../models/DialoGPT-small/saves",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":12,          # training batch size
    "VALID_BATCH_SIZE":1,          # validation batch size
    "TRAIN_EPOCHS":1,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":2e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":512,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":512,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 
}
model, tokenizer = GPT(df[:2048], src_text, out_text, model_params, 'myGPT_out')

Some, including ice cream trucks, sell frozen or prepackaged food; others have on-board kitchens and prepare food from scratch.
1638
{'input_ids': tensor([[ 4366,    11,  1390,  ..., 50259, 50259, 50259],
        [ 3919,    62,  6603,  ..., 50259, 50259, 50259],
        [32697,  5278, 26698,  ..., 50259, 50259, 50259],
        ...,
        [ 2025, 18284,  1097,  ..., 50259, 50259, 50259],
        [14573,   318,   530,  ..., 50259, 50259, 50259],
        [ 1026,   318,  2408,  ..., 50259, 50259, 50259]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
I understand your skepticism. However most have on board kitchens so its just like a normal restaurant on wheels.
1638
{'input_ids': tensor([[   40,  1833,   534,  ..., 50259, 50259, 50259],
        [   69,  3532,  6473,  ..., 50259, 50259, 50259],
    

ValueError: in user code:

    /home/farjad/.local/lib/python3.8/site-packages/keras/engine/training.py:853 train_function  *
        return step_function(self, iterator)
    /home/farjad/.local/lib/python3.8/site-packages/keras/engine/training.py:842 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/farjad/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/farjad/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/farjad/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/farjad/.local/lib/python3.8/site-packages/keras/engine/training.py:835 run_step  **
        outputs = model.train_step(data)
    /home/farjad/.local/lib/python3.8/site-packages/transformers/modeling_tf_utils.py:772 train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    /home/farjad/.local/lib/python3.8/site-packages/keras/engine/compile_utils.py:436 update_state
        self.build(y_pred, y_true)
    /home/farjad/.local/lib/python3.8/site-packages/keras/engine/compile_utils.py:358 build
        self._metrics = tf.__internal__.nest.map_structure_up_to(y_pred, self._get_metric_objects,
    /home/farjad/.local/lib/python3.8/site-packages/tensorflow/python/util/nest.py:1376 map_structure_up_to
        return map_structure_with_tuple_paths_up_to(
    /home/farjad/.local/lib/python3.8/site-packages/tensorflow/python/util/nest.py:1458 map_structure_with_tuple_paths_up_to
        assert_shallow_structure(
    /home/farjad/.local/lib/python3.8/site-packages/tensorflow/python/util/nest.py:1091 assert_shallow_structure
        raise ValueError(

    ValueError: The two structures don't have the same sequence length. Input structure has length 4, while shallow structure has length 2.


In [27]:
tokenizer = GPT2Tokenizer.from_pretrained("../models/DialoGPT-small/saves")
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
single_string = ''
single_string += "Hello everyone!" + tokenizer.eos_token
single_string += "How are you doing?" + tokenizer.eos_token
single_string += "I'm doing well, what about you?" + tokenizer.eos_token
single_string += "All good, thank you" + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)
print(tokenizer.decode(string_tokenized))

examples = []
block_size = 10
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])

print(examples)
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])

print(f"\n{tokenizer.decode(inputs[0])}\n{inputs[0]}\n")
print(labels)
print("\n\n\n")
for num in inputs[0]:
    print(f"n: {num}, dec: {tokenizer.decode(num)}")
print("\n\n\n")
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


Hello everyone! </s> How are you doing? </s> I'm doing well, what about you? </s> All good, thank you </s>
[[15496, 2506, 0, 50257, 2437, 389, 345, 1804, 30, 50257], [40, 1101, 1804, 880, 11, 644, 546, 345, 30, 50257]]

Hello everyone! </s> How are you doing?
[15496, 2506, 0, 50257, 2437, 389, 345, 1804, 30]

[[2506, 0, 50257, 2437, 389, 345, 1804, 30, 50257], [1101, 1804, 880, 11, 644, 546, 345, 30, 50257]]




n: 15496, dec: Hello
n: 2506, dec:  everyone
n: 0, dec: !
n: 50257, dec: </s>
n: 2437, dec: How
n: 389, dec:  are
n: 345, dec:  you
n: 1804, dec:  doing
n: 30, dec: ?




