<h1>Fine-tuning DialoGPT on Sentence Generation</h1>

<h3>Import Libraries</h3>

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelWithLMHead

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
device = 'cpu'

import json
from timeit import default_timer as timer

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)

2021-12-04 17:11:18.475860: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-04 17:11:18.475900: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
cuda.is_available()

True

<h3>DataSetClass</h3>
<h4>Custom dataset class for loading the dataset and passing it to the model</h4>

In [3]:
class DataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and 
    loading it into the dataloader to pass it to the neural network for finetuning the model

    """

    def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        #source = self.tokenizer.batch_encode_plus([source_text], return_tensors='pt')
        #target = self.tokenizer.batch_encode_plus([target_text], return_tensors='pt')
        
        source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        temp = {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }
        
        return temp

<h3>Train method</h3>

In [4]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, labels=y)
        loss = outputs[0]

        if _%10==0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

<h3>Validate method</h3>

In [5]:
def validate(epoch, tokenizer, model, device, loader):

    """
    Function to evaluate model for predictions

    """
    model.eval()
    prompts = []
    predictions = []
    actuals = []
    with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=64, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True,
              pad_token_id=tokenizer.eos_token_id
              )
        
          prompt = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in ids]
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          prompts.extend(prompt)
          predictions.extend(preds)
          actuals.extend(target)
            
          #print(f"Predictions: {predictions}\nActuals: {actuals}\nPrompt: {prompt}")
    return prompts, predictions, actuals

In [13]:
def predict(tokenizer, model, source_text_key, target_text_key, source_text, target_text, model_params):
    model.eval()
    predictions = []
    actuals = []
    
    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }
    

    with torch.no_grad():
        step = 0
        while True:
            print(f"\nSTEP #{step}")
            if step == 0:
                user_input = source_text
            else:
                user_input = input(">> user:") + tokenizer.eos_token
            
            data_src = pd.DataFrame([{'knowledge_sent': user_input, 'human_sent': target_text}])
            data_loader = DataLoader(DataSetClass(data_src, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"],
                                                      model_params["MAX_TARGET_TEXT_LENGTH"], source_text_key, target_text_key), **val_params)
            #print(f"data type: {data.testthis('target')}")

            for _, data in enumerate(data_loader, 0):
                #y = data['target_ids'].to(device, dtype = torch.long)
                source_ids = data['source_ids'].to(device, dtype = torch.long)
                mask = data['source_mask'].to(device, dtype = torch.long)

                #print(f"\nIDs:\n{ids}\n\nMASKS:\n{mask}")

                input_ids = torch.cat([past_gen_ids, source_ids], dim=-1) if step > 0 else source_ids
                input_ids = input_ids.to(device, dtype = torch.long)
                
                print(f"\nGenerated ids len: {past_gen_ids.shape if step > 0 else source_ids.shape}\nsource_ids len: {source_ids.shape}\ninput_ids len: {input_ids.shape}\nINPUT IDS: {input_ids}\n")
                
                generated_ids = model.generate(
                    input_ids = input_ids,
                    attention_mask = mask, 
                    max_length=model_params["MAX_SOURCE_TEXT_LENGTH"] + model_params["MAX_TARGET_TEXT_LENGTH"], 
                    num_beams=2,
                    repetition_penalty=2.5, 
                    length_penalty=1.0, 
                    early_stopping=True,
                    temperature=5,
                    pad_token_id=tokenizer.eos_token_id,
                    num_return_sequences=1
                  ).to(device, dtype = torch.long)

                output_list = []
                for output in generated_ids:
                    output_list.append(
                        tokenizer.decode(
                            output[input_ids.shape[-1]:], skip_special_tokens=True
                        )
                     )
                    
                past_gen_ids = generated_ids[0][input_ids.shape[-1]:].unsqueeze(0)
                    
                prompt = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in input_ids]
                #target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
                #actuals.extend(target)

                print(f"Predictions: {output_list}\nPrompt: {prompt}")

                #print("DialoGPT: {}".format(tokenizer.decode(generated_ids[:, ids.shape[-1]:][0], 
                #                                             skip_special_tokens=True)))

                #for hm in generated_ids[:, ids.shape[-1]:]:
                #    print("DialoGPT: {}".format(tokenizer.decode(hm, skip_special_tokens=False)))
            step = step + 1

In [7]:
def DialoGPTTrainer(dataframe, source_text, target_text, model_params, output_dir="./outputs/" ):

    """
    T5 trainer

    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"]) # pytorch random seed
    np.random.seed(model_params["SEED"]) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    #tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
    tokenizer = AutoTokenizer.from_pretrained(model_params["MODEL"])
    """
    tokenizer.add_special_tokens({
      "eos_token": "</s>",
      "bos_token": "<s>",
      "unk_token": "<unk>",
      "mask_token": "<mask>",
      "pad_token": "<pad>"
    })
    """
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    #model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    #model = AutoModelForCausalLM.from_pretrained(model_params["MODEL"])
    model = AutoModelForCausalLM.from_pretrained(model_params["MODEL"], bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id)
    model = model.to(device)

    model.resize_token_embeddings(len(tokenizer))
    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text,target_text]]
    display_df(dataframe.head(2))


    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
    train_size = 0.8
    train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
    val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = DataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
    val_set = DataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


    # Defining the parameters for creation of dataloaders
    train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


    val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)
    
    """
    for eg in val_loader:
        print(eg)
        break
    return 1
    """


    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=model_params["LEARNING_RATE"])


    # Training loop
    console.log(f'[Initiating Fine Tuning]...\n')

    for epoch in range(model_params["TRAIN_EPOCHS"]):
      train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    #Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)


    # evaluating test dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        knowledge_sent, predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Knowledge Sentence': knowledge_sent, 'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv(os.path.join(output_dir,'predictions.csv'))

    console.save_text(os.path.join(output_dir,'logs.txt'))

    console.log(f"[Validation Completed.]\n")
    console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
    console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")
    
    return {"tokenizer": tokenizer, "model": model}


<h3>Load dataset</h3>

In [8]:
#with open('wizards-of-wikipedia-data-extraction/out.json', 'r') as json_file:
#    raw_dataset = json.load(json_file)

In [9]:
df = pd.read_pickle('../wizards-of-wikipedia-data-extraction/out.pkl')
print(df[:500])

                                             prev_turn  \
0                                                        
1           I'm a huge fan of science fiction myself!    
2    I agree. One of my favorite forms of science f...   
3    And that's difficult to do when dealing with t...   
4    Thank you for the suggestion, I will definitel...   
..                                                 ...   
495  Do you have a preference as to the type of hot...   
496  I think I prefer corndogs. It's less unwieldy ...   
497  Ooh, that just happened today! Joey Chestnut a...   
498  I've never been to a Nathan's but I bet that w...   
499                                                      

                                            human_sent  \
0    I think science fiction is an amazing genre fo...   
1    Awesome! I really love how sci-fi storytellers...   
2    It's not quite sci-fi, but my favorite version...   
3    If you really want a look at the potential neg...   
4    It blend

<h2>Run the model</h2>

In [10]:
#distilgpt2
source_text_key = 'knowledge_sent'
target_text_key = 'human_sent'
output_dir = "./outputs"
model_params={
    "MODEL":"../../models/DialoGPT-medium",             # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE":1,          # training batch size
    "VALID_BATCH_SIZE":1,          # validation batch size
    "TRAIN_EPOCHS":1,              # number of training epochs
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":2e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH":64,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH":64,   # max length of target text
    "SEED": 42                     # set seed for reproducibility 
}

In [11]:
start = timer()
model_out = DialoGPTTrainer(dataframe=df[:1], source_text="knowledge_sent", target_text="human_sent", model_params=model_params, output_dir=output_dir)
end = timer()
print(f"\n\nTRAINING TIME ELAPSED: {end-start} seconds")
model = model_out['model']
tokenizer = model_out['tokenizer']



TRAINING TIME ELAPSED: 8.450122989001102 seconds


In [14]:
#test_src_text = "Science fiction (often shortened to SF or sci-fi) is a genre of speculative fiction, typically dealing with imaginative concepts such as futuristic science and technology, space travel, time travel, faster than light travel, parallel universes, and extraterrestrial life"
#test_target_text = "I think science fiction is an amazing genre for anything. Future science, technology, time travel, FTL trave;, they're all such interesting concepts"
test_src_text = "What do you think of my meme game?"
#"Situated on the bank of the Chenab River, Multan is Pakistan's 7th largest city and is the major cultural and economic centre of Southern Punjab. Multan's history stretches deep into antiquity. The ancient city was site of the renowned Hindu Multan Sun Temple, and was besieged by Alexander the Great during the Mallian Campaign."
#test_src_text = "Do you like cycling?"
test_target_text = "Of the three primary colors, Blue is my favorite"
predict(tokenizer, model, 'knowledge_sent', 'human_sent', test_src_text + " " + tokenizer.eos_token, test_target_text, model_params)


STEP #0

Generated ids len: torch.Size([1, 64])
source_ids len: torch.Size([1, 64])
input_ids len: torch.Size([1, 64])
INPUT IDS: tensor([[ 2061,   466,   345,   892,   286,   616, 25336,   983,    30,   220,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256]])

Predictions: ["I don't know, I'm not a memer."]
Prompt: ['What do you think of my meme game? ']

STEP #1
>> user:What is the weather like?

Generated ids len: torch.Size([1, 13])
source_ids len: torch.Size([1, 64])
input_ids len: torch.Size([1, 77])
INPUT IDS: tensor([[   40,   836,   470,   760,   837,   314,  1101,   407,   257,  1066,
           263,

RuntimeError: shape '[-1, 77]' is invalid for input of size 128

In [None]:
predict(tokenizer, model, 'knowledge_sent', 'human_sent', in_msg + " " + tokenizer.eos_token, model_params)

<h3>Basic DialoGPT run</h3>

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


t_tokenizer = AutoTokenizer.from_pretrained("../../models/DialoGPT-large")
t_model = AutoModelForCausalLM.from_pretrained("../../models/DialoGPT-large")

t_model.eval()

In [None]:
# Let's chat for 5 lines
for step in range(10):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = t_tokenizer.encode(input(">> User:") + t_tokenizer.eos_token, return_tensors='pt')
    hist_id = t_tokenizer.encode("Matt Damon and John Lennon starred in Fast and Furious" + t_tokenizer.eos_token, return_tensors='pt')
    
    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([hist_id, new_user_input_ids], dim=-1) if step > -1 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    print(f"\ninput ids len: {bot_input_ids.shape}\nchat hist id len: {chat_history_ids.shape if step > 0 else new_user_input_ids.shape}\nuser input id len: {new_user_input_ids.shape}\n")
    chat_history_ids = t_model.generate(bot_input_ids, max_length=1000, pad_token_id=t_tokenizer.eos_token_id,
                                       temperature=5, num_beams=4,
                    repetition_penalty=2.5, num_return_sequences=3)
    
    prompt = [t_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in bot_input_ids]

    #print(f"\nPROMPT: {prompt}\n{bot_input_ids}\n\n{chat_history_ids}")

    # pretty print last ouput tokens from bot
    for hm in chat_history_ids[:, bot_input_ids.shape[-1]:]:
        print("DialoGPT: {}".format(t_tokenizer.decode(hm, skip_special_tokens=False)))

In [None]:
t_tokenizer = AutoTokenizer.from_pretrained("../models/DialoGPT-small/saves")
t_model = AutoModelForCausalLM.from_pretrained("../models/DialoGPT-small/saves")
help(t_model)