In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch import cuda

# Importing the T5 modules from huggingface/transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

from rich.table import Column, Table
from rich import box
from rich.console import Console


training_logger = Table(Column("Epoch", justify="center"),
                        Column("Steps", justify="center"),
                        Column("Loss", justify="center"),
                        title="Training Status", pad_edge=False, box=box.ASCII)
console = Console(record=True)

class PTDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the neural network for fine-tuning the model
    """

    def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        # source = self.tokenizer.batch_encode_plus([source_text], return_tensors='pt')
        # target = self.tokenizer.batch_encode_plus([target_text], return_tensors='pt')

        source = self.tokenizer.batch_encode_plus([source_text], max_length=self.source_len, pad_to_max_length=True,
                                                  truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length=self.summ_len, pad_to_max_length=True,
                                                  truncation=True, padding="max_length", return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        temp = {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

        return temp


def display_df(df):
    """display dataframe in ASCII format"""
    df_console = Console()
    table = Table(Column("source_text", justify="center"), Column("target_text", justify="center"),
                  title="Sample Data",
                  pad_edge=False, box=box.ASCII)

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    df_console.print(table)


class DialoGPTController:
    def __init__(self, model_path):
        # define a rich console logger
        self.TAG = 'DialoGPTController'

        # Not enough memory to train this model, which is why the cuda option is commented out
        # self.device = 'cuda' if cuda.is_available() else 'cpu'
        # cuda.empty_cache()
        self.device = 'cpu'

        # Conversation hasn't started, so set turn and chat history to None
        self.turn = None
        self.chat_history_ids = None

        self.model_path = model_path

        # Initial values for model and tokenizer
        self.model = None
        self.tokenizer = None

    def __initialize_model(self, refresh=False):
        # Loads model from disk into memory
        console.log(f"""[{self.TAG}]: Loading {self.model_path}...\n""")

        if self.tokenizer is None or refresh:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
            self.tokenizer.add_special_tokens({'pad_token': self.tokenizer.eos_token})
        if self.model is None or refresh:
            self.model = AutoModelForCausalLM.from_pretrained(self.model_path, bos_token_id=self.tokenizer.bos_token_id,
                                                              eos_token_id=self.tokenizer.eos_token_id)
            self.model = self.model.to(self.device)
            self.model.resize_token_embeddings(len(self.tokenizer))

        print(f"{self.TAG}: CUDA IS {'NOT' if not cuda.is_available() else ''} AVAILABLE")

    def predict(self, user_input, output_fragment="", new_dialog_session=False):
        # If this is the first turn, initialize the turn variable
        if new_dialog_session or self.turn is None:
            self.turn = 0

        # Ensure the model is initialized
        self.__initialize_model()

        # encode the new user input, add the eos_token and return a tensor in Pytorch
        new_user_input_ids = self.tokenizer.encode(
            user_input + self.tokenizer.eos_token + output_fragment, return_tensors='pt')

        # append the new user input tokens to the chat history
        bot_input_ids = torch.cat([self.chat_history_ids,
                                   new_user_input_ids], dim=-1) if self.turn > 0 else new_user_input_ids

        # generated a response while limiting the total chat history to 1000 tokens,
        # print(
        #    f"\ninput ids len: {bot_input_ids.shape}\nchat hist id len: "
        #    f"{self.chat_history_ids.shape if self.turn > 0 else new_user_input_ids.shape}"
        #    f"\nuser input id len: {new_user_input_ids.shape}\n")
        chat_history_ids = self.model.generate(bot_input_ids, max_length=1000, pad_token_id=self.tokenizer.eos_token_id,
                                               temperature=5, num_beams=4, repetition_penalty=2.5,
                                               num_return_sequences=3)

        # prompt = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        #          for g in bot_input_ids]
        # print(f"\nPROMPT: {prompt}\n{bot_input_ids}\n\n{chat_history_ids}")

        # print the decoded, generated output for this conversation turn
        # model may have returned multiple responses
        best_response = None
        for response_id, response in enumerate(chat_history_ids[:, bot_input_ids.shape[-1]:]):
            decoded_response = self.tokenizer.decode(response, skip_special_tokens=True)
            if response_id == 0:
                best_response = decoded_response
            print("DialoGPT: {}".format(decoded_response))

        return best_response

    def __train_step(self, loader, optimizer, epoch):
        """
        Function to be called for training with the parameters passed from main function

        """

        self.model.train()
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(self.device, dtype=torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == self.tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(self.device, dtype=torch.long)
            mask = data['source_mask'].to(self.device, dtype=torch.long)

            outputs = self.model(input_ids=ids, attention_mask=mask, labels=y)
            loss = outputs[0]

            if _ % 10 == 0:
                training_logger.add_row(str(epoch), str(_), str(loss))
                console.print(training_logger)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    def train(self, train_batch_size, valid_batch_size, train_epochs, val_epochs, learning_rate,
              max_source_text_length, max_target_text_length, dataframe, source_text_key, target_text_key, output_dir):
        """
        T5 trainer

        """

        # Set random seeds and deterministic pytorch for reproducibility
        torch.manual_seed(42)  # pytorch random seed
        np.random.seed(42)  # numpy random seed
        torch.backends.cudnn.deterministic = True

        # Importing the raw dataset
        console.log(f"[{self.TAG}]: Reading data...\n")
        dataframe = dataframe[[source_text_key, target_text_key]]
        display_df(dataframe.head(2))

        # Creation of Dataset and Dataloader
        # Defining the train size. So 80% of the data will be used for training and the rest for validation.
        train_size = 0.8
        train_dataset = dataframe.sample(frac=train_size, random_state=42)
        val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
        train_dataset = train_dataset.reset_index(drop=True)

        console.print(f"FULL Dataset: {dataframe.shape}")
        console.print(f"TRAIN Dataset: {train_dataset.shape}")
        console.print(f"TEST Dataset: {val_dataset.shape}\n")

        # Creating the Training and Validation dataset for further creation of Dataloader
        training_set = PTDataSetClass(train_dataset, self.tokenizer, max_source_text_length,
                                      max_target_text_length, source_text_key, target_text_key)
        val_set = PTDataSetClass(val_dataset, self.tokenizer, self.max_source_text_length,
                                 self.max_target_text_length, source_text_key, target_text_key)

        # Defining the parameters for creation of data loaders
        train_params = {
            'batch_size': train_batch_size,
            'shuffle': True,
            'num_workers': 0
        }

        val_params = {
            'batch_size': valid_batch_size,
            'shuffle': False,
            'num_workers': 0
        }

        # Creation of Dataloaders for testing and validation. This will be used down for training and validation
        # stage for the model.
        training_loader = DataLoader(training_set, **train_params)
        val_loader = DataLoader(val_set, **val_params)

        # Defining the optimizer that will be used to tune the weights of the network in the training session.
        optimizer = torch.optim.Adam(params=self.model.parameters(), lr=learning_rate)

        # Training loop
        console.log(f'[Initiating Fine Tuning]...\n')

        for epoch in range(train_epochs):
            self.__train_step(training_loader, optimizer, epoch)

        console.log(f"[Saving Model]...\n")
        # Saving the model after training
        path = os.path.join(output_dir, "model_files")
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)

        # evaluating test dataset
        console.log(f"[Initiating Validation]...\n")
        for epoch in range(val_epochs):
            knowledge_sent, predictions, actuals = self.validate(epoch, val_loader)
            final_df = pd.DataFrame(
                {'Knowledge Sentence': knowledge_sent, 'Generated Text': predictions, 'Actual Text': actuals})
            final_df.to_csv(os.path.join(output_dir, 'predictions.csv'))

        console.save_text(os.path.join(output_dir, 'logs.txt'))

        console.log(f"[Validation Completed.]\n")
        console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
        console.print(
            f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir, 'predictions.csv')}\n""")
        console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir, 'logs.txt')}\n""")

    def validate(self, epoch, loader):
        """
        Function to evaluate model for predictions

        """

        self.model.eval()
        prompts = []
        predictions = []
        actuals = []
        with torch.no_grad():
            for _, data in enumerate(loader, 0):
                y = data['target_ids'].to(self.device, dtype=torch.long)
                ids = data['source_ids'].to(self.device, dtype=torch.long)
                mask = data['source_mask'].to(self.device, dtype=torch.long)

                generated_ids = self.model.generate(
                    input_ids=ids,
                    attention_mask=mask,
                    max_length=64,
                    num_beams=2,
                    repetition_penalty=2.5,
                    length_penalty=1.0,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )

                prompt = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
                          ids]
                preds = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
                         generated_ids]
                target = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in
                          y]
                if _ % 10 == 0:
                    console.print(f'Completed {_}')

                prompts.extend(prompt)
                predictions.extend(preds)
                actuals.extend(target)

                # print(f"Predictions: {predictions}\nActuals: {actuals}\nPrompt: {prompt}")
        return prompts, predictions, actuals


2021-12-05 16:14:25.421158: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-05 16:14:25.421198: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
model = DialoGPTController('../../models/DialoGPT-medium')

<h3>Model training</h3>

<h3>Inference: Dialog generation</h3>

In [4]:
model.predict(input(">> User:"))

>> User:South Africa


DialoGPTController: CUDA IS  AVAILABLE
DialoGPT: I'm from South Africa and I can confirm this.
DialoGPT: I'm in South Africa too!
DialoGPT: I'm from South Africa and I've never heard of it.


"I'm from South Africa and I can confirm this."

In [5]:
model.predict("Who was the host?")

DialoGPTController: CUDA IS  AVAILABLE
DialoGPT: I think it was a guy who looked like him, but I could be wrong.
DialoGPT: I think it was a guy who looked like him, but I'm not sure.
DialoGPT: I think it was a guy who looked like him, but I don't remember his name.


'I think it was a guy who looked like him, but I could be wrong.'

In [4]:
model.predict(input(">> User: "), output_fragment="Cinematography is an art, ")

>> User: What do you think about Cinematography?


DialoGPTController: CUDA IS  AVAILABLE
DialoGPT: ive only seen a few movies, but i like it.
DialoGPT: ive only seen a few movies, but i like the cinematography.
DialoGPT: ive only seen a few of the movies, but i love it.


'ive only seen a few movies, but i like it.'