## Assignment: Create a Chatbot

### Name: Nikiforos Mandilaras

### Email: nikiforosmandi@windowslive.com

### Date: 15/11/2019

In [1]:
import os
import pickle
import numpy as np
import glob
import copy
import statistics
import torch
import torch.optim as optim
import torch.nn.functional as F
from datetime import datetime
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, WEIGHTS_NAME, CONFIG_NAME
from itertools import chain
from ast import literal_eval
from itertools import zip_longest
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader 

In [2]:
def load_checkpoint(output_dir='openai-gpt'):
    """
    Loads GPT Model and the corresponding tokenizer from local checkpoint.
    If no path is specified the pretrained weights on language modelling task are downloaded.
    
    :param output_dir: path to checkpoint 
    :return : model and tokenizer
    """

    tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
    model = OpenAIGPTLMHeadModel.from_pretrained(output_dir)  
    
    return model, tokenizer

In [3]:
model, tokenizer = load_checkpoint() 

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [65]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Number of trainable model parameters: {}".format(trainable_params))

Number of trainable model parameters: 116538624


In [None]:
# TODO check number of parameters
# print(model) # check the architecture of the model

In [4]:
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ('<speaker1>', '<speaker2>')}

In [5]:
def add_special_tokens_(model, tokenizer):
    """
    Adds special tokens to the tokenizer and if they weren't present to the model also.
    """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    
    if num_added_tokens > 0:
        print("New tokens added to model: {}".format(num_added_tokens))
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) 
    else:
        print("No new tokens found! Nothing added.")

In [6]:
# various constants needed

max_history = 2 # pairs of question/answer to be retained
min_sentence_length = 1
max_sentence_length = 20 # maximum length of a sentence produced by the model 

temperature = 0.75 # increases confidence in the most propable outputs 

use_cuda = False # whether to try to use cuda or not

SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
add_special_tokens_(model, tokenizer)  

SPECIAL_TOKENS_IDS = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
TIME_FORMAT = '%Y%m%d_%H%M'

New tokens added to model: 5


In [7]:
SPECIAL_TOKENS_IDS

[40478, 40479, 40481, 40482, 40480]

### Data Preprocessing

In [66]:
workspace = 'C:\\Users\\nikmand\\nikmand\\ncsr-chatbot\\'  # os.getcwd()

# TODO new function for the tokenization process

def parser(datafolder='metalwoz-v1/dialoguesTest/', cache_file='cache_folder/dialogs.txt'): 
    """
    Function that reads files, keeps only 'turns' from each entry and tokenizes them

    :param datafolder: path to the folder that contains the files
    :return: a list that contains dialogs, each dialog is a list of lists 
             where each of them represents the ids of a phrase,
    """
    try: 
        with open(cache_file, "rb") as f:
            print("Cache file found loading content.")
            dialogs = pickle.load(f)
            return dialogs
    except: # cache file not created yet
         print("Cache file not found. Start processing.")       
    
        dialogs, dialogs_len = [], []
        files = list(glob.glob(os.path.join(workspace, datafolder ,"*.txt")))

        for file in files:
            with open(file) as f:
                for line in f.readlines():

                    dialog = literal_eval(line)['turns'][1:] # keep only turns without the first sentence
                    dialog = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(phrase)) for phrase in dialog]  
                    dialogs.append(dialog) 

        with open(cache_file, "wb") as f:
            pickle.dump(dialogs, f)            
                
    return dialogs      

In [9]:
def extract_pairs(dialogs = None, cache_file='cache_folder/pairs.txt'):
    """
    Function that creates pairs of input, output from dialogs, each dialogs corresponds now to many pairs.
    
    :param dialogs: a list with all the dialogs 
    :param cache_file: path to save the result
    
    :return : a list whose elements are pairs of input(history), output(expected bot reply)  
    """
    try:         # try to open cache file
        
        with open(cache_file, "rb") as f:
            print("Cache file found loading content.")
            pairs = pickle.load(f)       
            return pairs
        
    except:      # cache file not created yet
        print("Cache file not found. Start processing.")
        
        pairs = [] 
        for dialog in dialogs:
            
            t_dict = {'input': []}
            if len(dialog) % 2 != 0: # discard the last phrase if it was said by the user
                dialog = dialog[:-1]    
            dialog_it = iter(dialog)
            
            for i_phrase, o_phrase in zip_longest(dialog_it, dialog_it): # process phrases two by two        
                try:
                    t_dict["input"].append(t_dict["output"])
                except:
                    pass 
                t_dict["input"].append(i_phrase) # history
                t_dict["output"] = o_phrase
                pairs.append(t_dict)
                t_dict = copy.deepcopy(t_dict) # so future changes address only the new dict
                
        with open(cache_file, "wb") as f:  # save result so future calls can retrieve it right away
            pickle.dump(pairs, f)
            
        return pairs

In [10]:
def adjust_history(pairs, max_history=2): # seq len reduced from 263 to 181
    """
    Reduces number of previous chat senteces that are going to be included in the input
    
    :param pairs: list with samples 
    :param max_history: Number of pairs of question/answer to be preserved (at least one is preserved)
    return : two lists, pairs with fixed history and theirs corresponding seq_lenghts 
    """
    
    pairs_len = []
    for pair in pairs:
        
        pair['input'] = pair['input'][-(2*max_history+1):] # at least one phrase is preserved
        pair_len = sum(len(phrase) for phrase in pair['input']) + len(pair['output'])
        pairs_len.append(pair_len)
        
    return pairs, pairs_len   

In [19]:
def filter_samples(samples, samples_len, percentile=90):
    """
    Filters samples based on sequence lengths.  
    
    :param samples: a list with samples
    :param samples_len: their corresponding lengths
    :param percentile: percentage of samples to preserve  
    
    :return : two lists, preserved samples and their lengths
    """
    
    samples_length = np.array(samples_len)
    reasonable_length = np.percentile(samples_length, percentile)
    print("{}% of the samples have sequence length less than {}".format(percentile, reasonable_length))
    
    samples_red, samples_len_red = [], []
    for sample, sample_len in zip(samples, samples_len):
        
        if sample_len <= reasonable_length:
            samples_red.append(sample)
            samples_len_red.append(sample_len)
    
    return samples_red, samples_len_red  

In [67]:
dialogs = parser()

print("Number of dialogs in the whole dataset: {}".format(len(dialogs)))

pairs = extract_pairs(dialogs) # list of dictionaries of input history and bot's reply

# keep only portion of the chat history to reduce seq_length
pairs, pairs_len = adjust_history(pairs, max_history=max_history) 

print("Number of pairs created: {}".format(len(pairs)))
print("Maximum seq_length observed: {}".format(max(pairs_len)))

pairs_reduced, pairs_len_reduced = filter_samples(pairs, pairs_len) # reduces from 181 to 81 (history 2) or from 263 to 108
# mean leangth with history 2 is  47 and max 181

Number of dialogs in the whole dataset: 3828
Cache file found loading content.
Number of pairs created: 19379
Maximum seq_length observed: 181
90% of the samples have sequence length less than 81.0


We instantiate a gpt pytorch model with pre-trained weights on language modelling task.

### Tokenizer

A helper class used to interact with the vocabulary in which our model has been pre-trained.
Our language model have been pre-trained with a vocabulary of {} words.

In [49]:
print("Our language model have been pre-trained with a vocabulary of {} words.".format(tokenizer.vocab_size))

Our language model have been pre-trained with a vocabulary of 40478 words.


Εμείς έχουμε πάντα περιττού πλήθους history που αρχίζει και τελειώνει με speaker1 και reply που το λέει ο speaker2

Για το input_ids: Η λογική είναι αναθέτει τον speaker2 κάθε φορά που μένει άρτιο πλήθος από διαλόγους(περιττό συνολικά μαζί με sos, βλέπε συνθήκη). Εμάς όλες μας οι λίστες έχουν άρτιο πλήθος οπότε θα ξεκινήσει με speaker2 ενώ θέλουμε speaker1. Τα επιμέρους αποτελεσματα όμως είναι συμβατά μεταξύ τους.

Στο input_ids το i στο iter παίρνει τιμή i = seq_len - 2 (αφού ξεκινήσαμε από το δεύτερο στοιχείο το iteration)

Για το token_type_ids: για κάθε μία λίστα κάνουμε iterate στα στοιχεία της, αν η θέση της λίστας είναι άρτια παίρνει speaker1 αλλιώς speaker2
Στο token_type_ids: επειδή το πλήθος είναι περιττό με την προσθήκη του sos θα αλλάξει η σειρά και η πρώτη πρόταση θα πάει speaker2 και το reply speaker1

Καταρχάς η αντιστοιχία που δίνουν οι ίδιοι στο δικό τους δεν ταιριάζει με αυτό που είχαμε σκεφτεί 
Κατά δεύτερο πρέπει να δούμε που θα μπει αν θα μπει το sos, αυτό μας δημιουργεί πρόβλημα αυτή τη στιγμή. Θα μπει μετά το tag του speaker ? 

είτε θα μπει μόνο του πριν τον speaker
σε αυτή την περίπτωση θα πρέπει να παίρνει το tag του speaker1 στα tokens αυτό δε συμβαίνει τώρα και μας μπερδεύει τη σειρά 

για το label βάζει σε όλα τα inputs εκτός του reply -1, στο speaker2 του reply -1 και βάζει τα tokens του reply.

Τι ακριβώς θα δούμε με το validation.

### Για διαχωρισμό σε train test
υπάρχει κάτι που να δημιουργεί πρόβλημα;
μπορεί να μας ενοχλεί ότι ζεύγη που έρχονται από διαφορετικούς διαλόγους θα χωριστούν; μας ενοχλεί αν δεν μπαίνουν με τη σειρά;
αν δεν κάνουμε τυχαίο split κάποια domains δε θα εμφανίζονται στο train set

In [36]:
pairs_train, pairs_eval = train_test_split(pairs_reduced, test_size=0.3, shuffle=True)      

print("Number of samples in train set: {} and in validation: {}".format(len(pairs_train), len(pairs_eval)))

Number of samples in train set: 12243 and in validation: 5248


"""
pad on batch level 

in order to avoid padding to the global max_len we can define our own collate_fn
which forms the samples into batches and call inside there the pad function.
Samples should be allocated to batches based on their sequence length in order to
minimize the need for padding.
""" 

In [28]:
def create_model_inputs(history, reply, tokenizer, with_eos=True):
    """
    Function that creates the various parts of the model input from input/output pairs.
    """
    
    bos, eos, speaker1, speaker2 = SPECIAL_TOKENS_IDS[:-1]
    sequence = [[bos]] + history + [reply + ([eos] if with_eos else [])]
    seq_len = len(sequence) # sequence: list of lists
    sequence = [sequence[0]] + [[speaker2 if (seq_len-i) % 2 != 1 else speaker1] + s 
                                for i, s in enumerate(sequence[1:])]
    
    instance = {}
    instance["input_ids"] = list(chain(*sequence)) # words
    instance["token_type_ids"] = [speaker1] + [speaker2 if i % 2 else speaker1 
                                               for i, s in enumerate(sequence[1:]) for _ in s] # for each word
    instance["mask"] = [1] * len(instance["input_ids"]) 
    instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    # TODO positional embeddings
    
    return instance

In [29]:
class DialogDataset(Dataset):
    """
    """

    def __init__(self, dialog_pairs):
        self.dataset = self.create_segments(dialog_pairs)
        self.dataset = self.sort_on_seq_length()

    def __len__(self):
        return len(self.dataset)
    
    def create_segments(self, dialog_pairs):
        """
        """
        dataset = []
        for pair in dialog_pairs:
            instance = create_model_inputs(pair['input'], pair['output'], tokenizer)
            dataset.append(instance)
        return dataset
    
    def sort_on_seq_length(self):
        """
        Sorts dataset based on seq_len to minimize padding afterwards
        """
        return sorted(self.dataset, key=lambda x: len(x['input_ids']))
    

    def __getitem__(self, index):
        return  self.dataset[index]

In [30]:
def pad_sequenses(batch, pad_token=0):
    """
    """
    max_seq_len = max(len(entry["input_ids"]) for entry in batch)
    for entry in batch:
        for index_name in entry.keys():
            if index_name == "lm_labels":
                pad_token_ = -1
            elif index_name == "mask":
                pad_token_ = 0
            else:
                pad_token_ = pad_token
            entry[index_name] =  entry[index_name] + [pad_token_] * (max_seq_len - len(entry[index_name]))
  
    return batch  

In [31]:
def custom_collate_fn(batch):
    """
    """
    batch = pad_sequenses(batch, tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
    
    inputs = [torch.stack(list(map(lambda x: torch.from_numpy( \
        np.array(x[index_name])), batch)), dim=0) for index_name in batch[0].keys()]

    inputs = [input_tensor.type(torch.LongTensor) for input_tensor in inputs]

    if use_cuda and torch.cuda.is_available:
        inputs = [input_tensor.cuda() for input_tensor in inputs]
        
    # input_ids, mask, category_ids, label_ids = inputs    

    return inputs  

In [58]:
training_set = DialogDataset(pairs_train[:10]) 
validation_set = DialogDataset(pairs_eval[:32])

In [59]:
TRAIN_BATCH_SIZE = 32 
EVAL_BATCH_SIZE = 64 

dataloader_train = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=False, 
                              collate_fn=custom_collate_fn, num_workers=0) 

dataloader_valid = DataLoader(validation_set, batch_size=EVAL_BATCH_SIZE, shuffle=False,
                              collate_fn=custom_collate_fn, num_workers=0)

### Training procedure

loss_function = nn. check it την διαλέγει μόνο του?
αν δοθεί το labels αρχικοποιεί και χρησιμοποιεί εσωτερικά το crossEntropyLoss
να πούμε αναλυτικά τι κάνει σε πρώτη φάση, αν δε το γράψουμε χεράτα.

In [39]:
def train(model, dataloader, optimizer, print_period=400):
    """
    """
    epoch_loss = 0.0
    model.train()
    
    for i_batch, (input_ids, attention_mask, category_ids, label_ids) in enumerate(dataloader):
        
        loss, logits = model(input_ids, attention_mask, category_ids, labels=label_ids)
        optimizer.zero_grad() 
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()  
        
    return epoch_loss    

In [40]:
def evaluate(model, dataloader):
    """
    """
    epoch_loss = 0.0
    model.eval()
    with torch.no_grad():
        
        for i_batch, (input_ids, attention_mask, category_ids, label_ids) in enumerate(dataloader):
            
            loss, logits = model(input_ids, attention_mask, category_ids, labels=label_ids)
            epoch_loss += loss.item()
            
    return epoch_loss

In [34]:
def checkpoint_state(model, tokenizer, output_dir=None):
    """
    Function that saves models weights and configuration as well as tokenizer's voc.
    
    :param model: to checkpoint
    :param tokenizer: to checkpoint
    :param output_dir: directory where checkpointed files will be created
    :retun :
    """
    
    if output_dir is None:
        output_dir = workspace + 'metalwoz-v1\\checkpoint_{}'.format(datetime.now().strftime(TIME_FORMAT))
    
    try:
        os.mkdir(output_dir)
    except FileExistsError:
        pass
    
    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(output_dir, CONFIG_NAME)  
    
    torch.save(model.state_dict(), output_model_file) # checkpoint weights
    model.config.to_json_file(output_config_file)     # configuration
    tokenizer.save_vocabulary(output_dir)             # vocabulary

In [41]:
class EarlyStoppingException(Exception):
    def __init__(self, message):

        super().__init__(message)

In [60]:
def early_stopping(min_loss, cur_patience, max_patience, epoch_eval_loss):
    """
    """
    if (epoch_eval_loss >= min_loss):  
        cur_patience += 1
        if (cur_patience >= max_patience):
            raise EarlyStoppingException("Execution terminated due to Early Stopping")
    else:
        print("New min validation loss: \t epoch {} : {:.4f}".format(epoch, epoch_eval_loss))
        checkpoint_state(model, tokenizer) # checkpointing
        print("New checkpoint created")
        min_loss, cur_patience = epoch_eval_loss, 0    
        
    return min_loss, cur_patience    

In [61]:
epochs = 10
lr = 6.25e-5
min_loss, max_patience, cur_patience = np.inf, 3, 0

if use_cuda and torch.cuda.is_available:
    model.cuda()

optimizer = optim.Adam(model.parameters(), lr=lr) # , weight_decay=0.001 # TODO review those values
for epoch in range(epochs):

    train_loss = train(model, dataloader_train, optimizer) / len(dataloader_train)
    print("Loss on train set: \t\t epoch {} : {:.4f}".format(epoch, train_loss))
    
    eval_loss = evaluate(model, dataloader_valid) / len(dataloader_valid)     
    print("Loss on validation set: \t epoch {} : {:.4f}".format(epoch, eval_loss))    
 
    try:
        min_loss, cur_patience = early_stopping(min_loss, cur_patience, max_patience, eval_loss) 
    except EarlyStoppingException as e:
        print("{} at epoch: {}".format(e, epoch))
        break

76
Loss on train set: 		 epoch 0 : 5.0405
88
Loss on validation set: 	 epoch 0 : 6.1711
New min validation loss: 	 epoch 0 : 6.1711
New checkpoint created
76
Loss on train set: 		 epoch 1 : 4.8323
88
Loss on validation set: 	 epoch 1 : 6.0714
New min validation loss: 	 epoch 1 : 6.0714
New checkpoint created
76
Loss on train set: 		 epoch 2 : 4.5554
88
Loss on validation set: 	 epoch 2 : 6.2055
76
Loss on train set: 		 epoch 3 : 4.4879
88
Loss on validation set: 	 epoch 3 : 6.0672
New min validation loss: 	 epoch 3 : 6.0672
New checkpoint created
76
Loss on train set: 		 epoch 4 : 4.3195
88
Loss on validation set: 	 epoch 4 : 6.1503
76
Loss on train set: 		 epoch 5 : 4.2222
88
Loss on validation set: 	 epoch 5 : 6.3049
76
Loss on train set: 		 epoch 6 : 4.0674
88
Loss on validation set: 	 epoch 6 : 6.4207
Execution terminated due to Early Stopping at epoch: 6


## Interaction with the bot - Inference

In [361]:
def format_input(history, reply_so_far):
    """
    """
    history = [tokenizer.encode(phrase) for phrase in history]
    
    instance = create_model_inputs(history, reply_so_far, tokenizer, with_eos=False)
    
    input_ids = torch.tensor(instance["input_ids"]).unsqueeze(0)
    token_type_ids = torch.tensor(instance["token_type_ids"]).unsqueeze(0)
    
    return input_ids, token_type_ids

In [360]:
def decoding(probs, logits, method="top_p"):
    """
    Functions that selects the next token to be emmited. Three different approaches are implemented: 
    
    Greedy: the most probable token is selected.
    Top-k : 
    Top-p : 
    
    :param logits: 
    :param method: the decoding method to be used, Values={'greedy', 'top_k', 'top_p'}
    :return: the selected token
    """
    top_k = 40 # sample from the 100 most probable tokens based on their probs
    top_p = 0.9 # sample from the n most probable tokens that have a cumulative probability at least 0.9 
    
    if method == "greedy":
        return torch.argmax(probs).item()
    
    elif method == "top_k":        
        prob_k = probs.topk(top_k)[0][-1].item() # value of the 100th most probable
        probs[probs < prob_k] = 0   # cut off the tail  
        
    elif method == "top_p":
        probs_sorted, probs_indexes = probs.sort(dim=-1, descending=True) # start the cumulation from the most probable token in descending order
        cum_probs = probs_sorted.cumsum(dim=-1)
        
        indices = cum_probs > top_p 
        indices[1:] = indices[:-1].clone()
        indices[0] = 0 # at least one token is preserved 
        
        probs[probs_indexes[indices]] = 0
    
    word = torch.multinomial(probs, 1).item()
    # TODO handle the case that special token was emitted in the first pick
    
    return word

In [359]:
def infer_answer(history, model, tokenizer, method="top_p"):
    """
    Function that generates word by word the bot answer, based on user input and previous history.
    
    :param history: a list of past sentences and last user's input, in plain text
    :param model: the model to be used for inference
    :return: a list with the words of the answer in plain text 
    """
    model.eval()
    reply_so_far = []
    with torch.no_grad():
    
        for i in range(max_sentence_length):
            
            input_ids, category_ids = format_input(history, reply_so_far)
            # print("Inputs ids are {}".format(input_ids)) seems good
            # print("Category ids are {}".format(category_ids)) seems good
            outputs = model(input_ids=input_ids, token_type_ids=category_ids)
            logits = outputs[0]
            logits = logits[0, -1, :] / temperature # keep last 
            probs = F.softmax(logits, dim=-1) 
            word = decoding(probs, logits, method=method) 
            
            if word in SPECIAL_TOKENS_IDS: # we stop inference if we find a special token without emitting this token
                print("Bot terminate sentence!")
                break
            reply_so_far.append(word)
            
        answer_text = tokenizer.decode(reply_so_far, skip_special_tokens=True)    
        return answer_text

In [358]:
def interact_with_bot(model, tokenizer, method='top_p'):
    """
    """
    bot_prompt = "bot:>>> "
    user_prompt = "user:>>> "

    history = []
    print(bot_prompt + "Hello how may I help you?")
    user_input = input(user_prompt)
    
    while user_input != "\q": # TODO check if we need to truncate user input to not exceed max_length
        
        history.append(user_input)
        answer = infer_answer(history, model, tokenizer, method=method)
        history.append(answer)
        
        history = history[-(2*max_history+1):]  # keep the same history as in the training 
        
        print(bot_prompt + answer)
        
        user_input = input(user_prompt) 

In [344]:
model_loaded ,tokenizer_loaded  = load_checkpoint(workspace + "metalwoz-v1\\checkpoint_20191114_1250")

add_special_tokens_(model_loaded, tokenizer_loaded)

interact_with_bot(model_loaded, tokenizer_loaded, method='top_p')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


C:\Users\nikmand\nikmand\ncsr-chatbot\metalwoz-v1\checkpoint_20191114_1250


In [363]:
test = "could you book a ticket for me?"

bot:>>> Hello how may I help you?
user:>>> i need toothpaste
Bot terminate sentence!
bot:>>> 
user:>>> hey how are you?
Bot terminate sentence!
bot:>>> yes
user:>>> tell me something.
Bot terminate sentence!
bot:>>> yes
user:>>> \q


In [356]:
tokenizer_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) # just to check

[40478, 40479, 40481, 40482, 40480]