In [327]:
import os
import pickle
import numpy as np
import glob
import copy
import statistics
import torch
import torch.optim as optim
import torch.nn.functional as F
from datetime import datetime
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, WEIGHTS_NAME, CONFIG_NAME
from itertools import chain
from ast import literal_eval
from itertools import zip_longest
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader 

### Data Preprocessing

In [343]:
def load_checkpoint(output_dir='openai-gpt'):
    """
    Loads GPT Model and the corresponding tokenizer from local checkpoint.
    If no path is specified the pretrained weights on language modelling task are downloaded.
    
    :param output_dir: path to checkpoint 
    :return : model and tokenizer
    """

    tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
    model = OpenAIGPTLMHeadModel.from_pretrained(output_dir)  
    
    return model, tokenizer

In [24]:
model, tokenizer = load_checkpoint() 

In [None]:
# TODO check number of parameters
# print(model) # check the architecture of the model

In [4]:
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ('<speaker1>', '<speaker2>')}

In [5]:
def add_special_tokens_(model, tokenizer):
    """
    Adds special tokens to the tokenizer and if they weren't present to the model also.
    """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    
    if num_added_tokens > 0:
        print("New tokens added to model: {}".format(num_added_tokens))
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) 
    else:
        print("No new tokens found! Nothing added.")

In [349]:
# various constants needed

max_history = 2 # pairs of question/answer to be retained
min_sentence_length = 1
max_sentence_length = 20 # maximum length of a sentence produced by the model 

temperature = 0.75 # increases confidence in the most propable outputs 

use_cuda = False # whether to try to use cuda or not

SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
add_special_tokens_(model, tokenizer)  

SPECIAL_TOKENS_IDS = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
TIME_FORMAT = '%Y%m%d_%H%M'

In [366]:
SPECIAL_TOKENS_IDS

[40478, 40479, 40481, 40482, 40480]

### Severe TODO 
H Parser είναι πιο χρονοβόρα από την extract pairs
refactor το που γίνεται τι ώστε κάθε συνάρτηση να έχει ένα ρόλο 
το tokenization δεν κολλάει πολύ στην parser.

In [368]:
workspace = 'C:\\Users\\nikmand\\nikmand\\ncsr-chatbot\\'  # os.getcwd()

# TODO new function for the tokenization process

def parser(datafolder='metalwoz-v1/dialoguesTest/', cache_file='cache_folder/pairs.txt'): 
    """
    Function that reads files, keeps only 'turns' from each entry and tokenizes them

    :param datafolder: path to the folder that contains the files
    :return: a list that contains dialogs, each dialog is a list of lists 
             where each of them represents the ids of a phrase,
             a second list with number of words in each dialog 
    """
    try: 
        with open(cache_file, "rb") as f:
            print("Cache file found loading content.")
            pairs = pickle.load(f)
            return pairs
    except: # cache file not created yet
    
    dialogs, dialogs_len = [], []
    files = list(glob.glob(os.path.join(workspace, datafolder ,"*.txt")))
    
    for file in files:
        with open(file) as f:
            for line in f.readlines():
                
                dialog = literal_eval(line)['turns'][1:] # keep only turns without the first sentence
                dialog = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(phrase)) for phrase in dialog]
                dialog_len = sum(len(phrase) for phrase in dialog)
                
                dialogs.append(dialog) 
                dialogs_len.append(dialog_len)
                
    return dialogs, dialogs_len        

In [88]:
def extract_pairs(dialogs = None, cache_file='cache_folder/pairs.txt'):
    """
    Function that creates pairs of input, output from dialogs, each dialogs corresponds now to many pairs.
    
    :param dialogs: a list with all the dialogs 
    :return a list whose elements are pairs of input, output  
    """
    try: 
        with open(cache_file, "rb") as f:
            print("Cache file found loading content.")
            pairs = pickle.load(f)
            return pairs
    except: # cache file not created yet
        print("Cache file not found. Start processing.")
        pairs = [] 
        for dialog in dialogs:
            t_dict = {'input': []}
            if len(dialog) % 2 != 0: # discard the last phrase if it was said by the user
                dialog = dialog[:-1]
            dialog_it = iter(dialog)
            for i_phrase, o_phrase in zip_longest(dialog_it, dialog_it): # process phrases two by two
                try:
                    t_dict["input"].append(t_dict["output"])
                except:
                    pass
                t_dict["input"].append(i_phrase) # history
                t_dict["output"] = o_phrase
                pairs.append(t_dict)
                t_dict = copy.deepcopy(t_dict) # so future changes address only the new dict
        with open(cache_file, "wb") as f:
            pickle.dump(pairs, f)
        return pairs

In [125]:
def adjust_history(pairs, max_history=2): # seq len reduced from 263 to 181
    """
    """
    pairs_len = []
    for pair in pairs:
        pair['input'] = pair['input'][-(2*max_history+1):] # at least one phrase is preserved
        pair_len = sum(len(phrase) for phrase in pair['input']) + len(pair['output'])
        pairs_len.append(pair_len)
    return pairs, pairs_len   

In [164]:
def filter_samples(samples, samples_len, percentile=90):
    """
    """
    
    samples_length = np.array(samples_len)
    reasonable_length = np.percentile(samples_length, percentile)
    print(reasonable_length)
    
    samples_red, samples_len_red = [], []
    for sample, sample_len in zip(samples, samples_len):
        if sample_len <= reasonable_length:
            samples_red.append(sample)
            samples_len_red.append(sample_len)
    
    return samples_red, samples_len_red  

In [369]:
dialogs, dialogs_len = parser()

In [371]:
print("Number of dialogs in the whole dataset: {}".format(len(dialogs)))

Number of dialogs in the whole dataset: 3828


In [148]:
pairs = extract_pairs(dialogs) # list of dictionaries of input history and bot's reply

pairs, pairs_len = adjust_history(pairs, max_history=2) # keep only portion of the chat history to reduce seq_length

pairs_reduced, pairs_len_reduced = filter_samples(pairs, pairs_len) # reduces from 181 to 81 (history 2) or from 263 to 108
# mean leangth with history 2 is  47 and max 181

Cache file found loading content.


In [156]:
print(len(pairs))
print(len(pairs_reduced))
print(pairs[3])
print(pairs_len[3])
print(max(pairs_len))
print(statistics.median(pairs_len))
miiii = sum(pairs_len) / len(pairs_len)

19379
17491
{'input': [[488, 249, 1074, 12361, 15354, 504, 481, 3361], [249, 2518, 512, 1074, 246, 5358, 500, 481, 3361], [668, 5611, 239, 249, 1074, 688, 504, 2306], [525, 256, 252, 246, 1875, 4778], [912, 249, 1048, 246, 16219, 267]], 'output': [249, 2310, 256, 241, 2153, 485, 699, 512]}
45
181
45


46.829093348469996

### TODO να σώζεται σε αρχείο στην πιο κατάλληλη μορφή. Να δούμε αν βολεύει Pandas ή κάτι άλλο 

We instantiate a gpt pytorch model with pre-trained weights on language modelling task.

### Tokenizer

A helper class used to interact with the vocabulary in which our model has been pre-trained.

In [49]:
print("Our language model have been pre-trained with a vocabulary of {} words.".format(tokenizer.vocab_size))

Our language model have been pre-trained with a vocabulary of 40478 words.


Εμείς έχουμε πάντα περιττού πλήθους history που αρχίζει και τελειώνει με speaker1 και reply που το λέει ο speaker2

Για το input_ids: Η λογική είναι αναθέτει τον speaker2 κάθε φορά που μένει άρτιο πλήθος από διαλόγους(περιττό συνολικά μαζί με sos, βλέπε συνθήκη). Εμάς όλες μας οι λίστες έχουν άρτιο πλήθος οπότε θα ξεκινήσει με speaker2 ενώ θέλουμε speaker1. Τα επιμέρους αποτελεσματα όμως είναι συμβατά μεταξύ τους.

Στο input_ids το i στο iter παίρνει τιμή i = seq_len - 2 (αφού ξεκινήσαμε από το δεύτερο στοιχείο το iteration)

Για το token_type_ids: για κάθε μία λίστα κάνουμε iterate στα στοιχεία της, αν η θέση της λίστας είναι άρτια παίρνει speaker1 αλλιώς speaker2
Στο token_type_ids: επειδή το πλήθος είναι περιττό με την προσθήκη του sos θα αλλάξει η σειρά και η πρώτη πρόταση θα πάει speaker2 και το reply speaker1

Καταρχάς η αντιστοιχία που δίνουν οι ίδιοι στο δικό τους δεν ταιριάζει με αυτό που είχαμε σκεφτεί 
Κατά δεύτερο πρέπει να δούμε που θα μπει αν θα μπει το sos, αυτό μας δημιουργεί πρόβλημα αυτή τη στιγμή. Θα μπει μετά το tag του speaker ? 

είτε θα μπει μόνο του πριν τον speaker
σε αυτή την περίπτωση θα πρέπει να παίρνει το tag του speaker1 στα tokens αυτό δε συμβαίνει τώρα και μας μπερδεύει τη σειρά 

για το label βάζει σε όλα τα inputs εκτός του reply -1, στο speaker2 του reply -1 και βάζει τα tokens του reply.

Τι ακριβώς θα δούμε με το validation.

In [80]:
pairs_array = np.array(pairs_reduced)

In [350]:
pairs_train_l = pairs[:int(len(pairs)*0.8)]
pairs_eval_l = pairs[int(len(pairs)*0.8):]
print(len(pairs_train_l), len(pairs_eval_l))

13996 3500


### Για διαχωρισμό σε train test
υπάρχει κάτι που να δημιουργεί πρόβλημα;
μπορεί να μας ενοχλεί ότι ζεύγη που έρχονται από διαφορετικούς διαλόγους θα χωριστούν; μας ενοχλεί αν δεν μπαίνουν με τη σειρά;
αν δεν κάνουμε τυχαίο split κάποια domains δε θα εμφανίζονται στο train set

In [175]:
pairs_train, pairs_eval, pairs_train_len, pairs_eval_len = train_test_split(pairs_reduced, pairs_len_reduced, test_size=0.3, shuffle=True)      

Θέλω στο μοντέλο μου να δίνω τρία inputs όπως το παράγει η συνάρτηση build, συνεπώς αυτό θέλω να μου γυρίζει η συνάρτηση get item 

"""
TODO: pad on batch level 

in order to avoid padding to the global max_len we can define our own collate_fn
which forms the samples into batches and call inside there the pad function.
Samples should be allocated to batches based on their sequence length in order to
minimize the need for padding.
""" 

In [318]:
def create_model_inputs(history, reply, tokenizer, with_eos=True):
    """ 
    """
    
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos]] + history + [reply + ([eos] if with_eos else [])]
    seq_len = len(sequence) # sequence λίστα από λίστες
    sequence = [sequence[0]] + [[speaker2 if (seq_len-i) % 2 != 1 else speaker1] + s for i, s in enumerate(sequence[1:])]
    
    instance = {}
    instance["input_ids"] = list(chain(*sequence)) # words
    instance["token_type_ids"] = [speaker1] + [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence[1:]) for _ in s] # for each word
    instance["mask"] = [1] * len(instance["input_ids"]) 
    # TODO positional embeddings
    instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    
    return instance

In [317]:
class DialogDataset(Dataset):
    """
    """

    def __init__(self, dialog_pairs):
        self.dataset = self.create_segments(dialog_pairs)
        self.dataset = self.order_on_seq_length()

    def __len__(self):
        return len(self.dataset)
    
    def create_segments(self, dialog_pairs):
        dataset = []
        for pair in dialog_pairs:
            instance = create_model_inputs(pair['input'], pair['output'], tokenizer)
            dataset.append(instance)
        return dataset
    
    def order_on_seq_length(self):
        return sorted(self.dataset, key=lambda x: len(x['input_ids']))
    

    def __getitem__(self, index):
        return  self.dataset[index]

In [302]:
def pad_sequenses(batch, pad_token=0):
    """
    """
    max_seq_len = max(len(entry["input_ids"]) for entry in batch)
    print(max_seq_len) # debug remove
    for entry in batch:
        for index_name in entry.keys():
            if index_name == "lm_labels":
                pad_token_ = -1
            elif index_name == "mask":
                pad_token_ = 0
            else:
                pad_token_ = pad_token
            entry[index_name] =  entry[index_name] + [pad_token_] * (max_seq_len - len(entry[index_name]))
  
    return batch  

In [325]:
def custom_collate_fn(batch):
    """
    """
    batch = pad_sequenses(batch, tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
    
    inputs = [torch.stack(list(map(lambda x: torch.from_numpy( \
        np.array(x[index_name])), batch)), dim=0) for index_name in batch[0].keys()]

    inputs = [input_tensor.type(torch.LongTensor) for input_tensor in inputs]

    if use_cuda:
        inputs = [input_tensor.cuda() for input_tensor in inputs]
        
    # input_ids, mask, category_ids, label_ids = inputs    

    return inputs  

In [305]:
training_set = DialogDataset(pairs_train) 
validation_set = DialogDataset(pairs_eval)

In [334]:
TRAIN_BATCH_SIZE = 64 
EVAL_BATCH_SIZE = 100 

dataloader_train = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=False, 
                              collate_fn=custom_collate_fn, num_workers=0) 

dataloader_valid = DataLoader(validation_set, batch_size=EVAL_BATCH_SIZE, shuffle=False,
                              collate_fn=custom_collate_fn, num_workers=0)

In [367]:
def checkpoint_state(model, tokenizer, output_dir=None):
    """
    """
    
    if output_dir is None:
        output_dir = workspace + 'metalwoz-v1\\checkpoint_{}'.format(datetime.now().strftime(TIME_FORMAT))
    
    try:
        os.mkdir(output_dir)
    except FileExistsError:
        pass
    
    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(output_dir, CONFIG_NAME)  
    
    torch.save(model.state_dict(), output_model_file)
    model.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(output_dir)

### Training procedure

In [354]:
epochs = 1
min_loss, max_patience, cur_patience = np.inf, 5, 0

if use_cuda and torch.cuda.is_available :
    model.cuda()

# loss_function = nn. check it την διαλέγει μόνο του?
# αν δοθεί το labels αρχικοποιεί και χρησιμοποιεί εσωτερικά το crossEntropyLoss
# να πούμε αναλυτικά τι κάνει σε πρώτη φάση, αν δε το γράψουμε χεράτα.
optimizer = optim.Adam(model.parameters(), lr=1e-3) # , weight_decay=0.001 # TODO review those values
for epoch in range(epochs):
    epoch_train_loss = 0.0
    model.train()
    
    for i_batch, (input_ids, attention_mask, category_ids, label_ids) in enumerate(dataloader_train):
#         print(type(input_ids))
#         print(input_ids.shape)
#         print(category_ids.shape)
#         print(label_ids.shape)
        
        loss, logits = model(input_ids, attention_mask, category_ids, labels=label_ids)
        # print(type(outputs))
        #  = outputs[:2]
        optimizer.zero_grad() 
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()
        break  
    print("Loss on train set: \t\t epoch {} : {:.4f}".format(epoch, epoch_train_loss/(i_batch + 1)))
    
    epoch_eval_loss = 0.0
    with torch.no_grad():
        model.eval()
        
        for i_batch, (input_ids, attention_mask, category_ids, label_ids) in enumerate(dataloader_valid):
            
            loss, logits = model(input_ids, attention_mask, category_ids, labels=label_ids)
            epoch_eval_loss += loss.item()
            break
            
        print("Loss on validation set: \t epoch {} : {:.4f}".format(epoch, epoch_eval_loss/(i_batch + 1)))    
 
    if (epoch_eval_loss >= min_loss):     # early stopping
        cur_patience += 1
        if (cur_patience >= max_patience):
            print("Execution terminated due to Early Stopping at epoch: {}".format(epoch))
            break
    else:
        print("New min validation loss: \t epoch {} : {:.4f}".format(epoch, epoch_eval_loss/(i_batch + 1)))
        checkpoint_state(model, tokenizer) #torch.save(model.state_dict(), save_file) # checkpointing
        print("New checkpoint created")
        min_loss, cur_patience = epoch_eval_loss, 0

10
Loss on train set: 		 epoch 0 : 18.7202
13
Loss on validation set: 	 epoch 0 : 4.5075
New min validation loss: 	 epoch 0 : 4.5075
New checkpoint created


## Interaction with the bot - Inference

### TODO load model and tokenizer

In [361]:
def format_input(history, reply_so_far):
    """
    """
    # print("History is: {}".format(history))
    # print("Reply so far is: {}".format(reply_so_far))
    history = [tokenizer.encode(phrase) for phrase in history]
    
    instance = create_model_inputs(history, reply_so_far, tokenizer, with_eos=False)
    
    input_ids = torch.tensor(instance["input_ids"]).unsqueeze(0)
    token_type_ids = torch.tensor(instance["token_type_ids"]).unsqueeze(0)
    
    return input_ids, token_type_ids

In [360]:
def decoding(probs, logits, method="top_p"):
    """
    Functions that selects the next token to be emmited. Three different approaches are implemented: 
    
    Greedy: the most probable token is selected.
    Top-k : 
    Top-p : 
    
    :param logits: 
    :param method: the decoding method to be used, Values={'greedy', 'top_k', 'top_p'}
    :return: the selected token
    """
    top_k = 40 # sample from the 100 most probable tokens based on their probs
    top_p = 0.9 # sample from the n most probable tokens that have a cumulative probability at least 0.9 
    
    if method == "greedy":
        return torch.argmax(probs).item()
    
    elif method == "top_k":        
        prob_k = probs.topk(top_k)[0][-1].item() # value of the 100th most probable
#         print(probs.topk(top_k)[0])
#         print((probs < prob_k).nonzero().shape)
        probs[probs < prob_k] = 0   # cut off the tail  
        
    elif method == "top_p":
        probs_sorted, probs_indexes = probs.sort(dim=-1, descending=True) # start the cumulation from the most probable token in descending order
        cum_probs = probs_sorted.cumsum(dim=-1)
        
        indices = cum_probs > top_p 
#         print(indices)
#         print(indices.nonzero().shape)
#         print(indices.nonzero())
#         print(probs_sorted[:10])
        indices[1:] = indices[:-1].clone()
        indices[0] = 0 # at least one token is preserved 
        
        probs[probs_indexes[indices]] = 0
    
    word = torch.multinomial(probs, 1).item()
    # TODO handle the case that special token was emitted in the first pick
    
    return word

In [359]:
def infer_answer(history, model, tokenizer, method="top_p"):
    """
    Function that generates word by word the bot answer, based on user input and previous history.
    
    :param history: a list of past sentences and last user's input, in plain text
    :param model: the model to be used for inference
    :return: a list with the words of the answer in plain text 
    """
    model.eval()
    reply_so_far = []
    with torch.no_grad():
    
        for i in range(max_sentence_length):
            
            input_ids, category_ids = format_input(history, reply_so_far)
            # print("Inputs ids are {}".format(input_ids)) seems good
            # print("Category ids are {}".format(category_ids)) seems good
            outputs = model(input_ids=input_ids, token_type_ids=category_ids)
            logits = outputs[0]
            logits = logits[0, -1, :] / temperature # keep last 
            probs = F.softmax(logits, dim=-1) 
            word = decoding(probs, logits, method=method) 
            
            if word in SPECIAL_TOKENS_IDS: # we stop inference if we find a special token without emitting this token
                print("Bot terminate sentence!")
                break
            reply_so_far.append(word)
            
        answer_text = tokenizer.decode(reply_so_far, skip_special_tokens=True)    
        return answer_text

In [358]:
def interact_with_bot(model, tokenizer, method='top_p'):
    """
    """
    bot_prompt = "bot:>>> "
    user_prompt = "user:>>> "

    history = []
    print(bot_prompt + "Hello how may I help you?")
    user_input = input(user_prompt)
    
    while user_input != "\q": # TODO check if we need to truncate user input to not exceed max_length
        
        history.append(user_input)
        answer = infer_answer(history, model, tokenizer, method=method)
        history.append(answer)
        
        history = history[-(2*max_history+1):]  # keep the same history as in the training 
        
        print(bot_prompt + answer)
        
        user_input = input(user_prompt) 

In [344]:
model_loaded ,tokenizer_loaded  = load_checkpoint(workspace + "metalwoz-v1\\checkpoint_20191114_1250")

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


C:\Users\nikmand\nikmand\ncsr-chatbot\metalwoz-v1\checkpoint_20191114_1250


In [355]:
add_special_tokens_(model_loaded, tokenizer_loaded)

Tokens added to model: 5


In [356]:
tokenizer_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) # θέλει ξανά να μπου τα σπέσιαλ

[40478, 40479, 40481, 40482, 40480]

In [363]:
interact_with_bot(model_loaded, tokenizer_loaded, method='top_p')
test = "could you book a ticket for me?"

bot:>>> Hello how may I help you?
user:>>> i need toothpaste
Bot terminate sentence!
bot:>>> 
user:>>> hey how are you?
Bot terminate sentence!
bot:>>> yes
user:>>> tell me something.
Bot terminate sentence!
bot:>>> yes
user:>>> \q
