In [327]:
import os
import pickle
import numpy as np
import glob
import copy
import statistics
import torch
import torchtext
import torch.optim as optim
import torch.nn.functional as F
from datetime import datetime
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, WEIGHTS_NAME, CONFIG_NAME
from itertools import chain
from ast import literal_eval
from itertools import zip_longest
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader 

In [64]:
%load_ext autoreload
%autoreload 2

### Data Preprocessing

In [23]:
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [24]:
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
# print(model) # check the architecture of the model

In [4]:
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ('<speaker1>', '<speaker2>')}

In [5]:
def add_special_tokens_(model, tokenizer):
    """
    """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    if num_added_tokens > 0:
        print("Tokens added to model: {}".format(num_added_tokens))
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) 

In [349]:
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences

max_history = 2 # pairs of question/answer to be retained
min_sentence_length = 1
max_sentence_length = 20 # maximum length of a sentence produced by the model 

temperature = 0.75 # increases confidence in the most propable outputs 

use_cuda = False # whether to try to use cuda or not

SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
add_special_tokens_(model, tokenizer)       
SPECIAL_TOKENS_IDS = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
TIME_FORMAT = '%Y%m%d_%H%M'


# bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

# MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
# PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

In [347]:
len(tokenizer.encoder)

40478

In [350]:
SPECIAL_TOKENS_IDS

[40478, 40479, 40481, 40482, 40480]

### Severe TODO 
H Parser είναι πιο χρονοβόρα από την extract pairs
refactor το που γίνεται τι ώστε κάθε συνάρτηση να έχει ένα ρόλο 
το tokenization δεν κολλάει πολύ στην parser.

### TODO download with torchtext

In [84]:
workspace = 'C:\\Users\\nikmand\\nikmand\\ncsr-chatbot\\'  # os.getcwd()

# TODO new function for the tokenization process

def parser(datafolder='metalwoz-v1\\dialoguesTest\\'): # rename to parser
    """
    Function that reads files, keeps only 'turns' from each entry and tokenizes them

    :param datafolder: path to the folder that contains the files
    :return: a list that contains dialogs, each dialog is a list of lists where each of them represents the ids of a phrase 
    """
    dialogs = []
    dialogs_len = []
    files = list(glob.glob(workspace + datafolder + "*.txt"))
    for file in files:
        with open(file) as f:
            for line in f.readlines():
                dialog = literal_eval(line)['turns'][1:] # keep only turns without the first sentence
                dialog = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(phrase)) for phrase in dialog] # to lowercase is performed by the tokenizer 
                dialog_len = sum(len(phrase) for phrase in dialog)
                dialogs.append(dialog) 
                dialogs_len.append(dialog_len)
    return dialogs, dialogs_len        

In [166]:
dialogs, dialogs_len = parser()

In [86]:
print(len(dialogs))
print(max(dialogs_len), min(dialogs_len))

3828
269 17


In [164]:
def filter_samples(samples, samples_len, percentile=90):
    
    samples_length = np.array(samples_len)
    reasonable_length = np.percentile(samples_length, percentile)
    print(reasonable_length)
    
    samples_red, samples_len_red = [], []
    for sample, sample_len in zip(samples, samples_len):
        if sample_len <= reasonable_length:
            samples_red.append(sample)
            samples_len_red.append(sample_len)
    
    return samples_red, samples_len_red  

In [169]:
dialogs_reduced, dialogs_len_reduced = filter_samples(dialogs, dialogs_len)
print(dialogs_len[:30])
print(len(dialogs)) 
print(len(dialogs_reduced)) 
print(dialogs_len_reduced[:30])

141.0
[61, 103, 73, 63, 100, 92, 131, 79, 99, 53, 96, 58, 46, 108, 63, 51, 106, 162, 36, 85, 74, 170, 90, 120, 37, 69, 25, 71, 127, 46]
3828
3446
[61, 103, 73, 63, 100, 92, 131, 79, 99, 53, 96, 58, 46, 108, 63, 51, 106, 36, 85, 74, 90, 120, 37, 69, 25, 71, 127, 46, 51, 67]


In [88]:
def extract_pairs(dialogs = None, cache_file='cache_folder\\pairs.txt'):
    """
    Function that creates pairs of input, output from dialogs, each dialogs corresponds now to many pairs.
    
    :param dialogs: a list with all the dialogs 
    :return a list whose elements are pairs of input, output  
    """
    try: 
        with open(cache_file, "rb") as f:
            print("Cache file found loading content.")
            pairs = pickle.load(f)
            return pairs
    except: # cache file not created yet
        print("Cache file not found. Start processing.")
        pairs = [] 
        for dialog in dialogs:
            t_dict = {'input': []}
            if len(dialog) % 2 != 0: # discard the last phrase if it was said by the user
                dialog = dialog[:-1]
            dialog_it = iter(dialog)
            for i_phrase, o_phrase in zip_longest(dialog_it, dialog_it): # process phrases two by two
                try:
                    t_dict["input"].append(t_dict["output"])
                except:
                    pass
                t_dict["input"].append(i_phrase) # history
                t_dict["output"] = o_phrase
                pairs.append(t_dict)
                t_dict = copy.deepcopy(t_dict) # so future changes address only the new dict
        with open(cache_file, "wb") as f:
            pickle.dump(pairs, f)
        return pairs

In [148]:
pairs = extract_pairs(dialogs) #dialogs_reduced list of dictionaries of two keys

Cache file found loading content.


In [125]:
def adjust_history(pairs, max_history=2): # seq len reduced from 263 to 181
    pairs_len = []
    for pair in pairs:
        pair['input'] = pair['input'][-(2*max_history+1):] # at least one phrase is preserved
        pair_len = sum(len(phrase) for phrase in pair['input']) + len(pair['output'])
        pairs_len.append(pair_len)
    return pairs, pairs_len   

In [149]:
pairs, pairs_len = adjust_history(pairs, max_history=2) # 7 is practically all history

In [171]:
pairs_reduced, pairs_len_reduced = filter_samples(pairs, pairs_len) # reduces from 181 to 81 (history 2) or from 263 to 108
# mean leangth with history 2 is  47 and max 181

81.0


In [None]:
def filter_pairs(pairs):
   [] pair in pairs 

In [156]:
print(len(pairs))
print(len(pairs_reduced))
print(pairs[3])
print(pairs_len[3])
print(max(pairs_len))
print(statistics.median(pairs_len))
miiii = sum(pairs_len) / len(pairs_len)
miiii

19379
17491
{'input': [[488, 249, 1074, 12361, 15354, 504, 481, 3361], [249, 2518, 512, 1074, 246, 5358, 500, 481, 3361], [668, 5611, 239, 249, 1074, 688, 504, 2306], [525, 256, 252, 246, 1875, 4778], [912, 249, 1048, 246, 16219, 267]], 'output': [249, 2310, 256, 241, 2153, 485, 699, 512]}
45
181
45


46.829093348469996

### TODO να σώζεται σε αρχείο στην πιο κατάλληλη μορφή. Να δούμε αν βολεύει Pandas ή κάτι άλλο 

We instantiate a gpt pytorch model with pre-trained weights on language modelling task.

### Tokenizer

A helper class used to interact with the vocabulary in which our model has been pre-trained.

In [49]:
print("Our language model have been pre-trained with a vocabulary of {} words.".format(tokenizer.vocab_size))

Our language model have been pre-trained with a vocabulary of 40478 words.


Εμείς έχουμε πάντα περιττού πλήθους history που αρχίζει και τελειώνει με speaker1 και reply που το λέει ο speaker2

Για το input_ids: Η λογική είναι αναθέτει τον speaker2 κάθε φορά που μένει άρτιο πλήθος από διαλόγους(περιττό συνολικά μαζί με sos, βλέπε συνθήκη). Εμάς όλες μας οι λίστες έχουν άρτιο πλήθος οπότε θα ξεκινήσει με speaker2 ενώ θέλουμε speaker1. Τα επιμέρους αποτελεσματα όμως είναι συμβατά μεταξύ τους.

Στο input_ids το i στο iter παίρνει τιμή i = seq_len - 2 (αφού ξεκινήσαμε από το δεύτερο στοιχείο το iteration)

Για το token_type_ids: για κάθε μία λίστα κάνουμε iterate στα στοιχεία της, αν η θέση της λίστας είναι άρτια παίρνει speaker1 αλλιώς speaker2
Στο token_type_ids: επειδή το πλήθος είναι περιττό με την προσθήκη του sos θα αλλάξει η σειρά και η πρώτη πρόταση θα πάει speaker2 και το reply speaker1

Καταρχάς η αντιστοιχία που δίνουν οι ίδιοι στο δικό τους δεν ταιριάζει με αυτό που είχαμε σκεφτεί 
Κατά δεύτερο πρέπει να δούμε που θα μπει αν θα μπει το sos, αυτό μας δημιουργεί πρόβλημα αυτή τη στιγμή. Θα μπει μετά το tag του speaker ? 

είτε θα μπει μόνο του πριν τον speaker
σε αυτή την περίπτωση θα πρέπει να παίρνει το tag του speaker1 στα tokens αυτό δε συμβαίνει τώρα και μας μπερδεύει τη σειρά 

για το label βάζει σε όλα τα inputs εκτός του reply -1, στο speaker2 του reply -1 και βάζει τα tokens του reply.

Τι ακριβώς θα δούμε με το validation.

In [80]:
pairs_array = np.array(pairs_reduced)

In [350]:
pairs_train_l = pairs[:int(len(pairs)*0.8)]
pairs_eval_l = pairs[int(len(pairs)*0.8):]
print(len(pairs_train_l), len(pairs_eval_l))

13996 3500


### Για διαχωρισμό σε train test
υπάρχει κάτι που να δημιουργεί πρόβλημα;
μπορεί να μας ενοχλεί ότι ζεύγη που έρχονται από διαφορετικούς διαλόγους θα χωριστούν; μας ενοχλεί αν δεν μπαίνουν με τη σειρά;
αν δεν κάνουμε τυχαίο split κάποια domains δε θα εμφανίζονται στο train set

In [175]:
pairs_train, pairs_eval, pairs_train_len, pairs_eval_len = train_test_split(pairs_reduced, pairs_len_reduced, test_size=0.3, shuffle=True)      

Θέλω στο μοντέλο μου να δίνω τρία inputs όπως το παράγει η συνάρτηση build, συνεπώς αυτό θέλω να μου γυρίζει η συνάρτηση get item 

"""
TODO: pad on batch level 

in order to avoid padding to the global max_len we can define our own collate_fn
which forms the samples into batches and call inside there the pad function.
Samples should be allocated to batches based on their sequence length in order to
minimize the need for padding.
""" 

In [318]:
def create_model_inputs(history, reply, tokenizer, with_eos=True):
    """ 
    """
    
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos]] + history + [reply + ([eos] if with_eos else [])]
    seq_len = len(sequence) # sequence λίστα από λίστες
    sequence = [sequence[0]] + [[speaker2 if (seq_len-i) % 2 != 1 else speaker1] + s for i, s in enumerate(sequence[1:])]
    
    instance = {}
    instance["input_ids"] = list(chain(*sequence)) # words
    instance["token_type_ids"] = [speaker1] + [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence[1:]) for _ in s] # for each word
    instance["mask"] = [1] * len(instance["input_ids"]) 
    # TODO positional embeddings
    instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    
    return instance

In [317]:
class DialogDataset(Dataset):
    """
    """

    def __init__(self, dialog_pairs):
        self.dataset = self.create_segments(dialog_pairs)
        self.dataset = self.order_on_seq_length()

    def __len__(self):
        return len(self.dataset)
    
    def create_segments(self, dialog_pairs):
        dataset = []
        for pair in dialog_pairs:
            instance = create_model_inputs(pair['input'], pair['output'], tokenizer)
            dataset.append(instance)
        return dataset
    
    def order_on_seq_length(self):
        return sorted(self.dataset, key=lambda x: len(x['input_ids']))
    

    def __getitem__(self, index):
        return  self.dataset[index]

In [302]:
def pad_sequenses(batch, pad_token=0):
    """
    """
    max_seq_len = max(len(entry["input_ids"]) for entry in batch)
    print(max_seq_len) # debug remove
    for entry in batch:
        for index_name in entry.keys():
            if index_name == "lm_labels":
                pad_token_ = -1
            elif index_name == "mask":
                pad_token_ = 0
            else:
                pad_token_ = pad_token
            entry[index_name] =  entry[index_name] + [pad_token_] * (max_seq_len - len(entry[index_name]))
  
    return batch  

In [325]:
def custom_collate_fn(batch):
    """
    """
    batch = pad_sequenses(batch, tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
    
    inputs = [torch.stack(list(map(lambda x: torch.from_numpy( \
        np.array(x[index_name])), batch)), dim=0) for index_name in batch[0].keys()]
    
    if use_cuda:
        inputs = [input_tensor.cuda() for input_tensor in inputs]
        
    input_ids, mask, category_ids, label_ids = inputs    
    
    return input_ids.type(torch.LongTensor), mask, category_ids.type(torch.LongTensor), label_ids.type(torch.LongTensor)   

In [305]:
training_set = DialogDataset(pairs_train) 
validation_set = DialogDataset(pairs_eval)

In [334]:
TRAIN_BATCH_SIZE = 64 
EVAL_BATCH_SIZE = 100 

dataloader_train = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=False, 
                              collate_fn=custom_collate_fn, num_workers=0) 

dataloader_valid = DataLoader(validation_set, batch_size=EVAL_BATCH_SIZE, shuffle=False,
                              collate_fn=custom_collate_fn, num_workers=0)

In [324]:
for i_batch, (input_ids, mask, category_ids, label_ids) in enumerate(dataloader_train):
    i+=1
    print(type(input_ids))
    print(input_ids.shape)
    print(input_ids)
    print(mask.shape)
    print(mask)
    print(category_ids.shape)
    print(category_ids)
    print(label_ids.shape)
    print(label_ids)
    break

10
<class 'torch.Tensor'>
torch.Size([64, 10])
tensor([[40478, 40481,  3569, 40482,  2229, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3569, 40482,  3570, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3570, 40482,  2229, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3569, 40482,  3569, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3569, 40482,  3570, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  2229, 40482,   685, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3569, 40482,  3569, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3569, 40482,  3570, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3569, 40482,  3570, 40479, 40480, 40480, 40480, 40480],
        [40478, 40481,  3569,   267, 40482,  3570, 40479, 40480, 40480, 40480],
        [40478, 40481,  3569,   267, 40482,  3570, 40479, 40480, 40480, 40480],
        [40478, 40481,  2229,   655, 40482,  3570, 40479, 40480, 40480, 4

### Training procedure

In [354]:
epochs = 1
min_loss, max_patience, cur_patience = np.inf, 5, 0
save_file = "chatbot_{}.pkl".format(datetime.now().strftime(TIME_FORMAT))

if use_cuda and torch.cuda.is_available :
    model.cuda()

# loss_function = nn. check it την διαλέγει μόνο του?
# αν δοθεί το labels αρχικοποιεί και χρησιμοποιεί εσωτερικά το crossEntropyLoss
# να πούμε αναλυτικά τι κάνει σε πρώτη φάση, αν δε το γράψουμε χεράτα.
optimizer = optim.Adam(model.parameters(), lr=1e-3) # , weight_decay=0.001 # TODO review those values
for epoch in range(epochs):
    epoch_train_loss = 0.0
    model.train()
    
    for i_batch, (input_ids, attention_mask, category_ids, label_ids) in enumerate(dataloader_train):
#         print(type(input_ids))
#         print(input_ids.shape)
#         print(category_ids.shape)
#         print(label_ids.shape)
        
        loss, logits = model(input_ids, attention_mask, category_ids, labels=label_ids)
        # print(type(outputs))
        #  = outputs[:2]
        optimizer.zero_grad() 
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()
        break  
    print("Loss on train set: \t\t epoch {} : {:.4f}".format(epoch, epoch_train_loss/(i_batch + 1)))
    
    epoch_eval_loss = 0.0
    with torch.no_grad():
        model.eval()
        
        for i_batch, (input_ids, attention_mask, category_ids, label_ids) in enumerate(dataloader_valid):
            
            loss, logits = model(input_ids, attention_mask, category_ids, labels=label_ids)
            epoch_eval_loss += loss.item()
            break
            
        print("Loss on validation set: \t epoch {} : {:.4f}".format(epoch, epoch_eval_loss/(i_batch + 1)))    
 
    if (epoch_eval_loss >= min_loss):     # early stopping
        cur_patience += 1
        if (cur_patience >= max_patience):
            print("Execution terminated due to Early Stopping at epoch: {}".format(epoch))
            break
    else:
        print("New min validation loss: \t epoch {} : {:.4f}".format(epoch, epoch_eval_loss/(i_batch + 1)))
        checkpoint_state(model, tokenizer) #torch.save(model.state_dict(), save_file) # checkpointing
        print("New checkpoint created")
        min_loss, cur_patience = epoch_eval_loss, 0

10
Loss on train set: 		 epoch 0 : 18.7202
13
Loss on validation set: 	 epoch 0 : 4.5075
New min validation loss: 	 epoch 0 : 4.5075
New checkpoint created


In [353]:
def checkpoint_state(model, tokenizer, output_dir=None):
    """
    """
    
    if output_dir is None:
        output_dir = workspace + 'metalwoz-v1\\checkpoint_{}'.format(datetime.now().strftime(TIME_FORMAT))
    
    try:
        os.mkdir(output_dir)
    except FileExistsError:
        pass
    
    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    output_config_file = os.path.join(output_dir, CONFIG_NAME)  
    
    torch.save(model.state_dict(), output_model_file)
    model.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(output_dir)

In [343]:
def load_checkpoint(output_dir='openai-gpt'):
    """
    """
    print(output_dir)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
    model = OpenAIGPTLMHeadModel.from_pretrained(output_dir)  
    
    return model, tokenizer

## Interaction with the bot - Inference

### TODO load model and tokenizer

In [361]:
def format_input(history, reply_so_far):
    """
    """
    # print("History is: {}".format(history))
    # print("Reply so far is: {}".format(reply_so_far))
    history = [tokenizer.encode(phrase) for phrase in history]
    
    instance = create_model_inputs(history, reply_so_far, tokenizer, with_eos=False)
    
    input_ids = torch.tensor(instance["input_ids"]).unsqueeze(0)
    token_type_ids = torch.tensor(instance["token_type_ids"]).unsqueeze(0)
    
    return input_ids, token_type_ids

In [360]:
def decoding(probs, logits, method="top_p"):
    """
    Functions that selects the next token to be emmited. Three different approaches are implemented: 
    
    Greedy: the most probable token is selected.
    Top-k : 
    Top-p : 
    
    :param logits: 
    :param method: the decoding method to be used, Values={'greedy', 'top_k', 'top_p'}
    :return: the selected token
    """
    top_k = 40 # sample from the 100 most probable tokens based on their probs
    top_p = 0.9 # sample from the n most probable tokens that have a cumulative probability at least 0.9 
    
    if method == "greedy":
        return torch.argmax(probs).item()
    
    elif method == "top_k":        
        prob_k = probs.topk(top_k)[0][-1].item() # value of the 100th most probable
#         print(probs.topk(top_k)[0])
#         print((probs < prob_k).nonzero().shape)
        probs[probs < prob_k] = 0   # cut off the tail  
        
    elif method == "top_p":
        probs_sorted, probs_indexes = probs.sort(dim=-1, descending=True) # start the cumulation from the most probable token in descending order
        cum_probs = probs_sorted.cumsum(dim=-1)
        
        indices = cum_probs > top_p 
#         print(indices)
#         print(indices.nonzero().shape)
#         print(indices.nonzero())
#         print(probs_sorted[:10])
        indices[1:] = indices[:-1].clone()
        indices[0] = 0 # at least one token is preserved 
        
        probs[probs_indexes[indices]] = 0
    
    word = torch.multinomial(probs, 1).item()
    # TODO handle the case that special token was emitted in the first pick
    
    return word

In [359]:
def infer_answer(history, model, tokenizer, method="top_p"):
    """
    Function that generates word by word the bot answer, based on user input and previous history.
    
    :param history: a list of past sentences and last user's input, in plain text
    :param model: the model to be used for inference
    :return: a list with the words of the answer in plain text 
    """
    model.eval()
    reply_so_far = []
    with torch.no_grad():
    
        for i in range(max_sentence_length):
            
            input_ids, category_ids = format_input(history, reply_so_far)
            # print("Inputs ids are {}".format(input_ids)) seems good
            # print("Category ids are {}".format(category_ids)) seems good
            outputs = model(input_ids=input_ids, token_type_ids=category_ids)
            logits = outputs[0]
            logits = logits[0, -1, :] / temperature # keep last 
            probs = F.softmax(logits, dim=-1) 
            word = decoding(probs, logits, method=method) 
            
            if word in SPECIAL_TOKENS_IDS: # we stop inference if we find a special token without emitting this token
                print("Bot terminate sentence!")
                break
            reply_so_far.append(word)
            
        answer_text = tokenizer.decode(reply_so_far, skip_special_tokens=True)    
        return answer_text

In [358]:
def interact_with_bot(model, tokenizer, method='top_p'):
    """
    """
    bot_prompt = "bot:>>> "
    user_prompt = "user:>>> "

    history = []
    print(bot_prompt + "Hello how may I help you?")
    user_input = input(user_prompt)
    
    while user_input != "\q": # TODO check if we need to truncate user input to not exceed max_length
        
        history.append(user_input)
        answer = infer_answer(history, model, tokenizer, method=method)
        history.append(answer)
        
        history = history[-(2*max_history+1):]  # keep the same history as in the training 
        
        print(bot_prompt + answer)
        
        user_input = input(user_prompt) 

In [344]:
model_loaded ,tokenizer_loaded  = load_checkpoint(workspace + "metalwoz-v1\\checkpoint_20191114_1250")

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


C:\Users\nikmand\nikmand\ncsr-chatbot\metalwoz-v1\checkpoint_20191114_1250


In [355]:
add_special_tokens_(model_loaded, tokenizer_loaded)

Tokens added to model: 5


In [356]:
tokenizer_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) # θέλει ξανά να μπου τα σπέσιαλ

[40478, 40479, 40481, 40482, 40480]

In [363]:
interact_with_bot(model_loaded, tokenizer_loaded, method='top_p')
test = "could you book a ticket for me?"

bot:>>> Hello how may I help you?
user:>>> i need toothpaste
Bot terminate sentence!
bot:>>> 
user:>>> hey how are you?
Bot terminate sentence!
bot:>>> yes
user:>>> tell me something.
Bot terminate sentence!
bot:>>> yes
user:>>> \q


# DEBUG AREA

In [61]:
print(pairs_train.shape, pairs_eval.shape)
print(pairs_train[0])
print("wwwwwww")
print(pairs_eval[0]['input'])

print(pairs_train.shape, pairs_eval.shape)
print(pairs_train[0])
print("wwwwwww")
print(pairs_eval[0]['input'])

(12547,) (3137,)
{'input': [[13659, 2679], [1304, 239, 718, 1272, 12286, 587, 512, 966, 507, 485, 580, 257], [277], [566, 2499, 13659, 239, 587, 512, 604, 246, 5052, 5855, 257], [13103, 3597, 246, 3592]], 'output': [1304, 886, 507, 239, 544, 655, 246, 7537, 1807, 512, 640, 1081, 491, 257]}
wwwwwww
[[249, 966, 485, 5838, 531, 4895]]


In [190]:
x =zip(pairs_train_len, pairs_train)

for y in x:
    print(y[0])
    
Z = [sample for length, sample in sorted(zip(pairs_train_len, pairs_train), key = lambda x: x[0])]
print(len(Z))    

training_set = DialogDataset(Z) 

In [81]:
# print(logits_e)
print(torch.topk(logits_e, 5))
log_sort, log_index = logits_e.sort(dim=-1, descending=True)
print(log_sort)
print(log_index)
logits_e.topk(5)

torch.return_types.topk(
values=tensor([8.1849, 7.7367, 5.3123, 5.3115, 4.0969], grad_fn=<TopkBackward>),
indices=tensor([509, 558, 535, 656, 980]))
tensor([  8.1849,   7.7367,   5.3123,  ..., -50.3099, -51.1483, -51.4548],
       grad_fn=<SortBackward>)
tensor([  509,   558,   535,  ..., 33567, 32509, 16443])


torch.return_types.topk(
values=tensor([8.1849, 7.7367, 5.3123, 5.3115, 4.0969], grad_fn=<TopkBackward>),
indices=tensor([509, 558, 535, 656, 980]))

In [86]:
cum_prob = probs.cumsum(dim=-1)

In [95]:
c = cum_prob > 0.9

In [96]:
c[1:] = c[:-1].clone()
#c[0] = 0

In [102]:
c[558]

tensor(False)

In [97]:
(c == True).nonzero()[0]

tensor([559])

In [72]:
loss, logits = outputs[:2]
print(loss)
print(logits.shape)

model.eval()
outputs_e = model(input_ids=input_ids.type(torch.LongTensor), token_type_ids=category_ids.type(torch.LongTensor))
print(type(outputs_e))

print(len(outputs_e))
print(outputs_e[0].shape)
logits_e = outputs_e[0]
logits_e = logits_e[0, -1, :] / temperature
print(logits_e.shape)
probs = F.softmax(logits_e, dim=-1)

word = torch.argmax(probs)
word_3 = torch.argmax(logits_e)
# word_2 = torch.topk(probs, 1)[1]
print(word.item()) 
print(word_3.item())

tensor(5.3508, grad_fn=<NllLossBackward>)
torch.Size([32, 124, 40483])
<class 'tuple'>
1
torch.Size([32, 124, 40483])
torch.Size([40483])
509
509


In [438]:
def create_segments(dialog_pairs):
    dataset = []
    for pair in dialog_pairs:
        instance = build_input_from_segments(pair['input'], pair['output'], tokenizer)
        dataset.append(instance)
    return dataset


def pad_sequenses(dataset, padding=0):
    # TODO create mask for each entry
    max_seq_len = max(len(entry["input_ids"]) for entry in dataset)
    print(max_seq_len)
    for entry in dataset:
        for index_name in entry.keys():
            entry[index_name] =  entry[index_name] + [padding if index_name != "lm_labels" else -1] * (max_seq_len - len(entry[index_name]))
        # entry[index_name] =  [ for index_name in entry.keys()]
    return dataset   

test_1 = create_segments(pairs_train)
test_2 = pad_sequenses(test_1)
test_1[0]
test_2[0]

In [458]:
# len(tokenizer.encoder)
# model.resize_token_embeddings(new_num_tokens=40478 + 5)

Embedding(40483, 768)

In [None]:
# model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
# print(model)
# model_raw = OpenAIGPTModel.from_pretrained('openai-gpt')
# print(model_raw) # without the last linear layer

In [37]:
a = [1,2,3]
it = iter(a)
testt = [it]*3

test_tok = tokenizer.tokenize('could you book me a ticket?')
print(test_tok)
print(tokenizer.convert_tokens_to_ids(test_tok)) # all tokens must be lowercase 
tokenizer.encode('i i i i i i i i i i i i i i i')

['could</w>', 'you</w>', 'book</w>', 'me</w>', 'a</w>', 'ticket</w>', '?</w>']
[635, 512, 1861, 510, 246, 8194, 257]


[249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249]

In [233]:
tokenizer.max_len # what does it means ?

512

In [372]:
for index_name in instance.keys():
    print(index_name)

input_ids
token_type_ids
lm_labels


In [41]:
# debug cell
def build_inputs(persona, history, reply):
    # Build our sequence by adding delimiters and concatenating
    sequence = [[bos] + list(chain(*persona))] + history + [reply + [eos]]
    sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s
                                for i, s in enumerate(sequence[1:])]
    # Build our word, segments and position inputs from the sequence
    words = list(chain(*sequence))                          # word tokens
    segments = [speaker2 if i % 2 else speaker1             # segment tokens
                for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))                      # position tokens
    return words, segments, position, sequence

persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]

sequence = [["<bos>"] + list(chain(*persona))] + history +  [reply + ["<eos>"]]
sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
print(sequence)
print(list(chain(*sequence))   )
[speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]

def build_input_from_segments_or(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]  # chain: expands the lists  
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    instance = {}
    instance["input_ids"] = list(chain(*sequence)) # words
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = [-1] * len(instance["input_ids"])
    if lm_labels:
        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    return instance

In [289]:
# debug cell
history = [[249, 1048, 6702], [498, 1385, 512, 640], [488, 249, 1074, 12361, 15354, 504, 481, 3361]]
reply =  [249, 2518, 512, 1074, 246, 5358, 500, 481, 3361]

instance = build_input_from_segments(history, reply, tokenizer)
instance_or = build_input_from_segments_or(persona, history, reply, tokenizer, lm_labels=True)

print(tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]))
instance_or["input_ids"]
instance_or["token_type_ids"]
instance["input_ids"] 
instance["token_type_ids"] 
instance = build_input_from_segments(persona, history, reply, tokenizer)

### ασυμβίβαστα μεταξύ τους στο input_ids βγαίνει ότι το reply το είπε ο speaker1, ενώ στο token_type_ids ότι το είπε ο speaker2 (μάλλον για τον κώδικα του medium μόνο)

lm_targets = ([-1] * sum(len(s) for s in sequence[:-1])) \
             + [-1] + tokenizer.convert_tokens_to_ids(sequence[-1][1:])

lm_targets # στα labels tou language model έχουν τιμές μόνο τα tokens του reply.
lm_distractor = [-1] * len(instance["input_ids"])
lm_distractor

5
[[40478], [40481, 249, 1048, 6702], [40482, 498, 1385, 512, 640], [40481, 488, 249, 1074, 12361, 15354, 504, 481, 3361], [40482, 249, 2518, 512, 1074, 246, 5358, 500, 481, 3361, 40479]]


In [59]:
def decoding_old(logits, method="greedy"):
    """
    Functions that selects the next token to be emmited. Three different approaches are implemented: 
    
    Greedy: the most probable token is selected.
    Top-k : 
    Top-p : 
    
    :param logits: 
    :param method: the decoding method to be used, Values={'greedy', 'top_k', 'top_p'}
    :return: the selected token
    """
    top_k = 100 # sample from the 100 most probable tokens based on their probs
    top_p = 0.9 # sample from the n most probable tokens that have a cumulative probability > 0.9 
    
    if method == "top_k":        
        logit_k = logits.topk(top_k)[0][-1].item() # value of the 100th most probable
        logits[logits < logit_k] = -float('Inf')   # cut off the tail  
        
    elif method == "top_p":
        logits_sorted, logits_indexes = logits.sort(dim=-1, descending=True) # start the cumulation from the most probable token in descending order
        probs = F.softmax(logits_sorted, dim=-1)
        cum_probs = probs.cumsum(dim=-1)
        
        indices = cum_probs > top_p 
        indices[1:] = indices[:-1].clone()
        indices[0] = 0 # at least one token is preserved 
        
        logits[logits_indexes[indices]] = -float('Inf')
    
    probs = F.softmax(logits, dim=-1)
    word = torch.multinomial(probs, 1).item()
    # TODO handle the case that special token was emitted in the first peek
    
    return word

In [371]:
instance['lm_labels']

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 249,
 2518,
 512,
 1074,
 246,
 5358,
 500,
 481,
 3361,
 40479]