In [1]:
import os
import pickle
import numpy as np
import glob
import copy
import torch
import torchtext
import torch.optim as optim
import torch.nn.functional as F
from transformers import OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer
from itertools import chain
from ast import literal_eval
from itertools import zip_longest
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader 

In [64]:
%load_ext autoreload
%autoreload 2

### Data Preprocessing

In [38]:
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [39]:
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
# print(model) # check the architecture of the model

In [14]:
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ('<speaker1>', '<speaker2>')}

In [40]:
def add_special_tokens_(model, tokenizer):
    """ Add special tokens to the tokenizer and the model if they have not already been added. """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    if num_added_tokens > 0:
        print("Tokens added to model: {}".format(num_added_tokens))
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) 

In [52]:
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences

max_history = 2 # pairs of question/answer to be retained
max_sentence_length = 20 # maximum length of a sentence produced by the model 

temperature = 0.65 # increase confidence in the most propable outputs 

SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
add_special_tokens_(model, tokenizer)       
SPECIAL_TOKENS_IDS = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)


# bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

# MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
# PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

In [42]:
SPECIAL_TOKENS_IDS

[40478, 40479, 40481, 40482, 40480]

### Severe TODO 
H Parser είναι πιο χρονοβόρα από την extract pairs
refactor το που γίνεται τι ώστε κάθε συνάρτηση να έχει ένα ρόλο 
το tokenization δεν κολλάει πολύ στην parser.

### TODO download with torchtext

In [7]:
workspace = 'C:\\Users\\nikmand\\nikmand\\ncsr-chatbot\\'  # os.getcwd()

# TODO new function for the tokenization process

def parser(datafolder='metalwoz-v1\\dialoguesTest\\'): # rename to parser
    """
    Function that reads files, keeps only 'turns' from each entry and tokenizes them

    :param datafolder: path to the folder that contains the files
    :return: a list that contains dialogs, each dialog is a list of lists where each of them represents the ids of a phrase 
    """
    dialogs = []
    dialogs_len = []
    files = list(glob.glob(workspace + datafolder + "*.txt"))
    for file in files:
        with open(file) as f:
            for line in f.readlines():
                dialog = literal_eval(line)['turns'][1:] # keep only turns without the first sentence
                dialog = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(phrase)) for phrase in dialog] # to lowercase is performed by the tokenizer 
                dialog_len = sum(len(phrase) for phrase in dialog)
                dialogs.append(dialog) 
                dialogs_len.append(dialog_len)
    return dialogs, dialogs_len        

In [6]:
dialogs, dialogs_len = parser()

NameError: name 'parser' is not defined

In [479]:
print(len(dialogs))
print(max(dialogs_len), min(dialogs_len))

3461
269 17


In [8]:
def filter_dialogs(dialogs, dialogs_len, percentile=90):
    
    d_lens = np.array(dialogs_len)
    reasonable_length = np.percentile(d_lens, percentile)
    dialogs_reduced = [dialog for dialog, length in zip(dialogs, dialogs_len) if length <= reasonable_length]
    
    return dialogs_reduced   

In [9]:
dialogs_reduced = filter_dialogs(dialogs, dialogs_len)
print(len(dialogs_reduced)) 

NameError: name 'dialogs' is not defined

In [10]:
def extract_pairs(dialogs = None, cache_file='cache_folder\\pairs.txt'):
    """
    Function that creates pairs of input, output from dialogs, each dialogs corresponds now to many pairs.
    
    :param dialogs: a list with all the dialogs 
    :return a list whose elements are pairs of input, output  
    """
    try: 
        with open(cache_file, "rb") as f:
            print("Cache file found loading content.")
            pairs = pickle.load(f)
            return pairs
    except: # cache file not created yet
        print("Cache file not found. Start processing.")
        pairs = [] 
        for dialog in dialogs:
            t_dict = {'input': []}
            if len(dialog) % 2 != 0: # discard the last phrase if it was said by the user
                dialog = dialog[:-1]
            dialog_it = iter(dialog)
            for i_phrase, o_phrase in zip_longest(dialog_it, dialog_it): # process phrases two by two
                try:
                    t_dict["input"].append(t_dict["output"])
                except:
                    pass
                t_dict["input"].append(i_phrase) # history
                t_dict["output"] = o_phrase
                pairs.append(t_dict)
                t_dict = copy.deepcopy(t_dict) # so future changes address only the new dict
        with open(cache_file, "wb") as f:
            pickle.dump(pairs, f)
        return pairs

In [11]:
pairs = extract_pairs() #dialogs_reduced list of dictionaries of two keys

Cache file found loading content.


In [19]:
def adjust_history(pairs, max_history=2): # seq len reduced from 153 to 124
    for pair in pairs:
        pair['input'] = pair['input'][-(2*max_history+1):] # at least one phrase is preserved
    return pairs   

In [20]:
pairs = adjust_history(pairs)

In [None]:
def filter_pairs(pairs):
    

In [49]:
print(len(pairs))
print(pairs[3])

15684
{'input': [[488, 249, 1074, 12361, 15354, 504, 481, 3361], [249, 2518, 512, 1074, 246, 5358, 500, 481, 3361], [668, 5611, 239, 249, 1074, 688, 504, 2306], [525, 256, 252, 246, 1875, 4778], [912, 249, 1048, 246, 16219, 267]], 'output': [249, 2310, 256, 241, 2153, 485, 699, 512]}


### TODO να σώζεται σε αρχείο στην πιο κατάλληλη μορφή. Να δούμε αν βολεύει Pandas ή κάτι άλλο 

We instantiate a gpt pytorch model with pre-trained weights on language modelling task.

### Tokenizer

A helper class used to interact with the vocabulary in which our model has been pre-trained.

In [49]:
print("Our language model have been pre-trained with a vocabulary of {} words.".format(tokenizer.vocab_size))

Our language model have been pre-trained with a vocabulary of 40478 words.


Εμείς έχουμε πάντα περιττού πλήθους history που αρχίζει και τελειώνει με speaker1 και reply που το λέει ο speaker2

Για το input_ids: Η λογική είναι αναθέτει τον speaker2 κάθε φορά που μένει άρτιο πλήθος από διαλόγους(περιττό συνολικά μαζί με sos, βλέπε συνθήκη). Εμάς όλες μας οι λίστες έχουν άρτιο πλήθος οπότε θα ξεκινήσει με speaker2 ενώ θέλουμε speaker1. Τα επιμέρους αποτελεσματα όμως είναι συμβατά μεταξύ τους.

Στο input_ids το i στο iter παίρνει τιμή i = seq_len - 2 (αφού ξεκινήσαμε από το δεύτερο στοιχείο το iteration)

Για το token_type_ids: για κάθε μία λίστα κάνουμε iterate στα στοιχεία της, αν η θέση της λίστας είναι άρτια παίρνει speaker1 αλλιώς speaker2
Στο token_type_ids: επειδή το πλήθος είναι περιττό με την προσθήκη του sos θα αλλάξει η σειρά και η πρώτη πρόταση θα πάει speaker2 και το reply speaker1

Καταρχάς η αντιστοιχία που δίνουν οι ίδιοι στο δικό τους δεν ταιριάζει με αυτό που είχαμε σκεφτεί 
Κατά δεύτερο πρέπει να δούμε που θα μπει αν θα μπει το sos, αυτό μας δημιουργεί πρόβλημα αυτή τη στιγμή. Θα μπει μετά το tag του speaker ? 

είτε θα μπει μόνο του πριν τον speaker
σε αυτή την περίπτωση θα πρέπει να παίρνει το tag του speaker1 στα tokens αυτό δε συμβαίνει τώρα και μας μπερδεύει τη σειρά 

για το label βάζει σε όλα τα inputs εκτός του reply -1, στο speaker2 του reply -1 και βάζει τα tokens του reply.

Τι ακριβώς θα δούμε με το validation.

In [21]:
pairs_array = np.array(pairs)

In [350]:
pairs_train_l = pairs[:int(len(pairs)*0.8)]
pairs_eval_l = pairs[int(len(pairs)*0.8):]
print(len(pairs_train_l), len(pairs_eval_l))

13996 3500


### Για διαχωρισμό σε train test
υπάρχει κάτι που να δημιουργεί πρόβλημα;
μπορεί να μας ενοχλεί ότι ζεύγη που έρχονται από διαφορετικούς διαλόγους θα χωριστούν; μας ενοχλεί αν δεν μπαίνουν με τη σειρά;
αν δεν κάνουμε τυχαίο split κάποια domains δε θα εμφανίζονται στο train set

In [22]:
pairs_train, pairs_eval = train_test_split(pairs_array, test_size=0.2, shuffle=True)      

In [61]:
print(pairs_train.shape, pairs_eval.shape)
print(pairs_train[0])
print("wwwwwww")
print(pairs_eval[0]['input'])

(12547,) (3137,)
{'input': [[13659, 2679], [1304, 239, 718, 1272, 12286, 587, 512, 966, 507, 485, 580, 257], [277], [566, 2499, 13659, 239, 587, 512, 604, 246, 5052, 5855, 257], [13103, 3597, 246, 3592]], 'output': [1304, 886, 507, 239, 544, 655, 246, 7537, 1807, 512, 640, 1081, 491, 257]}
wwwwwww
[[249, 966, 485, 5838, 531, 4895]]


In [59]:
print(pairs_train.shape, pairs_eval.shape)
print(pairs_train[0])
print("wwwwwww")
print(pairs_eval[0]['input'])

(12547,) (3137,)
{'input': [[249, 966, 485, 34427, 531, 4895], [881], [1462, 547, 6307, 4895, 1572], [1304, 1256, 239], [1462, 481, 43, 10458, 6307, 4895, 485, 44, 10458, 504, 5513]], 'output': [1304, 239, 636, 512, 1362, 507, 257]}
wwwwwww
[[759, 512, 5838, 547, 292, 1048, 4895, 504, 5498, 488, 11072, 507, 485, 1099, 2433, 850, 257], [773, 512, 823, 704, 54, 1048, 4895, 617, 5498, 822, 5375, 257], [685, 240, 525, 256, 252, 770, 239], [773, 240, 4895, 544, 1233, 239, 544, 655, 1033, 1284, 249, 759, 587, 562, 512, 257], [2548, 239, 1359, 635, 512, 1233, 481, 8358, 485, 11477, 297, 562, 525, 257]]


Θέλω στο μοντέλο μου να δίνω τρία inputs όπως το παράγει η συνάρτηση build, συνεπώς αυτό θέλω να μου γυρίζει η συνάρτηση get item 

### TODO check if we can use torchtext

In [23]:
class DialogDataset(Dataset):

    def __init__(self, dialog_pairs):
        self.dataset = self.create_segments(dialog_pairs)
        self.dataset = self.pad_sequenses(self.dataset, tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) # χρειάζεται η ανάθεση ?

    def __len__(self):
        return len(self.dataset)
    
    def create_segments(self, dialog_pairs):
        dataset = []
        for pair in dialog_pairs:
            instance = build_input_from_segments(pair['input'], pair['output'], tokenizer)
            dataset.append(instance)
        return dataset
    
    def pad_sequenses(self, dataset, padding=0):
        # TODO create mask for each entry
        max_seq_len = max(len(entry["input_ids"]) for entry in dataset)
        print(max_seq_len)
        for entry in dataset:
            for index_name in entry.keys():
                entry[index_name] =  entry[index_name] + [padding if index_name != "lm_labels" else -1] * (max_seq_len - len(entry[index_name]))
        return dataset    

    def __getitem__(self, index):
#         index_ids = self.index_ids[index]
#         category_ids = self.category[index]
#         labels_ids = self.labels[index]
        sample = self.dataset[index]
        return (np.array(sample['input_ids']), np.array(sample['token_type_ids']), np.array(sample["lm_labels"]))

In [24]:
def build_input_from_segments(history, reply, tokenizer, with_eos=True):
    """ Build a sequence of input from 2 segments:  history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos]] + history + [reply + ([eos] if with_eos else [])]
    seq_len = len(sequence) # sequence λίστα από λίστες
    # print(seq_len)
    sequence = [sequence[0]] + [[speaker2 if (seq_len-i) % 2 != 1 else speaker1] + s for i, s in enumerate(sequence[1:])]
    # print(sequence)
    instance = {}
    instance["input_ids"] = list(chain(*sequence)) # words
    instance["token_type_ids"] = [speaker1] + [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence[1:]) for _ in s] # for each word
    # TODO mask
    instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    return instance

In [25]:
training_set = DialogDataset(pairs_train) 
validation_set = DialogDataset(pairs_eval)

124
124


In [26]:
BATCH_SIZE = 32 
"""
TODO: pad on batch level 

in order to avoid padding to the global max_len we can define our own collate_fn
which forms the samples into batches and call inside there the pad function.
Samples should be allocated to batches based on their sequence length in order to
minimize the need for padding.
""" 

dataloader_train = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0) # has shuffle any sideffects here?
dataloader_valid = DataLoader(validation_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

### Training procedure

In [27]:
epochs = 1
min_loss, max_patience, cur_patience = np.inf, 10, 0
save_file = "chatbot.pkl"
use_cuda = False

# loss_function = nn. check it την διαλέγει μόνο του?
# αν δοθεί το labels αρχικοποιεί και χρησιμοποιεί εσωτερικά το crossEntropyLoss
# να πούμε αναλυτικά τι κάνει σε πρώτη φάση, αν δε το γράψουμε χεράτα.
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.001) # TODO review those values
for epoch in range(epochs):
    # aux staff here
    model.train()
    for i_batch, (input_ids, category_ids, label_ids) in enumerate(dataloader_train):
#         print(type(input_ids))
#         print(input_ids.shape)
#         print(category_ids.shape)
#         print(label_ids.shape)
        
        outputs = model(input_ids=input_ids.type(torch.LongTensor), token_type_ids=category_ids.type(torch.LongTensor), labels=label_ids.type(torch.LongTensor))
        print(type(outputs))
        loss, logits = outputs[:2]
        optimizer.zero_grad() # model.zero_grad() and optimizer.zero_grad() are the same IF all your model parameters are in that optimizer.
        loss.backward()
        optimizer.step()

        break
        # στο loss που παίρνω μπορώ να κάνω backward απ' ότι καταλαβαίνω
        
        

<class 'tuple'>


### TODO SAVE MODEL and tokenizer

## Interaction with the bot - Inference

### TODO load model and tokenizer

In [46]:
def format_input(history, reply_so_far):
    """
    """
    # print("History is: {}".format(history))
    # print("Reply so far is: {}".format(reply_so_far))
    history = [tokenizer.encode(phrase) for phrase in history]
    
    instance = build_input_from_segments(history, reply_so_far, tokenizer, with_eos=False)
    
    input_ids = torch.tensor(instance["input_ids"]).unsqueeze(0)
    token_type_ids = torch.tensor(instance["token_type_ids"]).unsqueeze(0)
    
    return input_ids, token_type_ids

In [30]:
def decoding(logits, method="greedy"):
    """
    Functions that selects the next token to be emmited. Three different approaches are implemented: 
    
    Greedy: the most probable token is selected.
    Top-k : 
    Top-p : 
    
    :param logits: 
    :param method: the decoding method to be used, Values={'greedy', 'top_k', 'top_p'}
    :return: the selected token
    """
    top_k = 100 # sample from the 100 most probable tokens based on their probs
    top_p = 0.9 # sample from the n most probable tokens that have a cumulative probability > 0.9 
    
    if method == "top_k":
        logit_k = torch.topk(logits, top_k)[0][-1].item() # value of the 100th most probable
        logits[logits < logit_k] = -float('Inf')   # cut off the tail  
        
    elif method == "top_p":
        
    # default: greedy

In [69]:
tst = torch.topk(logits[0, -1, :], 100)[0][..., -1, None]
print(ts)
print(ts[-1].item())

tensor([ 3.6177,  3.1210,  2.3628,  2.2761,  2.1987,  1.5925,  1.2785,  1.2639,
         1.1791,  0.8849,  0.8654,  0.7551,  0.6730,  0.6603,  0.5451,  0.5403,
         0.4563,  0.3471,  0.2676,  0.1768,  0.0710,  0.0299,  0.0290,  0.0133,
         0.0039, -0.0358, -0.0556, -0.0771, -0.1768, -0.3003, -0.3305, -0.4515,
        -0.4820, -0.4945, -0.5143, -0.5508, -0.5748, -0.6180, -0.6758, -0.6847,
        -0.6874, -0.6942, -0.7083, -0.7232, -0.7593, -0.7905, -0.8563, -0.9661,
        -1.0109, -1.0398, -1.0596, -1.0603, -1.0604, -1.0776, -1.0814, -1.1205,
        -1.1241, -1.1291, -1.1345, -1.1431, -1.1492, -1.2057, -1.2991, -1.3368,
        -1.3590, -1.3609, -1.3838, -1.4533, -1.4547, -1.4550, -1.4569, -1.5742,
        -1.6001, -1.6469, -1.6515, -1.6707, -1.7707, -1.7743, -1.7827, -1.8013,
        -1.8057, -1.8740, -1.9021, -1.9383, -1.9434, -1.9744, -1.9816, -1.9875,
        -2.0014, -2.0252, -2.0295, -2.0455, -2.0740, -2.1106, -2.1302, -2.1389,
        -2.1672, -2.1854, -2.2004, -2.22

In [54]:
def infer_answer(history, model):
    """
    Function that generates word by word the bot answer, based on user input and previous history
    
    :param history: a list of past sentences and last user's input, in plain text
    :param model: the model to be used for inference
    :return: a list with the words of the answer in plain text 
    """
    model.eval()
    reply_so_far = []
    with torch.no_grad():
    
        for i in range(max_sentence_length):
            
            input_ids, category_ids = format_input(history, reply_so_far)
            # print("Inputs ids are {}".format(input_ids)) seems good
            # print("Category ids are {}".format(category_ids)) seems good
            outputs = model(input_ids=input_ids, token_type_ids=category_ids)
            logits = outputs[0]
            logits = logits[0, -1, :] / temperature # keep last 
            probs = F.softmax(logits, dim=-1) # greedy decoding
            word = torch.argmax(probs).item()
            
            if word in SPECIAL_TOKENS_IDS: # we stop inference if we find a special token without emitting this token
                print("Bot terminate sentence!")
                break
            reply_so_far.append(word)
            
        answer_text = tokenizer.decode(reply_so_far, skip_special_tokens=True)    
        return answer_text

In [44]:
def interact_with_bot(model):
    """
    """
    bot_prompt = "bot:>>> "
    user_prompt = "user:>>> "

    history = []
    print(bot_prompt + "Hello how may I help you?")
    user_input = input(user_prompt)
    
    while user_input != "\q":
        # TODO truncate user input to not exceed max_length
        history.append(user_input)
        answer = infer_answer(history, model)
        history.append(answer)
        
        history = history[-(2*max_history+1):]  # keep the same history as in the training 
        
        print(bot_prompt + answer)
        
        user_input = input(user_prompt) 

In [55]:
interact_with_bot(model)
test = "could you book a ticket for me?"

bot:>>> Hello how may I help you?
user:>>> We would need such derived metrics that literature considers as indicators.
bot:>>> " 
 " i'm not sure i understand. " 
 " i'm not sure i understand either
user:>>> what's the issue here?
bot:>>> , or what'e's the issue here. " 
 " i'm not sure i understand.
user:>>> \q


# DEBUG AREA

In [50]:
loss, logits = outputs[:2]
print(loss)
print(logits.shape)

model.eval()
outputs_e = model(input_ids=input_ids.type(torch.LongTensor), token_type_ids=category_ids.type(torch.LongTensor))
print(type(outputs_e))

print(len(outputs_e))
print(outputs_e[0].shape)
logits_e = outputs_e[0]
logits_e = logits_e[0, -1, :] / temperature
print(logits_e.shape)
probs = F.softmax(logits_e, dim=-1)

word = torch.argmax(probs)
word_3 = torch.argmax(logits_e)
# word_2 = torch.topk(probs, 1)[1]
print(word.item()) 
print(word_3.item())

In [438]:
def create_segments(dialog_pairs):
    dataset = []
    for pair in dialog_pairs:
        instance = build_input_from_segments(pair['input'], pair['output'], tokenizer)
        dataset.append(instance)
    return dataset


def pad_sequenses(dataset, padding=0):
    # TODO create mask for each entry
    max_seq_len = max(len(entry["input_ids"]) for entry in dataset)
    print(max_seq_len)
    for entry in dataset:
        for index_name in entry.keys():
            entry[index_name] =  entry[index_name] + [padding if index_name != "lm_labels" else -1] * (max_seq_len - len(entry[index_name]))
        # entry[index_name] =  [ for index_name in entry.keys()]
    return dataset   

test_1 = create_segments(pairs_train)
test_2 = pad_sequenses(test_1)
test_1[0]
test_2[0]

In [458]:
# len(tokenizer.encoder)
# model.resize_token_embeddings(new_num_tokens=40478 + 5)

Embedding(40483, 768)

In [None]:
# model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
# print(model)
# model_raw = OpenAIGPTModel.from_pretrained('openai-gpt')
# print(model_raw) # without the last linear layer

In [37]:
a = [1,2,3]
it = iter(a)
testt = [it]*3

test_tok = tokenizer.tokenize('could you book me a ticket?')
print(test_tok)
print(tokenizer.convert_tokens_to_ids(test_tok)) # all tokens must be lowercase 
tokenizer.encode('i i i i i i i i i i i i i i i')

['could</w>', 'you</w>', 'book</w>', 'me</w>', 'a</w>', 'ticket</w>', '?</w>']
[635, 512, 1861, 510, 246, 8194, 257]


[249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249]

In [233]:
tokenizer.max_len # what does it means ?

512

In [372]:
for index_name in instance.keys():
    print(index_name)

input_ids
token_type_ids
lm_labels


In [41]:
# debug cell
def build_inputs(persona, history, reply):
    # Build our sequence by adding delimiters and concatenating
    sequence = [[bos] + list(chain(*persona))] + history + [reply + [eos]]
    sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s
                                for i, s in enumerate(sequence[1:])]
    # Build our word, segments and position inputs from the sequence
    words = list(chain(*sequence))                          # word tokens
    segments = [speaker2 if i % 2 else speaker1             # segment tokens
                for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))                      # position tokens
    return words, segments, position, sequence

persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]

sequence = [["<bos>"] + list(chain(*persona))] + history +  [reply + ["<eos>"]]
sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
print(sequence)
print(list(chain(*sequence))   )
[speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]

def build_input_from_segments_or(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]  # chain: expands the lists  
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    instance = {}
    instance["input_ids"] = list(chain(*sequence)) # words
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = [-1] * len(instance["input_ids"])
    if lm_labels:
        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    return instance

In [289]:
# debug cell
history = [[249, 1048, 6702], [498, 1385, 512, 640], [488, 249, 1074, 12361, 15354, 504, 481, 3361]]
reply =  [249, 2518, 512, 1074, 246, 5358, 500, 481, 3361]

instance = build_input_from_segments(history, reply, tokenizer)
instance_or = build_input_from_segments_or(persona, history, reply, tokenizer, lm_labels=True)

print(tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]))
instance_or["input_ids"]
instance_or["token_type_ids"]
instance["input_ids"] 
instance["token_type_ids"] 
instance = build_input_from_segments(persona, history, reply, tokenizer)

### ασυμβίβαστα μεταξύ τους στο input_ids βγαίνει ότι το reply το είπε ο speaker1, ενώ στο token_type_ids ότι το είπε ο speaker2 (μάλλον για τον κώδικα του medium μόνο)

lm_targets = ([-1] * sum(len(s) for s in sequence[:-1])) \
             + [-1] + tokenizer.convert_tokens_to_ids(sequence[-1][1:])

lm_targets # στα labels tou language model έχουν τιμές μόνο τα tokens του reply.
lm_distractor = [-1] * len(instance["input_ids"])
lm_distractor

5
[[40478], [40481, 249, 1048, 6702], [40482, 498, 1385, 512, 640], [40481, 488, 249, 1074, 12361, 15354, 504, 481, 3361], [40482, 249, 2518, 512, 1074, 246, 5358, 500, 481, 3361, 40479]]


In [371]:
instance['lm_labels']

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 249,
 2518,
 512,
 1074,
 246,
 5358,
 500,
 481,
 3361,
 40479]