In [2]:
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer

In [3]:
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

100%|██████████| 478750579/478750579 [00:20<00:00, 23263069.47B/s]
100%|██████████| 656/656 [00:00<00:00, 453064.95B/s]
100%|██████████| 815973/815973 [00:00<00:00, 1273602.56B/s]
100%|██████████| 458495/458495 [00:00<00:00, 829414.60B/s]
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


### 5 specials tokens are used:
- `<bos>` - beginning of sequence
- `<eos>` - end of sequence
- `<user>` - mark all the words uttered by user
- `<bot>` - mark all the words uttered by bot
- `<pad>` - pad token to make all the utterances equal size

In [10]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<user>", "<bot>", "<pad>"]

#Adding tokens to the vocabulary
tokenizer.set_special_tokens(SPECIAL_TOKENS)
model.set_num_special_tokens(len(SPECIAL_TOKENS))

In [12]:
from itertools import chain
# chain is used to iterate multidimensional list in element wise order without worrying about the indexes.

# Let's define our contexts and special tokens
persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]

history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]

reply = ["great", "to", "hear"]
bos, eos, user, bot = "<bos>", "<eos>", "<user>", "<bot>"

In [34]:
def sample_print(sequence):
    for s in sequence:
        print(s)
        
def build_inputs(persona, history, reply):
    # create one single input sequence by concatenation and adding delimiters
    # one list for persona, n-list of n-utterance in history, one list for reply
    sequence = [[bos] + list(chain(*persona))] + history + [reply + [eos]]
    '''
    ['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.']
    ['hello', 'how', 'are', 'you', '?']
    ['i', 'am', 'fine', 'thanks', '.']
    ['great', 'to', 'hear', '<eos>']
    '''
    #sample_print(sequence)
    # adding <user> and <bot> token, assuming history starts from user and bot and user speaks alternatively 
    # Also reply is just continuation of history
    sequence = [sequence[0]] + [[user if i%2 else bot] + sequence[i] for i in range(1, len(sequence))]
    '''
    ['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.']
    ['<user>', 'hello', 'how', 'are', 'you', '?']
    ['<bot>', 'i', 'am', 'fine', 'thanks', '.']
    ['<user>', 'great', 'to', 'hear', '<eos>']
    '''
    #sample_print(sequence)
    
    #build word, segments and positions token from sequence
    words = list(chain(*sequence))
    #persona is defined for bot so it belongs to bot segments
    segments = [user if i%2 else bot for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))
    
    '''
    words =    ['<bos>', 'i',     'like',  'playing', 'football', '.',     'i',     'am',    'from',  'NYC',   '.',     '<user>', 'hello',  'how',    'are',    'you',    '?',      '<bot>', 'i',     'am',    'fine',  'thanks', '.',     '<user>', 'great',  'to',     'hear',   '<eos>']
    segments = ['<bot>', '<bot>', '<bot>', '<bot>',   '<bot>',    '<bot>', '<bot>', '<bot>', '<bot>', '<bot>', '<bot>', '<user>', '<user>', '<user>', '<user>', '<user>', '<user>', '<bot>', '<bot>', '<bot>', '<bot>', '<bot>',  '<bot>', '<user>', '<user>', '<user>', '<user>', '<user>']
    position = [0,        1,       2,       3,         4,          5,       6,       7,       8,       9,      10,       11,       12,       13,       14,       15,       16,       17,      18,      19,      20,      21,       22,      23,       24,       25,       26,       27]
    '''
    
    return words, segments, position, sequence

In [39]:
words, segments, position, sequence = build_inputs(persona, history, reply)

#As words need to converted to numebers for processing, tokenizer inbuilt function is used to assign id to each word
words = tokenizer.convert_tokens_to_ids(words)
segments = tokenizer.convert_tokens_to_ids(segments)

In [40]:
print(words)

[40478, 11, 14594, 0, 0, 1, 11, 1574, 0, 0, 1, 40480, 0, 1991, 2183, 7159, 19, 40481, 11, 1574, 0, 12389, 1, 40480, 5201, 571, 863, 40479]


In [41]:
print(segments)

[40481, 40481, 40481, 40481, 40481, 40481, 40481, 40481, 40481, 40481, 40481, 40480, 40480, 40480, 40480, 40480, 40480, 40481, 40481, 40481, 40481, 40481, 40481, 40480, 40480, 40480, 40480, 40480]
