In [74]:
import re
import numpy as np
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding, Input, LSTM, Lambda, Reshape, Dense, dot, Activation
import keras.backend as K
from keras.models import Model
from keras.optimizers import RMSprop

## Data Prep

In [2]:
input_data_file = 'data/tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_train.txt'

In [3]:
def get_stories(input_data_file):

    counter = 0
    data = []
    with open(input_data_file) as f:
        for line in f:
            sid, sline = (line.split(' ', 1))

            if int(sid) == 1:
                story = []

            if '\t' in line:
                q, a, _ = sline.split('\t')

                story_until_now = story.copy()
                
                if q.strip() and a.strip():
                    data.append((story_until_now, q.strip(), a.strip()))
            else:
                if line.strip():
                    story.append(line.strip())

    return data

In [4]:
def flatten_flag(el):
    return isinstance(el, str)

def flatten_fn(l):
    for el in l:
        if flatten_flag(el):
            yield el
        else:
            yield from flatten_fn(el)
            
def tokenize(unique_sents):
    vocab = []
    empty = ['', ' ']
    for i in unique_sents:
        temp = []
        l = re.split('(\W+?)', i)
        temp = [el for el in l if el not in empty]
        vocab.extend(temp)
    return vocab

def get_vocab(all_stories):
    unique_sents = set(flatten_fn(all_stories))
    vocab = set(tokenize(unique_sents))
    return vocab

In [5]:
def vectorize_stories(all_stories, word2ind, max_story_len, max_query_len):
    stories = []
    queries = []
    answers = []

    for s, q, a in all_stories:
#         print(s,q,a)
        stories.append([[word2ind[w] for w in tokenize([l])] for l in s ])
        queries.append([word2ind[w] for w in tokenize([q])])
        answers.append([word2ind[w] for w in tokenize([a])])

    padded_stories = [pad_sequences(s, max_story_len) for s in stories]
    padded_queries = pad_sequences(queries, max_query_len) 
    
    return padded_stories, np.array(padded_queries), np.array(answers)

def stack(stories, max_num_story, max_story_len):  
    output = np.zeros((len(stories), max_num_story, max_story_len))
    for i, s in enumerate(stories):
        placeholder = np.zeros((max_num_story-len(s), max_story_len))
        s_arr = np.array([np.array(s_1) for s_1 in s])

        intmd_arr = np.vstack([s_arr, placeholder])
        output[i] = intmd_arr
    
    return output

In [6]:
def get_data(challenge):
    input_data_file = 'data/tasks_1-20_v1-2/en-10k/qa1_{0}-supporting-fact_{1}.txt'
    
    train_data = get_stories(input_data_file.format(challenge, 'train'))
    test_data = get_stories(input_data_file.format(challenge, 'test'))
    
    all_stories = train_data + test_data
    
    max_num_story = max([len(s) for s,q,a in all_stories])
    max_story_len = max([len(tokenize([s1])) for s,q,a in all_stories for s1 in s])
    max_query_len = max([len(tokenize([q])) for s,q,a in all_stories])
    
    vocab = sorted(get_vocab(all_stories))
    vocab.insert(0, '<PAD>')
#     print(vocab, type(vocab))
    word2ind = {w:i for i,w in enumerate(vocab)}
        
    train_stories, train_queries, train_answers = vectorize_stories(train_data, word2ind, max_story_len, max_query_len)
    train_stories = stack(train_stories, max_num_story, max_story_len)
    
    test_stories, test_queries, test_answers = vectorize_stories(test_data, word2ind, max_story_len, max_query_len)
    test_stories = stack(test_stories, max_num_story, max_story_len)
    
    return (word2ind, vocab, max_num_story, max_story_len, max_query_len,
            train_stories, train_queries, train_answers,
            test_stories, test_queries, test_answers,
            train_data, test_data)

In [7]:
(word2ind, vocab, max_num_story, max_story_len, max_query_len,
train_stories, train_queries, train_answers,
test_stories, test_queries, test_answers,
train_data, test_data) = get_data('single')

max_num_story, max_story_len

(10, 8)

In [8]:
print('Train data - stories, queries, ans', train_stories.shape, train_queries.shape, train_answers.shape)
print('Test data - stories, queries, ans', test_stories.shape, test_queries.shape, test_answers.shape)

Train data - stories, queries, ans (10000, 10, 8) (10000, 4) (10000, 1)
Test data - stories, queries, ans (1000, 10, 8) (1000, 4) (1000, 1)


In [20]:
print(vocab), len(vocab)

['<PAD>', '.', '1', '10', '11', '13', '14', '2', '4', '5', '7', '8', '?', 'Daniel', 'John', 'Mary', 'Sandra', 'Where', 'back', 'bathroom', 'bedroom', 'garden', 'hallway', 'is', 'journeyed', 'kitchen', 'moved', 'office', 'the', 'to', 'travelled', 'went']


(None, 32)

## Creating the Model - 1 fact

In [10]:
EMBEDDING_DIM = 15
vocab_size = len(vocab)

In [11]:
input_story_ = Input(shape=(max_num_story, max_story_len))
embedded_story = Embedding(input_dim = vocab_size, output_dim = EMBEDDING_DIM)(input_story_)

# treating each story line like a "bag of words"
embedded_story = Lambda(lambda x: K.sum(x, axis=2))(embedded_story)

print("input_story_.shape, embedded_story.shape:", input_story_.shape, embedded_story.shape)




input_story_.shape, embedded_story.shape: (?, 10, 8) (?, 10, 15)


In [40]:
input_query_ = Input(shape=(max_query_len, ))
embedded_query = Embedding(input_dim = vocab_size, output_dim = EMBEDDING_DIM)(input_query_)
print(embedded_query.shape)
# treating each query like a "bag of words"
embedded_query = Lambda(lambda x: K.sum(x, axis=1))(embedded_query)

print("input_query_.shape, embedded_query.shape:", input_query_.shape, embedded_query.shape)

(?, 4, 30)
input_query_.shape, embedded_query.shape: (?, 4) (?, 30)


In [13]:
# for dot product, add dimension to query
embedded_query = Reshape((1, EMBEDDING_DIM))(embedded_query)
print("embedded_query.shape:", embedded_query.shape)

embedded_query.shape: (?, 1, 15)


In [14]:
x = dot([embedded_story, embedded_query], axes=2)
print(x.shape)
x = Reshape((max_num_story, ))(x)
print(x.shape)
x = Activation('softmax')(x)
weights = Reshape((max_num_story, 1))(x)
print(weights.shape)

(?, 10, 1)
(?, 10)
(?, 10, 1)


In [15]:
x = dot([weights, embedded_story], 1)
print(x.shape)
x = Reshape((EMBEDDING_DIM, ))(x)
print(x.shape)
ans = Dense(vocab_size, activation='softmax')(x)

(?, 1, 15)
(?, 15)


In [16]:
model = Model([input_story_, input_query_], ans)

model.compile(optimizer='adam',
             loss = 'sparse_categorical_crossentropy',
             metrics=['accuracy'])





In [17]:
model.fit([train_stories, train_queries], train_answers, batch_size=32, epochs=20,
         validation_data=([test_stories, test_queries], test_answers))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 10000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1389ee4d0>

In [18]:
ind2word = {v:k for k,v in word2ind.items()}

In [31]:
# Check how we weight each input sentence given a story and question
debug_model = Model([input_story_, input_query_], weights)

# choose a random story
story_idx = np.random.choice(len(train_data))

# get weights from debug model
i = train_stories[story_idx:story_idx+1]
q = train_queries[story_idx:story_idx+1]
w = debug_model.predict([i, q]).flatten()

story, question, ans_ = train_data[story_idx]
print("story:\n")
for j, line in enumerate(story):
  print("{:1.5f}".format(w[j]), "\t", " ".join(line))

print("question:", " ".join(question))
print("answer:", ans_)

pred = model.predict([i, q])
ind = pred[0].argmax()
print("Predicted answer:", ind2word[ind])

story:

0.00000 	 1   J o h n   t r a v e l l e d   t o   t h e   o f f i c e .
0.00000 	 2   D a n i e l   w e n t   b a c k   t o   t h e   g a r d e n .
0.00000 	 4   D a n i e l   t r a v e l l e d   t o   t h e   o f f i c e .
0.00000 	 5   J o h n   t r a v e l l e d   t o   t h e   h a l l w a y .
0.00002 	 7   J o h n   j o u r n e y e d   t o   t h e   g a r d e n .
0.99997 	 8   M a r y   m o v e d   t o   t h e   h a l l w a y .
question: W h e r e   i s   M a r y ?
answer: hallway
Predicted answer: hallway


## Creating the Model - 2 fact

In [33]:
EMBEDDING_DIM = 30
vocab_size = len(vocab)
vocab_size

32

In [44]:
def embedding_fn(emb_input, axis_=2):
    x = Embedding(input_dim = vocab_size, output_dim = EMBEDDING_DIM)(emb_input)
    print('Intmd shape:', x.shape)
    x = Lambda(lambda x: K.sum(x, axis=axis_))(x)
    return x

In [45]:
input_story_ = Input(shape=(max_num_story, max_story_len))
print(input_story_.shape)
embedded_story = embedding_fn(input_story_, 2)

print("input_story_.shape, embedded_story.shape:", input_story_.shape, embedded_story.shape)

(?, 10, 8)
Intmd shape: (?, 10, 8, 30)
input_story_.shape, embedded_story.shape: (?, 10, 8) (?, 10, 30)


In [48]:
input_query_ = Input(shape=(max_query_len, ))
print(input_query_.shape)
embedded_query = embedding_fn(input_query_, 1)

print("input_query_.shape, embedded_query.shape:", input_query_.shape, embedded_query.shape)

(?, 4)
Intmd shape: (?, 4, 30)
input_query_.shape, embedded_query.shape: (?, 4) (?, 30)


In [49]:
dense_layer = Dense(vocab_size, activation='softmax')

In [58]:
def hop(story, query):
    
    # for dot product, add dimension to query
    query = Reshape((1, EMBEDDING_DIM))(query)
    x = dot([story, query], axes=2)
    x = Reshape((max_num_story, ))(x)
    x = Activation('softmax')(x)
    weights = Reshape((max_num_story, 1))(x)
    
    # make new embedding for second calculation
    story_2 = embedding_fn(input_story_, 2)
    x = dot([weights, story_2], 1)
    x = Reshape((EMBEDDING_DIM, ))(x)
    x = dense_layer(x)
    return x, weights, story_2

In [59]:
ans1, weights1, embedded_story = hop(embedded_story, embedded_query)
ans2, weights2, _ = hop(embedded_story, embedded_query)

Intmd shape: (?, 10, 8, 30)
Intmd shape: (?, 10, 8, 30)


In [68]:
model2 = Model([input_story_, input_query_], ans2)

model2.compile(optimizer='adam',
             loss = 'sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [69]:
model2.fit([train_stories, train_queries], train_answers, batch_size=32, epochs=5,
         validation_data=([test_stories, test_queries], test_answers))

Train on 10000 samples, validate on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13acda650>

In [73]:
# Check how we weight each input sentence given a story and question
debug_model2 = Model([input_story_, input_query_], [weights1, weights2])

# choose a random story
story_idx = np.random.choice(len(train_data))

# get weights from debug model
i = train_stories[story_idx:story_idx+1]
q = train_queries[story_idx:story_idx+1]
w1, w2 = debug_model2.predict([i, q])
w1 = w1.flatten()
w2 = w2.flatten()

story, question, ans_ = train_data[story_idx]
print("story:\n")
for j, line in enumerate(story):
  print("{:1.5f}".format(w1[j]), "\t", "{:1.5f}".format(w2[j]), "\t", " ".join(line))

print("question:", " ".join(question))
print("answer:", ans_)

pred = model2.predict([i, q])
ind = pred[0].argmax()
print("Predicted answer:", ind2word[ind])

story:

0.11208 	 0.00000 	 1   M a r y   t r a v e l l e d   t o   t h e   k i t c h e n .
0.05016 	 0.00000 	 2   D a n i e l   t r a v e l l e d   t o   t h e   b e d r o o m .
0.06639 	 0.00021 	 4   M a r y   t r a v e l l e d   t o   t h e   b a t h r o o m .
0.06788 	 0.00000 	 5   S a n d r a   t r a v e l l e d   t o   t h e   g a r d e n .
0.10584 	 0.00000 	 7   D a n i e l   m o v e d   t o   t h e   h a l l w a y .
0.17139 	 0.99979 	 8   M a r y   j o u r n e y e d   t o   t h e   k i t c h e n .
question: W h e r e   i s   M a r y ?
answer: kitchen
Predicted answer: kitchen
