In [1]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, Masking
from keras.layers import add, dot, concatenate
from keras.layers import GRU, BatchNormalization, Concatenate, RepeatVector, LSTM
from keras.layers.wrappers import Bidirectional
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from functools import reduce
import tarfile
import numpy as np
import re
import os


def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)', sent) if x.strip()]


def parse_stories(lines, only_supporting=False,n=True):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true, only the sentences
    that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.strip()
        if n:
            nid, line = line.split(' ', 1)
            nid = int(nid)
            if nid == 1:
                story = []
        else:
            pass
        if '?' in line:
            if n:
                q, a, supporting = line.split('\t')
                a = tokenize(a.replace(',',' '))
            else:
                q=line
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            if n:
                data.append((substory, q, a))
            else:
                data.append((substory,q))
            story.append('')
            if not n:
                story=[]
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file,
    retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    with open(f) as file:
        data = parse_stories(file.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data

def zarr(w,l):
    a=np.zeros(l)
    a[w]=1
    return a

def vectorize_stories(data, word_idx, story_maxlen, query_maxlen, answer_maxlen,n=False):
    X = []
    Xq = []
    Y = []
    if n:
        for story, query in data:
            x = [word_idx[w] for w in story]
            xq = [word_idx[w] for w in query]
            X.append(x)
            Xq.append(xq)
        return (pad_sequences(X, maxlen=story_maxlen),
            pad_sequences(Xq, maxlen=query_maxlen))
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        xa = [word_idx[w] for w in answer]
        # let's not forget that index 0 is reserved
#         y = np.zeros(len(word_idx) + 1)
#         y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(xa)
    return (pad_sequences(X, maxlen=story_maxlen),
            pad_sequences(Xq, maxlen=query_maxlen),
            pad_sequences(Y, maxlen=answer_maxlen))

challenges = {
        "1": "qa1_single-supporting-fact",
        "2": "qa2_two-supporting-facts",
        "3": "qa3_three-supporting-facts",
        "4": "qa4_two-arg-relations",
        "5": "qa5_three-arg-relations",
        "6": "qa6_yes-no-questions",
        "7": "qa7_counting",
        "8": "qa8_lists-sets",
        "9": "qa9_simple-negation",
        "10": "qa10_indefinite-knowledge",
        "11": "qa11_basic-coreference",
        "12": "qa12_conjunction",
        "13": "qa13_compound-coreference",
        "14": "qa14_time-reasoning",
        "15": "qa15_basic-deduction",
        "16": "qa16_basic-induction",
        "17": "qa17_positional-reasoning",
        "18": "qa18_size-reasoning",
        "19": "qa19_path-finding",
        "20": "qa20_agents-motivations",
        "MCTest": "MCTest",
        "19changed": "19changed",
        "joint": "all_shuffled", 
        "sh1": "../shuffled/qa1_single-supporting-fact",
        "sh2": "../shuffled/qa2_two-supporting-facts",
        "sh3": "../shuffled/qa3_three-supporting-facts",
        "sh4": "../shuffled/qa4_two-arg-relations",
        "sh5": "../shuffled/qa5_three-arg-relations",
        "sh6": "../shuffled/qa6_yes-no-questions",
        "sh7": "../shuffled/qa7_counting",
        "sh8": "../shuffled/qa8_lists-sets",
        "sh9": "../shuffled/qa9_simple-negation",
        "sh10": "../shuffled/qa10_indefinite-knowledge",
        "sh11": "../shuffled/qa11_basic-coreference",
        "sh12": "../shuffled/qa12_conjunction",
        "sh13": "../shuffled/qa13_compound-coreference",
        "sh14": "../shuffled/qa14_time-reasoning",
        "sh15": "../shuffled/qa15_basic-deduction",
        "sh16": "../shuffled/qa16_basic-induction",
        "sh17": "../shuffled/qa17_positional-reasoning",
        "sh18": "../shuffled/qa18_size-reasoning",
        "sh19": "../shuffled/qa19_path-finding",
        "sh20": "../shuffled/qa20_agents-motivations",
        "all": "all"
}

Using TensorFlow backend.


In [2]:
challenge_type = 'all'
challenge = challenges[challenge_type]+'_{}.txt'

DATA_DIR='dmn_simple/data/babi/en-10k'

print('Extracting stories for the challenge:', challenge_type)
train_stories = get_stories(os.path.join(DATA_DIR,challenge.format('train')),max_length=200)
test_stories = get_stories(os.path.join(DATA_DIR,challenge.format('test')),max_length=200)

vocab = set()
for story, q, answer in train_stories + test_stories:
    vocab |= set(story + q + answer)
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))
answer_maxlen = max(map(len, (x for _, _, x in train_stories + test_stories)))

print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training stories:', len(train_stories))
print('Number of test stories:', len(test_stories))
print('-')
print('Here\'s what a "story" tuple looks like (input, query, answer):')
print(train_stories[0])
print('-')
print('Vectorizing the word sequences...')

word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize_stories(train_stories,
                                                               word_idx,
                                                               story_maxlen,
                                                               query_maxlen,
                                                              answer_maxlen)
answers_train=np.array([to_categorical(i,vocab_size) for i in answers_train])
inputs_test, queries_test, answers_test = vectorize_stories(test_stories,
                                                            word_idx,
                                                            story_maxlen,
                                                            query_maxlen,
                                                           answer_maxlen)
answers_test=np.array([to_categorical(i,vocab_size) for i in answers_test])
print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)
print('-')
print('Compiling...')

Extracting stories for the challenge: all
-
Vocab size: 175 unique words
Story max length: 199 words
Query max length: 12 words
Number of training stories: 191714
Number of test stories: 17164
-
Here's what a "story" tuple looks like (input, query, answer):
(['Fred', 'is', 'either', 'in', 'the', 'school', 'or', 'the', 'park', '.', 'Mary', 'went', 'back', 'to', 'the', 'office', '.'], ['Is', 'Mary', 'in', 'the', 'office', '?'], ['yes'])
-
Vectorizing the word sequences...
-
inputs: integer tensor of shape (samples, max_length)
inputs_train shape: (191714, 199)
inputs_test shape: (17164, 199)
-
queries: integer tensor of shape (samples, max_length)
queries_train shape: (191714, 12)
queries_test shape: (17164, 12)
-
answers: binary (1 or 0) tensor of shape (samples, vocab_size)
answers_train shape: (191714, 3, 175)
answers_test shape: (17164, 3, 175)
-
Compiling...


In [3]:
GLOVE_DIR='dmn_simple/data/glove/glove.6B/'
EMBEDDING_DIM=100

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [4]:
# placeholders
input_sequence = Input((story_maxlen,))
question = Input((query_maxlen,))

hid_dim=50

context_rec=Masking()(input_sequence)
context_rec=Embedding(input_dim=vocab_size,output_dim=100,weights=[embedding_matrix])(context_rec)
context_rec=BatchNormalization()(context_rec)
context_rec=Bidirectional(GRU(hid_dim,return_sequences=True),merge_mode='concat')(context_rec)
context_rec=BatchNormalization()(context_rec)
context_rec=Dropout(0.15)(context_rec)

question_rec=Masking()(question)
question_rec=Embedding(input_dim=vocab_size,output_dim=100,weights=[embedding_matrix])(question_rec)
question_rec=BatchNormalization()(question_rec)
question_rec=Bidirectional(GRU(hid_dim),merge_mode='concat')(question_rec)
question_rec=BatchNormalization()(question_rec)
question_rec=RepeatVector(inputs_train.shape[1])(question_rec)
question_rec=Dropout(0.15)(question_rec)

answer=concatenate([context_rec,question_rec])

answer=GRU(hid_dim)(answer)
answer=BatchNormalization()(answer)
answer=RepeatVector(answer_maxlen)(answer)
answer=Dropout(0.2)(answer)
answer=GRU(hid_dim,return_sequences=True)(answer)
answer=BatchNormalization()(answer)
answer=Dropout(0.2)(answer)
answer = Dense(vocab_size)(answer)
answer = Activation('softmax')(answer)

# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

In [5]:
model.load_weights("weights.h5")

In [None]:
# train
model.fit([inputs_train, queries_train], answers_train,
          batch_size=2000,
          epochs=500,
          validation_data=([inputs_test, queries_test], answers_test))

Train on 191714 samples, validate on 17164 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500

In [96]:
model.save_weights("weights.h5")

In [6]:
voc={v:k for k,v in word_idx.items()}
voc[0]=""

def ans(st):
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    tst=[(flatten(story), q) for story, q in parse_stories(st.split("\n"),n=False)]
    inputs_val, queries_val = vectorize_stories(tst,word_idx,story_maxlen,query_maxlen,answer_maxlen,n=True)
    res=model.predict([inputs_val,queries_val])
    return " ".join(list(map(lambda x:voc[x],np.argmax(res,axis=2)[0]))).strip(" ")

In [7]:
st="""Mary went to the kitchen.
Mary got the apple there.
Mary went to the bathroom.
Mice are in a rectangle.
John took the milk.
John went to the hallway.
Mary discarded the apple.
John took the apple there.
John went to the garden.
Cats are red.
John is a pink rectangle.
John picked up the football.
What is John carrying?"""

In [8]:
st="""Mary is red.
Mary is a rectangle.
Mary went to Fred.
What color is Mary?"""

In [9]:
ans(st)

'garden'

In [10]:
model.evaluate([inputs_test, queries_test], answers_test,batch_size=1500)



[0.095633176408447748, 0.97586024678467431]

In [11]:
model.evaluate([inputs_train, queries_train], answers_train,batch_size=1500)



[0.016675329478823154, 0.99338765165340381]