In [1]:
import json
from pprint import pprint
import numpy as np
from __future__ import print_function
from functools import reduce
import re
from nltk.tokenize import word_tokenize
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent, Bidirectional, Dense, Flatten, Conv1D, Dropout
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

Using TensorFlow backend.


In [2]:
def get_dictionary(vocab):
    d = dict()
    with open(vocab) as f:
        lines = f.readlines()
        for l in lines:
            values = l.strip().split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            d[word] = coefs
    return d

In [3]:
d = get_dictionary("glove/glove.6B.300d.txt")

In [4]:
with open('train.json') as data_file:    
    data = json.load(data_file)

In [5]:
ids = []
titles = dict()
contexts = dict()
questions = dict()
answers_text = dict()
answers_start = dict()
for i in range(len(data)):
    paragraphs = data[i]["paragraphs"]
    title = data[i]["title"]
    for j in range(len(paragraphs)):
        context = paragraphs[j]["context"]
        qas = paragraphs[j]["qas"]
        for k in range(len(qas)):
            id_ = qas[k]["id"]
            answer = qas[k]["answer"]
            question = qas[k]["question"]
            ids.append(id_)
            titles[id_] = title
            contexts[id_] = context
            answers_start[id_] = answer["answer_start"]
            answers_text[id_] = answer["text"]
            questions[id_] = question

In [6]:
max_para = 600
max_q = 50
dimension = 300
train_len = len(ids)

In [7]:
paras = np.zeros((train_len, max_para, dimension))
qns = np.zeros((train_len, max_q, dimension))

In [8]:
for i in range(train_len):
    if i % 1000 == 0:
        print(i)
    words = word_tokenize(contexts[ids[i]].lower())
    qs = word_tokenize(questions[ids[i]].lower())
    for j in range(min(max_para - 1, len(words))):
        if words[j] in d:
            paras[i][j] = d[words[j]]
    for j in range(min(max_q - 1, len(qs))):
        if qs[j] in d:
            qns[i][j] = d[qs[j]]

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000


KeyboardInterrupt: 

In [9]:
def create_one_hot_answer(para, answer, answer_start, option, max_length):
    if option == "s":
        from_begin = para[0:answer_start]
    else:
        from_begin = para[0:answer_start+len(answer)]
    l = len(word_tokenize(from_begin))
    one_hot = np.zeros(max_length)
    if option == "s":
        one_hot[min(max_para-1,l)] = 1
    else:
        one_hot[min(max_para-1,l-1)] = 1
    return one_hot
    

In [10]:
ans_starts = [create_one_hot_answer(contexts[ids[i]], answers_text[ids[i]], answers_start[ids[i]], "s", max_para) for i in range(len(ids))]

In [11]:
ans_ends = [create_one_hot_answer(contexts[ids[i]], answers_text[ids[i]], answers_start[ids[i]], "e", max_para) for i in range(len(ids))]

In [48]:
# ans = [create_bin_answer(contexts[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], max_para) for i in range(len(ind))]

In [12]:

RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 300
BATCH_SIZE = 128
EPOCHS = 2


sentence = layers.Input(shape=(max_para,dimension), dtype='float32')
encoded_sentence = RNN(EMBED_HIDDEN_SIZE, return_sequences=True)(sentence)
print(encoded_sentence.shape)

question = layers.Input(shape=(max_q,dimension), dtype='float32')
encoded_question = RNN(EMBED_HIDDEN_SIZE, return_sequences=True)(question)

merge_1 = layers.dot([encoded_sentence, encoded_question], axes = 2 )
A_Q = layers.Activation("softmax")(merge_1)
merge_2 = layers.dot([encoded_question, encoded_sentence], axes = 2 )
A_D = layers.Activation("softmax")(merge_2)
C_Q = layers.dot([A_Q, encoded_sentence], axes = 1 )

C_Q = layers.concatenate([encoded_question, C_Q], axis=2)
C_D = layers.dot([A_D, C_Q], axes=1)
C_ = layers.concatenate([encoded_sentence, C_D], axis=2)

U = Bidirectional(RNN(EMBED_HIDDEN_SIZE, return_sequences=True))(C_)

start = RNN(EMBED_HIDDEN_SIZE//15, return_sequences=True)(U)
start = Flatten()(start)
start = Dense(max_para, activation='relu')(start)
start = Dropout(.5)(start)
start = Dense(max_para, activation='relu')(start)
start = Dropout(.5)(start)
start = Dense(max_para, activation='softmax', name='output_1')(start)
end = RNN(EMBED_HIDDEN_SIZE//15, return_sequences=True)(U)
end = Flatten()(end)
end = Dense(max_para, activation='relu')(end)
end = Dropout(.5)(end)
end = Dense(max_para, activation='relu')(end)
end = Dropout(.5)(end)
end = Dense(max_para, activation='softmax', name='output_2')(end)

(?, ?, 300)


In [25]:
model = Model([sentence, question],[start, end])
model.compile(optimizer='adam',
              loss={'output_1': 'categorical_crossentropy', 'output_2': 'categorical_crossentropy'},
              metrics=['accuracy'])

In [29]:
ans_s = np.array(ans_starts)
ans_e = np.array(ans_ends)

In [31]:
print('Training')
for i in range(2):
    model.fit([paras, qns], [ans_s, ans_e],
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              validation_split=0.2)
#     model.save("/home/users/nus/a0112066/keras/model")

Training


KeyboardInterrupt: 

In [35]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 600, 301)      0                                            
____________________________________________________________________________________________________
input_6 (InputLayer)             (None, 50, 301)       0                                            
____________________________________________________________________________________________________
lstm_11 (LSTM)                   (None, 600, 300)      722400      input_5[0][0]                    
____________________________________________________________________________________________________
lstm_12 (LSTM)                   (None, 50, 300)       722400      input_6[0][0]                    
___________________________________________________________________________________________