In [1]:
import json
from pprint import pprint
import numpy as np
from __future__ import print_function
from functools import reduce
import re
from nltk.tokenize import word_tokenize
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent, Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

Using TensorFlow backend.


In [2]:
def get_dictionary(vocab):
    d = dict()
    with open(vocab, "r") as f:
        lines = f.readlines()
        for l in lines:
            w = l.strip().split()
            k = w[0]
            v = []
            for i in range(1, len(w)):
                v.append(float(w[i]))
            d[k] = np.array(v, dtype=np.float32)
    return d

In [3]:
d = get_dictionary("glove/glove.6B.50d.txt")

In [4]:
def tokenize_sequence(seq, max_length, dim):
    words = word_tokenize(seq.lower())
    word_vectors = list(map(lambda x: d[x] if x in d else np.zeros(dim), words))
    for i in range(len(word_vectors), max_length):
        word_vectors.append(np.zeros(dim))
    return np.array(word_vectors)

In [5]:
with open('train.json') as data_file:    
    data = json.load(data_file)

In [6]:
ids = []
titles = dict()
contexts = dict()
questions = dict()
answers_text = dict()
answers_start = dict()
for i in range(len(data)):
    paragraphs = data[i]["paragraphs"]
    title = data[i]["title"]
    for j in range(len(paragraphs)):
        context = paragraphs[j]["context"]
        qas = paragraphs[j]["qas"]
        for k in range(len(qas)):
            id_ = qas[k]["id"]
            answer = qas[k]["answer"]
            question = qas[k]["question"]
            ids.append(id_)
            titles[id_] = title
            contexts[id_] = context
            answers_start[id_] = answer["answer_start"]
            answers_text[id_] = answer["text"]
            questions[id_] = question

In [7]:
max_para = 766
max_q = 125
dimension = 50

In [8]:
t = [i for i in range(len(ids))]
t.sort(key=lambda x: np.random.random())
ind = t[:5000]

In [9]:
def create_one_hot_answer(para, answer, answer_start, option, max_length):
    if option == "s":
        from_begin = para[0:answer_start]
    else:
        from_begin = para[0:answer_start+len(answer)]
    l = len(word_tokenize(from_begin))
    one_hot = np.zeros(max_length)
    if option == "s":
        one_hot[l] = 1
    else:
        one_hot[l - 1] = 1
    return one_hot
    

In [10]:
# def create_bin_answer(para, answer, answer_start, max_length):
#     from_begin_s = para[0:answer_start]
#     from_begin_e = para[0:answer_start +len(answer)]
#     l_s = len(word_tokenize(from_begin_s))
#     l_e = len(word_tokenize(from_begin_e))
#     bin_ = np.zeros(max_length)
#     for i in range(l_s, l_e):
#         bin_[i] = 1
#     return bin_
    

In [11]:
paras = [tokenize_sequence(contexts[ids[ind[i]]],max_para,dimension) for i in range(len(ind))]

In [12]:
qns = [tokenize_sequence(questions[ids[ind[i]]],max_q,dimension) for i in range(len(ind))]

In [13]:
ans_starts = [create_one_hot_answer(contexts[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], "s", max_para) for i in range(len(ind))]

In [14]:
ans_ends = [create_one_hot_answer(contexts[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], "e", max_para) for i in range(len(ind))]

In [15]:
# ans = [create_bin_answer(contexts[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], max_para) for i in range(len(ind))]

In [17]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
ANS_SIZE = 766
BATCH_SIZE = 32
EPOCHS = 40

In [18]:
sentence = layers.Input(shape=(max_para,dimension), dtype='float32')
encoded_sentence = RNN(EMBED_HIDDEN_SIZE, return_sequences=True)(sentence)
print(encoded_sentence.shape)

(?, ?, 50)


In [19]:
question = layers.Input(shape=(max_q,dimension), dtype='float32')
encoded_question = RNN(EMBED_HIDDEN_SIZE, return_sequences=True)(question)

In [20]:
merge_1 = layers.dot([encoded_sentence, encoded_question], axes = 2 )
A_Q = layers.Activation("softmax")(merge_1)
merge_2 = layers.dot([encoded_question, encoded_sentence], axes = 2 )
A_D = layers.Activation("softmax")(merge_2)
C_Q = layers.dot([merge_1, encoded_sentence], axes = 1 )
print(C_Q.shape)

(?, ?, 50)


In [21]:
C_Q = layers.concatenate([encoded_question, C_Q], axis=2)
C_D = layers.dot([A_D, C_Q], axes=1)
C_ = layers.concatenate([encoded_sentence, C_D], axis=2)
print(C_.shape)

(?, ?, 150)


In [22]:
U = Bidirectional(RNN(EMBED_HIDDEN_SIZE, return_sequences=True))(C_)
print(U.shape)

(?, ?, 100)


In [None]:
decoder = RNN(ANS_SIZE)(U)
decoder_s = layers.Activation("softmax", name="output_1")(decoder)
decoder_e = layers.Activation("softmax", name="output_2")(decoder)
print(decoder_s.shape)

In [27]:
model = Model([sentence, question],[decoder_s, decoder_e])
model.compile(optimizer='adam',
              loss={'output_1': 'categorical_crossentropy', 'output_2': 'categorical_crossentropy'},
              metrics=['accuracy'])

In [28]:
x = np.array(paras)
xq = np.array(qns)
ans_s = np.array(ans_starts)
ans_e = np.array(ans_ends)

In [None]:
print('Training')
model.fit([x, xq], [ans_s, ans_e],
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.1)

Training
Train on 4500 samples, validate on 500 samples
Epoch 1/40
  96/4500 [..............................] - ETA: 3216s - loss: 14.7120 - output_1_loss: 7.4625 - output_2_loss: 7.2495 - output_1_acc: 0.0000e+00 - output_2_acc: 0.0000e+00

In [68]:
a = tf.tanh(np.array([[1.2,2.3], [2,3]]) + np.array([1.0,2.0]))
b = tf.tanh(np.array([1.0,2.0]))
c = a + b

In [69]:
sess = tf.Session()

In [70]:
with sess.as_default() as s:
    print(c.eval())

[[ 1.73733729  1.96365944]
 [ 1.75664891  1.96393678]]


In [59]:
a

array([[ 2.2,  4.3],
       [ 3. ,  5. ]])