In [1]:
from os import environ
environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
from __future__ import print_function
import json
from pprint import pprint
import numpy as np
from functools import reduce
import re
from nltk.tokenize import word_tokenize
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent, Bidirectional, Dense, Flatten, Dropout, LSTM, GRU, Conv1D, MaxPooling1D, UpSampling1D,BatchNormalization
from keras.layers import Activation
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
import string
from keras.models import load_model

Using TensorFlow backend.


In [3]:
def normalize(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_dictionary(vocab):
    d = dict()
    with open(vocab, 'r') as f:
        lines = f.readlines()
        for l in lines:
            w = l.strip().split()
            k = w[0]
            v = []
            for i in range(1, len(w)):
                v.append(float(w[i]))
            d[k] = np.array(v, dtype=np.float32)
    return d
def tokenize_sequence(seq, max_length, dim):
    words = word_tokenize(seq.lower())
    word_vectors = list(map(lambda x: d[x] if x in d else np.zeros(dim), words))
    for i in range(len(word_vectors), max_length):
        word_vectors.append(np.zeros(dim))
    word_vectors = word_vectors[:max_length]
    return np.array(word_vectors)
def create_one_hot_answer(para, answer, answer_start, option, max_length):
    if option == "s":
        from_begin = para[0:answer_start]
    else:
        from_begin = para[0:answer_start+len(answer)]
    l = len(word_tokenize(from_begin))
    one_hot = np.zeros(max_length)
    if option == "s":
        one_hot[min(l, max_length - 1)] = 1
    else:
        one_hot[min(l - 1, max_length-1)] = 1
    return one_hot

In [4]:
d = get_dictionary("glove/glove.6B.200d.txt")

In [5]:
with open('train.json') as data_file:
    data = json.load(data_file)
ids = []
titles = dict()
contexts = dict()
questions = dict()
answers_text = dict()
answers_start = dict()
for i in range(len(data)):
    paragraphs = data[i]["paragraphs"]
    title = data[i]["title"]
    for j in range(len(paragraphs)):
        context = paragraphs[j]["context"]
        qas = paragraphs[j]["qas"]
        for k in range(len(qas)):
            id_ = qas[k]["id"]
            answer = qas[k]["answer"]
            question = qas[k]["question"]
            ids.append(id_)
            titles[id_] = title
            contexts[id_] = context
            answers_start[id_] = answer["answer_start"]
            answers_text[id_] = answer["text"]
            questions[id_] = question

In [6]:
max_para = 664
max_q = 50
dimension = 200


In [7]:
t = [i for i in range(len(ids))]
t.sort(key=lambda x: np.random.random())
ind = t[:len(questions)]

In [8]:
paras = [tokenize_sequence(contexts[ids[ind[i]]],max_para,dimension) for i in range(len(ind))]
qns = [tokenize_sequence(questions[ids[ind[i]]],max_q,dimension) for i in range(len(ind))]
ans_starts = [create_one_hot_answer(contexts[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], "s", max_para) for i in range(len(ind))]
ans_ends = [create_one_hot_answer(contexts[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], "e", max_para) for i in range(len(ind))]


In [9]:
RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
BATCH_SIZE = 512
EPOCHS = 10

In [26]:
sentence = layers.Input(shape=(max_para,dimension), dtype='float32')
encoded_sentence = GRU(200, return_sequences=True)(sentence)
print(encoded_sentence.shape)
question = layers.Input(shape=(max_q,dimension), dtype='float32')
encoded_question = GRU(200, return_sequences=True)(question)

(?, ?, 200)


In [27]:
merge_1 = layers.dot([encoded_sentence, encoded_question], axes = 2 )
A_Q = layers.Activation("softmax")(merge_1)
merge_2 = layers.dot([encoded_question, encoded_sentence], axes = 2 )
A_D = layers.Activation("softmax")(merge_2)
C_Q = layers.dot([A_Q, encoded_sentence], axes = 1 )
print(C_Q.shape)

(?, ?, 200)


In [28]:
C_Q = layers.concatenate([encoded_question, C_Q], axis=2)
C_D = layers.dot([A_D, C_Q], axes=1)
C_ = layers.concatenate([encoded_sentence, C_D], axis=2)
print(C_.shape)

(?, ?, 600)


In [29]:
U = Bidirectional(LSTM(100, return_sequences=True))(C_)
U = Dropout(0.5)(U)
print(U.shape)

(?, ?, 200)


In [30]:
#Convoluion, batch norm, relu unit
def convBN_pool(input_layer, conv_channels):
    convlayer = Conv1D(conv_channels, 1, padding = 'valid', strides = 2)(input_layer)
    BN = BatchNormalization(axis=-1, momentum = 0.99, epsilon=0.001, center=True, scale = True, 
                            beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros')(convlayer)
    activation = layers.PReLU()(BN)

    return activation

def convBN(input_layer, conv_channels):
    convlayer = Conv1D(conv_channels, 3, padding = 'same')(input_layer)
    BN = BatchNormalization(axis=-1, momentum = 0.99, epsilon=0.001, center=True, scale = True, 
                            beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', 
                            beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)(convlayer)
    activation = layers.PReLU()(BN)

    return activation

def RU(input_layer, conv_channels, d_rate):
    # input tensor for a 3-channel 256x256 image
    x = input_layer
    # 3x3 conv with 3 output channels (same as input channels)
    y = Conv1D(conv_channels, 3, padding='same', dilation_rate = d_rate)(x)
    y = Conv1D(conv_channels, 3, padding='same', dilation_rate = d_rate)(y)
    # this returns x + y.
    z = layers.add([x, y])
    return z

In [31]:
start = convBN(U, 100)
start = RU(start, 100, 1)
start = Dropout(0.5)(start)
start = convBN_pool(start, 64)
start = RU(start, 64, 1)
start = convBN_pool(start, 64)
start = RU(start, 64, 2)
start = convBN_pool(start, 128)
start = RU(start, 128, 1)
start =layers.PReLU()(start)
start = RU(start, 128, 2)
start = Dropout(0.5)(start)
start = convBN_pool(start, 256)
start = RU(start, 256, 2)
start = Dropout(0.5)(start)
start = convBN_pool(start, 128)
start = RU(start, 128, 1)
start = layers.PReLU()(start)
start = Flatten()(start)
start = Dropout(0.5)(start)
start = Dense(max_para, activation='softmax', name='output_1')(start)


end = GRU(100, return_sequences=True)(U)
end = convBN(end, 100)
end = RU(end, 100, 1)
end = Dropout(0.5)(end)
end = convBN_pool(end, 64)
end = RU(end, 64, 1)
end = convBN_pool(end, 64)
end = RU(end, 64, 2)
end = convBN_pool(end, 128)
end = RU(end, 128, 1)
end = layers.PReLU()(end)
end = RU(end, 128, 2)
end = Dropout(0.5)(end)
end = convBN_pool(end, 256)
end = RU(end, 256, 2)
end = Dropout(0.5)(end)
end = convBN_pool(end, 128)
end = RU(end, 128, 1)
end = layers.PReLU()(end)
end = Flatten()(end)
end = Dropout(0.5)(end)
end = Dense(max_para, activation='softmax', name='output_2')(end)


In [32]:
model = Model([sentence, question],[start, end])
model.compile(optimizer='adam',
              loss={'output_1': 'categorical_crossentropy', 'output_2': 'categorical_crossentropy'},
              metrics=['accuracy'])

In [10]:
x = np.array(paras)
xq = np.array(qns)
ans_s = np.array(ans_starts)
ans_e = np.array(ans_ends)

In [33]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 664, 200)      0                                            
____________________________________________________________________________________________________
input_4 (InputLayer)             (None, 50, 200)       0                                            
____________________________________________________________________________________________________
gru_4 (GRU)                      (None, 664, 200)      240600                                       
____________________________________________________________________________________________________
gru_5 (GRU)                      (None, 50, 200)       240600                                       
___________________________________________________________________________________________

In [36]:
print('Training')
val_split = 0.1
for i in range(5):
    model.fit([x, xq], [ans_s, ans_e],
              batch_size=256,
              epochs=10,
              validation_split=val_split)
    if i == 3:
        val_split = 0
    model.save("model_second.h5")

Training
Train on 55241 samples, validate on 6138 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 3584/55241 [>.............................] - ETA: 1159s - loss: 9.2979 - output_1_loss: 4.6220 - output_2_loss: 4.6758 - output_1_acc: 0.0357 - output_2_acc: 0.0198

KeyboardInterrupt: 

In [12]:
model.save("model_second.h5")

In [11]:
model = load_model("model_second.h5")

In [13]:
print('Training')
val_split = 0.1
for i in range(5):
    model.fit([x, xq], [ans_s, ans_e],
              batch_size=200,
              epochs=5,
              validation_split=val_split)
    if i == 2:
        val_split = 0
    model.save("model_second.h5")

Training
Train on 55241 samples, validate on 6138 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 55241 samples, validate on 6138 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 55241 samples, validate on 6138 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
