In [1]:
from os import environ
environ['CUDA_VISIBLE_DEVICES'] = '2'

In [82]:
import json
import unicodedata
from pprint import pprint
import numpy as np
from __future__ import print_function
from functools import reduce
import re
from nltk.tokenize import word_tokenize
from keras.utils.data_utils import get_file
import keras
from keras.layers import recurrent, Bidirectional, Dense, Flatten, Conv1D, Dropout, Embedding, GRU, LSTM, Input
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

In [3]:
def get_dictionary(vocab):
    d = dict()
    with open(vocab, 'r') as f:
        lines = f.readlines()
        for l in lines:
            w = l.strip().split()
            k = w[0]
            v = []
            for i in range(1, len(w)):
                v.append(float(w[i]))
            d[k] = np.array(v, dtype=np.float32)
    return d
def tokenize_sequence(seq, max_length, dim):
    words = word_tokenize(seq.lower())
    word_vectors = list(map(lambda x: d[x] if x in d else np.zeros(dim), words))
    for i in range(len(word_vectors), max_length):
        word_vectors.append(np.zeros(dim))
    return np.array(word_vectors)
def create_one_hot_answer(para, answer, answer_start, option, max_length):
    if option == "s":
        from_begin = para[0:answer_start]
    else:
        from_begin = para[0:answer_start+len(answer)]
    l = len(word_tokenize(from_begin))
    one_hot = np.zeros(max_length)
    if option == "s":
        one_hot[l] = 1
    else:
        one_hot[l - 1] = 1
    return one_hot
    

In [69]:
embeddings_index = dict()
f = open('glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [95]:
with open('train.json') as data_file:    
    data = json.load(data_file)
ids = []
titles = dict()
contexts = list()
questions = list()
contexts_uni = dict()
questions_uni = dict()
answers_text = dict()
answers_start = dict()
for i in range(len(data)):
    paragraphs = data[i]["paragraphs"]
    title = data[i]["title"]
    for j in range(len(paragraphs)):
        context = paragraphs[j]["context"]
        qas = paragraphs[j]["qas"]
        for k in range(len(qas)):
            id_ = qas[k]["id"]
            answer = qas[k]["answer"]
            question = qas[k]["question"]
            ids.append(id_)
            titles[id_] = title
            contexts.append(unicodedata.normalize('NFKD',context).encode('ascii', 'ignore'))
            contexts_uni[id_] = context
            answers_start[id_] = answer["answer_start"]
            answers_text[id_] = answer["text"]
            questions.append(unicodedata.normalize('NFKD',question).encode('ascii', 'ignore'))
            questions_uni[id_] = question
max_para = 766
max_q = 125
dimension = 100
t = [i for i in range(len(ids))]
t.sort(key=lambda x: np.random.random())
ind = t[:61379]

In [None]:
ans_starts = np.array([create_one_hot_answer(contexts_uni[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], "s", max_para) for i in range(len(ind))])
ans_ends = np.array([create_one_hot_answer(contexts_uni[ids[ind[i]]], answers_text[ids[ind[i]]], answers_start[ids[ind[i]]], "e", max_para) for i in range(len(ind))])

In [66]:
t = Tokenizer()
t.fit_on_texts(contexts)
para_vocab_size = len(t.word_index) + 1
encoded_para = t.texts_to_sequences(contexts)
padded_para = pad_sequences(encoded_para, maxlen=max_para, padding='post')

In [71]:
embedding_para = zeros((para_vocab_size, dimension))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_para[i] = embedding_vector

In [72]:
t = Tokenizer()
t.fit_on_texts(questions)
qns_vocab_size = len(t.word_index) + 1
encoded_qns = t.texts_to_sequences(questions)
padded_qns = pad_sequences(encoded_qns, maxlen=max_q, padding='post')

In [73]:
embedding_qns = zeros((qns_vocab_size, dimension))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_qns[i] = embedding_vector

In [78]:
padded_para.shape

(61379, 766)

In [87]:
#Embed
input_para = Input(shape=(max_para,))
input_qns = Input(shape=(max_q,))
em_paras = Embedding(para_vocab_size, dimension, weights=[embedding_para], input_length=max_para, trainable = False)(input_para)
em_qns = Embedding(qns_vocab_size, dimension, weights=[embedding_qns], input_length = max_q, trainable = False)(input_qns)
paras = GRU(100, return_sequences = True)(em_paras)
qns = GRU(100, return_sequences = True)(em_qns)
combine = keras.layers.concatenate([paras, qns], axis = 1)
combine = Bidirectional(GRU(100, return_sequences=True))(combine)

start = Conv1D(100, 5, padding='same', activation ='relu')(combine)
start = Conv1D(50, 3, padding ='same', dilation_rate=2, activation='relu')(start)
start = Conv1D(5, 3, padding ='same', dilation_rate=2, activation = 'relu')(start)
start = Flatten()(start)
start = Dropout(0.5)(start)
start = Dense(766, activation='softmax', name = 'output_1')(start)
end = GRU(200, return_sequences = True)(combine)
end = Conv1D(100, 5, padding='same', activation ='relu')(end)
end = Conv1D(50, 3, padding ='same', dilation_rate=2, activation='relu')(end)
end = Conv1D(5, 3, padding ='same', dilation_rate=2, activation = 'relu')(end)
end = Flatten()(end)
end = Dropout(0.5)(end)
end = Dense(766, activation='softmax', name = 'output_2')(end)

In [88]:
model = Model([input_para, input_qns],[start, end])
model.compile(optimizer='adam',
              loss={'output_1': 'categorical_crossentropy', 'output_2': 'categorical_crossentropy'},
              metrics=['accuracy'])

In [89]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_7 (InputLayer)             (None, 766)           0                                            
____________________________________________________________________________________________________
input_8 (InputLayer)             (None, 125)           0                                            
____________________________________________________________________________________________________
embedding_13 (Embedding)         (None, 766, 100)      6844000                                      
____________________________________________________________________________________________________
embedding_14 (Embedding)         (None, 125, 100)      3168500                                      
___________________________________________________________________________________________

In [90]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

In [None]:
model.fit([padded_para, padded_qns], [ans_starts, ans_ends],
          batch_size=32,
          epochs=10,
          validation_split=0.2)

Train on 49103 samples, validate on 12276 samples
Epoch 1/10
 2656/49103 [>.............................] - ETA: 6203s - loss: 11.0302 - output_1_loss: 5.4712 - output_2_loss: 5.5590 - output_1_acc: 0.0200 - output_2_acc: 0.0083