In [1]:
import json
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Embedding, Dropout, Dense, Activation, CuDNNLSTM
from keras.layers import LSTM, Bidirectional,Input
from keras.layers import concatenate
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = json.load(open('train-v2.0.json'))

In [3]:
dev = json.load(open('dev-v2.0.json'))

In [4]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('english'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [word for word in words if word and word not in stops]

    return words

In [5]:
train['data'][1]['paragraphs'][0]['qas'][0]

{'answers': [{'answer_start': 182, 'text': 'Polish and French'}],
 'id': '56cbd2356d243a140015ed66',
 'is_impossible': False,
 'question': "What was Frédéric's nationalities?"}

In [6]:
contexts = []
questions = []

starts = []
ends = []

for instance in train['data']:
    for paragraph in instance['paragraphs']:
        context = paragraph['context']
        
        for qas in paragraph['qas']:
            if qas['is_impossible']:
                continue
            question = qas['question']
            for answer in qas['answers']:
                start = answer['answer_start']
                end = start + len(answer['text'])
                contexts.append(normalize(context))
                questions.append(normalize(question))
                starts.append(start)
                ends.append(end)

In [7]:
contexts_dev = []
questions_dev = []

starts_dev = []
ends_dev = []

for instance in dev['data']:
    for paragraph in instance['paragraphs']:
        context = paragraph['context']
        
        for qas in paragraph['qas']:
            if qas['is_impossible']:
                continue
            question = qas['question']
            
            for answer in qas['answers']:
                
                start = answer['answer_start']
                end = start + len(answer['text'])
                contexts_dev.append(normalize(context))
                questions_dev.append(normalize(question))
                starts_dev.append(start)
                ends_dev.append(end)

In [8]:
vocab = set()

for context in contexts:
    vocab.update(context)

for question in questions:
    vocab.update(question)

id2word = {i:word for i, word in enumerate(vocab)}
word2id = {word:i for i, word in id2word.items()}

In [9]:
contexts_le = [[word2id[word] for word in context] for context in contexts]
max_len = max([len(c) for c in contexts])

X_train_context = pad_sequences(contexts_le, max_len, padding='post')

In [10]:
contexts_le_dev = [[word2id.get(word, 0) for word in context] for context in contexts_dev]

X_dev_context = pad_sequences(contexts_le_dev, max_len, padding='post')

In [11]:
X_train_context.shape

(86821, 410)

In [12]:
questions_le = [[word2id[word] for word in question] for question in questions]
max_len_q = max([len(c) for c in questions])

X_train_question = pad_sequences(questions_le, max_len_q, padding='post')

In [13]:
questions_le_dev = [[word2id.get(word, 0) for word in question] for question in questions_dev]
X_dev_question = pad_sequences(questions_le_dev, max_len_q, padding='post')

In [14]:
vocab_size = len(vocab)
embedding_vector_length = 50

max_span_begin = np.max(starts)
max_span_end = np.max(ends)
batch = 64


# slice of data to be used as one epoch training on full data is expensive
slce = 1000

In [15]:
starts = np.array(starts)
ends = np.array(ends)

starts_dev = np.array(starts_dev)
ends_dev = np.array(ends_dev)

In [18]:


# model1
context_input = Input(shape=(max_len, ), dtype='int32', name='context_input')
x = Embedding(input_dim=vocab_size, output_dim=50,
              input_length=max_len)(context_input)
lstm_out = Bidirectional(CuDNNLSTM(50, return_sequences=True), merge_mode='concat')(x)
drop_1 = Dropout(0.1)(lstm_out)

# model2
ques_input = Input(shape=(max_len_q, ), dtype='int32', name='ques_input')
x = Embedding(input_dim=vocab_size, output_dim=50,
              input_length=max_len_q)(ques_input)
lstm_out = Bidirectional(CuDNNLSTM(50, return_sequences=True), merge_mode='concat')(x)
drop_2 = Dropout(0.1)(lstm_out)

# merger model
merge_layer = concatenate([drop_1, drop_2], axis=1)
biLSTM = Bidirectional(CuDNNLSTM(50), merge_mode='mul')(merge_layer)
drop_3 =  Dropout(0.1)(biLSTM)
softmax_1 = Dense(max_span_begin, activation='softmax')(biLSTM)
softmax_2 = Dense(max_span_end, activation='softmax')(biLSTM)

model = Model(inputs=[context_input, ques_input], outputs=[softmax_1, softmax_2])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
context_input (InputLayer)      (None, 410)          0                                            
__________________________________________________________________________________________________
ques_input (InputLayer)         (None, 31)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 410, 50)      5319400     context_input[0][0]              
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 31, 50)       5319400     ques_input[0][0]                 
__________________________________________________________________________________________________
bidirectio

In [19]:
validation_data=({'context_input': X_dev_context,
                  'ques_input':X_dev_question}, 
                 {'dense_1': starts_dev,
                  'dense_2': ends_dev})


training_data=({'context_input': X_train_context,
                'ques_input':X_train_question}, 
                 {'dense_1': starts,
                  'dense_2': ends})

model.fit(training_data[0], training_data[1], batch_size=128, epochs=10,
          validation_data=(validation_data[0], validation_data[1]))

ValueError: No data provided for "dense_3". Need data for each key in: ['dense_3', 'dense_4']

In [20]:
questions

['kathmandu',
 'metropolitan',
 'city',
 'kmc',
 'order',
 'promote',
 'international',
 'relations',
 'established',
 'international',
 'relations',
 'secretariat',
 'irc',
 "kmc's",
 'first',
 'international',
 'relationship',
 'established',
 '1975',
 'city',
 'eugene',
 'oregon',
 'united',
 'states',
 'activity',
 'enhanced',
 'establishing',
 'formal',
 'relationships',
 '8',
 'cities',
 'motsumoto',
 'city',
 'japan',
 'rochester',
 'usa',
 'yangon',
 'formerly',
 'rangoon',
 'myanmar',
 "xi'an",
 "people's",
 'republic',
 'china',
 'minsk',
 'belarus',
 'pyongyang',
 'democratic',
 'republic',
 'korea',
 "kmc's",
 'constant',
 'endeavor',
 'enhance',
 'interaction',
 'saarc',
 'countries',
 'international',
 'agencies',
 'many',
 'major',
 'cities',
 'world',
 'achieve',
 'better',
 'urban',
 'management',
 'developmental',
 'programs',
 'kathmandu']