## Setup

Load Python modules.

In [2]:
import numpy as np
import pandas as pd
import os
from tensorflow.python.client import device_lib
import tensorflow as tf
from tensorflow.keras.layers import Dropout,Lambda,Dense,Input,LSTM,Concatenate,Flatten,Add,Reshape,GlobalAveragePooling1D,GlobalAveragePooling2D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
import sys
assert sys.version_info[0] >= 3
print("Set up!")

Set up!


## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [3]:
from google.colab import drive
driveBase = os.path.join(os.path.dirname(os.path.abspath("train-v1.1.json")),"drive")
drive.mount(driveBase)
hardDrive = os.path.join(driveBase,"My Drive")
os.chdir(hardDrive)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import json
def readFile(filename):
  with open(filename) as file:
    fields = []
    JSON = json.loads(file.read())
    for article in JSON["data"]:
      articleTitle = article["title"]
      for paragraph in article["paragraphs"]:
        paragraphContext = paragraph["context"]
        for qas in paragraph["qas"]:
          question = qas["question"]
          for answer in qas["answers"]:
            fields.append({"question":question,"answer_text":answer["text"],"answer_start":answer["answer_start"],"paragraph_context":paragraphContext,"article_title":articleTitle})
  return pd.DataFrame(fields)

In [7]:
trainingData = readFile("train-v1.1.json")
trainingData

Unnamed: 0,answer_start,answer_text,article_title,paragraph_context,question
0,515,Saint Bernadette Soubirous,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...
1,188,a copper statue of Christ,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?
2,279,the Main Building,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...
3,381,a Marian place of prayer and reflection,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?
4,92,a golden statue of the Virgin Mary,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...
5,248,September 1876,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...
6,441,twice,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",How often is Notre Dame's the Juggler published?
7,598,The Observer,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",What is the daily student paper at Notre Dame ...
8,126,three,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",How many student news papers are found at Notr...
9,908,1987,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...


In [0]:

trainingData["question"] = trainingData["question"].str.lower()
trainingData["article_title"] = trainingData["article_title"].str.lower()
trainingData["paragraph_context"] = trainingData["paragraph_context"].str.lower()
trainingData["answer_text"] = trainingData["answer_text"].str.lower()
trainingData["answer_start"] = pd.to_numeric(trainingData["answer_start"])

In [9]:
from sys import getsizeof
def summaryStatistics(series):
    numberOfWords = series.apply(lambda x: len(str(x).split(" ")))
    averageNumberOfWords = sum(numberOfWords) / len(numberOfWords)
    return "average: " + str(averageNumberOfWords) + "maximum: " + str(max(numberOfWords)) + " minimum: " +str(min(numberOfWords))
print("Words in each question: ",summaryStatistics(trainingData["question"]))
print("Words in each article title: ",summaryStatistics(trainingData["article_title"]))
print("Words in each context: ",summaryStatistics(trainingData["paragraph_context"]))
print("Words in each answer: ",summaryStatistics(trainingData["answer_text"]))

Words in each question:  average: 10.397504537723034maximum: 25601 minimum: 1
Words in each article title:  average: 1.0maximum: 1 minimum: 1
Words in each context:  average: 119.76870740533568maximum: 653 minimum: 20
Words in each answer:  average: 3.1621822166919715maximum: 43 minimum: 1


In [10]:
devData = readFile("dev-v1.1.json")
devData["question"] = devData["question"].str.lower()
devData["article_title"] = devData["article_title"].str.lower()
devData["paragraph_context"] = devData["paragraph_context"].str.lower()
devData["answer_text"] = devData["answer_text"].str.lower()
devData["answer_start"] = pd.to_numeric(devData["answer_start"])
print("Finished loading dev data and lowering appropriate columns.")

Finished loading dev data and lowering appropriate columns.


## Assemble data into X and Y datasets.

In [0]:
#X_1 training sets.
X_1_train_text = trainingData["question"]
Y_1_train_text = trainingData["article_title"]

In [0]:

X_2_train_text = trainingData[["question","paragraph_context"]]
Y_2_train = trainingData["answer_start"]

In [13]:
X_2_train_text[0:10]

Unnamed: 0,question,paragraph_context
0,to whom did the virgin mary allegedly appear i...,"architecturally, the school has a catholic cha..."
1,what is in front of the notre dame main building?,"architecturally, the school has a catholic cha..."
2,the basilica of the sacred heart at notre dame...,"architecturally, the school has a catholic cha..."
3,what is the grotto at notre dame?,"architecturally, the school has a catholic cha..."
4,what sits on top of the main building at notre...,"architecturally, the school has a catholic cha..."
5,when did the scholastic magazine of notre dame...,"as at most other universities, notre dame's st..."
6,how often is notre dame's the juggler published?,"as at most other universities, notre dame's st..."
7,what is the daily student paper at notre dame ...,"as at most other universities, notre dame's st..."
8,how many student news papers are found at notr...,"as at most other universities, notre dame's st..."
9,in what year did the student paper common sens...,"as at most other universities, notre dame's st..."


####Get maximum answer start.



In [0]:

max_answer_start = int(pd.concat((trainingData["answer_text"],devData["answer_text"])).str.len().max())


####Metrics

In [0]:
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Integer encode text

Used for manual encoding of text into integers.

In [0]:
strings = pd.concat((trainingData,devData)).drop("answer_start",axis=1)
strings = strings.values.flatten()
textTokenizer = Tokenizer()
textTokenizer.fit_on_texts(strings)

In [0]:
questionsTokenized_train = textTokenizer.texts_to_sequences(trainingData["question"])
contextTokenized_train = textTokenizer.texts_to_sequences(trainingData["paragraph_context"])

## Transfer learning RNN

Often, in machine learning, it is useful to use similar, pre-trained models. This can reduce training time and produce a better model.

In [0]:
# Load previous model.

In [0]:
# LSTM with Dropout for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 88000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)


In [0]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           2816000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 2,840,897
Trainable params: 2,840,897
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
# Compile and fit the transfer model.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
from keras.callbacks import ModelCheckpoint
model.fit(X_train, y_train, epochs=3, validation_split=0.2, batch_size=64,callbacks=[ModelCheckpoint('lstm-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto')])

Train on 20000 samples, validate on 5000 samples
Epoch 1/3


KeyboardInterrupt: ignored

In [0]:
from keras.models import load_model

In [0]:
transfer_model = load_model("lstm-best.h5",custom_objects={'f1':f1})
transfer_model.evaluate(X_test,y_test)



[0.349451700668335, 0.8505832393836975]

In [0]:
new_model = Sequential()

new_model.add(transfer_model.layers[0])
new_model.add(transfer_model.layers[1])
new_model.add(transfer_model.layers[2])
new_model.add(transfer_model.layers[3])
new_model.add(Dense(239,activation="softmax"))
new_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           2816000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 239)               15535     
Total params: 2,856,367
Trainable params: 2,856,367
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
new_model.compile("adam","categorical_crossentropy",metrics=['accuracy',f1])
new_model_checkpoint = ModelCheckpoint('answers_network-rnn-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto')
new_model.fit(x=pad_sequences(,y=Y_2_train,callbacks=[new_model_checkpoint])

ValueError: ignored

## Second neural network - non-recurrent/no transfer learning

This neural network is used to generate answers from the questions and articles. It works by first reading the relevant article and using the question to find the answer.

In [0]:
#Build the neural network.
from math import log
inputShape_second = X_2_train_num.shape[1:3]
print(inputShape_second)
answers_shape = Y_2_train_num.shape
print(answers_shape[1])
# Find the vocabulary length.
vocabularyLength = np.concatenate((X_2_train_num,X_2_dev_num)).max() + 1

NameError: ignored

In [0]:
questions = Input(shape=(questionsTrain.shape[1],))
context = Input(shape=(contextTrain.shape[1],))
embedding_1 = Embedding(vocabularyLength,16)(questions)
#answers_network.add(Dense(16))
flatten_1 = Flatten()(embedding_1)
hidden = Dense(16)(Dense(8)(flatten_1))
hidden_2 = Dense(16)(hidden)
hidden_3 = Dense(16)(hidden_2)
dropout = Dropout(0.45)(hidden_3)
output = Dense(answers_shape[1],activation='softmax')(dropout)
answers_network = Model(inputs=[questions,context],outputs=output)
answers_network.summary()

NameError: ignored

In [0]:
answers_network.compile("adam","binary_crossentropy",metrics=[f1])

#### Train the neural network.

In [0]:
answers_network_checkpoint = ModelCheckpoint('answers_network-non-rnn-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto') 

In [0]:
print(answers_network.metrics_names)

In [0]:

answers_network.fit(x=[questionsTrain,contextTrain],y=Y_2_train_num,callbacks=[answers_network_checkpoint],validation_split=0.2,verbose=True,epochs=9)
#print("Weights: ",questions_article_model.get_weights())

#### Loading the model with best fit.

In [0]:
answers_network.load_weights('answers_network-non-rnn-best.h5')

In [0]:

answers_network.evaluate([questionsDev,contextDev],Y_2_dev_num)

## Transfer learning using Google's Bert

Our model has a very poor F1 score. Let's see if we can't build a better model. We'll use Google's BERT deep learning network, which is so good that the networks with SQuaD 2.0's highest recorded F1 score use it.  

The basic idea behind BERT is this: Neural networks rely on numerical vectors. Similar sentences and phrases should produce similar vectors.

"It is not possible to train bidirectional models by simply conditioning each word on words before and after it. Doing this would allow the word that’s being predicted to indirectly see itself in a multi-layer model. To solve this, Google researchers used a straightforward technique of masking out some words in the input and condition each word bidirectionally in order to predict the masked words. This idea is not new, but BERT is the first technique where it was successfully used to pre-train a deep neural network." (packtpub.com)

#### Preparing data for BERT.

In [12]:
# BERT data
num_rows = X_2_train_text.shape[0]
ids = np.arange(num_rows)
print(ids.shape)
labels = Y_2_train


(87599,)


TypeError: ignored

### Downloading from TensorFlow Hub

In [0]:
import tensorflow_hub as hub
from tensorflow.dtypes import as_string
embedding_shape = (32,32,32)
def embedding(x):
    module = hub.Module("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1", trainable=True)
    inputs = dict(
    input_ids=ids,
    input_mask=masks,
    segment_ids=x)
    return_tensor = module(inputs, signature="tokens", as_dict=True)
    try:
      return_tensor = tf.cast(return_tensor,tf.float32)
    except ValueError:
      print("Embedding function could not cast return_tensor to float")
    print ("Returning tensor with shape",return_tensor.get_shape())
    return return_tensor


### The neural network.

In [0]:
input_questions = Input(shape=(1,),dtype=tf.string)
questions_embedded = Lambda(embedding)(input_questions)
input_context = Input(shape=(1,),dtype=tf.string)
context_embedded = Lambda(embedding)(input_context)
dense_layer_1 = Dense(1024)(questions_embedded)
dense_layer_2 = Dense(1024)(context_embedded)
main_branch_inputs = Add()([dense_layer_1,dense_layer_2])
main_outputs = Dense(max_answer_start+1,activation="softmax")(Dropout(0.5)(Flatten()(main_branch_inputs)))
answers_network = tf.keras.Model(inputs=[input_questions,input_context],outputs=main_outputs)

TypeError: ignored

In [0]:

answers_network.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
lambda_8 (Lambda)               (None, 128)          0           input_10[0][0]                   
__________________________________________________________________________________________________
lambda_9 (Lambda)               (None, 128)          0           input_11[0][0]                   
__________________________________________________________________________________________________
dense_11 (

In [0]:
answers_network.compile("rmsprop","categorical_crossentropy",metrics=[f1])
answers_network_checkpoint = ModelCheckpoint('answers_network-rnn-best.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  y=session.run(tf.one_hot(Y_2_train,max_answer_start+1))
  print(X_2_train_text.shape)
  print(y.shape)
  x = np.hsplit(X_2_train_text,2)
  answers_network.fit(x=x,y=y,callbacks=[answers_network_checkpoint],epochs=8,validation_split=0.2)

(87599, 2)
(87599, 240)
Train on 70079 samples, validate on 17520 samples
Epoch 1/8
Epoch 00001: val_loss improved from inf to 2.33924, saving model to answers_network-rnn-best.h5
Epoch 2/8
Epoch 00002: val_loss did not improve from 2.33924
Epoch 3/8
Epoch 00003: val_loss did not improve from 2.33924
Epoch 4/8
Epoch 00004: val_loss did not improve from 2.33924
Epoch 5/8
Epoch 00005: val_loss did not improve from 2.33924
Epoch 6/8
Epoch 00006: val_loss did not improve from 2.33924
Epoch 7/8
Epoch 00007: val_loss did not improve from 2.33924
Epoch 8/8
Epoch 00008: val_loss did not improve from 2.33924
