## Setup

Load Python modules.

In [3]:
import numpy as np
import pandas as pd
import os
from tensorflow.python.client import device_lib
import tensorflow as tf
from tensorflow.keras.layers import Embedding,Dropout,Lambda,Dense,Input,LSTM,Concatenate,Flatten,Add,Reshape,GlobalAveragePooling1D,GlobalAveragePooling2D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
import sys
assert sys.version_info[0] >= 3
!pip install keras_bert
print("Set up!")

Collecting keras_bert
  Downloading https://files.pythonhosted.org/packages/1e/f1/02896ce76c132761ba21fed5ea9461ffa7c69e7a9beb76ff7657d69f98f3/keras-bert-0.55.1.tar.gz
Collecting keras-pos-embd==0.10.0 (from keras_bert)
  Downloading https://files.pythonhosted.org/packages/01/e4/8b8519779d84c412c2c1bbf4637066800c3b04463da059a5d220fa904133/keras-pos-embd-0.10.0.tar.gz
Collecting keras-transformer==0.23.0 (from keras_bert)
  Downloading https://files.pythonhosted.org/packages/31/0d/0b62504dc9a6377d035b651b9222c95f9094c002bdcb17162ad4863de6ba/keras-transformer-0.23.0.tar.gz
Collecting keras-multi-head==0.20.0 (from keras-transformer==0.23.0->keras_bert)
  Downloading https://files.pythonhosted.org/packages/3b/0f/f1a66974db9c328ba675c1df63f8a68c5c0f3e181f1e74db4f3b1a72a6df/keras-multi-head-0.20.0.tar.gz
Collecting keras-layer-normalization==0.12.0 (from keras-transformer==0.23.0->keras_bert)
  Downloading https://files.pythonhosted.org/packages/95/76/42878fe46bff8458d8aa1da50bfdf705d632d33

## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [4]:
currentDirectory = os.getcwd()
from google.colab import drive
driveBase = os.path.join(os.path.dirname(os.path.abspath("train-v1.1.json")),"drive")
drive.mount(driveBase)
hardDrive = os.path.join(driveBase,"My Drive")
os.chdir(hardDrive)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import json
import re
regex = re.compile(r'\W+')
def readFile(filename):
  with open(filename) as file:
    fields = []
    JSON = json.loads(file.read())
    for article in JSON["data"]:
      articleTitle = article["title"]
      for paragraph in article["paragraphs"]:
        paragraphContext = paragraph["context"]
        for qas in paragraph["qas"]:
          question = qas["question"]
          for answer in qas["answers"]:
            fields.append({"question":question,"answer_text":answer["text"],"answer_start":answer["answer_start"],"paragraph_context":paragraphContext,"article_title":articleTitle})
  fields = pd.DataFrame(fields)
  fields["question"] = fields["question"].str.replace(regex," ")
  assert not (fields["question"].str.contains("catalanswhat").any())
  fields["paragraph_context"] = fields["paragraph_context"].str.replace(regex," ")
  fields["answer_text"] = fields["answer_text"].str.replace(regex," ")
  assert not (fields["paragraph_context"].str.contains("catalanswhat").any())
  fields["article_title"] = fields["article_title"].str.replace("_"," ")
  assert not (fields["article_title"].str.contains("catalanswhat").any())
  return fields

In [0]:
trainingData = readFile("train-v1.1.json")


In [0]:
trainingData

Unnamed: 0,answer_start,answer_text,article_title,paragraph_context,question
0,515,Saint Bernadette Soubirous,University of Notre Dame,Architecturally the school has a Catholic char...,To whom did the Virgin Mary allegedly appear i...
1,188,a copper statue of Christ,University of Notre Dame,Architecturally the school has a Catholic char...,What is in front of the Notre Dame Main Building
2,279,the Main Building,University of Notre Dame,Architecturally the school has a Catholic char...,The Basilica of the Sacred heart at Notre Dame...
3,381,a Marian place of prayer and reflection,University of Notre Dame,Architecturally the school has a Catholic char...,What is the Grotto at Notre Dame
4,92,a golden statue of the Virgin Mary,University of Notre Dame,Architecturally the school has a Catholic char...,What sits on top of the Main Building at Notre...
5,248,September 1876,University of Notre Dame,As at most other universities Notre Dame s stu...,When did the Scholastic Magazine of Notre dame...
6,441,twice,University of Notre Dame,As at most other universities Notre Dame s stu...,How often is Notre Dame s the Juggler published
7,598,The Observer,University of Notre Dame,As at most other universities Notre Dame s stu...,What is the daily student paper at Notre Dame ...
8,126,three,University of Notre Dame,As at most other universities Notre Dame s stu...,How many student news papers are found at Notr...
9,908,1987,University of Notre Dame,As at most other universities Notre Dame s stu...,In what year did the student paper Common Sens...


In [0]:

trainingData["question"] = trainingData["question"].str.lower()
trainingData["article_title"] = trainingData["article_title"].str.lower()
trainingData["paragraph_context"] = trainingData["paragraph_context"].str.lower()
trainingData["answer_text"] = trainingData["answer_text"].str.lower()
trainingData["answer_start"] = pd.to_numeric(trainingData["answer_start"])

In [0]:
devData = readFile("dev-v1.1.json")
devData["question"] = devData["question"].str.lower()
devData["article_title"] = devData["article_title"].str.lower()
devData["paragraph_context"] = devData["paragraph_context"].str.lower()
devData["answer_text"] = devData["answer_text"].str.lower()
devData["answer_start"] = pd.to_numeric(devData["answer_start"])
print("Finished loading dev data and lowering appropriate columns.")

Finished loading dev data and lowering appropriate columns.


In [0]:
from sys import getsizeof
def vocabulary():
  from sklearn.feature_extraction.text import CountVectorizer
  data_frame = trainingData + devData
  data_frame = data_frame.astype("str")
  phrases = []
  for idx,row in data_frame.iterrows():
    phrases.append(row["question"])
    phrases.append(row["paragraph_context"])
    phrases.append(row["article_title"])
  for string in phrases:
    assert "catalanswhat" not in str(string)
  words = CountVectorizer().fit(phrases).get_feature_names()
  assert "raisedin" not in words
  return words
def vocabularySize():
  return len(vocabulary())
def summaryStatistics(series):
    numberOfWords = series.apply(lambda x: len(str(x).split(" ")))
    averageNumberOfWords = sum(numberOfWords) / len(numberOfWords)
    return "average: " + str(averageNumberOfWords) + "maximum: " + str(max(numberOfWords)) + " minimum: " +str(min(numberOfWords))
print("Size of vocabulary: ", vocabularySize())
print("Words in each question: ",summaryStatistics(trainingData["question"]))
print("Words in each article title: ",summaryStatistics(trainingData["article_title"]))
print("Words in each context: ",summaryStatistics(trainingData["paragraph_context"]))
print("Words in each answer: ",summaryStatistics(trainingData["answer_text"]))

Size of vocabulary:  53662
Words in each question:  average: 11.217582392493066maximum: 41 minimum: 1
Words in each article title:  average: 2.006621080149317maximum: 9 minimum: 1
Words in each context:  average: 123.7916528727497maximum: 678 minimum: 21
Words in each answer:  average: 3.356339684242971maximum: 43 minimum: 1


####Get maximum ** possible** answer start.



In [0]:

max_possible_answer_start = int(pd.concat((trainingData["answer_text"],devData["answer_text"])).str.len().max())


####Metrics

In [0]:
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Integer encode text

Used for manual encoding of text into integers.

In [0]:
strings = pd.concat((trainingData,devData)).drop("answer_start",axis=1)
strings = strings.values.flatten()
textTokenizer = Tokenizer()
textTokenizer.fit_on_texts(strings)

In [0]:
# Get length of vocabulary.
vocabulary_length = len(textTokenizer.word_index) + 1

In [0]:
# Get maximum length of all questions and contexts.
max_length_questions = pd.concat((trainingData["question"],devData["question"])).str.split().len().max
max_length_context = pd.concat((trainingData["paragraph_context"],devData["paragraph_context"])).str.len().max()

AttributeError: ignored

In [0]:
questionsTokenized_train = pad_sequences(np.array(textTokenizer.texts_to_sequences(trainingData["question"])),maxlen=max_length_questions)
contextTokenized_train = pad_sequences(np.array(textTokenizer.texts_to_sequences(trainingData["paragraph_context"])),maxlen=max_length_context)

In [0]:
questionsTokenized_dev = textTokenizer.texts_to_sequences(devData["question"])
contextTokenized_dev = textTokenizer.texts_to_sequences(devData["paragraph_context"])

In [0]:
# Pad sequences.
questionsTokenized_train = pad_sequences(questionsTokenized_train,maxlen=max_length_questions)
contextTokenized_train = pad_sequences(contextTokenized_train,maxlen=max_length_context)

## Second neural network - non-recurrent/no transfer learning

This neural network is used to generate answers from the questions and articles. It works by first reading the relevant article and using the question to find the answer.

In [0]:
#Build the neural network.
from math import log
inputShape_second = X_2_train_num.shape[1:3]
print(inputShape_second)
answers_shape = Y_2_train_num.shape
print(answers_shape[1])
# Find the vocabulary length.
vocabularyLength = np.concatenate((X_2_train_num,X_2_dev_num)).max() + 1

NameError: ignored

In [0]:
questions = Input(shape=(questionsTrain.shape[1],))
context = Input(shape=(contextTrain.shape[1],))
embedding_1 = Embedding(vocabularyLength,16)(questions)
#answers_network.add(Dense(16))
flatten_1 = Flatten()(embedding_1)
hidden = Dense(16)(Dense(8)(flatten_1))
hidden_2 = Dense(16)(hidden)
hidden_3 = Dense(16)(hidden_2)
dropout = Dropout(0.45)(hidden_3)
output = Dense(answers_shape[1],activation='softmax')(dropout)
answers_network = Model(inputs=[questions,context],outputs=output)
answers_network.summary()

NameError: ignored

In [0]:
answers_network.compile("adam","binary_crossentropy",metrics=[f1])

#### Train the neural network.

In [0]:
answers_network_checkpoint = ModelCheckpoint('answers_network-non-rnn-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto') 

In [0]:
print(answers_network.metrics_names)

In [0]:

answers_network.fit(x=[questionsTrain,contextTrain],y=Y_2_train_num,callbacks=[answers_network_checkpoint],validation_split=0.2,verbose=True,epochs=9)
#print("Weights: ",questions_article_model.get_weights())

#### Loading the model with best fit.

In [0]:
answers_network.load_weights('answers_network-non-rnn-best.h5')

In [0]:

answers_network.evaluate([questionsDev,contextDev],Y_2_dev_num)

## Transfer learning using Google's Bert

Our model has a very poor F1 score. Let's see if we can't build a better model. We'll use Google's BERT deep learning network, which is so good that the networks with SQuaD 2.0's highest recorded F1 score use it.  

The basic idea behind BERT is this: Neural networks rely on numerical vectors. Similar sentences and phrases should produce similar vectors.

"It is not possible to train bidirectional models by simply conditioning each word on words before and after it. Doing this would allow the word that’s being predicted to indirectly see itself in a multi-layer model. To solve this, Google researchers used a straightforward technique of masking out some words in the input and condition each word bidirectionally in order to predict the masked words. This idea is not new, but BERT is the first technique where it was successfully used to pre-train a deep neural network." (packtpub.com)



#### Downloading 


### The neural network.

In [0]:
os.chdir(currentDirectory)

In [21]:
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [0]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [0]:
# TF_KERAS must be added to environment variables in order to use TPU
os.environ['TF_KERAS'] = '1'

In [24]:
# @title Load Basic Model
import codecs
from keras_bert import load_trained_model_from_checkpoint
assert os.path.exists(pretrained_path), str(pretrained_path + " does not exist")
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

model = load_trained_model_from_checkpoint(config_path, checkpoint_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [25]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 512)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 512, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 512, 768)     1536        Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

In [0]:

answers_network.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
lambda_8 (Lambda)               (None, 128)          0           input_10[0][0]                   
__________________________________________________________________________________________________
lambda_9 (Lambda)               (None, 128)          0           input_11[0][0]                   
__________________________________________________________________________________________________
dense_11 (

In [0]:
answers_network.compile("rmsprop","categorical_crossentropy",metrics=[f1])
answers_network_checkpoint = ModelCheckpoint('answers_network-rnn-best.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  y=session.run(tf.one_hot(Y_2_train,max_answer_start+1))
  print(X_2_train_text.shape)
  print(y.shape)
  x = np.hsplit(X_2_train_text,2)
  answers_network.fit(x=x,y=y,callbacks=[answers_network_checkpoint],epochs=8,validation_split=0.2)

(87599, 2)
(87599, 240)
Train on 70079 samples, validate on 17520 samples
Epoch 1/8
Epoch 00001: val_loss improved from inf to 2.33924, saving model to answers_network-rnn-best.h5
Epoch 2/8
Epoch 00002: val_loss did not improve from 2.33924
Epoch 3/8
Epoch 00003: val_loss did not improve from 2.33924
Epoch 4/8
Epoch 00004: val_loss did not improve from 2.33924
Epoch 5/8
Epoch 00005: val_loss did not improve from 2.33924
Epoch 6/8
Epoch 00006: val_loss did not improve from 2.33924
Epoch 7/8
Epoch 00007: val_loss did not improve from 2.33924
Epoch 8/8
Epoch 00008: val_loss did not improve from 2.33924
