## Setup

Load Python modules.

In [1]:
!pip3 install symspellpy
!pip3 install keras_bert
from symspellpy.symspellpy import SymSpell
import numpy as np
import pandas as pd
import os
from tensorflow.python.client import device_lib
import tensorflow as tf
from tensorflow.keras.layers import Embedding,Dropout,Lambda,Dense,Input,LSTM,Concatenate,Flatten,Add,Reshape,GlobalAveragePooling1D,GlobalAveragePooling2D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model,Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
import sys
assert sys.version_info[0] >= 3
print(tf.VERSION)
print("Modules loaded!")

Collecting symspellpy
  Downloading https://files.pythonhosted.org/packages/2f/c2/8a15e2d16d22644afa208317a445f46b1e3157ad681dc5f31d6a25a8113e/symspellpy-6.3.9-py3-none-any.whl
Installing collected packages: symspellpy
Successfully installed symspellpy-6.3.9
Collecting keras_bert
  Downloading https://files.pythonhosted.org/packages/3c/1a/54cd6e0832fc457de5fe1d0f703bd519690a466060a886bea4e74fb47771/keras-bert-0.75.0.tar.gz
Collecting keras-transformer>=0.30.0 (from keras_bert)
  Downloading https://files.pythonhosted.org/packages/83/4c/972325395b38547df8a74be89e980922c1dc9f921cc2eb613e086c6bc632/keras-transformer-0.30.0.tar.gz
Collecting keras-pos-embd>=0.10.0 (from keras-transformer>=0.30.0->keras_bert)
  Downloading https://files.pythonhosted.org/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz
Collecting keras-multi-head>=0.22.0 (from keras-transformer>=0.30.0->keras_bert)
  Downloading https://files.pythonhosted.org/packages/4

## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [3]:
!wget -nc https://github.com/moonman239/Capstone-project/raw/master/data.zip -O data.zip
!unzip data.zip
assert os.path.isfile("train-v1.1.json"),"Non-existent file"
print("JSON datasets downloaded.")

File ‘data.zip’ already there; not retrieving.
Archive:  data.zip
replace dev-v1.1.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: JSON datasets downloaded.


In [0]:
import json
import re
#regex = re.compile(r'\W+')
def readFile(filename):
  with open(filename) as file:
    fields = []
    JSON = json.loads(file.read())
    articles = []
    for article in JSON["data"]:
      articleTitle = article["title"]
      article_body = []
      for paragraph in article["paragraphs"]:
        paragraphContext = paragraph["context"]
        article_body.append(paragraphContext)
        for qas in paragraph["qas"]:
          question = qas["question"]
          answer = qas["answers"][0]
          fields.append({"question":question,"answer_text":answer["text"],"answer_start":answer["answer_start"],"paragraph_context":paragraphContext,"article_title":articleTitle})
      article_body = "\\n".join(article_body)
      article = {"title":articleTitle,"body":article_body}
      articles.append(article)
  fields = pd.DataFrame(fields)
  #fields["question"] = fields["question"].str.replace(regex," ")
  assert not (fields["question"].str.contains("catalanswhat").any())
  #fields["paragraph_context"] = fields["paragraph_context"].str.replace(regex," ")
  #fields["answer_text"] = fields["answer_text"].str.replace(regex," ")
  assert not (fields["paragraph_context"].str.contains("catalanswhat").any())
  fields["article_title"] = fields["article_title"].str.replace("_"," ")
  assert not (fields["article_title"].str.contains("catalanswhat").any())
  return fields,articles

In [0]:
trainingData,training_articles = readFile("train-v1.1.json")
from sklearn.model_selection import train_test_split
trainingData,crossValidationData = train_test_split(trainingData)
devData,dev_articles = readFile("dev-v1.1.json")

In [6]:
trainingData

Unnamed: 0,answer_start,answer_text,article_title,paragraph_context,question
0,515,Saint Bernadette Soubirous,University of Notre Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...
1,188,a copper statue of Christ,University of Notre Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?
2,279,the Main Building,University of Notre Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...
3,381,a Marian place of prayer and reflection,University of Notre Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?
4,92,a golden statue of the Virgin Mary,University of Notre Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...
5,248,September 1876,University of Notre Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...
6,441,twice,University of Notre Dame,"As at most other universities, Notre Dame's st...",How often is Notre Dame's the Juggler published?
7,598,The Observer,University of Notre Dame,"As at most other universities, Notre Dame's st...",What is the daily student paper at Notre Dame ...
8,126,three,University of Notre Dame,"As at most other universities, Notre Dame's st...",How many student news papers are found at Notr...
9,908,1987,University of Notre Dame,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...


In [7]:
devData

Unnamed: 0,answer_start,answer_text,article_title,paragraph_context,question
0,177,Denver Broncos,Super Bowl 50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...
1,249,Carolina Panthers,Super Bowl 50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...
2,403,"Santa Clara, California",Super Bowl 50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?
3,177,Denver Broncos,Super Bowl 50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?
4,488,gold,Super Bowl 50,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...
5,487,"""golden anniversary""",Super Bowl 50,Super Bowl 50 was an American football game to...,What was the theme of Super Bowl 50?
6,334,"February 7, 2016",Super Bowl 50,Super Bowl 50 was an American football game to...,What day was the game played on?
7,133,American Football Conference,Super Bowl 50,Super Bowl 50 was an American football game to...,What is the AFC short for?
8,487,"""golden anniversary""",Super Bowl 50,Super Bowl 50 was an American football game to...,What was the theme of Super Bowl 50?
9,133,American Football Conference,Super Bowl 50,Super Bowl 50 was an American football game to...,What does AFC stand for?


### Convert strings to lowercase.

In [0]:
#@title
trainingData["question"] = trainingData["question"].str.lower()
trainingData["article_title"] = trainingData["article_title"].str.lower()
trainingData["paragraph_context"] = trainingData["paragraph_context"].str.lower()
trainingData["answer_text"] = trainingData["answer_text"].str.lower()
trainingData["answer_start"] = pd.to_numeric(trainingData["answer_start"])

In [0]:
#@title
devData["question"] = devData["question"].str.lower()
devData["article_title"] = devData["article_title"].str.lower()
devData["paragraph_context"] = devData["paragraph_context"].str.lower()
devData["answer_text"] = devData["answer_text"].str.lower()
devData["answer_start"] = pd.to_numeric(devData["answer_start"])
print("Finished loading dev data and lowering appropriate columns.")

Finished loading dev data and lowering appropriate columns.


In [0]:
X_2_train = trainingData[["question","paragraph_context"]]
Y_2_train = trainingData["answer_start"]

### Statistics

In [0]:
from sys import getsizeof
def vocabulary():
  from sklearn.feature_extraction.text import CountVectorizer
  data_frame = trainingData + devData
  data_frame = data_frame.astype("str")
  phrases = []
  for idx,row in data_frame.iterrows():
    phrases.append(row["question"])
    phrases.append(row["paragraph_context"])
    phrases.append(row["article_title"])
  words = CountVectorizer().fit(phrases).get_feature_names()
  return words
def vocabularySize():
  return len(vocabulary())
def summaryStatistics(series):
    numberOfWords = series.apply(lambda x: len(str(x).split(" ")))
    averageNumberOfWords = sum(numberOfWords) / len(numberOfWords)
    return "average: " + str(averageNumberOfWords) + "maximum: " + str(max(numberOfWords)) + " minimum: " +str(min(numberOfWords))
print("Size of vocabulary: ", vocabularySize())
print("Words in each question: ",summaryStatistics(trainingData["question"]))
print("Words in each article title: ",summaryStatistics(trainingData["article_title"]))
print("Words in each context: ",summaryStatistics(trainingData["paragraph_context"]))
print("Words in each answer: ",summaryStatistics(trainingData["answer_text"]))

Size of vocabulary:  34236
Words in each question:  average: 10.397504537723034maximum: 25601 minimum: 1
Words in each article title:  average: 2.006621080149317maximum: 9 minimum: 1
Words in each context:  average: 119.76870740533568maximum: 653 minimum: 20
Words in each answer:  average: 3.1621822166919715maximum: 43 minimum: 1


####Get maximum ** possible** answer start.



In [0]:

max_possible_answer_start = int(pd.concat((trainingData["answer_text"],devData["answer_text"])).str.len().max())


####Metrics

In [0]:
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Integer encode text

Used for manual encoding of text into integers.

In [0]:
strings = pd.concat((trainingData,devData)).drop("answer_start",axis=1)
strings = strings.values.flatten()
textTokenizer = Tokenizer()
textTokenizer.fit_on_texts(strings)

In [0]:
# Get length of vocabulary.
vocabulary_length = len(textTokenizer.word_index) + 1

In [0]:
# Get maximum length of all questions and contexts.
max_length_questions = pd.concat((trainingData["question"],devData["question"])).str.split().len().max
max_length_context = pd.concat((trainingData["paragraph_context"],devData["paragraph_context"])).str.len().max()

AttributeError: ignored

In [0]:
questionsTokenized_train = pad_sequences(np.array(textTokenizer.texts_to_sequences(trainingData["question"])),maxlen=max_length_questions)
contextTokenized_train = pad_sequences(np.array(textTokenizer.texts_to_sequences(trainingData["paragraph_context"])),maxlen=max_length_context)

In [0]:
questionsTokenized_dev = textTokenizer.texts_to_sequences(devData["question"])
contextTokenized_dev = textTokenizer.texts_to_sequences(devData["paragraph_context"])

In [0]:
# Pad sequences.
questionsTokenized_train = pad_sequences(questionsTokenized_train,maxlen=max_length_questions)
contextTokenized_train = pad_sequences(contextTokenized_train,maxlen=max_length_context)

## Second neural network - non-recurrent/no transfer learning

This neural network is used to generate answers from the questions and articles. It works by first reading the relevant article and using the question to find the answer.

In [0]:
#Build the neural network.
from math import log
inputShape_second = X_2_train_num.shape[1:3]
print(inputShape_second)
answers_shape = Y_2_train_num.shape
print(answers_shape[1])
# Find the vocabulary length.
vocabularyLength = np.concatenate((X_2_train_num,X_2_dev_num)).max() + 1

NameError: ignored

In [0]:
questions = Input(shape=(questionsTrain.shape[1],))
context = Input(shape=(contextTrain.shape[1],))
embedding_1 = Embedding(vocabularyLength,16)(questions)
#answers_network.add(Dense(16))
flatten_1 = Flatten()(embedding_1)
hidden = Dense(16)(Dense(8)(flatten_1))
hidden_2 = Dense(16)(hidden)
hidden_3 = Dense(16)(hidden_2)
dropout = Dropout(0.45)(hidden_3)
output = Dense(answers_shape[1],activation='softmax')(dropout)
answers_network = Model(inputs=[questions,context],outputs=output)
answers_network.summary()

NameError: ignored

In [0]:
answers_network.compile("adam","binary_crossentropy",metrics=[f1])

#### Train the neural network.

In [0]:
answers_network_checkpoint = ModelCheckpoint('answers_network-non-rnn-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto') 

In [0]:
print(answers_network.metrics_names)

In [0]:

answers_network.fit(x=[questionsTrain,contextTrain],y=Y_2_train_num,callbacks=[answers_network_checkpoint],validation_split=0.2,verbose=True,epochs=9)
#print("Weights: ",questions_article_model.get_weights())

#### Loading the model with best fit.

In [0]:
answers_network.load_weights('answers_network-non-rnn-best.h5')

In [0]:

answers_network.evaluate([questionsDev,contextDev],Y_2_dev_num)

## Transfer learning using Google's Bert

Our model has a very poor F1 score. Let's see if we can't build a better model. We'll use Google's BERT deep learning network, which is so good that the networks with SQuaD 2.0's highest recorded F1 score use it.  

The basic idea behind BERT is this: Neural networks rely on numerical vectors. Similar sentences and phrases should produce similar vectors.

"It is not possible to train bidirectional models by simply conditioning each word on words before and after it. Doing this would allow the word that’s being predicted to indirectly see itself in a multi-layer model. To solve this, Google researchers used a straightforward technique of masking out some words in the input and condition each word bidirectionally in order to predict the masked words. This idea is not new, but BERT is the first technique where it was successfully used to pre-train a deep neural network." (packtpub.com)

In [8]:
# @title Preparation
!pip install -q keras-bert
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip -o uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [0]:
# @title Environment
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

# TF_KERAS must be added to environment variables in order to use TPU
os.environ['TF_KERAS'] = '1'

In [10]:
# @title Initialize TPU Strategy

import tensorflow as tf
from keras_bert import get_custom_objects

TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)
tf.contrib.distribute.initialize_tpu_system(resolver)
strategy = tf.contrib.distribute.TPUStrategy(resolver)

W0828 21:53:01.076310 139887942354816 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
# @title Load Basic Model
import codecs
from keras_bert import load_trained_model_from_checkpoint

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

model = load_trained_model_from_checkpoint(config_path, checkpoint_path)

In [57]:
# @title Tokenization
import numpy as np
from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict)
def tokenize(text):
  tokens = tokenizer.tokenize(text)
  indices, segments = tokenizer.encode(first=text, max_len=512)
  return indices,segments
def feature_extraction(texts):
  return_values = []
  for text_ in texts:
    try:
      text_.split(" ")
    except AttributeError as e:
      raise TypeError("Expected array of strings.")
    try:
      indices,segments = tokenize(text_)
      predicts = model.predict([np.array([indices] * 8), np.array([segments] * 8)])[0]
      return_values.append(predicts)
    except ValueError as v:
      print(v)
  return_values = np.array(return_values)
  return return_values
text = "She sells seashells by the seashore."
text_array = np.asarray([text,"She does not sell seashells by the seashore"],dtype='object')
print(text_array.dtype)
print("text")
print(feature_extraction(text_array).shape)

object
text


KeyboardInterrupt: ignored

In [0]:
def batch_generator(dataset,batch_size):
  while True:
    batch = dataset.sample(n=batch_size,replace=True)
    try:
      batch_features = feature_extraction(batch["question"].values)
    except ValueError as v:
      print("Oops, I'm getting a ValueError for batch_features.")
      print(v)
    try:
      batch_targets = batch["answer_start"]
    except ValueError as v:
      print("Oops, I'm getting a ValueError for batch_targets.")
      print(v)
    
    yield batch_features,batch_targets


In [43]:
test_dataset = pd.DataFrame({"question":["Does she sell seashells by the seashore"],"paragraph_context":["She sells seashells by the seashore"],"answer_start":[0]})
test_dataset

Unnamed: 0,question,paragraph_context,answer_start
0,Does she sell seashells by the seashore,She sells seashells by the seashore,0


In [49]:
for x,y in batch_generator(test_dataset,2):
  print(x.shape)
  print(y.shape)
  assert x.shape[0] == y.shape[0], "Shape mismatch."
  print(x.dtype)
  print(x)
  print(y)
  break

Prediction shape:  (512, 768)
Prediction shape:  (512, 768)
Return values shape:  (2, 512, 768)
(2, 512, 768)
(2,)
float32
[[[-0.31885183 -0.12924516 -0.18322109 ... -0.35949844  0.18972889
   -0.10933438]
  [ 0.311414    0.17972124 -0.10798444 ... -0.11876195  0.56598294
   -0.31934875]
  [ 0.5496158  -1.0750685  -0.24597827 ... -0.9492761   0.16425355
   -0.2893064 ]
  ...
  [ 0.29522124 -0.09924655  0.21650063 ... -0.04205173  0.06311297
   -0.4185968 ]
  [ 0.1601033  -0.13200733  0.1676978  ...  0.00439339  0.05115967
   -0.49891245]
  [ 0.05326451 -0.19191569  0.1559972  ... -0.06486911  0.10209841
   -0.3616868 ]]

 [[-0.31885183 -0.12924516 -0.18322109 ... -0.35949844  0.18972889
   -0.10933438]
  [ 0.311414    0.17972124 -0.10798444 ... -0.11876195  0.56598294
   -0.31934875]
  [ 0.5496158  -1.0750685  -0.24597827 ... -0.9492761   0.16425355
   -0.2893064 ]
  ...
  [ 0.29522124 -0.09924655  0.21650063 ... -0.04205173  0.06311297
   -0.4185968 ]
  [ 0.1601033  -0.13200733  0.167

In [24]:

#@title Model Summary
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 512)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 512, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 512, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

In [25]:
#@title Incorporate model into answers network.
answers_network = Sequential()
answers_network.add(Dense(32,input_shape=(512,768)))
answers_network.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512, 32)           24608     
Total params: 24,608
Trainable params: 24,608
Non-trainable params: 0
_________________________________________________________________


In [0]:
nn_2_train = trainingData[["question","paragraph_context","answer_start"]]
nn_2_cross_validation = crossValidationData[["question","paragraph_context","answer_start"]]


In [0]:
nn_2_train_features = feature_extraction(nn_2_train)
nn_2_cross_validation_features = feature_extraction(nn_2_cross_validation)

In [51]:
#@title Fitting neural network
print("Done creating features.")
answers_network.compile("rmsprop","categorical_crossentropy")
answers_network_checkpoint = ModelCheckpoint('answers_network-rnn-best.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')
answers_network.fit(x=callbacks=[answers_network_checkpoint])

Done creating features.
Epoch 1/8


InvalidArgumentError: ignored