<a href="https://colab.research.google.com/github/moonman239/Capstone-project/blob/master/Capstone_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

Load Python modules.

In [1]:
import numpy as np
import pandas as pd
import os
# Verify we are using GPU.
from tensorflow.python.client import device_lib
import tensorflow as tf
from tensorflow.keras.layers import Lambda,Dense,Input,LSTM,concatenate
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
import sys
assert sys.version_info[0] >= 3
print("Set up!")

Set up!


## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [2]:
from google.colab import drive
driveBase = os.path.join(os.path.dirname(os.path.abspath("train-v1.1.json")),"drive")
drive.mount(driveBase)
hardDrive = os.path.join(driveBase,"My Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import json
def readFile(filename):
  with open(os.path.join(hardDrive,filename)) as file:
    fields = []
    JSON = json.loads(file.read())
    for article in JSON["data"]:
      articleTitle = article["title"]
      for paragraph in article["paragraphs"]:
        paragraphContext = paragraph["context"]
        for qas in paragraph["qas"]:
          question = qas["question"]
          for answer in qas["answers"]:
            fields.append({"question":question,"answer_text":answer["text"],"answer_start":answer["answer_start"],"paragraph_context":paragraphContext,"article_title":articleTitle})
  return pd.DataFrame(fields)

In [4]:
trainingData = readFile("train-v1.1.json")
trainingData

Unnamed: 0,answer_start,answer_text,article_title,paragraph_context,question
0,515,Saint Bernadette Soubirous,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...
1,188,a copper statue of Christ,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?
2,279,the Main Building,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...
3,381,a Marian place of prayer and reflection,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?
4,92,a golden statue of the Virgin Mary,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...
5,248,September 1876,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...
6,441,twice,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",How often is Notre Dame's the Juggler published?
7,598,The Observer,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",What is the daily student paper at Notre Dame ...
8,126,three,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",How many student news papers are found at Notr...
9,908,1987,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",In what year did the student paper Common Sens...


In [0]:

trainingData["question"] = trainingData["question"].str.lower()
trainingData["article_title"] = trainingData["article_title"].str.lower()
trainingData["paragraph_context"] = trainingData["paragraph_context"].str.lower()
trainingData["answer_text"] = trainingData["answer_text"].str.lower()
trainingData["answer_start"] = pd.to_numeric(trainingData["answer_start"])

In [6]:
from sys import getsizeof
def summaryStatistics(series):
    numberOfWords = series.apply(lambda x: len(str(x).split(" ")))
    averageNumberOfWords = sum(numberOfWords) / len(numberOfWords)
    return "average: " + str(averageNumberOfWords) + "maximum: " + str(max(numberOfWords)) + " minimum: " +str(min(numberOfWords))
print("Words in each question: ",summaryStatistics(trainingData["question"]))
print("Words in each article title: ",summaryStatistics(trainingData["article_title"]))
print("Words in each context: ",summaryStatistics(trainingData["paragraph_context"]))
print("Words in each answer: ",summaryStatistics(trainingData["answer_text"]))

Words in each question:  average: 10.397504537723034maximum: 25601 minimum: 1
Words in each article title:  average: 1.0maximum: 1 minimum: 1
Words in each context:  average: 119.76870740533568maximum: 653 minimum: 20
Words in each answer:  average: 3.1621822166919715maximum: 43 minimum: 1


In [7]:
devData = readFile("dev-v1.1.json")
devData["question"] = devData["question"].str.lower()
devData["article_title"] = devData["article_title"].str.lower()
devData["paragraph_context"] = devData["paragraph_context"].str.lower()
devData["answer_text"] = devData["answer_text"].str.lower()
devData["answer_start"] = pd.to_numeric(devData["answer_start"])
print("Finished loading dev data and lowering appropriate columns.")

Finished loading dev data and lowering appropriate columns.


## Assemble data into X and Y datasets.

In [0]:
X_1_train_text = trainingData["question"]
Y_1_train_text = trainingData["article_title"]
X_2_train_text = trainingData[["question","paragraph_context"]]
Y_2_train = trainingData["answer_start"]

In [9]:
X_2_train_text[0:10]

Unnamed: 0,question,paragraph_context
0,to whom did the virgin mary allegedly appear i...,"architecturally, the school has a catholic cha..."
1,what is in front of the notre dame main building?,"architecturally, the school has a catholic cha..."
2,the basilica of the sacred heart at notre dame...,"architecturally, the school has a catholic cha..."
3,what is the grotto at notre dame?,"architecturally, the school has a catholic cha..."
4,what sits on top of the main building at notre...,"architecturally, the school has a catholic cha..."
5,when did the scholastic magazine of notre dame...,"as at most other universities, notre dame's st..."
6,how often is notre dame's the juggler published?,"as at most other universities, notre dame's st..."
7,what is the daily student paper at notre dame ...,"as at most other universities, notre dame's st..."
8,how many student news papers are found at notr...,"as at most other universities, notre dame's st..."
9,in what year did the student paper common sens...,"as at most other universities, notre dame's st..."


In [0]:
#One-hot encode answer_starts.
max_answer_start = int(pd.concat((trainingData["answer_text"],devData["answer_text"])).str.len().max())


### Assemble numerical data.

### Integer encode text

Used for manual encoding of text into integers.

In [0]:
words = set()
trainingData.applymap(lambda x: words.update(str(x).split(" ")))
devData.applymap(lambda x: words.update(str(x).split(" ")))
print("Finished adding words.")

In [0]:
numberOfWords = len(words)
questionTokenizer = Tokenizer()
questionTokenizer.fit_on_texts(pd.concat((trainingData["question"],devData["question"])))

In [0]:
# Test tokenizer.
questionTokenizer.texts_to_sequences(trainingData["question"])

In [0]:
# Transform questions.
questionsTrain = np.array(pad_sequences(questionTokenizer.texts_to_sequences(trainingData["question"])))
questionsDev = np.array(pad_sequences(questionTokenizer.texts_to_sequences(devData["question"]),maxlen=questionsTrain.shape[1]))
print(type(questionsTrain[0]))
print(questionsTrain.shape)
print(questionsDev.shape)
print("Done!")

In [0]:
articleTitleTokenizer = Tokenizer()

articleTitleTokenizer.fit_on_texts(pd.concat((trainingData["article_title"],devData["article_title"])))

In [0]:
# Tokenize article titles.
articleTitleTrain = np.array(pad_sequences(articleTitleTokenizer.texts_to_sequences(trainingData["article_title"])))
articleTitleDev = np.array(pad_sequences(articleTitleTokenizer.texts_to_sequences(devData["article_title"]),maxlen=articleTitleTrain.shape[1]))

In [0]:
print(articleTitleTrain.shape)
print(articleTitleDev.shape)

In [0]:
contextTokenizer = Tokenizer()
contextTokenizer.fit_on_texts(pd.concat((trainingData["paragraph_context"],devData["paragraph_context"])))

In [0]:
# Transform context.
contextTrain = np.array(pad_sequences(contextTokenizer.texts_to_sequences(trainingData["paragraph_context"])))
contextDev = np.array(pad_sequences(contextTokenizer.texts_to_sequences(devData["paragraph_context"]),maxlen=contextTrain.shape[1]))

In [0]:
print(contextTrain.shape)
print(contextDev.shape)

In [0]:
#Transform answers.
answersTrain = np.array(pad_sequences(contextTokenizer.texts_to_sequences(trainingData["answer_text"].astype(str))))
answersDev = np.array(pad_sequences(contextTokenizer.texts_to_sequences(devData["answer_text"].astype(str)),maxlen=answersTrain.shape[1]))

In [0]:
print(answersTrain.shape)
print(answersDev.shape)

In [0]:
#Y_2_dev = answersDev
Y_2_dev_num = tf.Session().run(tf.one_hot(devData["answer_start"],max_answer_start))
X_1_train_num = questionsTrain
X_2_train_num = np.hstack((questionsTrain,contextTrain))
X_1_dev_num = questionsDev
X_2_dev_num = np.hstack((questionsDev,contextDev))
Y_1_train_num = articleTitleTrain
Y_1_dev_num = articleTitleDev

#Y_2_train = answersTrain

In [0]:
#Metrics
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## Second neural network - non-recurrent/no transfer learning

This neural network is used to generate answers from the questions and articles. It works by first reading the relevant article and using the question to find the answer.

In [11]:
#Build the neural network.
from math import log
inputShape_second = X_2_train_num.shape[1:3]
print(inputShape_second)
answers_shape = Y_2_train_num.shape
print(answers_shape[1])
# Find the vocabulary length.
vocabularyLength = np.concatenate((X_2_train_num,X_2_dev_num)).max() + 1

NameError: ignored

In [0]:
questions = Input(shape=(questionsTrain.shape[1],))
context = Input(shape=(contextTrain.shape[1],))
embedding_1 = Embedding(vocabularyLength,16)(questions)
#answers_network.add(Dense(16))
flatten_1 = Flatten()(embedding_1)
hidden = Dense(16)(Dense(8)(flatten_1))
hidden_2 = Dense(16)(hidden)
hidden_3 = Dense(16)(hidden_2)
dropout = Dropout(0.45)(hidden_3)
output = Dense(answers_shape[1],activation='softmax')(dropout)
answers_network = Model(inputs=[questions,context],outputs=output)
answers_network.summary()

In [0]:
answers_network.compile("adam","binary_crossentropy",metrics=[f1])

#### Train the neural network.

In [0]:
answers_network_checkpoint = ModelCheckpoint('answers_network-non-rnn-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto') 

In [0]:
print(answers_network.metrics_names)

In [0]:

answers_network.fit(x=[questionsTrain,contextTrain],y=Y_2_train_num,callbacks=[answers_network_checkpoint],validation_split=0.2,verbose=True,epochs=9)
#print("Weights: ",questions_article_model.get_weights())

#### Loading the model with best fit.

In [0]:
answers_network.load_weights('answers_network-non-rnn-best.h5')

In [0]:

answers_network.evaluate([questionsDev,contextDev],Y_2_dev_num)

## Transfer learning RNN using Google's Bert

Our model has a very poor F1 score. Let's see if we can't build a better model. We'll use Google's BERT deep learning network, which is so good that the networks with SQuaD 2.0's highest recorded F1 score use it.
Additionally, we would like to add a Long Short-Term Memory Network, which allows our neural network to act a little bit more like a hidden Markov chain model. Such a network has been proven to provide better results overall than a simple dense neural network in a natural language processing task.

### Downloading from TensorFlow Hub

In [0]:
import tensorflow_hub as hub
from tensorflow.dtypes import as_string
embedding_shape = (64,2,2)
def embedding(x):
    module = hub.Module("https://tfhub.dev/google/nnlm-en-dim128/1")
    return_tensor = tf.reshape(module(tf.squeeze(tf.cast(x, tf.string)),signature="default", as_dict=True)["default"],embedding_shape)
    print ("Returning tensor with shape",return_tensor.get_shape())
    return return_tensor


### The neural network.

In [23]:

input_questions = Input(shape=(1,), dtype=tf.string)
input_context = Input(shape=(1,), dtype=tf.string)
questions_embedded = Lambda(embedding,output_shape=embedding_shape)(input_questions)
questions_rnn = LSTM(128)(questions_embedded)
context_embedded = Lambda(embedding,output_shape=embedding_shape)(input_context)
context_rnn = LSTM(128)(context_embedded)
concatenate_layer = concatenate([questions_rnn,context_rnn])
dense = Dense(16)(concatenate_layer)
outputs = Dense(max_answer_start)(dense)
answers_network_rnn = tf.keras.Model(inputs=[input_questions,input_context], outputs=outputs)



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0410 05:42:49.086495 140595007182720 saver.py:1483] Saver not created because there are no variables in the graph to restore


Returning tensor with shape (64, 2, 2)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0410 05:42:49.638114 140595007182720 saver.py:1483] Saver not created because there are no variables in the graph to restore


Returning tensor with shape (64, 2, 2)


In [30]:
answers_network_rnn.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 128)          0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 128)          0           input_2[0][0]                    
__________________________________________________________________________________________________
concatenat

In [33]:
answers_network_rnn.compile("adam","categorical_crossentropy",metrics=['accuracy',f1])
answers_network_rnn_checkpoint = ModelCheckpoint('answers_network-rnn-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto')
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  answers_network_rnn.fit(x=[trainingData["question"].values,trainingData["paragraph_context"].values],y=session.run(tf.one_hot(trainingData["answer_start"],max_answer_start)))



KeyboardInterrupt: ignored

In [0]:
#Build the neural network.
from math import log
answers_network_rnn = Sequential()
vocabularyLength = X_2_RNN_train.max() +1
answers_network_rnn.add(Embedding(vocabularyLength,3,input_length=inputShape_second))
#
#answers_network.add(Flatten())
answers_network_rnn.add(LSTM(16))
answers_network_rnn.add(Dense(16))
answers_network_rnn.add(Dense(answers_shape[1]))
answers_network_rnn.add(Activation("softmax"))
answers_network_rnn.summary()

In [0]:
answers_network_rnn_checkpoint = ModelCheckpoint('answers_network-rnn-best.h5', verbose=1, monitor='val_f1',save_best_only=True, mode='auto') 

In [0]:
answers_network_rnn.compile("adagrad","binary_crossentropy",metrics=['accuracy',f1])

In [0]:
answers_network_rnn.fit(X_2_train,tf.Session().run(Y_2_train),batch_size=100,callbacks=[answers_network_rnn_checkpoint],validation_split=0.2,verbose=True,epochs=4)
