## Setup

Load Python modules.

In [2]:
!pip3 install symspellpy
!pip3 install keras_bert
!pip3 install wordcloud
!wget -nc https://github.com/moonman239/Capstone-project/raw/master/data.zip -O data.zip
!unzip data.zip
!pip3 install -q keras-bert
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip -o uncased_L-12_H-768_A-12.zip

Collecting symspellpy
[?25l  Downloading https://files.pythonhosted.org/packages/6d/0b/2daa14bf1ed649fff0d072b2e51ae98d8b45cae6cf8fdda41be01ce6c289/symspellpy-6.5.2-py3-none-any.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 2.7MB/s 
Installing collected packages: symspellpy
Successfully installed symspellpy-6.5.2
Collecting keras_bert
  Downloading https://files.pythonhosted.org/packages/df/fe/bf46de1ef9d1395cd735d8df5402f5d837ef82cfd348a252ad8f32feeaef/keras-bert-0.80.0.tar.gz
Collecting keras-transformer>=0.30.0
  Downloading https://files.pythonhosted.org/packages/0a/57/496b1eab888171b0801a0a44d3245a7874b8d1cc04c1fbfdbb5e3327fc7a/keras-transformer-0.31.0.tar.gz
Collecting keras-pos-embd>=0.10.0
  Downloading https://files.pythonhosted.org/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz
Collecting keras-multi-head>=0.22.0
  Downloading https://files.pythonhosted.org/packages/40/3e/d0a64bb2ac5217928effe4507c26bbd

In [3]:
import numpy as np
import pandas as pd
import os
# TF_KERAS must be added to environment variables in order to use TPU
os.environ['TF_KERAS'] = '1'
from tensorflow.python.client import device_lib
import tensorflow.compat.v1 as tf
#import keras
from tensorflow.keras.layers import Embedding,Dropout,Lambda,Dense,Input,InputLayer,LSTM,Concatenate,Flatten,Add,Reshape,GlobalAveragePooling1D,GlobalAveragePooling2D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model,Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
import sys
assert sys.version_info[0] >= 3
from keras.backend import slice
print(tf.VERSION)
print(tf.keras.__version__)
print("Modules loaded!")

1.15.0
2.2.4-tf
Modules loaded!


Using TensorFlow backend.


In [3]:
import sys
print(sys.version)

3.6.9 (default, Nov  7 2019, 10:44:02) 
[GCC 8.3.0]


# Preprocessing

For the preprocessing step, we will create two Pandas DataFrames - one for the training data, and another for the test (dev) data.

## Data preprocessing techniques

In order to ensure the invariance of the text, I will need to train my deep & LSTM neural networks on lower-cased data, with punctuation removed if feasibly possible.

Natural language processing suffers from a dearth of data. I have heard that a useful technique for overcoming this problem is to create inverted data. For example, given sentence "I used to live all over Europe. In fact, I spent a significant amount of time in France, and now I speak fluent French.", we might also add the sentence "French fluent speak I now and France in time of amount significant a spent I fact in Europe over all live to used I"

### Loading JSON datasets

In [0]:
import json
import re
regex = re.compile(r'\W+')
def readFile(filename):
  with open(filename) as file:
    fields = []
    JSON = json.loads(file.read())
    articles = []
    for article in JSON["data"]:
      articleTitle = article["title"]
      article_body = []
      for paragraph in article["paragraphs"]:
        paragraphContext = paragraph["context"]
        article_body.append(paragraphContext)
        for qas in paragraph["qas"]:
          question = qas["question"]
          answer = qas["answers"][0]
          fields.append({"question":question,"answer_text":answer["text"],"answer_start":answer["answer_start"],"paragraph_context":paragraphContext,"article_title":articleTitle})
      article_body = "\\n".join(article_body)
      article = {"title":articleTitle,"body":article_body}
      articles.append(article)
  fields = pd.DataFrame(fields)
  #Remove punctuation.
  fields["question"] = fields["question"].str.replace(regex," ")
  assert not (fields["question"].str.contains("catalanswhat").any())
  fields["paragraph_context"] = fields["paragraph_context"].str.replace(regex," ")
  fields["answer_text"] = fields["answer_text"].str.replace(regex," ")
  assert not (fields["paragraph_context"].str.contains("catalanswhat").any())
  fields["article_title"] = fields["article_title"].str.replace("_"," ")
  assert not (fields["article_title"].str.contains("catalanswhat").any())
  return fields


In [0]:
trainingData = readFile("train-v1.1.json")
devData = readFile("dev-v1.1.json")

Summary statistics (credit to thushv89):

In [19]:
pd.Series(trainingData["question"]).str.split(' ').str.len().describe()

count    87599.000000
mean        11.217582
std          3.597356
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         41.000000
Name: question, dtype: float64

In [20]:
pd.Series(trainingData["paragraph_context"]).str.split(' ').str.len().describe()

count    87599.000000
mean       123.791653
std         50.541385
min         21.000000
25%         92.000000
50%        114.000000
75%        147.000000
max        678.000000
Name: paragraph_context, dtype: float64

## Punctuation check/removal

In [0]:
from string import punctuation
def punctuationCheck(series):
    for string in series:
        if any(p in punctuation for p in string):
            print(string + "\n")


In [0]:
punctuationCheck(trainingData["question"])
punctuationCheck(devData["question"])

In [0]:
punctuationCheck(trainingData["paragraph_context"])

In [0]:
punctuationCheck(trainingData["article_title"])

### Convert strings to lowercase.

In [0]:
#@title
def lowercase(data):
  data["question"] = data["question"].str.lower()
  data["article_title"] = data["article_title"].str.lower()
  data["paragraph_context"] = data["paragraph_context"].str.lower()
  data["answer_text"] = data["answer_text"].str.lower()
  return data


In [0]:
trainingData = lowercase(trainingData)
devData = lowercase(devData)

## Create more data

In [0]:
def invert_string(string):
	string_array = string.split(" ")
	string_array.reverse()
	return " ".join(string_array)
invert_string = np.vectorize(invert_string)
def invert_series_of_strings(series):
  return pd.Series(invert_string(series.values))
def createMoreData(dataFrame):
	newDataFrameQuestions = dataFrame["question"].append(invert_series_of_strings(dataFrame["question"]),ignore_index=True)
	newDataFrameArticleTitles = dataFrame["article_title"].append(invert_series_of_strings(dataFrame["article_title"]),ignore_index=True)
	newDataFrameParagraphContexts = dataFrame["paragraph_context"].append(dataFrame["paragraph_context"],ignore_index=True)
	newDataFrameAnswerStarts = dataFrame["answer_start"].append(dataFrame["answer_start"],ignore_index=True)
	return pd.DataFrame(data={"question":newDataFrameQuestions,"paragraph_context":newDataFrameParagraphContexts,"article_title":newDataFrameArticleTitles,"answer_start":newDataFrameAnswerStarts})

In [0]:
trainingData = createMoreData(trainingData)

In [0]:
devData = createMoreData(devData)

In [0]:
print(trainingData.shape)

(175198, 4)


## Convert answer_starts to numeric.

In [0]:
trainingData["answer_start"] = pd.to_numeric(trainingData["answer_start"])
devData["answer_start"] = pd.to_numeric(devData["answer_start"])

## Dataset output

In [0]:
trainingData[["question","paragraph_context","article_title","answer_start"]]

In [0]:
devData

In [0]:
print(trainingData.dtypes)

In [0]:
#@title
devData["question"] = devData["question"].str.lower()
devData["article_title"] = devData["article_title"].str.lower()
devData["paragraph_context"] = devData["paragraph_context"].str.lower()
#devData["answer_text"] = devData["answer_text"].str.lower()
devData["answer_start"] = pd.to_numeric(devData["answer_start"])
print("Finished loading dev data and lowering appropriate columns.")

## Getting training data?

In [0]:
X_2_train = trainingData[["question","paragraph_context"]]
Y_2_train = trainingData["answer_start"]

### Integer encode text

Used for manual encoding of text into integers.

In [0]:
strings = pd.concat((trainingData,devData)).drop("answer_start",axis=1)
strings = strings.values.flatten()
textTokenizer = Tokenizer(num_words=50000, oov_token='unk') # 37000 words appear less than 10 times, so exclude them. Credit thushv89 on StackOverflow for this suggestion and code.
textTokenizer.fit_on_texts(strings)

In [0]:
# Get length of vocabulary.
vocabulary_length = len(textTokenizer.word_index) + 1

In [59]:
print(vocabulary_length)

87490


In [0]:
questionsTokenized_train = pad_sequences(textTokenizer.texts_to_sequences(trainingData["question"]))
contextTokenized_train = pad_sequences(textTokenizer.texts_to_sequences(trainingData["paragraph_context"]))

In [0]:
questionsTokenized_dev = pad_sequences(textTokenizer.texts_to_sequences(devData["question"]))
contextTokenized_dev = pad_sequences(textTokenizer.texts_to_sequences(devData["paragraph_context"]))

In [0]:
articleTitles_train = pad_sequences(textTokenizer.texts_to_sequences(trainingData["article_title"]))

In [0]:
articleTitles_dev = pad_sequences(textTokenizer.texts_to_sequences(devData["article_title"]))

**One-hot encoding answer_start**

In [0]:
answer_start_one_hot = pd.get_dummies(pd.concat((trainingData["answer_start"],devData["answer_start"])))

In [0]:
answer_start_train_one_hot = pd.get_dummies(trainingData["answer_start"])

In [15]:
#@title Shape of answer_start
print(trainingData["answer_start"].values.shape)

(87599,)


# Code to strip punctuation from strings

In [0]:
import string
def removePunctuation(s):
  return s.translate(str.maketrans('', '', string.punctuation))

#Exploratory Visualizations & Statistics

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
data_frame = trainingData + devData
data_frame = data_frame.astype("str")

In [0]:
question_vectorizer = CountVectorizer().fit(data_frame["question"])
context_vectorizer = CountVectorizer().fit(data_frame["paragraph_context"])
title_vectorizer = CountVectorizer().fit(data_frame["article_title"])

In [0]:
def removeNonAlphanumericCharacters(string):
  import re
  regex = re.compile('[^a-zA-Z]')
  #First parameter is the replacement, second parameter is your input string
  return regex.sub('', string)
  #Out: 'abdE'

In [0]:
def vocabulary_array(vocabulary_dictionary,strip_chars=False): # Returns an np.array with the word in the first column and the frequency in the second column.
  vocabulary = []
  for word,frequency in vocabulary_dictionary.items():
    if (strip_chars):
      word = removeNonAlphanumericCharacters(word)
    vocabulary.append([word,str(frequency)])
  vocabulary = np.array(vocabulary)
  return vocabulary[np.argsort(vocabulary[:,1])[::-1]]

In [0]:
question_words = vocabulary_array(question_vectorizer.vocabulary_,strip_chars=True)
print(question_words[:,1][0:10].astype(np.uint64))

In [0]:
print(question_words[:,0][0:10])

## Bar charts

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure()
fig.subplots_adjust(top=0.8)

In [0]:
#@title Bar plot of most frequent words.
from wordcloud import WordCloud,STOPWORDS
wordcloud = WordCloud(
    width=800,height=800,
    stopwords = set(STOPWORDS),
    min_font_size = 10,
    background_color='white'
).generate(" ".join(pd.concat((trainingData["answer_text"],devData["answer_text"])).values.tolist())) 
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud,interpolation="bilinear") 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [0]:
#@title Bar plots of lengths.
plt.xlabel("Length")
plt.ylabel("Frequency")
plt.title("Histogram of question lengths")
plt.hist(trainingData["question"].str.len().values[0:10000])

In [0]:
wordcloud = WordCloud(
    width=800,height=800,
    stopwords = set(STOPWORDS),
    min_font_size = 10,
    background_color='white'
).generate(" ".join(pd.concat((trainingData["answer_text"],devData["answer_text"])).values.tolist())) 
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud,interpolation="bilinear") 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [0]:
plt.xlabel("Length")
plt.ylabel("Frequency")
plt.title("Histogram of answer lengths")
plt.hist(trainingData["answer_text"].str.len().values[0:10000])

### Other statistics

In [0]:
from sys import getsizeof
def vocabularySize():
  return len(vocabulary())
def summaryStatistics(series):
    numberOfWords = series.apply(lambda x: len(str(x).split(" ")))
    averageNumberOfWords = sum(numberOfWords) / len(numberOfWords)
    return "average: " + str(averageNumberOfWords) + "maximum: " + str(max(numberOfWords)) + " minimum: " +str(min(numberOfWords))
print("Size of vocabulary: ", vocabularySize())
print("Words in each question: ",summaryStatistics(trainingData["question"]))
print("Words in each article title: ",summaryStatistics(trainingData["article_title"]))
print("Words in each context: ",summaryStatistics(trainingData["paragraph_context"]))
print("Words in each answer: ",summaryStatistics(trainingData["answer_text"]))

In [0]:
#@title Top ten most frequently occuring words:
def wordFrequencies(series):
  split_strings = series.str.split(" ")
  frequencies = {}
  for split_string in split_strings:
    for word in split_string:
      word = word.lower()
      word = removePunctuation(word)
      try:
        frequencies[word] = frequencies[word] + 1
      except KeyError as k:
        frequencies[word] = 1
  return frequencies
pd.DataFrame.from_dict(wordFrequencies(trainingData["question"]))

#Building neural networks

This kind of task requires a neural network to find the answer from the relevant article. In this section, we look at two kinds of neural networks.

## Dense Neural Networks

This is the most basic, unremarkable kind of network. Put simply, each layer of this network takes inputs, applies weights and biases, then outputs a value between 0 and 1 - usually either a sigmoid value or a linear function.

## Long Short-Term Memory Networks


Long short-term memory networks are a type of recurrent neural networs that work by recalling words it previously read to get a good feel for the context of a word or phrase.

The most fundamental unit of an LSTM is the cell state - a little unit that holds information the LSTM deems important to remember. For example, consider the following piece of text:

"I used to live all over Europe. In fact, I spent a significant amount of time in France, and now I speak fluent _____." To fill in this blank, we only need to remember two things:

1) The preceding phrase "I speak fluent" - this indicates that the word that follows is the name of a language.
2) That the speaker lived in France; the fact that he lived "all over Europe" is irrelevant to deducing the name of the language he is about to mention he is fluent in.

We can thus deduce that the word that fits in the blank is "French". 

Part of the training process of an LSTM that would fill in the blank would involve learning which of these words would be important to deducing the missing word. For example, on the first training iteration, the LSTM might decide that all of the words are important. However, we might hope that as the LSTM progresses, it narrows down the list of words to remember to "France","I","speak", and "fluent". (https://medium.com/@ageitgey/natural-language-processing-is-fun-9a0bff37854e)

## Google BERT


As we will see, my LSTM model performed very poorly. One of the best models for learning relationships between words is Google's "Bidirectional Encoder Representations from Transformers" model.

One of the problems that BERT attempts to solve is the dearth of sufficient reliable training data for natural language processing tasks. It does this by manipulating existing data to create more data. For example, if our training data includes the sentence "I used to live all over Europe. In fact, I spent a significant amount of time in France, and now I speak fluent French," BERT will generate another sentence, such as: French fluent speak I now and France in time of amount significant a spent I fact in Europe over all live to used I". (https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)

## Benchmark
  

Since my simplest model is a Dense model, I will use that as a benchmark.

## Defining metrics



First, we need to define a metric we can use to score our neural network.

We'll use the F1 score. This score emphasizes false positives and false negatives, and therefore should be used when they are most important. (https://medium.com/analytics-vidhya/accuracy-vs-f1-score-6258237beca2#:~:targetText=Accuracy%20is%20used%20when%20the,and%20False%20Positives%20are%20crucial&targetText=In%20most%20real%2Dlife%20classification,to%20evaluate%20our%20model%20on.)

In [0]:
#@title F1
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

##Mapping Questions to Article Titles



While this step is likely important in a regular search, I do not expect that Stanford evaluates a program's ability to search for article titles; therefore, I am omitting this step for now.

## Mapping Questions and Paragraph Contexts to Answers

This neural network is used to generate answers from the questions and articles. It works by first reading the relevant article and using the question to find the answer.

## Word2Vec

In [0]:

from tensorflow.keras.preprocessing.text import *
from tensorflow.keras.preprocessing.sequence import skipgrams,make_sampling_table
def skipgrams_labels(sequence,vocabulary_length,window_size=3):
    couples,labels = skipgrams(sequence,vocabulary_length,window_size=window_size)
    assert len(couples) > 0
    target_words,contexts = zip(*couples)
    target_words = np.array(target_words).astype("int32")
    contexts = np.array(contexts).astype("int32")
    return target_words,contexts,labels
# This is to train word2vec.
def word2vec_selection(sequences,vocabulary_length,window_size=3,batch_index=-1):
    assert sequences.ndim == 2
    if (batch_index == -1):
        batch_index = np.random.choice(sequences.shape[0], 1, replace=False)
    sequence = sequences[batch_index,:].flatten()
    target_words,contexts,labels = skipgrams_labels(sequence,vocabulary_length,window_size=window_size)
    return target_words,contexts,labels

In [0]:
#@title Training word2vec
def word2vec_model(vector_dim=256):
    input_target = Input((1,))
    input_context = Input((1,))
    embedding = Embedding(vocabulary_length, vector_dim, input_length=1)
    target = embedding(input_target)
    context = embedding(input_context)
    from tensorflow.keras.layers import Dot,dot
    dot_product = dot([target,context],1)
    flat = Flatten()(dot_product)
    output = Dense(1)(flat)
    model = Model(inputs=[input_target,input_context],outputs=[output])
    return model


In [0]:
word2vec = word2vec_model()
word2vec.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 256)       22394368    input_1[0][0]                    
                                                                 input_2[0][0]               

In [0]:
def word2vec_batch_generator(sequences,vocabulary_length,batch_size=1):
    while True:
        target_words,contexts,labels = word2vec_selection(sequences,vocabulary_length)
        x = np.hstack([target_words,contexts])
        yield [target_words,contexts],np.array(labels)

In [0]:
word2vec.compile("rmsprop","binary_crossentropy",metrics=[f1])

In [0]:
word2vec.fit_generator(
    word2vec_batch_generator(
        contextTokenized_train,
        vocabulary_length),
    validation_data=word2vec_batch_generator(
        contextTokenized_train,
        vocabulary_length),
    validation_steps=2,
    steps_per_epoch=100,
    epochs=20,callbacks=[contexts_word2vec_checkpoint])

In [0]:
test = word2vec_selection(questionsTokenized_dev,vocabulary_length)

In [0]:
word2vec.load_weights("questions_word2vec.h5")

In [0]:
word2vec.evaluate(x=[test[0],test[1]],y=[test[2]])



[0.738932599623998, 0.795977]

In [0]:
word2vec.evaluate(x=[context_tests[0],context_tests[1]],y=[context_tests[2]])

In [0]:
#Freeze models.
word2vec.trainable = False

In [0]:
def positive_skipgrams(sequence,vocabulary_length):
    targets = []
    contexts = []
    for i in range(len(sequence)):
        lower_bound = 0
        if (i > 0):
            lower_bound = i - 1
        upper_bound = i + 1
        if (upper_bound >= len(sequence)):
            upper_bound = i
        word = sequence[i]
        skipgram_1_word = sequence[lower_bound]
        targets.append(word)
        contexts.append(skipgram_1_word)
        skipgram_2_word = sequence[upper_bound]
        targets.append(word)
        contexts.append(skipgram_2_word)
    return targets,contexts

In [0]:
question_skipgrams_train = [positive_skipgrams(sequence,vocabulary_length) for sequence in questionsTokenized_train]

In [0]:
contexts_skipgrams_train = [positive_skipgrams(sequence,vocabulary_length) for sequence in contextTokenized_train]

In [0]:
word2vec_embeddings = Model(inputs=[word2vec.inputs],outputs=[word2vec.get_layer("embedding_2").output])

In [0]:
question_embeddings = word2vec_embeddings.predict(question_skipgrams_train)

## Non-word2vec stuff.

In [0]:
answers_network_checkpoint = ModelCheckpoint('answers_network-best.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto') 

In [30]:
print(answer_start_train_one_hot.shape)

(87599, 1604)


## Dense Network

In [0]:
questions_embedding = questions_word2vec.layers[-2].output
contexts_embedding = contexts_word2vec.layers[-2].output
combined_layers = Concatenate()([questions_embedding,contexts_embedding])
answers_network_2_dense_5 = Dense(answer_start_train_one_hot.values.shape[1],activation="sigmoid")(combined_layers)
#answers_network_2.add(Dense(32)) Removing this line gives us more trainable parameters.
answers_network_2 = Model(inputs=[questions_word2vec.input,contexts_word2vec.input],outputs=[answers_network_2_dense_5])

In [0]:
answers_network_2.summary()

In [0]:
answers_network_2.compile("adam","binary_crossentropy",metrics=[f1])

In [0]:
def answers_network_batch_generator(question_skipgrams,context_skipgrams,vocabulary_length,answer_starts):
    while True:
        index = np.random.choice(question_skipgrams.shape[0], 1, replace=False)
        question_targets,question_contexts = zip(question_skipgrams[index,:])
        context_targets,context_contexts = zip(context_skipgrams[index,:])
        question
        yield [question_targets,question_contexts,context_targets,context_contexts],answer_starts[index,:]

In [0]:
answers_network_2.fit_generator(answers_network_batch_generator(
    np.array(question_skipgrams_train),
    np.array(contexts_skipgrams_train),
    vocabulary_length,
    answer_start_train_one_hot.values),
                                steps_per_epoch=20)

## Long Short-Term Memory Network (LSTM)

In [0]:
#@title One-hot encode answer starts.
outputTrain = pd.get_dummies(trainingData["answer_start"]).values
print(outputTrain.shape)

In [0]:
def goodNumber(array):
  return array.shape[1]
def actualNumber(features,time_steps):
  return features * time_steps
def isGoodNumber(array,features,time_steps):
  return goodNumber(array) == actualNumber(features,time_steps)

In [0]:
answers_questions_features = 8
answers_questions_time_steps = int(questionsTokenized_train.shape[1] / answers_questions_features)
assert isGoodNumber(questionsTokenized_train,answers_questions_features,answers_questions_time_steps)

In [0]:
print(contextTokenized_train.shape)

In [0]:
answers_contexts_features = 677
answers_contexts_time_steps = int(contextTokenized_train.shape[1] / answers_contexts_features)
assertion_error_message = "Bad number: " + str(actualNumber(answers_contexts_features,answers_contexts_time_steps)) + " Good number: " + str(goodNumber(contextTokenized_train))
assert isGoodNumber(contextTokenized_train,answers_contexts_features,answers_contexts_time_steps),assertion_error_message

In [0]:
answers_questions_input = Input(shape=(answers_questions_features,answers_questions_time_steps,))
answers_contexts_input = Input(shape=(answers_contexts_features,answers_contexts_time_steps,))
answers_questions_lstm = LSTM(256)(answers_questions_input)
answers_contexts_lstm = LSTM(256)(answers_contexts_input)
#answers_network_1.add(Embedding(715,128,input_length=715))
answers_combined_lstm = Add()([answers_questions_lstm,answers_contexts_lstm])
answers_combined_flattened=Flatten()(answers_combined_lstm)
answers_hidden_1 = Dense(124)(answers_combined_flattened)
answers_output = Dense(outputTrain.shape[1])(answers_hidden_1)
answers_network_1 = Model(inputs=[answers_questions_input,answers_contexts_input],outputs=[answers_output])
answers_network_1.summary()

In [0]:
answers_network_1.compile("adam","binary_crossentropy",metrics=[f1])

In [0]:
#@title Reshape the inputs for LSTM.
def reshape_for_lstm(inputs,features,time_steps):
  assert features * time_steps == inputs.shape[1], "Bad shape."
  return np.reshape(inputs,(inputs.shape[0],features,time_steps))

In [0]:
answers_network_1.fit(x=[reshape_for_lstm(questionsTokenized_train,answers_questions_features,answers_questions_time_steps),reshape_for_lstm(contextTokenized_train,answers_contexts_features,answers_contexts_time_steps)],y=[outputTrain],validation_split=0.1,callbacks=[answers_network_checkpoint],verbose=True,epochs=9)

#### Train the neural network.

In [0]:
print(answers_network_1.metrics_names)

In [0]:
def shape_for_lstm(array,features,time_steps):
  try:
    new_shape = (array.shape[0],features,time_steps)
  except:
    raise ValueError("Bad inputs.")
    return
  try:
    array_reshaped = np.reshape(array,new_shape)
  except TypeError as t:
    from traceback import print_stack
    raise TypeError("Traceback: " + str(print_stack()) + " bad types: " + str(features) + " " + str(time_steps))
  except AttributeError as a:
    raise AttributeError("What?")
  return array_reshaped

In [0]:
questionsTokenized_train_lstm = shape_for_lstm(questionsTokenized_train,answers_questions_features,answers_questions_time_steps)
contextTokenized_train_lstm = shape_for_lstm(contextTokenized_train,answers_contexts_features,answers_contexts_time_steps)

In [0]:

answers_network_1.fit(x=[questionsTokenized_train_lstm,contextTokenized_train_lstm],y=[outputTrain],validation_split=0.2,callbacks=[answers_network_checkpoint],verbose=True,epochs=9)
#print("Weights: ",questions_article_model.get_weights())

##Results

In [0]:
inputDev = np.pad(pd.get_dummies(devData[["question","paragraph_context"]].values),((0,0),(0,trainingData[["question","paragraph_context"]].values.shape[1])))

In [0]:
outputDev = np.pad(pd.get_dummies(devData["answer_start"]).values,((0,0),(0,431)))
print(outputDev.shape)

The F1 score was very poor (0.0e+00). Strangely, a network of just Dense layers may have performed better.

## Transfer learning using Google's BERT

The previous model has a very poor F1 score. Let's see if we can't build a better model. We'll use Google's BERT deep learning network, which is so good that 5 neural networks that use it made it to the top 10 winning neural networks for the SQuAD v2.0 dataset. These five neural networks had an average of 86.7% exact matches. For comparison, the humans that Stanford tested scored an average of 86.8% - a difference of 0.1%.
The basic idea behind BERT is this: Neural networks rely on numerical vectors. Similar sentences and phrases should produce similar vectors.

"It is not possible to train bidirectional models by simply conditioning each word on words before and after it. Doing this would allow the word that’s being predicted to indirectly see itself in a multi-layer model. To solve this, Google researchers used a straightforward technique of masking out some words in the input and condition each word bidirectionally in order to predict the masked words. This idea is not new, but BERT is the first technique where it was successfully used to pre-train a deep neural network." (packtpub.com)

In [0]:
#@title Clear session.
from tensorflow.keras import backend as K
K.clear_session()

In [0]:
#@title Memory Management
tf.keras.backend.set_floatx("float32")

In [0]:
# @title Environment
import os
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')
# Use TF_Keras
os.environ["TF_KERAS"] = "1"

In [9]:
# @title Load Basic Model
import codecs
from keras_bert import load_trained_model_from_checkpoint
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

model = load_trained_model_from_checkpoint(config_path, checkpoint_path)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:

#@title Model Summary
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 512)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 512, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 512, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

In [0]:
#@title Create tokenization stuff.
import keras_bert
tokenizer = keras_bert.Tokenizer(token_dict)
def tokenize(text,max_len):
  tokenizer.tokenize(text)
  return tokenizer.encode(first=text,max_len=max_len)
def tokenize_array(texts,max_len=512):
  indices = np.zeros((texts.shape[0],max_len))
  segments = np.zeros((texts.shape[0],max_len))
  for i in range(texts.shape[0]):
    tokens = tokenize(texts[i],max_len)
    indices[i] = tokens[0]
    segments[i] = tokens[1]
  #print(indices.shape)
  #print(segments.shape)
  return segments,indices

In [0]:
#@ Tokenize inputs.
def X_Y(dataset,answer_start_one_hot,batch_size=10,max_len=512):
    batch_indices = np.random.choice(np.arange(0,dataset.shape[0]),size=batch_size)
    dataset = dataset.iloc[batch_indices]
    questions = dataset["question"]
    contexts = dataset["paragraph_context"]
    question_indices,question_segments = tokenize_array(questions.values,max_len=max_len)
    context_indices,context_segments = tokenize_array(contexts.values,max_len=max_len)
    X = [question_indices,question_segments,context_indices,context_segments]
    Y = answer_start_one_hot.iloc[batch_indices]
    return X,Y
def X_Y_generator(dataset,answer_start_one_hot,batch_size=10,max_len=512):
    while True:
        try:
            X,Y = X_Y(dataset,answer_start_one_hot,batch_size=batch_size,max_len=max_len)
            #max_int = pd.concat((trainingData["answer_start"],devData["answer_start"])).max()
            yield X,Y
        except Exception as e:
            print("Unhandled exception in X_Y_generator: ",e)
            raise

In [16]:
X,Y = X_Y(trainingData,answer_start_train_one_hot,max_len=150) # Credit to the aforementioned StackOverflow user for his suggestion to use 150 words for brevity.
print(X[0].shape)
print(X[1].shape)
print(X[2].shape)
print(X[3].shape)

(10, 150)
(10, 150)
(10, 150)
(10, 150)


In [0]:
model.trainable = False

In [18]:

question_indices_layer = Input(shape=(150,))
question_segments_layer = Input(shape=(150,))
context_indices_layer = Input(shape=(150,))
context_segments_layer = Input(shape=(150,))
questions_bert_layer = model([question_indices_layer,question_segments_layer])
print("Questions bert layer loaded.")
context_bert_layer = model([context_indices_layer,context_segments_layer])
print("Context bert layer loaded.")

#Credit to the following block goes to thushv89 @ StackOverflow.com
questions_flattened = Flatten(dtype=tf.float16)(questions_bert_layer)
questions_flattened = Dense(128, activation='relu',dtype=tf.float16)(questions_flattened)
questions_flattened = Dense(128, activation='relu',dtype=tf.float16)(questions_flattened)
contexts_flattened = Flatten(dtype=tf.float16)(context_bert_layer)
contexts_flattened = Dense(128,activation="relu",dtype=tf.float16)(contexts_flattened)
contexts_flattened = Dense(128,activation="relu",dtype=tf.float16)(contexts_flattened)
combined = Concatenate(dtype=tf.float16)([questions_flattened,contexts_flattened])


#bert_dense_questions = Dense(256,activation="sigmoid")(questions_flattened)
#bert_dense_context = Dense(256,activation="sigmoid")(context_flattened)
answers_network_output = Dense(1604,activation="softmax",dtype=tf.float16)(combined)
#answers_network = Model(inputs=[input_layer],outputs=[questions_bert_layer,context_bert_layer])
answers_network = Model(inputs=[question_indices_layer,question_segments_layer,context_indices_layer,context_segments_layer],outputs=[answers_network_output])
answers_network.summary()


Questions bert layer loaded.
Context bert layer loaded.
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 150)]        0                                            
____________________________________

In [0]:
from tensorflow.keras.optimizers import SGD
opt = SGD(lr=0.01)

In [0]:
answers_network.compile("adam","categorical_crossentropy",metrics=[f1])

In [27]:
print(answers_network.metrics_names)

['loss', 'acc']


In [31]:
# Credit goes to thushv89 on StackOverflow
# for suggesting a maximum length of 150
answers_network.fit_generator(
    X_Y_generator(
        trainingData,
        answer_start_train_one_hot,
        batch_size=32,max_len=150),
    steps_per_epoch=100,
    epochs=10,
    callbacks=[answers_network_checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: ignored

In [47]:
evaluate_generator = answers_network.evaluate_generator(X_Y_generator(devData,answer_start_train_one_hot,batch_size=32,max_len=150),steps = 10)

j=0
for i in evaluate_generator:
  print(i)
  j += 1
  if (j==10):
    break

nan
1.9987539


In [44]:
answers_network.evaluate_generator
(
    X_Y_generator(
        devData,
        answer_start_train_one_hot,
        batch_size=32,max_len=150),
 steps=10
)


SyntaxError: ignored