## Setup

 Just to verify that everything's working.

In [25]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from keras.models import Sequential
from keras.layers import Dense,LSTM,Activation,InputLayer,Input,BatchNormalization
from keras.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display
# Verify we are using GPU.
from tensorflow.python.client import device_lib
import sys
print(sys.maxsize)
assert sys.version_info[0] >= 3
print("Set up!")

9223372036854775807
Set up!


## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [26]:
trainingData = pd.read_csv("training_data.csv",dtype=np.unicode_)
display(trainingData)
from sys import getsizeof
print("Finished loading training data.")
print("Size of training data:" + str(getsizeof(trainingData) / 1024**2) + " MB")

Unnamed: 0.1,Unnamed: 0,question,title,answer_start,answer_text,context
0,0,To whom did the Virgin Mary allegedly appear i...,University of Notre Dame,515,Saint Bernadette Soubirous,Architecturally the school has a Catholic char...
1,1,What is in front of the Notre Dame Main Building,University of Notre Dame,188,a copper statue of Christ,Architecturally the school has a Catholic char...
2,2,The Basilica of the Sacred heart at Notre Dame...,University of Notre Dame,279,the Main Building,Architecturally the school has a Catholic char...
3,3,What is the Grotto at Notre Dame,University of Notre Dame,381,a Marian place of prayer and reflection,Architecturally the school has a Catholic char...
4,4,What sits on top of the Main Building at Notre...,University of Notre Dame,92,a golden statue of the Virgin Mary,Architecturally the school has a Catholic char...
5,5,When did the Scholastic Magazine of Notre dame...,University of Notre Dame,248,September 1876,As at most other universities Notre Dames stud...
6,6,How often is Notre Dames the Juggler published,University of Notre Dame,441,twice,As at most other universities Notre Dames stud...
7,7,What is the daily student paper at Notre Dame ...,University of Notre Dame,598,The Observer,As at most other universities Notre Dames stud...
8,8,How many student news papers are found at Notr...,University of Notre Dame,126,three,As at most other universities Notre Dames stud...
9,9,In what year did the student paper Common Sens...,University of Notre Dame,908,1987,As at most other universities Notre Dames stud...


Finished loading training data.
Size of training data:119.34291648864746 MB


In [27]:
devData = pd.read_csv("dev_data.csv",dtype=np.unicode_)
print("Finished loading dev data.")
print("Size of dev data:" + str(getsizeof(devData) / 1024**2) + " MB")

Finished loading dev data.
Size of dev data:14.254668235778809 MB


### Text preprocessing using scikit-learn and TensorFlow

### Get n-grams

The following section turns inputs into n-grams. Note that order is not necessarily preserved.

In [28]:
#Custom skip-gram vectorizer
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        return lambda doc: self._word_skip_grams(
                compose(tokenize, preprocess, self.decode)(doc),
                stop_words)
    
    def _word_skip_grams(self, tokens, stop_words=None):
        """Turn tokens into a sequence of 1-skip-2-grams after stop words filtering"""
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)

#### Vectorize questions

In [29]:
def vectorize(trainData,devData):
    vectorizer = SkipGramVectorizer()
    combinedData = pd.concat((trainData,devData))
    vectorizer.fit(combinedData)
    print("Vectorizer fit.")
    trainData_vectorized = vectorizer.transform(trainData)
    print("First text vectorized")
    devData_vectorized = vectorizer.transform(devData)
    print("Second text vectorized")
    return trainData_vectorized,devData_vectorized,vectorizer.vocabulary_

In [30]:

# Vectorize all questions.
training_questions,dev_questions,vocabulary1 = vectorize(trainingData["question"],devData["question"])
print("Number of features:",training_questions.shape)

Vectorizer fit.
First text vectorized
Second text vectorized
Number of features: (87355, 371849)


#### Useful function

In [31]:
series_split = lambda series: series.apply(lambda x: x.split(" ")) # Splits a Pandas series of strings into words.

#### Vectorize article titles

In [32]:
# Training set.
from sklearn.preprocessing import MultiLabelBinarizer
combinedArticleTitles = series_split(pd.concat((trainingData["title"],devData["title"])))
articleTitlesBinarizer = MultiLabelBinarizer().fit(combinedArticleTitles)
print("Article titles binarized")

Article titles binarized


#### Replace numbers with text.

In [46]:
from num2words import num2words

def convertOneNumberToWord(word):
    try:
        word = int(word)
    except ValueError:
        return word
    return num2words(word)
def n2w(string):
    try:
        splitString = string.split(" ")
    except AttributeError:
        return convertOneNumberToWord(string)
    conversion = []
    for word in splitString:
        conversion.append(convertOneNumberToWord(word))
    return " ".join(conversion)

In [34]:
# Test code.
print(n2w("I have 2 tomatoes."))

I have two tomatoes.


In [35]:
trainingData["context"] = trainingData["context"].apply(n2w)
devData["context"] = devData["context"].apply(n2w)
print("Successfully converted numbers to words")

Successfully converted numbers to words


#### Vectorize article content

In [36]:

contentTotal = pd.concat((trainingData["context"],devData["context"]))
print("Gathered content")
contentTotal = series_split(contentTotal)
contentBinarizer = MultiLabelBinarizer()
contentBinarizer.fit(contentTotal)
test = contentBinarizer.inverse_transform(contentBinarizer.transform(series_split(pd.Series("Saint Bernadette"))))
print("Test",test)
print("Content binarizer fit.")

Gathered content
Test [('Bernadette', 'Saint')]
Content binarizer fit.


In [37]:
training_context = contentBinarizer.transform(series_split(trainingData["context"]))
print("Training context binarized.")

Training context binarized.


In [38]:
dev_context = contentBinarizer.transform(series_split(devData["context"]))
print("Dev context binarized")

Dev context binarized


#### Num2words on answers

In [47]:

trainingData["answer_text"] = trainingData["answer_text"].apply(n2w)
print("Converted training answers text numbers to words")

Converted training answers text numbers to words


In [None]:
devData["answer_text"] = devData["answer_text"].apply(n2w)
print("Converted dev answers text numbers to words")

#### Binarize the answers.

In [None]:
def findErrorSentences(series):
    sentences = series_split(series)
    vocabulary = contentBinarizer.classes_
    for sentence in sentences:
        for word in sentence:
            if word not in vocabulary:
                print("Found an error sentence!")
                print(" ".join(sentence))
    return error_sentences
findErrorSentences(trainingData["answer_text"])
print("Error sentences: " + str(error_sentences))
training_answers = contentBinarizer.transform(series_split(trainingData["answer_text"]))
print("Training answers binarized")

In [None]:
dev_answers = contentBinarizer.transform(series_split(devData["answer_text"]))
print("Dev answers binarized")

In [None]:
print("Number of features for questions vectorizer:",len(vocabulary1))
print("Number of features in article titles:",len(articleTitles_vocabulary))
print("Number of features in context:",len(contentBinarizer.classes_))
print("Number of features in answers: ",len(answersBinarizer.classes_))

## Data normalization

Now that we've vectorized the questions and binarized the article context, we have one problem. First, the questions are represented by ordinal numbers. Our neural network will work best if the questions are represented as binary numbers; i.e, they contain only ones and zeroes. Second, each question in the dataset has 371,624 features. The dataset itself contains approximately 90,000 entries. This is a common problem in data science; having more features than data can lead the algorithm to miss significant relationships between the features and the end result.

In [None]:
from sklearn.preprocessing import normalize
training_questions = normalize(training_questions)
dev_questions = normalize(dev_questions)

### More setup

In [None]:
# convert data to numpy arrays, get input shape
inputShape_first = training_questions.toarray().shape[1:] # Input shape of the first neural network.
article_titles_shape = training_articleTitles.toarray().shape
print("First neural network X shape: " + str(training_questions.shape))
print("First neural network input shape: " + str(inputShape_first))
print("Article titles array shape: " + str(article_titles_shape))

#### Splitting the data into testing and cross-validation sets.


In [None]:
# 20% of data will be set aside for cross-validation
from sklearn.model_selection import train_test_split
X_1_train,X_1_cross_validation,Y_1_train,Y_1_cross_validation = train_test_split(training_questions,training_articleTitles)
print(type(X_1_train))
print(X_1_train.shape)
print(Y_1_train.shape)
print(X_1_cross_validation.shape)
print(Y_1_cross_validation.shape)

In [None]:
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
X_2 = hstack((training_questions,training_articleTitles))
print("X 2 horizontally stacked.")
X_2_train,X_2_cross_validation,Y_2_train,Y_2_cross_validation = train_test_split(X_2,training_answers)

In [None]:
X_2_dev = hstack((dev_questions,dev_articleTitles))
print("X 2 dev horizontally stacked.")

#### Save preprocessed datasets to a file.

In [None]:
from scipy.sparse import save_npz
save_npz("X_1_train",X_1_train)
save_npz("X_1_cross_validation",X_1_cross_validation)
save_npz("dev_questions",dev_questions)
save_npz("X_2_train",X_2_train)
save_npz("X_2_cross_validation",X_2_cross_validation)
save_npz("X_2_dev",X_2_dev)

In [None]:
from scipy.sparse import save_npz
save_npz("Y_1_train",Y_1_train)
save_npz("Y_1_cross_validation",Y_1_cross_validation)
save_npz("dev_articleTitles",dev_articleTitles)
save_npz("Y_2_train",csr_matrix(Y_2_train))
save_npz("Y_2_cross_validation",csr_matrix(Y_2_cross_validation))
save_npz("dev_answers",csr_matrix(dev_answers))
print("Done!")

## Post-preprocessing analysis

In [None]:
# Get vocabularies from preprocessors above.


#### N-gram pyplot