## Setup

 Just to verify that everything's working.

In [50]:
import numpy as np
from scipy.sparse import csr_matrix
from keras.models import Sequential
from keras.layers import Dense,LSTM,Activation,InputLayer,Input,BatchNormalization
from keras.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display
# Verify we are using GPU.
from tensorflow.python.client import device_lib
import sys
print(sys.maxsize)
assert sys.version_info[0] >= 3
print("Set up!")

9223372036854775807
Set up!


## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [None]:
p = re.compile('[1-9]*[1-9]')
def n2w(_string):
    isInt = True
    stringToReturn = ""
    try:
        stringToReturn = num2words(int(_string))
    except:
        stringToReturn = _string
        #assert isinstance(stringToReturn,str)
        return stringToReturn
def convertNumbersToWords(_string):
    #Error: expected string?
    #assert isinstance(_string,str)
    _string_copy = p.sub(lambda x: n2w(x.group()),_string)
    return _string_copy

In [97]:
import json
import re
import string
import pandas as pd
def readFile(filename):
    with open(filename) as file:
        data = json.load(file)["data"]
        paragraphs = pd.io.json.json_normalize(data,record_path="paragraphs")
        # Map paragraph IDs and qas IDs.
        paragraphIDs_questions = []
        lastParagraphId = 0
        for index,paragraph in paragraphs.iterrows():
            for qasElement in paragraph["qas"]:
                paragraphIDs_questions.append({"paragraphID":lastParagraphId,"question":qasElement["question"]})
            lastParagraphId = lastParagraphId + 1
        paragraphIDs_questions = pd.DataFrame(paragraphIDs_questions)
        print("Finished mapping paragraph IDs to qas questions. Now generating qas dataframe and doing other things.")
        qas = pd.io.json.json_normalize(data,record_path=["paragraphs","qas"],meta=["title"])
        paragraphs["id"] = paragraphs.index
        #print(qas["question"])
        #Gather a list of where all answers should be so we can shove them into a DataFrame.
        # Haven't found a more efficient way to do this yet.
        answer_ids = set()
        answerId = 0
        for index,row in qas.iterrows():
            answer_ids.add(answerId)
            answerId = answerId + len(row["answers"])
        print("Finished with answer ids.")
        # Map qas pair IDs to answer IDs.
        answer_ids = pd.DataFrame(list(answer_ids))
        print("Finished converting answer_ids to DataFrame.")
        question_answerId = pd.DataFrame(qas["question"]).join(answer_ids,how="outer")
        question_answerId.columns = ["question","answer_id"]
        #print("Id-answerID columns: ",id_answerId.columns)
        print("finished creating intermediary table.")
        # Load answers into a data frame.
        answers = pd.io.json.json_normalize(data,record_path=["paragraphs","qas","answers"])
        answers.rename(columns={"text":"answer_text"},inplace=True)
        # Give each answer an ID.
        answers["id"] = answers.index
        print("Finished creating answers dataframe.")
        qas = qas.drop(labels=["answers"],axis=1) # Not needed any longer; we have the answers!
        #print("Dropped column 'answers' from qas.")
        # Map qas dataframe to answer table via id_answerId
        qas_answerId = pd.merge(qas,question_answerId,how="inner",on="question")
        print("Finished joining qas to answer id")
        # Merge qas_answerId with answers.
        qas = pd.merge(qas_answerId,answers,how="inner",left_on="answer_id",right_on="id")
        #print("Returned data frame: ",returnDataFrame)
        # Finally, include context.
        context = paragraphs[["context","id"]]
        qasWithParagraphID = pd.merge(qas,paragraphIDs_questions,how="inner",on="question")
        qas = pd.merge(qasWithParagraphID,context,how="inner",left_on="paragraphID",right_on="id")
        qas = qas.drop_duplicates("question")
        assert qas.duplicated("question").any() == False
        # Prune the dataframe.
        # NOTE: Answer_start column may end up being useful.
        qas = qas.drop(labels=["id_x","id_y","answer_id","paragraphID","id"],axis=1)
        # Remove punctuation from all text columns.
        qas["title"] = qas['title'].str.replace('_',' ')
        qas["title"] = qas['title'].str.replace('[{}]'.format(string.punctuation), '')
        qas["question"] = qas['question'].str.replace('[{}]'.format(string.punctuation), '')
        qas["context"] = qas['context'].str.replace('[{}]'.format(string.punctuation), '')
        qas["answer_text"] = qas['answer_text'].str.replace('[{}]'.format(string.punctuation), '')
        print("Done!")
        return qas

In [98]:
trainingData = readFile("train-v1.1.json")
display(trainingData)
from sys import getsizeof
print("Finished loading training data.")
print("Size of training data:" + str(getsizeof(trainingData) / 1024**2) + " MB")

Finished mapping paragraph IDs to qas questions. Now generating qas dataframe and doing other things.
Finished with answer ids.
Finished converting answer_ids to DataFrame.
finished creating intermediary table.
Finished creating answers dataframe.
Finished joining qas to answer id
Done!


Unnamed: 0,question,title,answer_start,answer_text,context
0,To whom did the Virgin Mary allegedly appear i...,University of Notre Dame,515,Saint Bernadette Soubirous,Architecturally the school has a Catholic char...
1,What is in front of the Notre Dame Main Building,University of Notre Dame,188,a copper statue of Christ,Architecturally the school has a Catholic char...
2,The Basilica of the Sacred heart at Notre Dame...,University of Notre Dame,279,the Main Building,Architecturally the school has a Catholic char...
3,What is the Grotto at Notre Dame,University of Notre Dame,381,a Marian place of prayer and reflection,Architecturally the school has a Catholic char...
4,What sits on top of the Main Building at Notre...,University of Notre Dame,92,a golden statue of the Virgin Mary,Architecturally the school has a Catholic char...
5,When did the Scholastic Magazine of Notre dame...,University of Notre Dame,248,September 1876,As at most other universities Notre Dames stud...
6,How often is Notre Dames the Juggler published,University of Notre Dame,441,twice,As at most other universities Notre Dames stud...
7,What is the daily student paper at Notre Dame ...,University of Notre Dame,598,The Observer,As at most other universities Notre Dames stud...
8,How many student news papers are found at Notr...,University of Notre Dame,126,three,As at most other universities Notre Dames stud...
9,In what year did the student paper Common Sens...,University of Notre Dame,908,1987,As at most other universities Notre Dames stud...


Finished loading training data.
Size of training data:104.9325761795044 MB


In [None]:
assert len(dev_questions) == len(dev_answers)
print("Amount of dev data: " + str(len(dev_questions)))

### Text preprocessing using scikit-learn and TensorFlow

### Get n-grams

The following section turns inputs into n-grams. Note that order is not necessarily preserved.

In [34]:
#Custom skip-gram vectorizer
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        return lambda doc: self._word_skip_grams(
                compose(tokenize, preprocess, self.decode)(doc),
                stop_words)
    
    def _word_skip_grams(self, tokens, stop_words=None):
        """Turn tokens into a sequence of 1-skip-2-grams after stop words filtering"""
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)

#### Vectorize questions

In [58]:
def vectorize(trainData,devData):
    vectorizer = SkipGramVectorizer()
    combinedData = pd.concat((trainData,devData))
    vectorizer.fit(combinedData)
    print("Vectorizer fit.")
    trainData_vectorized = vectorizer.transform(trainData)
    print("First text vectorized")
    devData_vectorized = vectorizer.transform(devData)
    print("Second text vectorized")
    return trainData_vectorized,devData_vectorized,vectorizer.vocabulary_

In [101]:

# Vectorize all questions.
training_questions,dev_questions,vocabulary1 = vectorize(trainingData["question"],devData["question"])
print("Number of features:",training_questions.shape)

Vectorizer fit.
First text vectorized
Second text vectorized
Number of features: (87355, 371624)


#### Vectorize article titles

In [102]:
# Training set.
training_articleTitles,dev_articleTitles,articleTitles_vocabulary = vectorize(trainingData["title"],devData["title"])
print("Article titles vectorized")

Vectorizer fit.
First text vectorized
Second text vectorized
Article titles vectorized


#### Vectorize article content

In [123]:
from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer
contentTotal = pd.concat((trainingData["context"],devData["context"]))
print("Gathered content")
series_split = lambda series: series.apply(lambda x: x.split(" "))
print(series_split(pd.Series("Who are you"))) # Test code.
contentTotal = series_split(contentTotal)
contentBinarizer = MultiLabelBinarizer()
contentBinarizer.fit(contentTotal)
test = contentBinarizer.inverse_transform(contentBinarizer.transform(series_split(pd.Series("Saint Bernadette"))))
print("Test",test)
print("Content binarizer fit.")

Gathered content
0    [Who, are, you]
dtype: object
Test [('Bernadette', 'Saint')]
Content binarizer fit.


In [127]:
training_context = contentBinarizer.transform(series_split(trainingData["context"]))
print("Training context binarized.")

Training answers binarized.


In [130]:
dev_context = contentBinarizer.transform(series_split(devData["context"]))
print("Dev context binarized")

Dev context binarized


#### Binarize the answers.

In [137]:
answersBinarizer = MultiLabelBinarizer()
combinedAnswers = pd.concat((trainingData["answer_text"],devData["answer_text"]))
answersBinarizer.fit(series_split(combinedAnswers))

MultiLabelBinarizer(classes=None, sparse_output=False)

In [138]:
training_answers = answersBinarizer.transform(series_split(trainingData["answer_text"]))
print("Training answers binarized")

Training answers binarized


In [139]:
dev_answers = answersBinarizer.transform(series_split(devData["answer_text"]))
print("Dev answers binarized")

Dev answers binarized


In [141]:
print("Number of features for questions vectorizer:",len(vocabulary1))
print("Number of features in article titles:",len(articleTitles_vocabulary))
print("Number of features in context:",len(contentBinarizer.classes_))
print("Number of features in answers: ",len(answersBinarizer.classes_))

Number of features for questions vectorizer: 371624
Number of features in article titles: 188
Number of features in context: 117836
Number of features in answers:  50518


## Data normalization

Now that we've vectorized the questions and binarized the article context, we now have two more problems. First, the questions are represented by ordinal numbers. Our neural network will work best if the questions are represented as binary numbers; i.e, they contain only ones and zeroes. Second, each question in the dataset has 371,624 features; each item in the article context has 117,836 features. The dataset itself contains approximately 90,000 entries. This is a common problem in data science; having more features than data can lead the algorithm to miss significant relationships between the features and the end result.

### More setup

In [None]:
# convert data to numpy arrays, get input shape
inputShape_first = training_questions.toarray().shape[1:] # Input shape of the first neural network.
article_titles_shape = training_articleTitles.toarray().shape
print("First neural network X shape: " + str(training_questions.shape))
print("First neural network input shape: " + str(inputShape_first))
print("Article titles array shape: " + str(article_titles_shape))

#### Splitting the data into testing and cross-validation sets.


In [None]:
# 20% of data will be set aside for cross-validation
from sklearn.model_selection import train_test_split
X_1_train,X_1_cross_validation,Y_1_train,Y_1_cross_validation = train_test_split(training_questions,training_articleTitles)
print(type(X_1_train))
print(X_1_train.shape)
print(Y_1_train.shape)
print(X_1_cross_validation.shape)
print(Y_1_cross_validation.shape)

In [None]:
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
X_2 = hstack((training_questions,training_articleTitles))
print("X 2 horizontally stacked.")
X_2_train,X_2_cross_validation,Y_2_train,Y_2_cross_validation = train_test_split(X_2,training_answers)

In [None]:
X_2_dev = hstack((dev_questions,dev_articleTitles))
print("X 2 dev horizontally stacked.")

#### Save preprocessed datasets to a file.

In [None]:
from scipy.sparse import save_npz
save_npz("X_1_train",X_1_train)
save_npz("X_1_cross_validation",X_1_cross_validation)
save_npz("dev_questions",dev_questions)
save_npz("X_2_train",X_2_train)
save_npz("X_2_cross_validation",X_2_cross_validation)
save_npz("X_2_dev",X_2_dev)

In [None]:
from scipy.sparse import save_npz
save_npz("Y_1_train",Y_1_train)
save_npz("Y_1_cross_validation",Y_1_cross_validation)
save_npz("dev_articleTitles",dev_articleTitles)
save_npz("Y_2_train",csr_matrix(Y_2_train))
save_npz("Y_2_cross_validation",csr_matrix(Y_2_cross_validation))
save_npz("dev_answers",csr_matrix(dev_answers))
print("Done!")

## Post-preprocessing analysis

In [None]:
# Get vocabularies from preprocessors above.


#### N-gram pyplot