## Setup

 Just to verify that everything's working.

In [None]:
import numpy as np
from scipy.sparse import csr_matrix
from keras.models import Sequential
from keras.layers import Dense,LSTM,Activation,InputLayer,Input,BatchNormalization
from keras.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer
from IPython import display
# Verify we are using GPU.
from tensorflow.python.client import device_lib
import sys
print(sys.maxsize)
assert sys.version_info[0] >= 3
print("Set up!")

## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [None]:
p = re.compile('[1-9]*[1-9]')
def n2w(_string):
    isInt = True
    stringToReturn = ""
    try:
        stringToReturn = num2words(int(_string))
    except:
        stringToReturn = _string
        #assert isinstance(stringToReturn,str)
        return stringToReturn
def convertNumbersToWords(_string):
    #Error: expected string?
    #assert isinstance(_string,str)
    _string_copy = p.sub(lambda x: n2w(x.group()),_string)
    return _string_copy

In [2]:
import json
import re
import string
import pandas as pd
def readFile(filename):
    with open(filename) as file:
        data = json.load(file)["data"]
        paragraphs = pd.io.json.json_normalize(data,record_path="paragraphs")
        # Map paragraph IDs and qas IDs.
        paragraphIDs_questions = []
        lastParagraphId = 0
        for index,paragraph in paragraphs.iterrows():
            for qasElement in paragraph["qas"]:
                paragraphIDs_questions.append({"paragraphID":lastParagraphId,"question":qasElement["question"]})
            lastParagraphId = lastParagraphId + 1
        paragraphIDs_questions = pd.DataFrame(paragraphIDs_questions)
        print("Finished mapping paragraph IDs to qas questions. Now generating qas dataframe and doing other things.")
        qas = pd.io.json.json_normalize(data,record_path=["paragraphs","qas"],meta=["title"])
        paragraphs["id"] = paragraphs.index
        #print(qas["question"])
        #Gather a list of where all answers should be so we can shove them into a DataFrame.
        # Haven't found a more efficient way to do this yet.
        answer_ids = set()
        answerId = 0
        for index,row in qas.iterrows():
            answer_ids.add(answerId)
            answerId = answerId + len(row["answers"])
        print("Finished with answer ids.")
        # Map qas pair IDs to answer IDs.
        answer_ids = pd.DataFrame(list(answer_ids))
        print("Finished converting answer_ids to DataFrame.")
        question_answerId = pd.DataFrame(qas["question"]).join(answer_ids,how="outer")
        question_answerId.columns = ["question","answer_id"]
        #print("Id-answerID columns: ",id_answerId.columns)
        print("finished creating intermediary table.")
        # Load answers into a data frame.
        answers = pd.io.json.json_normalize(data,record_path=["paragraphs","qas","answers"])
        answers.rename(columns={"text":"answer_text"},inplace=True)
        # Give each answer an ID.
        answers["id"] = answers.index
        print("Finished creating answers dataframe.")
        qas = qas.drop(labels=["answers"],axis=1) # Not needed any longer; we have the answers!
        #print("Dropped column 'answers' from qas.")
        # Map qas dataframe to answer table via id_answerId
        qas_answerId = pd.merge(qas,question_answerId,how="inner",on="question")
        print("Finished joining qas to answer id")
        # Merge qas_answerId with answers.
        qas = pd.merge(qas_answerId,answers,how="inner",left_on="answer_id",right_on="id")
        #print("Returned data frame: ",returnDataFrame)
        # Finally, include context.
        context = paragraphs[["context","id"]]
        qasWithParagraphID = pd.merge(qas,paragraphIDs_questions,how="inner",on="question")
        qas = pd.merge(qasWithParagraphID,context,how="inner",left_on="paragraphID",right_on="id")
        qas = qas.drop_duplicates("question")
        assert qas.duplicated("question").any() == False
        print("Done!")
        return qas

In [3]:
trainingData = readFile("train-v1.1.json")
display(trainingData)
from sys import getsizeof
print("Finished loading training data.")
print("Size of training data:",getsizeof(trainingData))

Finished mapping paragraph IDs to qas questions. Now generating qas dataframe and doing other things.
Finished with answer ids.
Finished converting answer_ids to DataFrame.
finished creating intermediary table.
Finished creating answers dataframe.
Finished joining qas to answer id
Done!


Unnamed: 0,id_x,question,title,answer_id,answer_start,answer_text,id_y,paragraphID,context,id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,University_of_Notre_Dame,0,515,Saint Bernadette Soubirous,0,0,"Architecturally, the school has a Catholic cha...",0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,University_of_Notre_Dame,1,188,a copper statue of Christ,1,0,"Architecturally, the school has a Catholic cha...",0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,University_of_Notre_Dame,2,279,the Main Building,2,0,"Architecturally, the school has a Catholic cha...",0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,University_of_Notre_Dame,3,381,a Marian place of prayer and reflection,3,0,"Architecturally, the school has a Catholic cha...",0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,University_of_Notre_Dame,4,92,a golden statue of the Virgin Mary,4,0,"Architecturally, the school has a Catholic cha...",0
5,5733bf84d058e614000b61be,When did the Scholastic Magazine of Notre dame...,University_of_Notre_Dame,5,248,September 1876,5,1,"As at most other universities, Notre Dame's st...",1
6,5733bf84d058e614000b61bf,How often is Notre Dame's the Juggler published?,University_of_Notre_Dame,6,441,twice,6,1,"As at most other universities, Notre Dame's st...",1
7,5733bf84d058e614000b61c0,What is the daily student paper at Notre Dame ...,University_of_Notre_Dame,7,598,The Observer,7,1,"As at most other universities, Notre Dame's st...",1
8,5733bf84d058e614000b61bd,How many student news papers are found at Notr...,University_of_Notre_Dame,8,126,three,8,1,"As at most other universities, Notre Dame's st...",1
9,5733bf84d058e614000b61c1,In what year did the student paper Common Sens...,University_of_Notre_Dame,9,908,1987,9,1,"As at most other universities, Notre Dame's st...",1


Finished loading training data.
Size of training data: 122352056


In [None]:
print("Amount of training data: ",len(training_questions))

In [None]:
dev_questions,dev_articleTitles,dev_articleTexts,dev_answers = readFile("dev-v1.1.json")
print("Finished loading dev data.")
print("Size of dev questions",getsizeof(dev_questions) / 1024)
print("Size of dev titles",getsizeof(dev_articleTitles) / 1024)
print("Size of dev texts",getsizeof(dev_articleTexts) / 1024)
print("Size of dev answers",getsizeof(dev_answers) / 1024)

In [None]:
assert len(dev_questions) == len(dev_answers)
print("Amount of dev data: " + str(len(dev_questions)))

### Text preprocessing using scikit-learn and TensorFlow

### Get n-grams

The following section turns inputs into n-grams. Note that order is not necessarily preserved.

In [None]:
#Custom skip-gram vectorizer
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        return lambda doc: self._word_skip_grams(
                compose(tokenize, preprocess, self.decode)(doc),
                stop_words)
    
    def _word_skip_grams(self, tokens, stop_words=None):
        """Turn tokens into a sequence of 1-skip-2-grams after stop words filtering"""
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)

#### Vectorize questions

In [None]:
def vectorize(trainData,devData):
    vectorizer = SkipGramVectorizer()
    vectorizer.fit(trainData + devData)
    print("Vectorizer fit.")
    trainData_vectorized = vectorizer.transform(trainData)
    print("First text vectorized")
    devData_vectorized = vectorizer.transform(devData)
    print("Second text vectorized")
    return trainData_vectorized,devData_vectorized,vectorizer.vocabulary_

In [None]:

# Vectorize all questions.
training_questions,dev_questions,question_vocabulary = vectorize(training_questions,dev_questions)
print("Number of features:",training_questions.shape)

#### Vectorize article titles

In [None]:
# Training set.
training_articleTitles,dev_articleTitles,articleTitles_vocabulary = vectorize(training_articleTitles,dev_articleTitles)
print("Article titles vectorized")

#### Vectorize article content

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
contentBinarizer = MultiLabelBinarizer()
print(getsizeof(contentBinarizer))
contentTotal = training_articleTexts + dev_articleTexts
print("Gathered content")
contentBinarizer.fit(contentTotal)
print("Content binarizer fit.")

In [None]:
training_answers = contentBinarizer.transform(training_answers)
print("Training answers binarized.")

In [None]:
dev_answers = contentBinarizer.transform(dev_answers)
print("Dev answers binarized")

In [None]:
print("Number of features in questions:",training_questions.shape[1])
print("Number of features in answers:",training_answers.shape[1])
print("Number of features in article titles:",training_articleTitles.shape[1])
print("Number of features in content:",training_articleContent.shape[1])

## Data normalization

### More setup

In [None]:
# convert data to numpy arrays, get input shape
inputShape_first = training_questions.toarray().shape[1:] # Input shape of the first neural network.
article_titles_shape = training_articleTitles.toarray().shape
print("First neural network X shape: " + str(training_questions.shape))
print("First neural network input shape: " + str(inputShape_first))
print("Article titles array shape: " + str(article_titles_shape))

#### Splitting the data into testing and cross-validation sets.


In [None]:
# 20% of data will be set aside for cross-validation
from sklearn.model_selection import train_test_split
X_1_train,X_1_cross_validation,Y_1_train,Y_1_cross_validation = train_test_split(training_questions,training_articleTitles)
print(type(X_1_train))
print(X_1_train.shape)
print(Y_1_train.shape)
print(X_1_cross_validation.shape)
print(Y_1_cross_validation.shape)

In [None]:
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
X_2 = hstack((training_questions,training_articleTitles))
print("X 2 horizontally stacked.")
X_2_train,X_2_cross_validation,Y_2_train,Y_2_cross_validation = train_test_split(X_2,training_answers)

In [None]:
X_2_dev = hstack((dev_questions,dev_articleTitles))
print("X 2 dev horizontally stacked.")

#### Save preprocessed datasets to a file.

In [None]:
from scipy.sparse import save_npz
save_npz("X_1_train",X_1_train)
save_npz("X_1_cross_validation",X_1_cross_validation)
save_npz("dev_questions",dev_questions)
save_npz("X_2_train",X_2_train)
save_npz("X_2_cross_validation",X_2_cross_validation)
save_npz("X_2_dev",X_2_dev)

In [None]:
from scipy.sparse import save_npz
save_npz("Y_1_train",Y_1_train)
save_npz("Y_1_cross_validation",Y_1_cross_validation)
save_npz("dev_articleTitles",dev_articleTitles)
save_npz("Y_2_train",csr_matrix(Y_2_train))
save_npz("Y_2_cross_validation",csr_matrix(Y_2_cross_validation))
save_npz("dev_answers",csr_matrix(dev_answers))
print("Done!")

## Post-preprocessing analysis

In [None]:
# Get vocabularies from preprocessors above.


#### N-gram pyplot