## Setup

 Just to verify that everything's working.

In [13]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from keras.models import Sequential
from keras.layers import Dense,LSTM,Activation,InputLayer,Input,BatchNormalization
from keras.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display
import nltk
nltk.download("punkt")
import sys
assert sys.version_info[0] >= 3
print("Set up!")

Set up!


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocessing

For the preprocessing step, we will create two dictionaries. One will be used to map questions to articles, while the other will be used to map questions and the contents of the articles to the answers.

### Loading JSON datasets

In [14]:
trainingData = pd.read_csv("training_data.csv",dtype=object)

In [15]:
from sys import getsizeof
def summaryStatistics(series):
    numberOfWords = series.apply(lambda x: len(str(x).split(" ")))
    averageNumberOfWords = sum(numberOfWords) / len(numberOfWords)
    return "average: " + str(averageNumberOfWords) + "maximum: " + str(max(numberOfWords)) + " minimum: " +str(min(numberOfWords))
print("Finished loading training data.")
print("Words in each question: ",summaryStatistics(trainingData["question"]))
print("Words in each article title: ",summaryStatistics(trainingData["title"]))
print("Words in each context: ",summaryStatistics(trainingData["context"]))
print("Words in each answer: ",summaryStatistics(trainingData["answer_text"]))

Finished loading training data.
Words in each question:  average: 11.29124835441589maximum: 60 minimum: 1
Words in each article title:  average: 2.1835040924961366maximum: 10 minimum: 1
Words in each context:  average: 137.88847804933891maximum: 766 minimum: 22
Words in each answer:  average: 3.3740827657260604maximum: 46 minimum: 1


In [16]:
devData = pd.read_csv("dev_data.csv",dtype=object)
print("Finished loading dev data.")
print("Size of dev data:" + str(getsizeof(devData) / 1024**2) + " MB")

Finished loading dev data.
Size of dev data:14.796387672424316 MB


Let's create new columns in the training and dev datasets for where in the context strings the answers end.

In [17]:
sum_training = trainingData["answer_start"].astype("int64") + trainingData["answer_text"].astype("str").str.len()
trainingData["answer_end"] = sum_training
print("Training answer end column created.")

Training answer end column created.


In [18]:
sum_dev = devData["answer_start"].astype("int64") + devData["answer_text"].astype("str").str.len()
devData["answer_end"] = sum_dev
print("Dev answer end column created.")

Dev answer end column created.


Now let's convert answer_start to an integer.

In [19]:
trainingData["answer_start"] = trainingData["answer_start"].astype("int64")
devData["answer_start"] = devData["answer_start"].astype("int64")
print("Converted back to int64")

Converted back to int64


Let's make sure there are no NaNs:

In [20]:
trainingNans = pd.concat((trainingData["answer_start"].loc[trainingData["answer_start"].isnull()],trainingData["answer_end"].loc[trainingData["answer_end"].isnull()]))
assert trainingNans.shape[0] == 0,"NANs detected in training set: " + str(trainingNans)
devNans = pd.concat((devData["answer_start"].loc[devData["answer_start"].isnull()],devData["answer_end"].loc[devData["answer_end"].isnull()]))
assert devNans.shape[0] == 0, "NANs detected in dev set: " + str(devNans)
print("Everything's all good!")

Everything's all good!



### Text preprocessing using scikit-learn and TensorFlow

### Get n-grams

The following section turns inputs into n-grams. Note that order is not necessarily preserved.

In [21]:
#Custom skip-gram vectorizer
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        return lambda doc: self._word_skip_grams(
                compose(tokenize, preprocess, self.decode)(doc),
                stop_words)
    
    def _word_skip_grams(self, tokens, stop_words=None):
        """Turn tokens into a sequence of 1-skip-2-grams after stop words filtering"""
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)

#### Vectorize questions

In [22]:
def vectorize(trainData,devData):
    vectorizer = SkipGramVectorizer()
    combinedData = pd.concat((trainData,devData))
    vectorizer.fit(combinedData)
    print("Vectorizer fit.")
    trainData_vectorized = vectorizer.transform(trainData)
    print("First text vectorized")
    devData_vectorized = vectorizer.transform(devData)
    print("Second text vectorized")
    return trainData_vectorized,devData_vectorized,vectorizer.vocabulary_

In [23]:

# Vectorize all questions.
training_questions,dev_questions,questionsVocabulary = vectorize(trainingData["question"],devData["question"])
print("Shape:",training_questions.shape)

Vectorizer fit.
First text vectorized
Second text vectorized
Shape: (87355, 370435)


In [24]:
keys = list(questionsVocabulary.keys())
print("# of keys", len(keys))
print(keys[0:10])

# of keys 370435
['knots it', 'provisions the', 'about electron', 'antibiotics food', 'in earn', 'composer producer', 'are per', 'veneration images', 'lower music', 'have postoperative']


#### Vectorize article titles

In [25]:
# Training set.
from sklearn.preprocessing import MultiLabelBinarizer
combinedArticleTitles = pd.concat((trainingData["title"],devData["title"]))
binarizer = MultiLabelBinarizer().fit(combinedArticleTitles.str.split(" "))
print("Article titles binarized")

Article titles binarized


In [26]:
print("Number of classes: ",len(binarizer.classes_))

Number of classes:  761


#### Vectorize article content (including answers)

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer
contextTotal = pd.concat((trainingData["context"],devData["context"]))
print("Gathered content; fitting binarizer")
contextBinarizer = MultiLabelBinarizer().fit(contextTotal)
print("Binarizer fit. Transforming data.")
trainingContextBinarized = contextBinarizer.transform(trainingData["context"])
devContextBinarized = contextBinarizer.transform(devData["context"])
print("Data transformed.")
#print("Content vectorizer fit.")
#print("Vocabulary: " + str(articleContentVocabulary))

Gathered content; fitting binarizer
Binarizer fit. Transforming data.
Data transformed.


In [28]:
print("Number of classes",len(contextBinarizer.classes_))
#print(pd.Series(contextBinarizer.classes_))

Number of classes 1420


#### Binarize the answers.

In [29]:
import math
print("Gathering answers and formatting them.")
print("Processed answers. Binarizing answers")
# Convert answer_start and answer_end columns to equal-width arrays containing ones and zeros.
# Get the highest possible number across the two columns.
highestNumber = pd.concat((trainingData["answer_end"],devData["answer_end"])).max()
binaryNumberWidth = int(math.log(highestNumber,2)) + 1
trainingAnswerStartsBinarized = trainingData["answer_start"].map(lambda x: [int(y) for y in np.binary_repr(int(x),width=binaryNumberWidth)])
trainingAnswerEndsBinarized = trainingData["answer_end"].map(lambda x: [int(y) for y in np.binary_repr(int(x),width=binaryNumberWidth)])
devAnswerStartsBinarized = devData["answer_start"].map(lambda x: [int(y) for y in np.binary_repr(int(x),width=binaryNumberWidth)])
devAnswerEndsBinarized = devData["answer_end"].map(lambda x: [int(y) for y in np.binary_repr(int(x),width=binaryNumberWidth)])
#trainingAnswerEndsBinarized = pd.get_dummies(trainingAnswers["answer_end"])
#devAnswerStartsBinarized = pd.get_dummies(devAnswers["answer_start"])
#devAnswerEndsBinarized = pd.get_dummies(devAnswers["answer_end"])
print("Finished binarizing answers.")

Gathering answers and formatting them.
Processed answers. Binarizing answers
Finished binarizing answers.


#### Splitting the data into testing and cross-validation sets.


In [None]:
# 20% of data will be set aside for cross-validation
from sklearn.model_selection import train_test_split
X_1_train,X_1_cross_validation,Y_1_train,Y_1_cross_validation = train_test_split(training_questions,training_articleTitles)
print(type(X_1_train))
print(X_1_train.shape)
print(Y_1_train.shape)
print(X_1_cross_validation.shape)
print(Y_1_cross_validation.shape)

In [None]:
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
X_2 = hstack((training_questions,training_articleTitles))
print("X 2 horizontally stacked.")
X_2_train,X_2_cross_validation,Y_2_train,Y_2_cross_validation = train_test_split(X_2,training_answers)

In [None]:
X_2_dev = hstack((dev_questions,dev_articleTitles))
print("X 2 dev horizontally stacked.")

#### Save preprocessed datasets to a file.

In [None]:
from scipy.sparse import save_npz
save_npz("X_1_train",X_1_train)
save_npz("X_1_cross_validation",X_1_cross_validation)
save_npz("dev_questions",dev_questions)
save_npz("X_2_train",X_2_train)
save_npz("X_2_cross_validation",X_2_cross_validation)
save_npz("X_2_dev",X_2_dev)

In [None]:
from scipy.sparse import save_npz
save_npz("Y_1_train",Y_1_train)
save_npz("Y_1_cross_validation",Y_1_cross_validation)
save_npz("dev_articleTitles",dev_articleTitles)
save_npz("Y_2_train",csr_matrix(Y_2_train))
save_npz("Y_2_cross_validation",csr_matrix(Y_2_cross_validation))
save_npz("dev_answers",csr_matrix(dev_answers))
print("Done!")

## Post-preprocessing analysis

In [None]:
# Get vocabularies from preprocessors above.


#### N-gram pyplot