In [2]:
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.python.keras.utils import to_categorical

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

import csv
import string

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/noahbilgrien/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Reading the data

In [3]:
with open('Harvard_QA_dataset.csv', 'r') as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')
    data = [*reader]
    
    questions_raw = [row[0] for row in data]
    answers_raw = [row[1] for row in data]

# Cleaning the data

Future work
* Use google's Active Question Reformulation to generate alternate forms of the question

## 1) removing punctuation

In [4]:
table = str.maketrans({key: None for key in string.punctuation})
def remove_punct(s):
    return s.translate(table)

# 2) converting to lowercase

In [5]:
def lowercase(s):
    return s.lower()

# Stemming and lemmatization
Reducing the dimensionality of the data

Future work
* Parse the sentence to get POS tagging and feed the POS into the lemmatizer

In [6]:
lem = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    return " ".join([lem.lemmatize(w) for w in sentence.split(" ")])

In [7]:
def preprocess(sentence):
    s = remove_punct(sentence)
    s = lowercase(s)
    s = lemmatize_sentence(s)
    return s

In [8]:
questions = [preprocess(s) for s in questions_raw]

# Creating the embedded representation

## 1) create the vocabulary

In [9]:
from collections import defaultdict

vocab = set()
for q in questions:
    for w in q.split(" "):
        if w == "":
            continue
        vocab.add(w)

vocab_size = len(vocab) + 1
vocab_to_id = defaultdict(lambda: 0)
id_to_vocab = {}
for i, w in enumerate(vocab):
    vocab_to_id[w] = i + 1

## 2) create the encoding function

In [46]:
def encode(sentence, length=128):
    sentence = preprocess(sentence)
    sentence = sentence.split(" ")
    encoding = np.zeros((length))
    for i, word in enumerate(sentence):
        if i >= 128:
            break
        encoding[i] = vocab_to_id[word]
    return encoding

def one_hot_encode(sentence, length=128):
    sentence = preprocess(sentence)
    sentence = sentence.split(" ")
    onehot = np.zeros((length, vocab_size))
    for i, word in enumerate(sentence):
        onehot[i, vocab_to_id[word]] = 1
    return onehot

# Assemble the training data

In [64]:
X = np.array([encode(s) for s in questions_raw])
y = np.array([encode(s) for s in answers_raw])

# Build the model

In [65]:
model = keras.Sequential()

embedding = keras.layers.Embedding(vocab_size, 64, input_length=128)
encoder = keras.layers.LSTM(64, return_sequences=True)
decoder = keras.layers.LSTM(64, return_sequences=True)

model.add(embedding)
model.add(encoder)
model.add(decoder)
model.add(keras.layers.TimeDistributed(keras.layers.Dense(vocab_size)))
model.add(keras.layers.Activation('softmax'))

In [66]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 128, 64)           31360     
_________________________________________________________________
lstm_16 (LSTM)               (None, 128, 64)           33024     
_________________________________________________________________
lstm_17 (LSTM)               (None, 128, 64)           33024     
_________________________________________________________________
time_distributed_5 (TimeDist (None, 128, 490)          31850     
_________________________________________________________________
activation_8 (Activation)    (None, 128, 490)          0         
Total params: 129,258
Trainable params: 129,258
Non-trainable params: 0
_________________________________________________________________


In [68]:
model.fit(
    X,
    X,
    epochs=2
)

ValueError: Error when checking target: expected activation_8 to have 3 dimensions, but got array with shape (215, 128)

In [14]:
trained_encoder = keras.Sequential([
    embedding,
    encoder
])

In [17]:
output = model.predict(np.array([X[12]]))

In [18]:
output[0][0]

array([0.00213578, 0.00204282, 0.00204018, 0.00203818, 0.00204271,
       0.0020423 , 0.00203741, 0.0020354 , 0.0020338 , 0.00203398,
       0.00207542, 0.0020351 , 0.00203524, 0.00204594, 0.00204224,
       0.00203967, 0.00203117, 0.00204013, 0.00203694, 0.00204547,
       0.00204741, 0.00204424, 0.00203958, 0.00204145, 0.00204056,
       0.0020396 , 0.00203801, 0.00203601, 0.0020399 , 0.00203982,
       0.00204174, 0.00203375, 0.00203445, 0.00204293, 0.00204878,
       0.00204439, 0.00204126, 0.0020326 , 0.00205158, 0.00204121,
       0.00204232, 0.00205911, 0.0020332 , 0.00203299, 0.00203442,
       0.00204505, 0.00204459, 0.00203896, 0.00203502, 0.0020398 ,
       0.00204497, 0.00203636, 0.00203661, 0.00204199, 0.0020393 ,
       0.00204295, 0.00203745, 0.00204424, 0.00204622, 0.0020427 ,
       0.00203543, 0.00203956, 0.00204635, 0.00204172, 0.00203976,
       0.00203667, 0.00204454, 0.00203826, 0.00204585, 0.00204058,
       0.00204407, 0.00202867, 0.00203954, 0.00204082, 0.00204

In [19]:
encode("How can I get a receipt for my application fee?")

array([205.,  14., 428., 242., 451., 235., 407., 132., 317.,  22.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.])

In [51]:
np.array([X[0]]).shape

(1, 128)

In [23]:
X

array([[ 41., 420., 289., ...,   0.,   0.,   0.],
       [205., 220., 428., ...,   0.,   0.,   0.],
       [ 14., 352., 200., ...,   0.,   0.,   0.],
       ...,
       [ 86., 117., 428., ...,   0.,   0.,   0.],
       [ 86., 477., 117., ...,   0.,   0.,   0.],
       [ 86., 477.,  10., ...,   0.,   0.,   0.]])

In [30]:
y = to_categorical(X, num_classes=vocab_size)

In [31]:
y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [35]:
y[1] == y[0]

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])