In [120]:
import os, re, sys, time, json, codecs
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from inspect import getargspec

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from sklearn import metrics

from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

from keras import backend as K
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Conv1D, Flatten, concatenate, multiply, Dense, Input, InputLayer, LSTM, Embedding, Dropout, Activation, Masking, RepeatVector, Reshape
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import Concatenate
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import GRU
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
os.chdir("/Users/meif/Desktop/SI 630 NLP/Project/")

from Code.layers.SharedWeight import SharedWeight
from Code.layers.VariationalDropout import VariationalDropout
from Code.layers.QuestionAttnGRU import QuestionAttnGRU
from Code.layers.SelfAttnGRU import SelfAttnGRU
from Code.layers.QuestionPooling import QuestionPooling

% matplotlib inline 

In [43]:
# Initialization
N_TRAININGPOINTS = "10000_reidx"

MAX_SEQUENCE_LENGTH_NEWS = 300 # median ~ 650
MAX_SEQUENCE_LENGTH_QUES = 37 # max ~ 37
EMBEDDING_DIM_WORD = 100
EMBEDDING_DIM_CHAR = 25
EMBEDDING_DIM = EMBEDDING_DIM_WORD + EMBEDDING_DIM_CHAR
OUTPUT_TYPE = "multi"

UNK_WORD = "<UNK_WORD>"
UNK_CHAR = "^"
UNK_ENTITY = "<UNK_ENTITY>"

DROPOUT_RATE = 0.3
BATCH_SIZE = 50
H_DIM = 24
NB_EPOCHS = 50

BST_MODEL_PATH = 'QA_model.model'
MODEL_PATH = 'QA_model.h5'

# 0. Helper Functions

In [3]:
def generate_data(N_data, Q_data, y_data, batch_size=BATCH_SIZE):
    
    samples_per_epoch = N_data.shape[0]
    number_of_batches = samples_per_epoch // batch_size
    counter = 0
    while True:
        N_batch = np.array(N_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
        Q_batch = np.array(Q_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
        y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
        counter += 1
        yield [N_batch, Q_batch], y_batch

    if counter <= number_of_batches:
        counter = 0

# 1. Load Trainset & Valset

In [107]:
# Load Dataset
print("Loading dataset")

N_train = np.load("Dataset/GRU/{0}/N_train{0}.npy".format(N_TRAININGPOINTS))
Q_train = np.load("Dataset/GRU/{0}/Q_train{0}.npy".format(N_TRAININGPOINTS))
N_val = np.load("Dataset/GRU/{0}/N_val{0}.npy".format(N_TRAININGPOINTS))
Q_val = np.load("Dataset/GRU/{0}/Q_val{0}.npy".format(N_TRAININGPOINTS))
N_test = np.load("Dataset/GRU/{0}/N_test{0}.npy".format(N_TRAININGPOINTS))
Q_test = np.load("Dataset/GRU/{0}/Q_test{0}.npy".format(N_TRAININGPOINTS))
embedding_matrix = np.load("Dataset/GRU/{0}/embedding_input_matrix{0}.npy".format(N_TRAININGPOINTS))
option_input = np.load("Dataset/GRU/{0}/option_input{0}.npy".format(N_TRAININGPOINTS))
option_input = option_input.reshape(-1,337,1)

with open("Dataset/GRU/{0}/word_index{0}.json".format(N_TRAININGPOINTS), "r") as f:
    word_index = json.load(f)
with open("Dataset/GRU/{0}/entity_index{0}.json".format(N_TRAININGPOINTS), "r") as f:
    entity_index = json.load(f)
    LABEL_SIZE = len(entity_index)
    
y_train = np.load("Dataset/GRU/{0}/y_train_{1}{0}.npy".format(N_TRAININGPOINTS, OUTPUT_TYPE))
y_val = np.load("Dataset/GRU/{0}/y_val_{1}{0}.npy".format(N_TRAININGPOINTS, OUTPUT_TYPE))
y_test = np.load("Dataset/GRU/{0}/y_test_{1}{0}.npy".format(N_TRAININGPOINTS, OUTPUT_TYPE))

print("Found {} questions in trainset".format(len(y_train)))
print("Found {} questions in valset".format(len(y_val)))
print("Found {} questions in testset".format(len(y_test)))
print("Embedding dim: {}".format(embedding_matrix.shape))

Loading dataset
Found 8716 questions in trainset
Found 912 questions in valset
Found 931 questions in testset
Embedding dim: (20182, 125)


In [45]:
# test
print(N_train[0])
print(Q_train[0])

[   51    34    50    10    10   356  1018     2    13 14586     7 17696
     6    15     8  8089    23  4039  2339    15     3   121  5221   102
     1  4139    10   295  8354  7011     2  1186     1  6675     6    41
   165    10   295   881     3   376  1077   106    18    15    23   934
    82    10   660     3   436     1   662  5823     6   110  2950  2243
     2     1   798  1440    10  4842     6    41   165    10   295   881
    30   466     4    15     7  4733  2107   149     3    15    30   573
   863     6  1588   475     3    23  2728  2971     9    13     2    55
    46  1713  4843     3   131  1309  3015  1769    54  2322     2     8
  1602  3015    28  2662     6 15971     3     4  1507    99   348    17
  3015     9     1   165    10   295     2  2894   417   291     7 11142
     2  2918   132   131  2782  2547    43    28    51   363    12  6676
    17     1 15256    50     2     8   420     1 17697    41  4388     4
   269   177   437    23     7  1284 11143     3  2

# 2. Model

## 2.1 RNN

In [122]:
class RNNModel(Model):
    def __init__(self, inputs=None, outputs=None,
                       N=MAX_SEQUENCE_LENGTH_NEWS, M=MAX_SEQUENCE_LENGTH_QUES, C=EMBEDDING_DIM_CHAR, 
                       word2vec_dim=EMBEDDING_DIM, label_size=LABEL_SIZE, embedding_matrix=embedding_matrix,
                       hdim=H_DIM, dropout_rate=DROPOUT_RATE, output_type=OUTPUT_TYPE,
                       unroll=False, **kwargs):
        
        # Load model from config
        if inputs is not None and outputs is not None:
            super(RNNModel, self).__init__(inputs=inputs,
                                           outputs=outputs,
                                           **kwargs)
            return

        '''Dimensions'''
        B = None
        H = hdim
        W = word2vec_dim
        
        P_vecs = Input(shape=(N, W), name='P_vecs')
        Q_vecs = Input(shape=(M, W), name='Q_vecs')
        P_str = Input(shape=(N, C), dtype='int32', name='P_str')
        Q_str = Input(shape=(M, C), dtype='int32', name='Q_str')
        input_placeholders = [P_vecs, P_str, Q_vecs, Q_str]

        P = Embedding(len(embedding_matrix), W, 
                      weights=[embedding_matrix], trainable=False,
                      input_length=N, name='P') (P_vecs)
        Q = Embedding(len(embedding_matrix), W, 
                      weights=[embedding_matrix], trainable=False,
                      input_length=M, name='Q') (Q_vecs)
        Pc = Embedding(65, 25, 
                       dropout=dropout_rate,
                       input_length=1960)(P_str)
    
        P = Concatenate() ([P_vecs, P_char_embeddings])
        Q = Concatenate() ([Q_vecs, Q_char_embeddings])
        
        uP = Masking() (P)
        for i in range(1):
            uP = Bidirectional(GRU(units=H,
                                   return_sequences=True,
                                   dropout=dropout_rate, unroll=False)) (uP)
        uP = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, 2 * H), name='uP') (uP)

        uQ = Masking() (Q)
        for i in range(1):
            uQ = Bidirectional(GRU(units=H,
                                   return_sequences=True,
                                   dropout=dropout_rate, unroll=False)) (uQ)
        uQ = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, 2 * H), name='uQ') (uQ)

        uPc = Conv1D(nb_filter=nb_filters, filter_length= window_length, 
                     activation='relu', border_mode='full') (Pc) 
        char_max_pooling = MaxPooling1D(pool_length=maxCharSize) (char_cnn) #  get output per word. this is the size of the hidden layer

        
        
        
        merged = concatenate([uP, uQ], axis=1)
        
        merged = Bidirectional(GRU(units=H, 
                                   return_sequences=False,
                                   dropout=dropout_rate, unroll=False)) (merged)
#         preds = TimeDistributed(Dense(1, activation='relu')) (merged)
#         preds = multiply([preds, option_input])
#         preds = TimeDistributed(Activation('softmax') (preds))
        preds = Dense(label_size, activation='softmax')(merged)
             
        inputs = input_placeholders
        outputs = preds

        super(RNNModel, self).__init__(inputs=inputs, outputs=outputs, **kwargs)

In [None]:
model = RNNModel(hdim=32, dropout_rate=DROPOUT_RATE)
optimizer_config = {'class_name': 'sgd',
                    'config': {'lr':0.1, 'nesterov':True}}
model.compile(optimizer=optimizer_config,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
model.summary()

In [89]:
checkpoint = ModelCheckpoint("model_QA_onehot.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
early = EarlyStopping(monitor='val_acc', min_delta=0.00001, patience=10, verbose=1, mode='auto')

In [92]:
hist = model.fit(x=[N_train, Q_train], y=y_train, 
                 batch_size=BATCH_SIZE, epochs=NB_EPOCHS, 
                 validation_data=([N_val, Q_val], y_val),
                 callbacks=[checkpoint, early])

# hist = model.fit_generator(generator=generate_data(N_train, Q_train, y_train, BATCH_SIZE),
#                     steps_per_epoch=len(N_train) // BATCH_SIZE,
#                     validation_data=generate_data(N_val, Q_val, y_val, BATCH_SIZE),
#                     validation_steps=len(N_val) // BATCH_SIZE,
#                     epochs=NB_EPOCHS,
#                     callbacks=[
#                         EarlyStopping(monitor='val_loss', patience=10),
#                         ModelCheckpoint(MODEL_PATH, verbose=1, save_best_only=True)
#                     ])
model.save("model_QA_onehot.h5")
print("Model saved to {}".format("model_QA_onehot.h5"))
print(min(hist.history['val_loss']))

Train on 8716 samples, validate on 912 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50

KeyboardInterrupt: 

# 3. Prediction

In [110]:
model = RNNModel(hdim=16, dropout_rate=DROPOUT_RATE)
optimizer_config = {'class_name': 'sgd', 'config': {'lr':0.1}}
model.compile(optimizer=optimizer_config, loss='categorical_crossentropy', metrics=['accuracy'])
model.load_weights("Models/model_QA.h5")

print('Predicting QA model...')
y_pred = model.predict(x=[N_test, Q_test], verbose=1)

score = model.evaluate([N_test, Q_test], y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

print("Writing Results")
df = pd.DataFrame({"y_test":pd.DataFrame(y_test).idxmax(axis=1), "y_pred":pd.DataFrame(y_pred).idxmax(axis=1)})
df.to_csv("Models/model_QA_acc_{0:.4f}.csv".format(score[1]), header=None)

Predicting QA model...
Test loss: 2.2620138351182293
Test accuracy: 0.24597207303974222
Writing Results


In [104]:
model = RNNModel(hdim=32, dropout_rate=DROPOUT_RATE)
optimizer_config = {'class_name': 'sgd', 'config': {'lr':0.1}}
model.compile(optimizer=optimizer_config, loss='categorical_crossentropy', metrics=['accuracy'])
model.load_weights("Models/model_QA_onehot2.h5")

print('Predicting QA model...')
y_pred = model.predict(x=[N_test, Q_test], verbose=1)

score = model.evaluate([N_test, Q_test], y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

print("Writing Results")
df = pd.DataFrame({"y_test":pd.DataFrame(y_test).idxmax(axis=1), "y_pred":pd.DataFrame(y_pred).idxmax(axis=1)})
df.to_csv("Models/model_QA2_acc_{0:.4f}.csv".format(score[1]), header=None)

Test loss: 2.2818907052438573
Test accuracy: 0.24704618689581095
Writing Results


# Appendix

In [None]:
# CNN
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
prds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=128)