In [3]:
import os, re, sys, time, json, codecs
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from inspect import getargspec

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from sklearn import metrics

from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

from keras import backend as K
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Lambda, merge, concatenate, multiply, Masking, Reshape, RepeatVector
from keras.layers import Input, InputLayer, LSTM, Conv1D, Flatten, Dense, Embedding, Dropout, Activation
from keras.layers.merge import Concatenate
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import GRU
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
os.chdir("/Users/meif/Desktop/SI 630 NLP/Project/")

from Code.layers.SharedWeight import SharedWeight
from Code.layers.VariationalDropout import VariationalDropout
from Code.layers.QuestionAttnGRU import QuestionAttnGRU
from Code.layers.SelfAttnGRU import SelfAttnGRU
from Code.layers.QuestionPooling import QuestionPooling

% matplotlib inline 

In [6]:
# Initialization
N_TRAININGPOINTS = "100000_reidx"

MAX_SEQUENCE_LENGTH_NEWS = 300 # median ~ 650
MAX_SENTENCE_LENGTH_NEWS = 19
MAX_SEQUENCE_LENGTH_QUES = 46 # max ~ 37
MAX_WORD_LENGTH = 15
EMBEDDING_DIM_WORD = 100
EMBEDDING_DIM_CHAR = 25
EMBEDDING_DIM = EMBEDDING_DIM_WORD + EMBEDDING_DIM_CHAR
OUTPUT_TYPE = "multi"

UNK_WORD = "<UNK_WORD>"
UNK_CHAR = "^"
UNK_ENTITY = "<UNK_ENTITY>"

DROPOUT_RATE = 0.2
BATCH_SIZE = 35
H_DIM = 45
NB_EPOCHS = 50

BST_MODEL_PATH = 'QA_model.model'
MODEL_PATH = 'QA_model.h5'

# 0. Helper Functions

# 1. Load Trainset & Valset

In [7]:
# Load Dataset
print("Loading dataset")

N_train = np.load("Dataset/GRU/{0}/N_train{0}.npy".format(N_TRAININGPOINTS))
Q_train = np.load("Dataset/GRU/{0}/Q_train{0}.npy".format(N_TRAININGPOINTS))
O_train = np.load("Dataset/GRU/{0}/O_train{0}.npy".format(N_TRAININGPOINTS))
N_train_char = np.load("Dataset/GRU/{0}/N_train_char{0}.npy".format(N_TRAININGPOINTS))
Q_train_char = np.load("Dataset/GRU/{0}/Q_train_char{0}.npy".format(N_TRAININGPOINTS))

N_val = np.load("Dataset/GRU/{0}/N_val{0}.npy".format(N_TRAININGPOINTS))
Q_val = np.load("Dataset/GRU/{0}/Q_val{0}.npy".format(N_TRAININGPOINTS))
O_val = np.load("Dataset/GRU/{0}/O_val{0}.npy".format(N_TRAININGPOINTS))
N_val_char = np.load("Dataset/GRU/{0}/N_val_char{0}.npy".format(N_TRAININGPOINTS))
Q_val_char = np.load("Dataset/GRU/{0}/Q_val_char{0}.npy".format(N_TRAININGPOINTS))

N_test = np.load("Dataset/GRU/{0}/N_test{0}.npy".format(N_TRAININGPOINTS))
Q_test = np.load("Dataset/GRU/{0}/Q_test{0}.npy".format(N_TRAININGPOINTS))
O_test = np.load("Dataset/GRU/{0}/O_test{0}.npy".format(N_TRAININGPOINTS))
N_test_char = np.load("Dataset/GRU/{0}/N_test_char{0}.npy".format(N_TRAININGPOINTS))
Q_test_char = np.load("Dataset/GRU/{0}/Q_test_char{0}.npy".format(N_TRAININGPOINTS))

embedding_matrix = np.load("Dataset/GRU/{0}/embedding_word_matrix{0}.npy".format(N_TRAININGPOINTS))

with open("Dataset/GRU/{0}/word_index{0}.json".format(N_TRAININGPOINTS), "r") as f:
    word_index = json.load(f)
with open("Dataset/GRU/{0}/entity_index{0}.json".format(N_TRAININGPOINTS), "r") as f:
    entity_index = json.load(f)
    LABEL_SIZE = len(entity_index)
       
y_train = np.load("Dataset/GRU/{0}/y_train_{1}{0}.npy".format(N_TRAININGPOINTS, OUTPUT_TYPE))
y_val = np.load("Dataset/GRU/{0}/y_val_{1}{0}.npy".format(N_TRAININGPOINTS, OUTPUT_TYPE))
y_test = np.load("Dataset/GRU/{0}/y_test_{1}{0}.npy".format(N_TRAININGPOINTS, OUTPUT_TYPE))
# y_train = y_train.reshape(43722, 300, 1)
# y_val = y_val.reshape(3579, 300, 1)
# y_test = y_test.reshape(2992, 300, 1)

print("Found {} questions in trainset".format(len(y_train)))
print("Found {} questions in valset".format(len(y_val)))
print("Found {} questions in testset".format(len(y_test)))
print("Embedding dim: {}".format(embedding_matrix.shape))

Loading dataset
Found 87355 questions in trainset
Found 3579 questions in valset
Found 2992 questions in testset
Embedding dim: (20109, 100)


In [8]:
# test
print(N_train[0])
print(Q_train[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [9]:
embedding_matrix[word_index["@entity0"]]

array([1. , 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])

# 2. Model

## 2.1 RNet

In [12]:
class RNet(Model):
    def __init__(self, inputs=None, outputs=None,
                       N=MAX_SEQUENCE_LENGTH_NEWS, M=MAX_SEQUENCE_LENGTH_QUES, 
                       C=MAX_WORD_LENGTH, W=EMBEDDING_DIM_WORD, label_size=LABEL_SIZE, 
                       embedding_matrix=embedding_matrix, 
                       hdim=H_DIM, dropout_rate=DROPOUT_RATE, output_type=OUTPUT_TYPE,
                       unroll=False, **kwargs):
        # Load model from config
        if inputs is not None and outputs is not None:
            super(RNet, self).__init__(inputs=inputs,
                                       outputs=outputs,
                                       **kwargs)
            return

        '''Dimensions'''
        B = None
        H = hdim

        # INPUTS
        P_word = Input(shape=(N, ), dtype='int32')
        Q_word = Input(shape=(M, ), dtype='int32')
        P_char = Input(shape=(N, C), dtype='int32')
        Q_char = Input(shape=(M, C), dtype='int32')
        input_placeholders = [P_word, P_char, Q_word, Q_char]
        
        # Word Embedding
        P_word = Embedding(len(embedding_matrix), W, weights=[embedding_matrix], trainable=False, input_length=N, name='P_word') (P_word)
        Q_word = Embedding(len(embedding_matrix), W, weights=[embedding_matrix], trainable=False, input_length=M, name='Q_word') (Q_word)

        # Char Embedding
        P_char = TimeDistributed(Embedding(input_dim=67, output_dim=50, input_length=C)) (P_char)
        Pc_output = []
        for filter_width in [3, 4, 5]:
            Pc = TimeDistributed(Conv1D(filters=30,
                                 kernel_size=filter_width,
                                 activation='relu',
                                 border_mode='same',
                                 name='Conv1D_Pc_{}'.format(filter_width))) (P_char)
            Pc = TimeDistributed(GlobalMaxPooling1D(name='GlobalMaxPooling_Pc_{}'.format(filter_width))) (Pc)
            Pc_output.append(Pc)
        P_char = merge(Pc_output, mode='concat')
        P_char = TimeDistributed(Dropout(dropout_rate)) (P_char)

        Q_char = TimeDistributed(Embedding(input_dim=67, output_dim=50)) (Q_char)
        Qc_output = []
        for filter_width in [3, 4, 5]:
            Qc = TimeDistributed(Conv1D(filters=30,
                                 kernel_size=filter_width,
                                 activation='relu',
                                 border_mode='same',
                                 name='Conv1D_Qc_{}'.format(filter_width))) (Q_char)
            Qc = TimeDistributed(GlobalMaxPooling1D(name='GlobalMaxPooling_Qc_{}'.format(filter_width))) (Qc)
            Qc_output.append(Qc)
        Q_char = merge(Qc_output, mode='concat')
        Q_char = TimeDistributed(Dropout(dropout_rate)) (Q_char)
        
        # concat word + char
        P_vecs = Concatenate() ([P_word, P_char])
        Q_vecs = Concatenate() ([Q_word, Q_char])
  
        P = Concatenate(axis=1) ([P_vecs, Q_vecs])
        # uP, uQ shape: [(None, 300, 215), (None, 46, 215)]

        # context embedding
        uP = Masking() (P_vecs)
        for i in range(1):
            uP = Bidirectional(GRU(units=H,
                                   return_sequences=True,
                                   dropout=dropout_rate, unroll=False)) (uP)
        uP = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, 2 * H), name='uP') (uP)
       
        uQ = Masking() (Q_vecs)
        for i in range(1):
            uQ = Bidirectional(GRU(units=H,
                                   return_sequences=True,
                                   dropout=dropout_rate, unroll=False)) (uQ)
        uQ = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, 2 * H), name='uQ') (uQ)
    
        # uP, uQ shape: [(None, 300, 90), (None, 46, 90)]
    
        P = Concatenate(axis=1) ([uP, uQ])
          
        gP = Bidirectional(GRU(units=H, 
                               return_sequences=False,
                               dropout=dropout_rate, unroll=False)) (P)

        preds = Dense(label_size, activation='softmax')(gP)
             
        inputs = input_placeholders
        outputs = preds

        super(RNet, self).__init__(inputs=inputs, outputs=outputs, **kwargs)

In [13]:
model = RNet(hdim=H_DIM, dropout_rate=DROPOUT_RATE)



Instructions for updating:
keep_dims is deprecated, use keepdims instead


  name=name)


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [14]:
optimizer_config = {'class_name': 'Adadelta',
                    'config': {'epsilon':1e-6}}
model.compile(optimizer=optimizer_config,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [15]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_7 (InputLayer)             (None, 300, 15)       0                                            
____________________________________________________________________________________________________
input_8 (InputLayer)             (None, 46, 15)        0                                            
____________________________________________________________________________________________________
time_distributed_1 (TimeDistribu (None, 300, 15, 50)   3350        input_7[0][0]                    
____________________________________________________________________________________________________
time_distributed_9 (TimeDistribu (None, 46, 15, 50)    3350        input_8[0][0]                    
___________________________________________________________________________________________

In [16]:
checkpoint = ModelCheckpoint("model_QA_CNN.h5", monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
early = EarlyStopping(monitor='val_acc', min_delta=0.00001, patience=10, verbose=1, mode='auto')

In [20]:
hist = model.fit(x=[N_train[:10], N_train_char[:10], Q_train[:10], Q_train_char[:10]], y=y_train[:10], 
                 batch_size=BATCH_SIZE, epochs=NB_EPOCHS, 
                 validation_data=([N_val, N_val_char, Q_val, Q_val_char], y_val),
                 callbacks=[checkpoint, early])

model.save("model_QA_CNN.h5")
print("Model saved to {}".format("model_QA_CNN.h5"))
print(min(hist.history['val_loss']))

Train on 10 samples, validate on 3579 samples
Epoch 1/50


KeyboardInterrupt: 

# 3. Prediction

In [None]:
# fit the model to predict what color each person is
predict_green.fit([sentences_array], [is_green], nb_epoch=5000, verbose=1)
embeddings = predict_green.layers[1].W.get_value()

# print out the embedding vector associated with each word
for i in range(n_words):
    print('{}: {}'.format(idx2word[i], embeddings[i]))

# Appendix

In [None]:
# CNN
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=128)

In [None]:
        P_vecs = Input(shape=(N, ), name='P_vecs')
        Q_vecs = Input(shape=(M, ), name='Q_vecs')
        P_str = Input(shape=(N,15, ), dtype='int32', name='P_str')
        Q_str = Input(shape=(M,15, ), dtype='int32', name='Q_str')
        input_placeholders = [P_vecs, P_str, Q_vecs, Q_str]
       
        P_vecs = Embedding(len(embedding_matrix), W, weights=[embedding_matrix], trainable=False, input_length=N, name='P_vecs') (P_vecs)
        Q_vecs = Embedding(len(embedding_matrix), W, weights=[embedding_matrix], trainable=False, input_length=M, name='Q_vecs') (Q_vecs)
        P_str = Embedding(65, 25, dropout=dropout_rate, input_length=2100, name='P_str')(P_str)
        Q_str = Embedding(65, 25, dropout=dropout_rate, input_length=276, name='Q_str')(Q_str)

        uPc1 = Conv1D(nb_filter=25, kernel_size=3, padding='same', activation='relu') (P_str) 
        uPc2 = Conv1D(nb_filter=25, kernel_size=5, padding='same', activation='relu') (P_str) 
        uPc3 = Conv1D(nb_filter=25, kernel_size=7, padding='same', activation='relu') (P_str) 
        uPc1 = MaxPooling1D(pool_length=21) (uPc1)
        uPc2 = MaxPooling1D(pool_length=21) (uPc2)
        uPc3 = MaxPooling1D(pool_length=21) (uPc3)
        uPc = Concatenate(axis=1)([uPc1, uPc2, uPc3])
        uPc = Dropout(dropout_rate)(uPc)
        
        uQc = Conv1D(nb_filter=25, kernel_size=5, padding='same', activation='relu') (Q_str)
        uQc = MaxPooling1D(pool_length=6) (uQc1)
        uQc = Concatenate(axis=1)([uQc1, uQc2, uQc3])
        uQc = Dropout(dropout_rate)(uQc)
               