In [1]:
import keras.preprocessing.text as kpt
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, Flatten
from keras.layers.core import Dense
from keras.models import load_model, Sequential
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

Using TensorFlow backend.


In [2]:
# ----------------------------------------------
# Constants
# ----------------------------------------------
num_units = 300
batch_size = 50
embedding_dim = 300
story_length = 4
scale = 1.0
# num_vocab = 20 # 20000

In [3]:
# ----------------------------------------------
# Preprocess the sentences (Modified from Hong's code)
# ----------------------------------------------
##### Load data #####
# TRAIN_URL = 'train_stories.csv'
TRAIN_URL = 'eval_stories.csv'

# InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2,AnswerRightEnding
df_train_sep = pd.read_csv(TRAIN_URL).iloc[:, 1:]

df_train_sep['correct'] = df_train_sep[['InputSentence1','InputSentence2','InputSentence3',\
                                        'InputSentence4','RandomFifthSentenceQuiz1','RandomFifthSentenceQuiz2']].apply(lambda x: ' '.join(x), axis=1)

df_train_sep['right_endings'] = df_train_sep['RandomFifthSentenceQuiz1']
df_train_sep['right_endings'].update(df_train_sep.loc[df_train_sep['AnswerRightEnding'] == 2,'RandomFifthSentenceQuiz2'])

df_train_sep['wrong_endings'] = df_train_sep['RandomFifthSentenceQuiz2']
df_train_sep['wrong_endings'].update(df_train_sep.loc[df_train_sep['AnswerRightEnding'] == 2,'RandomFifthSentenceQuiz1'])

print(df_train_sep.shape)


##### Prepare dictionary #####
VOCAB_SIZE = 20000

tokenizer = kpt.Tokenizer(oov_token='<UNK>', num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df_train_sep['correct'])
tokenizer.word_index = {w:i for w,i in tokenizer.word_index.items() if i < VOCAB_SIZE}

vocab_dict = tokenizer.word_index
print(len(vocab_dict)+1)

##### Prepare for training set #####
# X_train = pd.concat([df_train_sep['sentence1'], df_train_sep['sentence2'],\
# 	df_train_sep['sentence3'], df_train_sep['sentence4'], df_train_sep['sentence5']])

X_train = pd.concat([df_train_sep['InputSentence1'], df_train_sep['InputSentence2'],\
	df_train_sep['InputSentence3'], df_train_sep['InputSentence4'], \
                     df_train_sep['RandomFifthSentenceQuiz1'], df_train_sep['RandomFifthSentenceQuiz2']])

print(len(X_train))
X_train = tokenizer.texts_to_sequences(X_train)

seq_len = max(len(s) for s in X_train)
n_correct = df_train_sep.shape[0]
# n_samples = n_correct * 2

X_train = np.zeros((n_correct,6,seq_len),dtype=np.int32)

for i in range(1,5):
	X_train_temp = tokenizer.texts_to_sequences(df_train_sep["InputSentence"+str(i)])

	X_train_temp = pad_sequences(X_train_temp, maxlen=seq_len, padding='post')
	X_train[:,i-1,:] = X_train_temp


X_train_temp = tokenizer.texts_to_sequences(df_train_sep["right_endings"])
X_train_temp = pad_sequences(X_train_temp, maxlen=seq_len, padding='post')
X_train[:,4,:] = X_train_temp

X_train_temp = tokenizer.texts_to_sequences(df_train_sep["wrong_endings"])
X_train_temp = pad_sequences(X_train_temp, maxlen=seq_len, padding='post')
X_train[:,5,:] = X_train_temp


print(X_train.shape)
print(X_train[n_correct:,4,:].shape)

X_ori = X_train

(1871, 10)
7432
11226
(1871, 6, 18)
(0, 18)


In [4]:
##### load embeddings #####
embedding_dim = 300
GLOVE_URL = 'glove.6B.300d.txt'

embeddings = {}
with open(GLOVE_URL, 'r', encoding='utf-8') as f:
	for line in f:
		values = line.split()
		w = values[0]
		coefs = np.asarray(values[1:], dtype='float32')
		embeddings[w] = coefs


print(len(embeddings))

embedding_matrix = np.random.uniform(-1, 1, size=(VOCAB_SIZE, embedding_dim))
num_loaded = 0
for w, i in vocab_dict.items():
	if w in embeddings and i < VOCAB_SIZE:
		embedding_matrix[i] = embeddings[w]
		num_loaded += 1

print(embedding_matrix.shape, num_loaded)



400000
(20000, 300) 7105


In [5]:
# ----------------------------------------------
# Set up the graph
# ----------------------------------------------
tf.reset_default_graph()

##### embed sentences #####
lstm_forward1 = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
							   initializer=tf.contrib.layers.xavier_initializer())
lstm_backward1 = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
							   initializer=tf.contrib.layers.xavier_initializer())

# should use GloVe to embed words first (not finished!)
# same story are close to each other

embedding = tf.placeholder('float32', [batch_size*(story_length+2), seq_len, embedding_dim])

x = tf.unstack(embedding, seq_len, 1)

state_forward1 = lstm_forward1.zero_state(batch_size*(story_length+2), 'float')
state_backward1 = lstm_backward1.zero_state(batch_size*(story_length+2), 'float')

states_forward1 = []
states_backward1 = []
for t in range(seq_len):

	p, state_forward1 = lstm_forward1(x[t], state_forward1)
	states_forward1.append(state_forward1.h)

	p, state_backward1 = lstm_backward1(x[seq_len-1-t], state_backward1)
	states_backward1.append(state_backward1.h)

states_backward1.reverse()

states_forward1 = tf.stack(states_forward1)
states_backward1 = tf.stack(states_backward1)

# state: 	seq_len x batch_size*(story_length+2) x num_units
# sentences: batch_size*(story_length+2) x num_units
# h_layer: seq_len x batch_size*(story_length+2) x num_units
print(states_forward1.shape,states_backward1.shape)
h_layer = tf.add(states_forward1,states_backward1)
print(h_layer.shape)
sentences = tf.reduce_mean(h_layer,axis=0)
print(sentences.shape)

(18, 300, 300) (18, 300, 300)
(18, 300, 300)
(300, 300)


In [7]:
##### Attention  Layer #####

# attention: seq_len x batch_size*story_length x num_units
h_layer = tf.reshape(h_layer,[seq_len,batch_size,story_length+2,num_units])

attention = tf.layers.dense(tf.reshape(h_layer[:,:,:story_length,:],\
	[seq_len*batch_size*story_length,num_units]), 
	num_units, kernel_initializer=tf.contrib.layers.xavier_initializer())

# alpha_correct: seq_len x batch_size*story_length
attention = tf.reshape(attention, [seq_len, batch_size*story_length,num_units])
alpha_correct = tf.reduce_sum(tf.multiply(attention,sentences[story_length]),axis=2)
alpha_wrong = tf.reduce_sum(tf.multiply(attention,sentences[story_length+1]),axis=2)
# alpha_wrong = tf.reduce_sum(tf.multiply(attention,sentences[story_length+1]),axis=1)
print(alpha_correct.shape)

alpha_correct = tf.transpose(tf.reshape(alpha_correct,[seq_len,batch_size,story_length]),perm=[1,2,0])
alpha_wrong = tf.transpose(tf.reshape(alpha_wrong,[seq_len,batch_size,story_length]),perm=[1,2,0])

# alpha_correct: batch_size x story_length x seq_len 
print(alpha_correct.shape)

# beta_correct: batch_size x story_length x seq_len
beta_correct = tf.nn.softmax(alpha_correct) 
beta_wrong = tf.nn.softmax(alpha_wrong) 

beta_correct = tf.transpose(tf.reshape(beta_correct,[batch_size*story_length,seq_len]))
beta_wrong = tf.transpose(tf.reshape(beta_wrong,[batch_size*story_length,seq_len]))
print(beta_correct.shape)

# beta_correct: seq_len x batch_size*story_length
# h_layer: seq_len x batch_size*(story_length+2) x num_units
sentences_atten_correct = tf.multiply(tf.reshape(h_layer[:,:,:story_length,:],[-1,num_units]),tf.reshape(beta_correct,[-1,1]))
sentences_atten_correct = tf.reduce_sum(tf.reshape(sentences_atten_correct,[seq_len,batch_size*story_length,-1]),axis=0)

sentences_atten_wrong = tf.multiply(tf.reshape(h_layer[:,:,:story_length,:],[-1,num_units]),tf.reshape(beta_wrong,[-1,1]))
sentences_atten_wrong = tf.reduce_sum(tf.reshape(sentences_atten_wrong,[seq_len,batch_size*story_length,-1]),axis=0)

print(sentences_atten_correct.shape)
# print(tf.multiply(tf.reshape(h_layer[:,:,:story_length,:],[-1,num_units]),tf.reshape(beta_correct,[-1,1])).shape)

(18, 200)
(50, 4, 18)
(18, 200)
(200, 300)


In [8]:
##### Embed Stories #####
# correct and wrong sentences are putted together respectively

sentences_atten_correct = tf.reshape(sentences_atten_correct,[batch_size,story_length,num_units])
sentences_atten_wrong = tf.reshape(sentences_atten_wrong,[batch_size,story_length,num_units])

endings = tf.reshape(sentences,[batch_size,story_length+2,num_units])[:,story_length:,:]
print(endings[:,0,:].shape)
endings = tf.concat([endings[:,0,:], endings[:,1,:]],axis=0)
print(endings.shape)
endings = tf.reshape(endings,[batch_size*2,-1])

lstm_forward2 = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
                               initializer=tf.contrib.layers.xavier_initializer())
lstm_backward2 = tf.nn.rnn_cell.LSTMCell(num_units=num_units,
                               initializer=tf.contrib.layers.xavier_initializer())

state_forward2 = lstm_forward2.zero_state(batch_size*2, 'float')
state_backward2 = lstm_backward2.zero_state(batch_size*2, 'float')

states_forward2 = []
states_backward2 = []

print(sentences_atten_correct.shape)

sentence_att = tf.concat([sentences_atten_correct,sentences_atten_wrong],axis=0)


print(sentence_att.shape)
sentence_att_unstack = tf.unstack(sentence_att,story_length,1)

print(len(sentence_att_unstack))

for t in range(story_length):
    
	p, state_forward2 = lstm_forward2(sentence_att_unstack[t], state_forward2)
	states_forward2.append(state_forward2.h)

	p, state_backward2 = lstm_backward2(sentence_att_unstack[story_length-1-t], state_backward2)
	states_backward2.append(state_backward2.h)

# states_backward2.reverse()

state_forward2 = lstm_forward2.zero_state(batch_size*2, 'float')
state_backward2 = lstm_backward2.zero_state(batch_size*2, 'float')

p, endings_forward = lstm_forward2(endings, state_forward2)
p, endings_backward = lstm_backward2(endings, state_backward2)

stories = tf.add(states_forward2[-1],states_backward2[-1])
endings_embed = tf.add(endings_forward.h,endings_backward.h)

print(stories.shape)


(50, 300)
(100, 300)
(50, 4, 300)
(100, 4, 300)
4
(100, 300)


In [9]:
##### Calculate Score, loss, and do updating #####
hier_stories = tf.concat([stories,endings_embed],axis=1)
print(hier_stories.shape)

regularizer = tf.contrib.layers.l2_regularizer(scale)
hidden_layer = tf.layers.dense(hier_stories, 512, kernel_initializer=tf.contrib.layers.xavier_initializer(),
                               activation=tf.nn.relu,kernel_regularizer=regularizer)

scores = tf.layers.dense(hidden_layer, 1, kernel_regularizer=regularizer,#activation=tf.nn.tanh,
                kernel_initializer=tf.contrib.layers.xavier_initializer())
# logits = tf.layers.dense(
#     states, num_vocab, kernel_initializer=tf.contrib.layers.xavier_initializer())

losses = tf.maximum(tf.add(tf.subtract(scores[batch_size:],scores[:batch_size]),1),0)
print(losses.shape)
loss = tf.reduce_sum(losses)

l2_loss = tf.losses.get_regularization_loss()
loss += l2_loss

optimizer = tf.train.AdamOptimizer()  # select optimizer and set learning rate
train_step = optimizer.minimize(loss)


(100, 600)
(50, 1)


In [11]:
# ----------------------------------------------
# Train the model
# ----------------------------------------------
epoch = 10
fold_num = 5

# X_Train.shape: (88161, 6, 19)
print(X_ori.shape)

single_fold = X_train.shape[0]//10

###### remove the weird 2nd part #####
# X_train = np.concatenate((X_train[:1*single_fold],X_train[2*single_fold:])) 
# single_fold = X_ori.shape[0]//10
# X_train = X_ori[:9*single_fold] 
# X_test = X_ori[9*single_fold:] 
# print(X_test.shape)

X_train = X_ori
############################################

np.random.shuffle(X_train)


with tf.Session() as sess:    
    single_fold = X_train.shape[0]//fold_num
    
    for k in range(fold_num):         
        init = tf.global_variables_initializer()
        sess.run(init)
        
        X_train_temp = np.concatenate((X_train[:k*single_fold],X_train[(k+1)*single_fold:]))   
        
        X_val = X_train[k*single_fold:(k+1)*single_fold]
        
        for i in range(epoch):
            accuracy = 0
            loss_mean = 0
            for j in range(X_train.shape[0]//batch_size):
                         
                X_batch = X_train_temp[np.random.choice(X_train_temp.shape[0],batch_size)]

                X_batch_embed = np.zeros((batch_size,(story_length+2), seq_len, embedding_dim))
                X_batch_embed = embedding_matrix[X_batch]
                X_batch_embed = np.reshape(X_batch_embed,(batch_size*(story_length+2), seq_len, embedding_dim))

                score, loss_sofar, _ = sess.run([scores,loss,train_step],feed_dict = {embedding: X_batch_embed})
                accuracy += np.mean((score[:batch_size]-score[batch_size:])>0)
                loss_mean += loss_sofar
    #             print(score.reshape([-1])[:batch_size])
    #             print(score.reshape([-1])[batch_size:])

                if(j % 10 == 0):
                    print("epoch:",i,j, X_train.shape[0]//batch_size,"loss:",loss_mean/(j+1),"accuracy:",accuracy/(j+1))
            
            saver = tf.train.Saver()
            saver.save(sess, './saved/model_biLSTM_att_val_test_'+str(k)+'.ckpt')
        
            # validation
            accuracy = 0
            loss_mean = 0
            for j in range(single_fold//batch_size):
                X_val_temp = X_val[j*batch_size:(j+1)*batch_size]
                X_val_embed = embedding_matrix[X_val_temp]
                
                X_val_embed = np.reshape(X_val_embed,(batch_size*(story_length+2), seq_len, embedding_dim))
                
            
                score, loss_sofar = sess.run([scores,loss],feed_dict = {embedding: X_val_embed})
                accuracy += np.mean((score[:batch_size]-score[batch_size:])>0)
                loss_mean += loss_sofar
            
            accuracy /= (single_fold//batch_size)
            loss_mean /= (single_fold//batch_size)
            print("epoch_val:",i, "loss:",loss_mean,"accuracy:",accuracy)
           
        
#             accuracy = 0
#             loss_mean = 0
#             for j in range(X_test.shape[0]//batch_size):
#                 X_test_temp = X_test[j*batch_size:(j+1)*batch_size]
#                 X_test_embed = embedding_matrix[X_test_temp]

#                 X_test_embed = np.reshape(X_test_embed,(batch_size*(story_length+2), seq_len, embedding_dim))


#                 score, loss_sofar = sess.run([scores,loss],feed_dict = {embedding: X_test_embed})
#                 accuracy += np.mean((score[:batch_size]-score[batch_size:])>0)
#                 loss_mean += loss_sofar

#             accuracy /= (X_test.shape[0]//batch_size)
#             loss_mean /= (X_test.shape[0]//batch_size)
#             print("epoch_test:", "loss:",loss_mean,"accuracy:",accuracy)
        
            

(1871, 6, 18)
epoch: 0 0 37 loss: 327.55389404296875 accuracy: 0.26
epoch: 0 10 37 loss: 279.87127962979406 accuracy: 0.4854545454545454
epoch: 0 20 37 loss: 244.5553232828776 accuracy: 0.5161904761904762
epoch: 0 30 37 loss: 215.25428722750755 accuracy: 0.5335483870967742
epoch_val: 0 loss: 108.54530334472656 accuracy: 0.6142857142857142
epoch: 1 0 37 loss: 114.09099578857422 accuracy: 0.56
epoch: 1 10 37 loss: 94.10007268732244 accuracy: 0.6545454545454545
epoch: 1 20 37 loss: 83.2124613807315 accuracy: 0.6676190476190477
epoch: 1 30 37 loss: 75.3611941183767 accuracy: 0.6793548387096774
epoch_val: 1 loss: 54.59133802141462 accuracy: 0.64
epoch: 2 0 37 loss: 39.004493713378906 accuracy: 0.8
epoch: 2 10 37 loss: 41.033627249977805 accuracy: 0.770909090909091
epoch: 2 20 37 loss: 39.6150392804827 accuracy: 0.7761904761904762
epoch: 2 30 37 loss: 37.89852068501134 accuracy: 0.7767741935483873
epoch_val: 2 loss: 43.92912728445871 accuracy: 0.6714285714285715
epoch: 3 0 37 loss: 33.617012

epoch: 5 30 37 loss: 28.78261375427246 accuracy: 0.7716129032258064
epoch_val: 5 loss: 39.333244051252095 accuracy: 0.6857142857142857
epoch: 6 0 37 loss: 21.1915340423584 accuracy: 0.86
epoch: 6 10 37 loss: 21.954714688387785 accuracy: 0.8254545454545454
epoch: 6 20 37 loss: 21.551562808808825 accuracy: 0.8323809523809526
epoch: 6 30 37 loss: 20.822664137809507 accuracy: 0.8412903225806451
epoch_val: 6 loss: 40.535323006766184 accuracy: 0.6828571428571429
epoch: 7 0 37 loss: 25.809383392333984 accuracy: 0.8
epoch: 7 10 37 loss: 21.13849232413552 accuracy: 0.8418181818181819
epoch: 7 20 37 loss: 21.555408704848517 accuracy: 0.8438095238095239
epoch: 7 30 37 loss: 20.05642057234241 accuracy: 0.8535483870967744
epoch_val: 7 loss: 38.299046652657644 accuracy: 0.72
epoch: 8 0 37 loss: 24.82274627685547 accuracy: 0.78
epoch: 8 10 37 loss: 28.718185251409356 accuracy: 0.8090909090909091
epoch: 8 20 37 loss: 33.382087071736656 accuracy: 0.7638095238095237
epoch: 8 30 37 loss: 31.8552861982776