In [1]:
import pandas as pd
from IPython.display import display
import skipthoughts
import numpy as np
import time
import math
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score as accuracy
tf.set_random_seed(5)
np.random.seed(5)



[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
# Provided training data
train = pd.read_csv('data/train_stories.csv')
val = pd.read_csv('data/cloze_test_val__spring2016 - cloze_test_ALL_val.csv')
test = pd.read_csv('data/cloze_test_test__spring2016 - cloze_test_ALL_test.csv')

train = train.drop("storytitle", axis=1).drop("storyid", axis=1)

val = val.drop("InputStoryid", axis=1)
val_answer = val["AnswerRightEnding"]
val_sentences = val.drop("AnswerRightEnding", axis=1)

test = test.drop("InputStoryid", axis=1)
test_answer = test["AnswerRightEnding"]
test_sentences = test.drop("AnswerRightEnding", axis=1)

display(train.head())

display(val_sentences.head())
display(val_answer.head())

display(test_sentences.head())
display(test_answer.head())

## Skipthoughts model from: https://github.com/ryankiros/skip-thoughts

In [3]:
a = time.time()
print("Loading model ...")
model = skipthoughts.load_model()
encoder = skipthoughts.Encoder(model)
print("Done in {} s".format(time.time() - a))

Loading model ...
Loading model parameters...
Compiling encoders...
Loading tables...
Packing up...
Done in 39.923095703125 s


In [4]:
# a = time.time()
# print("Reading stories to memory ...")
# stories = []
# stories_flat = []
# for index, row in train.iterrows():
#     story = []
#     for col in train.columns:
#         story.append(row[col])
#         stories_flat.append(row[col])
#     stories.append(story)
# print("Done in {} s".format(time.time() - a))

In [5]:
# print("Nb of stories:", len(stories))
# print("Example story:", stories[0])
# print("Nb of sentences per story:", len(stories[0]))

In [6]:
# # testing skipthoughts encoder
# a = time.time()
# print("Encoding story ...")
# vectors = encoder.encode(stories[0], verbose=False)
# print("Done in {} s".format(time.time() - a))
# print("Encoded story:\n", vectors)
# print(type(vectors))
# print(vectors.shape)

In [7]:
# # encoding only some sentences of the story to see if the same encoding outputted
# a = time.time()
# print("Encoding part of story ...")
# vectors = encoder.encode(stories[0][0:3], verbose=False)
# print("Done in {} s".format(time.time() - a))
# print("Encoded story:\n", vectors)
# print(type(vectors))
# print(vectors.shape)

In [8]:
# encoded_sentences = np.zeros([len(stories_flat), 4800]) # each story having 5 sentences, each sentence encoded to a 
#                                                     # 4800-dim vector

# print(encoded_sentences)
# encoding_batch_size = 2000
# nb_batches = int(math.ceil(len(stories_flat) / encoding_batch_size))
# print("nb sentences per batch: {}, nb batches: {}".format(encoding_batch_size, nb_batches))
# for i in range(nb_batches):
#     a = time.time()
#     print("Encoding batch {} of sentences ...".format(i))
#     encoded_sentences[i*encoding_batch_size : (i+1)*encoding_batch_size] = encoder.encode(stories_flat[i*encoding_batch_size : (i+1)*encoding_batch_size], verbose=False)
#     print("Done in {} s".format(time.time() - a))
# print(encoded_sentences)

In [9]:
# encoded_stories = encoded_sentences.reshape([-1, 5, 4800])
# print(encoded_stories.shape)

In [10]:
def get_stories_as_lists(dataframe):
    a = time.time()
    print("Reading stories to memory ...")
    stories = []
    stories_flat = []
    for index, row in dataframe.iterrows():
        story = []
        for col in dataframe.columns:
            story.append(row[col])
            stories_flat.append(row[col])
        stories.append(story)
    print("Done in {} s".format(time.time() - a))
    return stories, stories_flat

def encode_stories(encoder, stories_flat, encoding_batch_size=2000):
    encoded_sentences = np.zeros([len(stories_flat), 4800]) # each sentence encoded to a 4800-dim vector

    nb_batches = int(math.ceil(len(stories_flat) / encoding_batch_size))
    print("nb sentences per batch: {}, nb batches: {}".format(encoding_batch_size, nb_batches))
    for i in range(nb_batches):
        a = time.time()
        print("Encoding batch {} of sentences ...".format(i))
        encoded_sentences[i*encoding_batch_size : (i+1)*encoding_batch_size] = encoder.encode(stories_flat[i*encoding_batch_size : (i+1)*encoding_batch_size], verbose=False)
        print("Done in {} s".format(time.time() - a))
    return encoded_sentences.reshape([-1, 6, 4800])

def get_train_valid_split(x_data, y_data, valid_percent=0.1, shuffle=True):
    data_size = x_data.shape[0]
    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        x_data = x_data[shuffle_indices]
        y_data = y_data[shuffle_indices]
    split_index = int((1-valid_percent)*data_size)
    return x_data[:split_index], x_data[split_index:], y_data[:split_index], y_data[split_index:]

def split_pos_neg_endings(encoded_stories):
    encoded_stories_context = encoded_stories[:, 0:4] # first 4 sentences (i,e the context) 
    print(encoded_stories_context.shape)
    encoded_stories_context = np.repeat(encoded_stories_context, 2, axis=0) # repeat each context twice (once per possible ending)
    print(encoded_stories_context.shape)
    encoded_stories_endings = encoded_stories[:, 4:6] # the 2 possible endings
    print(encoded_stories_endings.shape)
    encoded_stories_endings = encoded_stories_endings.reshape([-1, 1, 4800]) # one ending per row
    print(encoded_stories_endings.shape)
    encoded_stories_split = np.concatenate([encoded_stories_context, encoded_stories_endings], axis=1)
    print(encoded_stories_split.shape)
    return encoded_stories_split

def create_stories_labels(answers):
    answers_split = np.zeros([2*answers.shape[0], 1])
    print("Answers shape:", answers_split.shape)
    for i,answer in enumerate(answers):
        if answer == 1:
            answers_split[2*i] = 1.
            answers_split[2*i+1] = 0.
        else: # answer == 2
            answers_split[2*i] = 0.
            answers_split[2*i+1] = 1.
    return answers_split

In [None]:
# --------------------------------------------- Labeled Train Data -----------------------------------------------

In [11]:
val_stories, val_stories_flat = get_stories_as_lists(val_sentences)

Reading stories to memory ...
Done in 0.20423507690429688 s


In [12]:
print("Nb of stories:", len(val_stories))
print("Example story with right and wrong ending:", val_stories[0])
print("Nb of sentences per story:", len(val_stories[0]))

Nb of stories: 1871
Example story with right and wrong ending: ['Rick grew up in a troubled household.', 'He never found good support in family, and turned to gangs.', "It wasn't long before Rick got shot in a robbery.", 'The incident caused him to turn a new leaf.', 'He is happy now.', 'He joined a gang.']
Nb of sentences per story: 6


In [13]:
val_encoded_stories = encode_stories(encoder, val_stories_flat)
print(val_encoded_stories.shape)
print(val_encoded_stories[0]) # print story 0 (both endings in the 5th and 6th sentence)

nb sentences per batch: 2000, nb batches: 6
Encoding batch 0 of sentences ...
Done in 23.839473724365234 s
Encoding batch 1 of sentences ...
Done in 23.39590334892273 s
Encoding batch 2 of sentences ...
Done in 22.656227350234985 s
Encoding batch 3 of sentences ...
Done in 23.053677558898926 s
Encoding batch 4 of sentences ...
Done in 23.683773040771484 s
Encoding batch 5 of sentences ...
Done in 14.791567325592041 s
(1871, 6, 4800)
[[ 0.00013156 -0.00460475  0.00544821 ... -0.01273214  0.00150634
  -0.00127442]
 [-0.00038869 -0.03386274 -0.00209803 ... -0.02370408  0.00628775
   0.00622181]
 [ 0.00784618  0.02995787 -0.00370612 ... -0.01775567  0.00297994
   0.01153545]
 [-0.00213756 -0.00957396 -0.00261727 ... -0.01276501  0.00276876
  -0.00384201]
 [-0.00714206 -0.0227032   0.01572856 ... -0.00376865  0.00276918
   0.00608158]
 [ 0.00167412 -0.01099871 -0.00196931 ... -0.03177596  0.0022835
   0.00621052]]


In [14]:
res = get_train_valid_split(val_encoded_stories, np.array(val_answer), valid_percent=0.1, shuffle=True)
train_stories, valid_stories, train_answers, valid_answers = res
print(train_stories.shape)
print(train_answers.shape)
print(valid_stories.shape)
print(valid_answers.shape)
print(train_stories[0]) # print train story 0 (both endings in the 5th and 6th sentence)

(1683, 6, 4800)
(1683,)
(188, 6, 4800)
(188,)
[[ 0.00257789 -0.01078219 -0.00668491 ... -0.03345989  0.00243997
   0.00695168]
 [ 0.00311376 -0.02050668  0.01805543 ... -0.00992052  0.00328235
   0.00122506]
 [ 0.00829043 -0.01595514 -0.00169544 ... -0.00554999  0.00448585
  -0.00629614]
 [ 0.01419469 -0.02284871 -0.00301768 ...  0.00108492  0.00252766
  -0.005558  ]
 [ 0.00201218 -0.02382531  0.01359426 ... -0.01885595  0.00522397
   0.00062915]
 [ 0.00917428 -0.00796457  0.00079475 ... -0.03304717  0.00208425
   0.00075519]]


In [15]:
train_stories_split = split_pos_neg_endings(train_stories)
print(train_stories_split[0:2]) # print same story as before with right and wrong endings on different rows
train_answers_split = create_stories_labels(train_answers) # 0 means the corresponding story in train_stories_split 
                                                      # has a wrong ending, and 1 means it has a right ending
print(train_answers_split[0:10])

(1683, 4, 4800)
(3366, 4, 4800)
(1683, 2, 4800)
(3366, 1, 4800)
(3366, 5, 4800)
[[[ 0.00257789 -0.01078219 -0.00668491 ... -0.03345989  0.00243997
    0.00695168]
  [ 0.00311376 -0.02050668  0.01805543 ... -0.00992052  0.00328235
    0.00122506]
  [ 0.00829043 -0.01595514 -0.00169544 ... -0.00554999  0.00448585
   -0.00629614]
  [ 0.01419469 -0.02284871 -0.00301768 ...  0.00108492  0.00252766
   -0.005558  ]
  [ 0.00201218 -0.02382531  0.01359426 ... -0.01885595  0.00522397
    0.00062915]]

 [[ 0.00257789 -0.01078219 -0.00668491 ... -0.03345989  0.00243997
    0.00695168]
  [ 0.00311376 -0.02050668  0.01805543 ... -0.00992052  0.00328235
    0.00122506]
  [ 0.00829043 -0.01595514 -0.00169544 ... -0.00554999  0.00448585
   -0.00629614]
  [ 0.01419469 -0.02284871 -0.00301768 ...  0.00108492  0.00252766
   -0.005558  ]
  [ 0.00917428 -0.00796457  0.00079475 ... -0.03304717  0.00208425
    0.00075519]]]
Answers shape: (3366, 1)
[[1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]]


In [16]:
valid_stories_split = split_pos_neg_endings(valid_stories)

(188, 4, 4800)
(376, 4, 4800)
(188, 2, 4800)
(376, 1, 4800)
(376, 5, 4800)


In [None]:
# ----------------------------------------------- Test Data ------------------------------------------------------

In [17]:
test_stories, test_stories_flat = get_stories_as_lists(test_sentences)

Reading stories to memory ...
Done in 0.22569847106933594 s


In [18]:
print("Nb of stories:", len(test_stories))
print("Example story with right and wrong ending:", test_stories[0])
print("Nb of sentences per story:", len(test_stories[0]))

Nb of stories: 1871
Example story with right and wrong ending: ['My friends all love to go to the club to dance.', "They think it's a lot of fun and always invite.", 'I finally decided to tag along last Saturday.', "I danced terribly and broke a friend's toe.", 'My friends decided to keep inviting me out as I am so much fun.', 'The next weekend, I was asked to please stay home.']
Nb of sentences per story: 6


In [19]:
test_encoded_stories = encode_stories(encoder, test_stories_flat)
print(test_encoded_stories.shape)
print(test_encoded_stories[0]) # print story 0 (both endings in the 5th and 6th sentence)

nb sentences per batch: 2000, nb batches: 6
Encoding batch 0 of sentences ...
Done in 22.885762929916382 s
Encoding batch 1 of sentences ...
Done in 23.074304580688477 s
Encoding batch 2 of sentences ...
Done in 23.042985677719116 s
Encoding batch 3 of sentences ...
Done in 23.387568473815918 s
Encoding batch 4 of sentences ...
Done in 23.373992681503296 s
Encoding batch 5 of sentences ...
Done in 15.123126983642578 s
(1871, 6, 4800)
[[-0.01622678 -0.01267685 -0.00449958 ... -0.01958963  0.0025401
   0.00582471]
 [ 0.00351208  0.01862131  0.01466644 ... -0.02046947  0.00296836
  -0.00029696]
 [-0.00341649 -0.02440572  0.02018603 ... -0.03569067  0.00320727
  -0.00057357]
 [ 0.02745583 -0.0154904  -0.00947571 ...  0.0108288   0.00337028
   0.01885112]
 [-0.00548592 -0.03024452  0.02510787 ... -0.00660004  0.00243402
  -0.00326103]
 [ 0.00078507 -0.02019084  0.01723271 ... -0.03500823  0.00463203
  -0.00075694]]


In [20]:
test_stories_split = split_pos_neg_endings(test_encoded_stories)
print(test_stories_split[0:2]) # print same story as before with right and wrong ending on different rows
test_answers = np.array(test_answer)

(1871, 4, 4800)
(3742, 4, 4800)
(1871, 2, 4800)
(3742, 1, 4800)
(3742, 5, 4800)
[[[-0.01622678 -0.01267685 -0.00449958 ... -0.01958963  0.0025401
    0.00582471]
  [ 0.00351208  0.01862131  0.01466644 ... -0.02046947  0.00296836
   -0.00029696]
  [-0.00341649 -0.02440572  0.02018603 ... -0.03569067  0.00320727
   -0.00057357]
  [ 0.02745583 -0.0154904  -0.00947571 ...  0.0108288   0.00337028
    0.01885112]
  [-0.00548592 -0.03024452  0.02510787 ... -0.00660004  0.00243402
   -0.00326103]]

 [[-0.01622678 -0.01267685 -0.00449958 ... -0.01958963  0.0025401
    0.00582471]
  [ 0.00351208  0.01862131  0.01466644 ... -0.02046947  0.00296836
   -0.00029696]
  [-0.00341649 -0.02440572  0.02018603 ... -0.03569067  0.00320727
   -0.00057357]
  [ 0.02745583 -0.0154904  -0.00947571 ...  0.0108288   0.00337028
    0.01885112]
  [ 0.00078507 -0.02019084  0.01723271 ... -0.03500823  0.00463203
   -0.00075694]]]


In [None]:
# ------------------------------------------ Model and Training -----------------------------------------------

In [21]:
def create_model():
    model = Sequential()
    model.add(Dense(units=2400, activation='relu', input_dim=4800))
    model.add(Dense(units=1200, activation='relu'))
    model.add(Dense(units=600, activation='relu'))
    model.add(Dense(units=2, activation='softmax')) # output layer
    return model

In [22]:
# train: last sentence only + ending
train_stories_ls = train_stories_split[:, 3] + train_stories_split[:, 4]
print(train_stories_ls.shape)
train_answers_categorical = keras.utils.to_categorical(train_answers_split, num_classes=2, dtype='float32')
print(train_answers_split)
print(train_answers_categorical)

(3366, 4800)
[[1.]
 [0.]
 [1.]
 ...
 [1.]
 [1.]
 [0.]]
[[0. 1.]
 [1. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


In [23]:
# valid: last sentence only + ending
valid_stories_ls = valid_stories_split[:, 3] + valid_stories_split[:, 4]
print(valid_stories_ls.shape)

(376, 4800)


In [24]:
# test: last sentence only + ending
test_stories_ls = test_stories_split[:, 3] + test_stories_split[:, 4]
print(test_stories_ls.shape)

(3742, 4800)


In [25]:
tb_callback = keras.callbacks.TensorBoard(log_dir='./logs')

In [26]:
class AccuracyCallback(keras.callbacks.Callback):
    def on_epoch_end(self, batch, logs={}):
        valid_pred = self.get_predicted_right_endings(valid_stories_ls)
        test_pred = self.get_predicted_right_endings(test_stories_ls)
        
        print("Validation:")
#         print("   Predicted endings:", valid_pred)
#         print("   Correct endings:", valid_answers)
        acc = accuracy(valid_answers, valid_pred)
        print("   Accuracy: {}".format(acc))
        
        print("Test:")
#         print("   Predicted endings:", test_pred)
#         print("   Correct endings:", test_answers)
        acc = accuracy(test_answers, test_pred)
        print("   Accuracy: {}".format(acc))

    def get_predicted_right_endings(self, data):
        preds = model.predict(data)
        preds_endings = []
        for i in range(preds.shape[0]//2):
            if(preds[2*i][1] > preds[2*i+1][1]): # if the first ending is "more right" then the second ending
                preds_endings.append(1)
            else:
                preds_endings.append(2)
        return np.array(preds_endings)
        
AccCb = AccuracyCallback()

In [27]:
model = create_model()
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(lr=0.001))
model.fit(train_stories_ls, train_answers_categorical, epochs=50, batch_size=32, 
          callbacks=[tb_callback, AccCb])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Validation:
   Accuracy: 0.6968085106382979
Test:
   Accuracy: 0.7113842864778194
Epoch 2/50
Validation:
   Accuracy: 0.7074468085106383
Test:
   Accuracy: 0.7151256012827365
Epoch 3/50
Validation:
   Accuracy: 0.723404255319149
Test:
   Accuracy: 0.7268840192410476
Epoch 4/50
Validation:
   Accuracy: 0.7393617021276596
Test:
   Accuracy: 0.7343666488508819
Epoch 5/50
Validation:
   Accuracy: 0.723404255319149
Test:
   Accuracy: 0.738107963655799
Epoch 6/50
Validation:
   Accuracy: 0.7340425531914894
Test:
   Accuracy: 0.7343666488508819
Epoch 7/50
Validation:
   Accuracy: 0.7712765957446809
Test:
   Accuracy: 0.7621592731159808
Epoch 8/50
Validation:
   Accuracy: 0.7712765957446809
Test:
   Accuracy: 0.757883484767504
Epoch 9/50
Validation:
   Accuracy: 0.7819148936170213
Test:
   Accuracy: 0.7744521646178514
Epoch 10/50
Validation:
   Accuracy: 0.76595744

Validation:
   Accuracy: 0.7553191489361702
Test:
   Accuracy: 0.7359700694815606
Epoch 48/50
Validation:
   Accuracy: 0.7659574468085106
Test:
   Accuracy: 0.7498663816141101
Epoch 49/50
Validation:
   Accuracy: 0.7553191489361702
Test:
   Accuracy: 0.743452699091395
Epoch 50/50
Validation:
   Accuracy: 0.7553191489361702
Test:
   Accuracy: 0.7386424371993586


<keras.callbacks.History at 0x7f3c0ef05668>