In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive')
  COLAB = True
except:
  COLAB = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import keras.preprocessing.text as kpt
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, Flatten
from keras.layers.core import Dense
from keras.models import load_model, Sequential
from keras.preprocessing.sequence import pad_sequences

seed = 7
np.random.seed(seed)

Using TensorFlow backend.


In [3]:
TRAIN_URL = '/content/drive/My Drive/Coding/nlu-story/train_stories.csv' if COLAB else '../train_stories.csv'

df_train = pd.read_csv(TRAIN_URL).iloc[:, 2:]
df_train['sentence6'] = df_train.iloc[:, 4].shift(periods=-1, fill_value=df_train.iloc[0, 4]) # sentence 5 shifted 1 row

n_samples = len(df_train)

df_train = df_train.assign(y=pd.Series(np.random.randint(2, size=n_samples)).values)
swap_idx = (df_train.y == 1)
df_train.loc[swap_idx,['sentence5','sentence6']] = df_train.loc[swap_idx,['sentence6','sentence5']].values # swap sentences 5 & 6 uniformly

df_train['X'] = df_train[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5', 'sentence6']].apply(lambda x: ' '.join(x), axis=1)
df_train = df_train[['X', 'y']]

print(df_train.shape)
print(df_train.iloc[0])

(88161, 2)
X    Kelly found her grandmother's pizza recipe in ...
y                                                    1
Name: 0, dtype: object


In [4]:
VOCAB_SIZE = 20000

tokenizer = kpt.Tokenizer(oov_token='<UNK>', num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df_train['X'])
tokenizer.word_index = {w:i for w,i in tokenizer.word_index.items() if i < VOCAB_SIZE}

vocab_dict = tokenizer.word_index
print(len(vocab_dict)+1)

20000


In [5]:
X_train = tokenizer.texts_to_sequences(df_train['X'])
seq_len = max(len(s) for s in X_train)
X_train = pad_sequences(X_train, maxlen=seq_len, padding='post')

y_train = df_train.y

print(X_train.shape, y_train.shape)

(88161, 87) (88161,)


In [6]:
EMB_SIZE = 100
GLOVE_URL = '/content/drive/My Drive/Coding/nlu-story/glove.6B.100d.txt' if COLAB else '../glove.6B.100d.txt'

embeddings = {}
with open(GLOVE_URL, 'r') as f:
    for line in f:
        values = line.split()
        w = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[w] = coefs

print(len(embeddings))

400000


In [7]:
embedding_matrix = np.random.uniform(-1, 1, size=(VOCAB_SIZE, EMB_SIZE))
num_loaded = 0
for w, i in vocab_dict.items():
    if w in embeddings and i < VOCAB_SIZE:
        embedding_matrix[i] = embeddings[w]
        num_loaded += 1

print(embedding_matrix.shape, num_loaded)

(20000, 100) 18894


In [8]:
MODEL_URL = '/content/drive/My Drive/Coding/nlu-story/model.h5' if COLAB else './model.h5'

def create_model():
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, EMB_SIZE, weights=[embedding_matrix], input_length=seq_len, trainable=True))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

try:
    model = load_model(MODEL_URL)
    print('Loaded previous model.')
except:
    model = create_model()
    print('Created new model.')

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Created new model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 87, 100)           2000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 8700)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               4454912   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 513       
Total params: 6,455,425
Trainable params: 6,455,425
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
BATCH_SIZE = 32
EPOCHS = 10

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

ckpt = ModelCheckpoint(filepath=MODEL_URL, verbose=0, save_best_only=True)

model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[ckpt]
)

Instructions for updating:
Use tf.cast instead.
Train on 70528 samples, validate on 17633 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f01414549e8>

In [10]:
EVAL_URL = '/content/drive/My Drive/Coding/nlu-story/eval_stories.csv' if COLAB else '../eval_stories.csv'

df_eval = pd.read_csv(EVAL_URL).iloc[:, 1:8]
df_eval['X'] = df_eval[['InputSentence1', 'InputSentence2', 'InputSentence3', 'InputSentence4', 'RandomFifthSentenceQuiz1', 'RandomFifthSentenceQuiz2']].apply(lambda x: ' '.join(x), axis=1)
df_eval['y'] = df_eval['AnswerRightEnding'].apply(lambda x: x-1)
df_eval = df_eval[['X', 'y']]

print(df_eval.shape)
print(df_eval.iloc[0])

(1871, 2)
X    Rick grew up in a troubled household. He never...
y                                                    0
Name: 0, dtype: object


In [11]:
correct = 0
total = len(df_eval)
for i, row in df_eval.iterrows():
    X_eval = tokenizer.texts_to_sequences([row['X']])
    X_eval = pad_sequences(X_eval, maxlen=seq_len, padding='post')
    y_pred = model.predict(X_eval)
    if y_pred >= 0.5 and row['y'] == 1 or y_pred < 0.5 and row['y'] == 0:
      correct += 1

acc = correct / total

print(correct, total, acc)

949 1871 0.5072153928380545
