In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import keras.preprocessing.text as kpt
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding, Flatten
from keras.layers.core import Dense
from keras.models import load_model, Sequential
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

Using TensorFlow backend.


In [3]:
TRAIN_URL = '/content/drive/My Drive/Coding/nlu-story/train_stories.csv'

df_train = pd.read_csv(TRAIN_URL).iloc[:, 2:]
df_train['fake'] = df_train.iloc[:, 4].shift(periods=-1, fill_value=df_train.iloc[0, 4])
df_train['correct'] = df_train[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].apply(lambda x: ' '.join(x), axis=1)
df_train['wrong'] = df_train[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'fake']].apply(lambda x: ' '.join(x), axis=1)
df_train = df_train[['correct','wrong']]

print(df_train.shape)

(88161, 2)


In [4]:
VOCAB_SIZE = 20000

tokenizer = kpt.Tokenizer(oov_token='<UNK>', num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df_train['correct'])
tokenizer.word_index = {w:i for w,i in tokenizer.word_index.items() if i < VOCAB_SIZE}

vocab_dict = tokenizer.word_index
print(len(vocab_dict)+1)

20000


In [5]:
X_train = pd.concat([df_train['correct'], df_train['wrong']])
X_train = tokenizer.texts_to_sequences(X_train)

seq_len = max(len(s) for s in X_train)
n_correct = len(df_train)
n_samples = n_correct * 2

X_train = pad_sequences(X_train, maxlen=seq_len, padding='post')

y_train = np.ones(n_samples)
y_train[n_correct:] = 0

X_train, y_train = shuffle(X_train, y_train)

print(X_train.shape, y_train.shape, n_correct, n_samples)

(176322, 74) (176322,) 88161 176322


In [6]:
EMB_SIZE = 100
GLOVE_URL = '/content/drive/My Drive/Coding/nlu-story/glove.6B.100d.txt'

embeddings = {}
with open(GLOVE_URL, 'r') as f:
    for line in f:
        values = line.split()
        w = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[w] = coefs

print(len(embeddings))

400000


In [7]:
embedding_matrix = np.random.uniform(-1, 1, size=(VOCAB_SIZE, EMB_SIZE))
num_loaded = 0
for w, i in vocab_dict.items():
    if w in embeddings and i < VOCAB_SIZE:
        embedding_matrix[i] = embeddings[w]
        num_loaded += 1

print(embedding_matrix.shape, num_loaded)

(20000, 100) 18955


In [8]:
MODEL_URL = '/content/drive/My Drive/Coding/nlu-story/model.h5'

def create_model():
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, EMB_SIZE, weights=[embedding_matrix], input_length=seq_len, trainable=False))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

try:
    model = load_model(MODEL_URL)
    print('Loaded previous model.')
except:
    model = create_model()
    print('Created new model.')

print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Created new model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 74, 100)           2000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 7400)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               3789312   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 513       
Total params: 5,789,825
Trainable params: 3,789,825
Non-trainable params: 2,000,000
_________________________________________________________________
None


In [0]:
BATCH_SIZE = 32
EPOCHS = 10

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

ckpt = ModelCheckpoint(filepath=MODEL_URL, verbose=0, save_best_only=True)

model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[ckpt]
)

Train on 141057 samples, validate on 35265 samples
Epoch 1/10
Epoch 2/10

In [10]:
EVAL_URL = '/content/drive/My Drive/Coding/nlu-story/eval_stories.csv'

df_eval = pd.read_csv(EVAL_URL).iloc[:, 1:8]
df_eval['Option1'] = df_eval[['InputSentence1', 'InputSentence2', 'InputSentence3', 'InputSentence4', 'RandomFifthSentenceQuiz1']].apply(lambda x: ' '.join(x), axis=1)
df_eval['Option2'] = df_eval[['InputSentence1', 'InputSentence2', 'InputSentence3', 'InputSentence4', 'RandomFifthSentenceQuiz2']].apply(lambda x: ' '.join(x), axis=1)
df_eval = df_eval[['Option1','Option2', 'AnswerRightEnding']]

print(df_eval.shape)

(1871, 3)


In [14]:
same = 0
correct = 0
rel_correct = 0
total = len(df_eval)
for i, row in df_eval.iterrows():
    option1 = tokenizer.texts_to_sequences([row['Option1']])
    option1 = pad_sequences(option1, maxlen=seq_len, padding='post')
    option2 = tokenizer.texts_to_sequences([row['Option2']])
    option2 = pad_sequences(option2, maxlen=seq_len, padding='post')
    y_pred1 = model.predict(option1)
    y_pred2 = model.predict(option2)
    if y_pred1 > 0.5 and row['AnswerRightEnding'] == 1:
      correct += 1
    if row['AnswerRightEnding'] == 1 and y_pred1 > y_pred2:
      rel_correct += 1
    elif y_pred1 == y_pred2:
      same += 1

acc = correct / total
rel_acc = rel_correct / total

print(same, total, correct, acc, rel_correct, rel_acc)

1871 1871 962 0.5141635489043292 0 0.0
