In [37]:
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Embedding, LSTM
from keras.optimizers import RMSprop
from keras import backend as K

In [8]:
import re

from collections import Counter
import tensorflow as tf
import numpy as np
import pandas as pd
import gensim
from scipy.spatial.distance import cosine

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [73]:
train_df = pd.read_csv('data/dev_train.csv')

In [74]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [75]:
train_df = train_df[train_df['question1'].notnull()]
train_df = train_df[train_df['question2'].notnull()]

question1_sentences = []
question2_sentences = []
labels = []
for index, row in train_df.iterrows():
    question1_sentences.append(text_to_wordlist(row['question1']))
    question2_sentences.append(text_to_wordlist(row['question2']))
    labels.append(int(row['is_duplicate']))

assert (len(question1_sentences) == len(question2_sentences)), "Num of q1 and q2 are not equal"

In [76]:
wordsList = []

for row in question1_sentences:
    wordsList.extend(row.split())
    
for row in question2_sentences:
    wordsList.extend(row.split())
        
counts = Counter(wordsList)

vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [78]:
question1_int = []
question2_int = []
labels_int = []

for index, row in enumerate(question1_sentences):
    question1 = []
    question2 = []
    
    question1_words = question1_sentences[index].split()
    question2_words = question2_sentences[index].split()
    
    if (len(question1_words) > 3) and (len(question2_words) > 3):
        labels_int.append(labels[index])
        for word in question1_words:
            question1.append(vocab_to_int.get(word))

        question1_int.append(question1)

        for word in question2_words:
            question2.append(vocab_to_int.get(word))

        question2_int.append(question2)
        
assert (len(question1_int) == len(question2_int)), "Num of q1 and q2 ints are not equal"

In [79]:
seq_len = 200

In [80]:
question1_features = np.zeros((len(question1_int), seq_len), dtype=int)
for i, row in enumerate(question1_int):
    question1_features[i, -len(row):] = np.array(row)[:seq_len]
    
question2_features = np.zeros((len(question2_int), seq_len), dtype=int)
for i, row in enumerate(question2_int):
    question2_features[i, -len(row):] = np.array(row)[:seq_len]
    
assert (len(question1_features) == len(question2_features)), "Num of q1 and q2 features are not equal"
assert (len(question1_features) == len(labels_int)), "Num of questions and labels are not equal"

In [19]:
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [81]:
embedding_matrix = np.zeros((len(vocab_to_int)+1, 300), dtype=float)

for word, i in vocab_to_int.items():
    if word in word_embedding_model.vocab:
        embedding_matrix[i] = word_embedding_model.word_vec(word)

In [82]:
from sklearn.model_selection import StratifiedShuffleSplit

ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

labels = np.asarray(labels_int)
train_idx, val_idx = next(ss.split(question1_features, labels))

half_val_len = int(len(val_idx)/2)
val_idx, test_idx = val_idx[:half_val_len], val_idx[half_val_len:]

question1_train, question2_train, label_train = question1_features[train_idx], question2_features[train_idx], labels[train_idx]
question1_val, question2_val, label_val = question1_features[val_idx], question2_features[val_idx], labels[val_idx]
question1_test, question2_test, label_test = question1_features[test_idx], question2_features[test_idx], labels[test_idx]

In [83]:
len(vocab)

26076

In [84]:
lstm_size = 64
lstm_layers = 1
batch_size = 128
learning_rate = 0.001

In [85]:
print(embedding_matrix.shape)

(26077, 300)


In [86]:
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

In [87]:
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [88]:
def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) +
                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

In [89]:
def compute_accuracy(predictions, labels):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()

In [92]:
input_q1 = Input(shape=(seq_len, ), dtype='float32')
input_q2 = Input(shape=(seq_len, ), dtype='float32')

embedding_layer_q1 = Embedding(input_dim=len(vocab_to_int) + 1, 
                               output_dim=300,
                            weights=[embedding_matrix],
                            input_length=seq_len,
                            trainable=False)(input_q1)

embedding_layer_q2 = Embedding(input_dim=len(vocab_to_int) + 1, 
                               output_dim=300,
                                weights=[embedding_matrix],
                                input_length=seq_len,
                                trainable=False)(input_q2)

shared_lstm = LSTM(128)

q1 = shared_lstm(embedding_layer_q1)
q2 = shared_lstm(embedding_layer_q2)

distance = Lambda(euclidean_distance,
                  output_shape=eucl_dist_output_shape)([q1, q2])

model = Model([input_q1, input_q2], distance)

# train
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms)
model.fit([question1_train, question2_train], label_train,
          batch_size=128,
          epochs=10,
          validation_data=([question1_val, question2_val], label_val))


Train on 23676 samples, validate on 2960 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x263c4b518>

In [118]:
# compute final accuracy on training and test sets
pred = model.predict([question1_train, question2_train])
tr_acc = compute_accuracy(pred, label_train)

pred = model.predict([question1_test, question2_test])
te_acc = compute_accuracy(pred, label_test)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

* Accuracy on training set: 82.83%
* Accuracy on test set: 74.26%
