In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#download google embedding
import requests
#url = 'https://s3.amazonaws.com/mordecai-geo/GoogleNews-vectors-negative300.bin.gz'
#url = 'https://github.com/eyaler/word2vec-slim/blob/master/GoogleNews-vectors-negative300-SLIM.bin.gz?raw=true'
url = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
r = requests.get(url, allow_redirects=True)
open('GoogleNews-vectors-negative300.bin.gz', 'wb').write(r.content)

**Data Preprocessing**

In [None]:
#read the data as is via pandas dataframe
import re
import nltk


#define some constants to be used later
TRAIN_CSV = '../input/train.csv'
TEST_CSV = '../input/test.csv'
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'

# Load training and test set
train_df = pd.read_csv(TRAIN_CSV)#.sample(150000, random_state=0) #sample 50k for training set
test_df = pd.read_csv(TEST_CSV)#.sample(10000, random_state=0) #sample 5k for test set
test_df.shape

In [None]:
#data clean-up
STOP_WORDS = nltk.corpus.stopwords.words() # load stop words from nltk library

"""
remove chars that are not letters or numbers, lowercase, then remove stop words
Input: sentence
Returns: cleaned up sentence
"""
def clean_sentence(val):
    
    regex = re.compile('([^\s\w]|_)+') # remove characters that are not letters or numbers
    sentence = regex.sub('', val).lower() #lower case
    sentence = sentence.split(" ")
    
    #for word in list(sentence): #remove stop words
    #    if word in STOP_WORDS:
    #        sentence.remove(word)  
            
    sentence = " ".join(sentence) 
    return sentence
"""
drop nans, then apply 'clean_sentence' function to question1 and 2
Input: dataframe
Returns: dataframe with questions column cleaned-up
"""
def clean_dataframe(data):
    
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data.loc[:,col] = data[col].apply(clean_sentence)
    
    return data





In [None]:
train_clean_df = clean_dataframe(train_df)
#test_clean_df = clean_dataframe(test_df)

In [None]:
train_clean_df.head(5)
#train_clean_df.shape

In [None]:
#extract questions from the dataframe
question1 = []
question2 = []
is_duplicate = []
question1 = train_clean_df["question1"].astype('str') 
question2 = train_clean_df["question2"].astype('str') 
is_duplicate = train_clean_df["is_duplicate"]

print (len(is_duplicate))

**Embedding Matrix**

In [None]:
from keras.preprocessing.text import Tokenizer

MAX_WORDS = 200000 #consider only top 200,000 words in the dataset
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))



In [None]:
word_index['quora']

In [None]:
from gensim.models import word2vec, KeyedVectors
embeddings_index = {}
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < MAX_WORDS:
        if word in word2vec.vocab:
            embedding_vector = word2vec.word_vec(word)
            if embedding_vector is not None:          
                embedding_matrix[i] = embedding_vector
#del word2vec
print('Word embeddings: %d' % (word2vec.vocab['word'].count))
print('Embedding matrix shape: %s' % (str(embedding_matrix.shape)))
del word2vec

**Prepare Train / Test data**

In [None]:
 from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 50
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
from sklearn.model_selection import train_test_split

X = np.stack((q1_data, q2_data), axis=1) #stack horizontally question 1 and question 2
y = is_duplicate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

# Convert labels to their numpy representations
y_train = y_train.values
y_test = y_test.values

# Make sure everything is ok
assert Q1_train.shape == Q2_train.shape
assert len(X_train) == len(y_train)


**Define Siamese Manhattan LSTM Model**

References: <br>
http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf<br>
https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM,TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras.optimizers import Adadelta

# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 15

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer
left_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
right_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(len(embedding_matrix), EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])


malstm.summary()

**Run the model**

In [None]:
from time import time
import datetime

MODEL_WEIGHTS = 'quora_question_pairs_weights.h5'
callbacks = [ModelCheckpoint(MODEL_WEIGHTS, monitor='val_acc', save_best_only=True)]
# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Start training
training_start_time = time()

malstm_trained = malstm.fit([Q1_train, Q2_train], y_train, batch_size=batch_size, epochs=n_epoch,
                            validation_data=([Q1_test, Q2_test], y_test), callbacks = callbacks)

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot accuracy
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
model = malstm.load_weights(MODEL_WEIGHTS)

In [None]:
test_df.tail(5)

**Prepare Test Dataset**

In [None]:
#test_df.head(10)
#x = test_df.loc[test_df['test_id'] == 2345793]
#a =x['question1'].astype('str')
#b =x['question2'].astype('str')
test_question1 = test_df["question1"].astype('str') 
test_question2 = test_df["question2"].astype('str')
t_question1_word_sequences = tokenizer.texts_to_sequences(test_question1)
t_question2_word_sequences = tokenizer.texts_to_sequences(test_question2)

t_q1_data = pad_sequences(t_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
t_q2_data = pad_sequences(t_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
predictions = malstm.predict([t_q1_data, t_q2_data])
predictions.shape

In [None]:
#flatten the predicted values into 1-d array
pred = predictions.flatten()
pred.shape


**Create a submission file**

In [None]:
#submission = pd.DataFrame(predictions, columns=['is_duplicate'])
#submission.insert(0, 'test_id', test.test_id)
file_name = 'submission_v2.csv'
pred_updated = np.where(pred > 0.5, 1, 0)
submission = pd.DataFrame({'test_id': test_df['test_id'], 'is_duplicate': pred})
submission.to_csv(file_name, index=False)

#submission.head(10)
submission.shape

In [None]:
submission.head(10)

In [None]:
#tests
#x = test_df.loc[test_df['test_id'] == 4085]
#x
#a =x['question1'].astype('str')
#b =x['question2'].astype('str')

In [None]:
#credit: https://www.kaggle.com/dansbecker/submitting-from-a-kernel/ (Dan B)
# import the modules we'll need
"""
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "submission_v1.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe
create_download_link(submission)
"""