# Data Preparation

In [7]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from gensim.models import Word2Vec

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

stops = set(stopwords.words("english"))
stemmer = SnowballStemmer('english')

In [3]:
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0k ", "0000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)
    text = re.sub(r"demonitization", "demonetization", text)
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text)
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text)
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)

    # Remove punctuation from text
    word_list = ''.join([c for c in text if c not in punctuation]).split()

    # Optionally, remove stop words
    if remove_stop_words:
        word_list = [w for w in word_list if w not in stops]

    # Optionally, shorten words to their stems
    if stem_words:
        word_list = [stemmer.stem(word) for word in word_list]

    # Return a list of words
    return " ".join(word_list)

In [4]:
# Text Cleaning
df_train = pd.read_csv('data/train.csv')
df_train["q1"] = df_train["question1"].astype(str).apply(text_to_wordlist)
df_train["q2"] = df_train["question2"].astype(str).apply(text_to_wordlist)
df_test = pd.read_csv('data/test.csv')
df_test["q1"] = df_test["question1"].astype(str).apply(text_to_wordlist)
df_test["q2"] = df_test["question2"].astype(str).apply(text_to_wordlist)

In [11]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1,q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What step step guide invest share market India,What step step guide invest share market
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What story Kohinoor Koh Noor Diamond,What would happen Indian government stole Kohi...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How I increase speed internet connection using...,How Internet speed increased hacking DNS
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why I mentally lonely How I solve,find remainder math 23 24 math divided 24 23
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,Which one dissolve water quickly sugar salt me...,Which fish would survive salt water


In [12]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df_train["q1"].tolist() + df_train["q2"].tolist() + df_test["q1"].tolist() + df_test["q2"].tolist())

sequences_1 = tokenizer.texts_to_sequences(df_train["q1"])
sequences_2 = tokenizer.texts_to_sequences(df_train["q2"])
test_sequences_1 = tokenizer.texts_to_sequences(df_test["q1"])
test_sequences_2 = tokenizer.texts_to_sequences(df_test["q2"])

In [17]:
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 120347 unique tokens


In [26]:
MAX_SEQUENCE_LENGTH = 30
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)

data_1_train, data_1_val = data_1[2000:], data_1[:2000]
data_2_train, data_2_val = data_2[2000:], data_2[:2000]
label_train, label_val = df_train["is_duplicate"][2000:], df_train["is_duplicate"][:2000]
print('Shape of data tensor:', data_1_validate.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)

Shape of data tensor: (2000, 30)


# Model Design

In [27]:
num_lstm = 225
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.4
rate_drop_dense = 0.4
act = 'relu'

embedding_layer = Embedding(20000, 100, input_length=MAX_SEQUENCE_LENGTH)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])

# Model Training

In [28]:
re_weight = True

if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None
    
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = "model/lstm.h5"
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train], label_train, \
        validation_data=([data_1_val, data_2_val], label_val), \
        epochs=1, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

Train on 402290 samples, validate on 2000 samples
Epoch 1/1
 51200/402290 [==>...........................] - ETA: 1046s - loss: 0.6957 - acc: 0.5973

KeyboardInterrupt: 

# Making Submission

In [None]:
preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':df_test["test_id"], 'is_duplicate':preds.ravel()})
submission.to_csv("submission/basic_lstm.csv", index=False)