In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GlobalAveragePooling1D, Lambda
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization.batch_normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K

In [2]:
# Define constants and parameters

# Supported data_clean_type (DO NOT forget to put on "()", also, if data_clean_type is NOT empty string, please put a " " before (cleaned)):
# empty string, no character
# (cleaned)
# (cleaned)(hyper_cleaned)
# (cleaned)(hyper_cleaned)(punctuation_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(words_shortened)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)(words_shortened)
data_clean_type = " (cleaned)"

EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin'
Train_Data_File = 'train_with_features' + data_clean_type + '.csv'
Test_Data_File = 'test_with_features' + data_clean_type + '.csv'
Max_Sequence_Length = 60
Max_Num_Words = 200000 # There are about 201000 unique words in training dataset, 200000 is enough for tokenization
Embedding_Dim = 300
Validation_Split_Ratio = 0.2

Num_Lstm = np.random.randint(175, 275)
Num_Dense = np.random.randint(100, 150)
Rate_Drop_Lstm = 0.15 + np.random.rand() * 0.25
Rate_Drop_Dense = 0.15 + np.random.rand() * 0.25

Lstm_Struc = 'lstm_{:d}_{:d}_{:.2f}_{:.2f}'.format(Num_Lstm, Num_Dense, Rate_Drop_Lstm, \
Rate_Drop_Dense)
print(Lstm_Struc)

act_f = 'relu'
re_weight = False # whether to re-weight classes to fit the 17.4% share in test set
use_more_features = False # If true, add other 19 features. If false, only use leaky features

lstm_178_124_0.31_0.26


In [3]:
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec))

Indexing word vectors
Found 3000000 word vectors of word2vec


In [4]:
# Process text in dataset
print('Processing text dataset')

# load data and process with text_to_wordlist
df_train = pd.read_csv(Train_Data_File, encoding='utf-8')
df_train = df_train.dropna()
#df_train = df_train.fillna('empty')

train_texts_1 = df_train['question1'].tolist()
train_texts_2 = df_train['question2'].tolist()
train_labels = df_train['is_duplicate'].tolist()

df_test = pd.read_csv(Test_Data_File, encoding='utf-8')
df_test = df_test.dropna()
#df_test = df_test.fillna('empty')

test_texts_1 = df_test['question1'].tolist()
test_texts_2 = df_test['question2'].tolist()
test_ids = df_test['test_id'].tolist()

Processing text dataset


In [5]:
# Tokenize words in all sentences
tokenizer = Tokenizer(num_words=Max_Num_Words)
tokenizer.fit_on_texts(train_texts_1 + train_texts_2 + test_texts_1 + test_texts_2)

train_sequences_1 = tokenizer.texts_to_sequences(train_texts_1)
train_sequences_2 = tokenizer.texts_to_sequences(train_texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('{} unique tokens are found'.format(len(word_index)))

# pad all train with Max_Sequence_Length
train_data_1 = pad_sequences(train_sequences_1, maxlen=Max_Sequence_Length)
train_data_2 = pad_sequences(train_sequences_2, maxlen=Max_Sequence_Length)
train_labels = np.array(train_labels)
print('Shape of train data tensor:', train_data_1.shape)
print('Shape of train labels tensor:', train_labels.shape)

# pad all test with Max_Sequence_Length
test_data_1 = pad_sequences(test_sequences_1, maxlen=Max_Sequence_Length)
test_data_2 = pad_sequences(test_sequences_2, maxlen=Max_Sequence_Length)
test_ids = np.array(test_ids)
print('Shape of test data tensor:', test_data_2.shape)
print('Shape of test ids tensor:', test_ids.shape)

85496 unique tokens are found
Shape of train data tensor: (399922, 60)
Shape of train labels tensor: (399922,)
Shape of test data tensor: (4290, 60)
Shape of test ids tensor: (4290,)


In [6]:
# leaky features
leaks = df_train[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = df_test[['q1_q2_intersect', 'q1_freq', 'q2_freq']]

ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)



In [7]:
# Add extra features input (optimal feature returned by XGBoost)
if use_more_features:
    #extra_features = df_train[['word_match_share', 'tfidf_word_match_share', 'tfidf_word_match', 'unigrams_common_count', 'unigrams_common_ratio',
    #                      'jaccard', 'common_words', 'common_words_stop', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
    #                      'wc_diff_unique', 'wc_ratio_unique', 'wc_diff_unique_stop', 'wc_ratio_unique_stop', 'same_start_word', 'char_diff', 
    #                      'char_ratio', 'char_diff_unique_stop']]
    extra_features = df_train[['word_match_share', 'tfidf_word_match', 'jaccard', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
                           'wc_diff_unique', 'wc_ratio_unique', 'same_start_word', 'char_diff', 'char_diff_unique_stop', 'q1_to_q2_wc_ratio_unique',
                              'q1_to_q2_char_diff', 'q1_to_q2_char_diff_unique_stop', 'word_match_share_alternative_stop', 'common_words_alternative_stop',
                              'total_unq_words_alternative_stop', 'wc_diff_unique_alternative_stop', 'char_diff_unique_alternative_stop',
                              'q1_to_q2_wc_diff_unique_alternative_stop', 'q1_to_q2_wc_ratio_unique_alternative_stop', 'q1_to_q2_char_diff_unique_alternative_stop']]
    #extra_features_test = df_test[['word_match_share', 'tfidf_word_match_share', 'tfidf_word_match', 'unigrams_common_count', 'unigrams_common_ratio',
    #                      'jaccard', 'common_words', 'common_words_stop', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
    #                      'wc_diff_unique', 'wc_ratio_unique', 'wc_diff_unique_stop', 'wc_ratio_unique_stop', 'same_start_word', 'char_diff', 
    #                      'char_ratio', 'char_diff_unique_stop']]
    extra_features_test = df_test[['word_match_share', 'tfidf_word_match', 'jaccard', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
                           'wc_diff_unique', 'wc_ratio_unique', 'same_start_word', 'char_diff', 'char_diff_unique_stop', 'q1_to_q2_wc_ratio_unique',
                              'q1_to_q2_char_diff', 'q1_to_q2_char_diff_unique_stop', 'word_match_share_alternative_stop', 'common_words_alternative_stop',
                              'total_unq_words_alternative_stop', 'wc_diff_unique_alternative_stop', 'char_diff_unique_alternative_stop',
                              'q1_to_q2_wc_diff_unique_alternative_stop', 'q1_to_q2_wc_ratio_unique_alternative_stop', 'q1_to_q2_char_diff_unique_alternative_stop']]
    
    ss = StandardScaler()
    ss.fit(np.vstack((extra_features, extra_features_test)))
    extra_features = ss.transform(extra_features)
    extra_features_test = ss.transform(extra_features_test)

In [8]:
# Create embedding matrix for embedding layer
print('Preparing embedding matrix')

num_words = min(Max_Num_Words, len(word_index))+1

embedding_matrix = np.zeros((num_words, Embedding_Dim))
for word, i in word_index.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec.get_vector(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 37372


In [9]:
# Train Validation split
perm = np.random.permutation(len(train_data_1))
idx_train = perm[:int(len(train_data_1)*(1-Validation_Split_Ratio))]
idx_val = perm[int(len(train_data_1)*(1-Validation_Split_Ratio)):]

data_1_train = np.vstack((train_data_1[idx_train], train_data_2[idx_train]))
data_2_train = np.vstack((train_data_2[idx_train], train_data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
if use_more_features:
    feature_train = np.vstack((extra_features[idx_train], extra_features[idx_train]))
labels_train = np.concatenate((train_labels[idx_train], train_labels[idx_train]))

data_1_val = np.vstack((train_data_1[idx_val], train_data_2[idx_val]))
data_2_val = np.vstack((train_data_2[idx_val], train_data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
if use_more_features:
    feature_val = np.vstack((extra_features[idx_val], extra_features[idx_val]))
labels_val = np.concatenate((train_labels[idx_val], train_labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.471544715
    weight_val[labels_val==0] = 1.309033281

In [10]:
# The embedding layer containing the word vectors
emb_layer = Embedding(
    input_dim=num_words,
    output_dim=Embedding_Dim,
    weights=[embedding_matrix],
    input_length=Max_Sequence_Length,
    trainable=False
)

# 1D convolutions that can iterate over the word vectors
conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

# Define inputs
seq1 = Input(shape=(60,))
seq2 = Input(shape=(60,))

# Run inputs through embedding
emb1 = emb_layer(seq1)
emb2 = emb_layer(seq2)

# Run through CONV + GAP layers
conv1a = conv1(emb1)
glob1a = GlobalAveragePooling1D()(conv1a)
conv1b = conv1(emb2)
glob1b = GlobalAveragePooling1D()(conv1b)

conv2a = conv2(emb1)
glob2a = GlobalAveragePooling1D()(conv2a)
conv2b = conv2(emb2)
glob2b = GlobalAveragePooling1D()(conv2b)

conv3a = conv3(emb1)
glob3a = GlobalAveragePooling1D()(conv3a)
conv3b = conv3(emb2)
glob3b = GlobalAveragePooling1D()(conv3b)

conv4a = conv4(emb1)
glob4a = GlobalAveragePooling1D()(conv4a)
conv4b = conv4(emb2)
glob4b = GlobalAveragePooling1D()(conv4b)

conv5a = conv5(emb1)
glob5a = GlobalAveragePooling1D()(conv5a)
conv5b = conv5(emb2)
glob5b = GlobalAveragePooling1D()(conv5b)

conv6a = conv6(emb1)
glob6a = GlobalAveragePooling1D()(conv6a)
conv6b = conv6(emb2)
glob6b = GlobalAveragePooling1D()(conv6b)

mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

# We take the explicit absolute difference between the two sentences
# Furthermore we take the multiply different entries to get a different measure of equalness
diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32,))([mergea, mergeb])

# Add the magic features
magic_input = Input(shape=(leaks.shape[1],))
magic_dense = BatchNormalization()(magic_input)
magic_dense = Dense(64, activation='relu')(magic_dense)

# Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
# nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
if use_more_features:
    feature_input = Input(shape=(extra_features.shape[1],))
    feature_dense = BatchNormalization()(feature_input)
    feature_dense = Dense(128, activation='relu')(feature_dense)

# Merge the Magic and distance features with the difference layer
if use_more_features:
    merge = concatenate([diff, mul, magic_dense, feature_dense])
else:
    merge = concatenate([diff, mul, magic_dense])

if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

# The MLP that determines the outcome
x = Dropout(0.2)(merge)
x = BatchNormalization()(x)
x = Dense(300, activation='relu')(x)

x = Dropout(0.2)(x)
x = BatchNormalization()(x)
pred = Dense(1, activation='sigmoid')(x)

if use_more_features:
    model = Model(inputs=[seq1, seq2, magic_input, feature_input], outputs=pred)
else:
    model = Model(inputs=[seq1, seq2, magic_input], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

# Set early stopping (large patience should be useful)
early_stopping =EarlyStopping(monitor='val_acc', patience=6)
bst_model_path = Lstm_Struc + '.h5' 
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

if use_more_features:
    hist = model.fit([data_1_train, data_2_train, leaks_train, feature_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val, feature_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])
else:
    hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path) # store model parameters in .h5 file
bst_val_score = min(hist.history['val_acc'])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200


In [11]:
# Make the submission
print('Making the submission')
if use_more_features:
    preds = model.predict([test_data_1, test_data_2, test_leaks, extra_features_test], batch_size=8192, verbose=1)
    preds += model.predict([test_data_2, test_data_1, test_leaks, extra_features_test], batch_size=8192, verbose=1)
    preds /= 2
else:
    preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=8192, verbose=1)
    preds += model.predict([test_data_2, test_data_1, test_leaks], batch_size=8192, verbose=1)
    preds /= 2

Making the submission


In [12]:
# Convert percentage to binary predictions
result = []
sub_result = []
for i in preds:
    if i[0] < 0.5:
        sub_result.append(0)
    else:
        sub_result.append(1)
result.append(sub_result)
result = np.array(result)

# Get the accuracy on the test data
true_values = df_test["is_duplicate (Ture Value)"]

score = 0
for i in range(0, len(sub_result)):
    if sub_result[i] == true_values.tolist()[i]:
        score = score + 1
accuracy = score / len(sub_result)
print("Accuracy on test data: {}%".format(round(accuracy*100, 3)))

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':result.ravel()})
submission.to_csv("Word2Vec + CNN_with_features" + data_clean_type + "(accuracy: + " + str(round(accuracy*100, 3)) + ")" + ".csv", index=False)

Accuracy on test data: 88.974%
