In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python

# Any results you write to the current directory are saved as output.

# Import package

import os
import re
import csv
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import codecs

from string import punctuation
from collections import defaultdict
# from tqdm import tqdm

from sklearn.preprocessing import StandardScaler

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Activation, LSTM, Lambda
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization.batch_normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
# from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalAveragePooling1D
import keras.backend as K

from BertEmbeddings import BertEmbeddings

In [2]:
# Define constants and parameters

# Supported data_clean_type (DO NOT forget to put on "()", also, if data_clean_type is NOT empty string, please put a " " before (cleaned)):
# empty string, no character
# (cleaned)
# (cleaned)(hyper_cleaned)
# (cleaned)(hyper_cleaned)(punctuation_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(words_shortened)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)(words_shortened)
data_clean_type = " (cleaned)"

Train_Data_File = 'train_with_features' + data_clean_type + '.csv'
Test_Data_File = 'test_with_features' + data_clean_type + '.csv'
Max_Sequence_Length = 60
Max_Num_Words = 200000 # There are about 201000 unique words in training dataset, 200000 is enough for tokenization
Embedding_Dim = 1024 # Dimension of Bert-embedding
Validation_Split_Ratio = 0.2

Num_Lstm = np.random.randint(175, 275)
Num_Dense = np.random.randint(100, 150)
Rate_Drop_Lstm = 0.15 + np.random.rand() * 0.25
Rate_Drop_Dense = 0.15 + np.random.rand() * 0.25

Lstm_Struc = 'lstm_{:d}_{:d}_{:.2f}_{:.2f}'.format(Num_Lstm, Num_Dense, Rate_Drop_Lstm, \
Rate_Drop_Dense)
print(Lstm_Struc)

act_f = 'relu'
re_weight = False # whether to re-weight classes to fit the 17.4% share in test set
use_more_features = False # If true, add TF-IDF and unigram features. If false, only use leaky features
fine_tuned_model = False # If true, use fine-tunned bert-embedding model. If false, use original bert-large-uncased model uploaded by Google

lstm_237_113_0.36_0.26


In [3]:
# Process text in dataset
print('Processing text dataset')

# load data and process with text_to_wordlist
df_train = pd.read_csv(Train_Data_File, encoding='utf-8')
df_train = df_train.dropna()
#df_train = df_train.fillna('empty')

train_texts_1 = df_train['question1'].tolist()
train_texts_2 = df_train['question2'].tolist()
train_labels = df_train['is_duplicate'].tolist()

df_test = pd.read_csv(Test_Data_File, encoding='utf-8')
df_test = df_test.dropna()
#df_test = df_test.fillna('empty')

test_texts_1 = df_test['question1'].tolist()
test_texts_2 = df_test['question2'].tolist()
test_ids = df_test['test_id'].tolist()

Processing text dataset


In [4]:
# Tokenize words in all sentences
tokenizer = Tokenizer(num_words=Max_Num_Words)
tokenizer.fit_on_texts(train_texts_1 + train_texts_2 + test_texts_1 + test_texts_2)

train_sequences_1 = tokenizer.texts_to_sequences(train_texts_1)
train_sequences_2 = tokenizer.texts_to_sequences(train_texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('{} unique tokens are found'.format(len(word_index)))

# pad all train with Max_Sequence_Length
train_data_1 = pad_sequences(train_sequences_1, maxlen=Max_Sequence_Length)
train_data_2 = pad_sequences(train_sequences_2, maxlen=Max_Sequence_Length)
train_labels = np.array(train_labels)
print('Shape of train data tensor:', train_data_1.shape)
print('Shape of train labels tensor:', train_labels.shape)

# pad all test with Max_Sequence_Length
test_data_1 = pad_sequences(test_sequences_1, maxlen=Max_Sequence_Length)
test_data_2 = pad_sequences(test_sequences_2, maxlen=Max_Sequence_Length)
test_ids = np.array(test_ids)
print('Shape of test data tensor:', test_data_2.shape)
print('Shape of test ids tensor:', test_ids.shape)

85496 unique tokens are found
Shape of train data tensor: (399922, 60)
Shape of train labels tensor: (399922,)
Shape of test data tensor: (4290, 60)
Shape of test ids tensor: (4290,)


In [5]:
if fine_tuned_model:
    bert_embeddings = BertEmbeddings(model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad')
else:
    bert_embeddings = BertEmbeddings(model_name = 'bert-large-uncased-whole-word-masking')
# This will create a tensor too large for a single computer, a more powerful one is needed
'''
embeddings_index = {}
progress = 1
for word in word_index:
    output = bert_embeddings([word])
    for value in output[0]['embeddings_map'].values():
        embeddings_index[word] = np.array(value)
    if (progress % 10000 == 0):
        print(str(progress) + " words embeded")
    progress = progress + 1
'''
# Use this instead
embeddings_index = {}
progress = 1
for word in word_index:
    output = bert_embeddings([word])
    result = np.array(output[0]['hidden_states'])[0][-1] + np.array(output[0]['hidden_states'])[0][-2] + np.array(output[0]['hidden_states'])[0][-3] + np.array(output[0]['hidden_states'])[0][-4]
    embeddings_index[word] = result
    if (progress % 1000 == 0):
        print(str(progress) + " words embeded")
    progress = progress + 1

1000 words embeded
2000 words embeded
3000 words embeded
4000 words embeded
5000 words embeded
6000 words embeded
7000 words embeded
8000 words embeded
9000 words embeded
10000 words embeded
11000 words embeded
12000 words embeded
13000 words embeded
14000 words embeded
15000 words embeded
16000 words embeded
17000 words embeded
18000 words embeded
19000 words embeded
20000 words embeded
21000 words embeded
22000 words embeded
23000 words embeded
24000 words embeded
25000 words embeded
26000 words embeded
27000 words embeded
28000 words embeded
29000 words embeded
30000 words embeded
31000 words embeded
32000 words embeded
33000 words embeded
34000 words embeded
35000 words embeded
36000 words embeded
37000 words embeded
38000 words embeded
39000 words embeded
40000 words embeded
41000 words embeded
42000 words embeded
43000 words embeded
44000 words embeded
45000 words embeded
46000 words embeded
47000 words embeded
48000 words embeded
49000 words embeded
50000 words embeded
51000 wor

In [6]:
print(embeddings_index['the'].shape)

(1024,)


In [7]:
# leaky features
leaks = df_train[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = df_test[['q1_q2_intersect', 'q1_freq', 'q2_freq']]

ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)



In [8]:
# optimized_feature_array = [1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1]

In [9]:
# Add extra features input (optimal feature returned by XGBoost)
if use_more_features:
    #extra_features = df_train[['word_match_share', 'tfidf_word_match_share', 'tfidf_word_match', 'unigrams_common_count', 'unigrams_common_ratio',
    #                      'jaccard', 'common_words', 'common_words_stop', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
    #                      'wc_diff_unique', 'wc_ratio_unique', 'wc_diff_unique_stop', 'wc_ratio_unique_stop', 'same_start_word', 'char_diff', 
    #                      'char_ratio', 'char_diff_unique_stop']]
    extra_features = df_train[['word_match_share', 'tfidf_word_match', 'jaccard', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
                           'wc_diff_unique', 'wc_ratio_unique', 'same_start_word', 'char_diff', 'char_diff_unique_stop', 'q1_to_q2_wc_ratio_unique',
                              'q1_to_q2_char_diff', 'q1_to_q2_char_diff_unique_stop', 'word_match_share_alternative_stop', 'common_words_alternative_stop',
                              'total_unq_words_alternative_stop', 'wc_diff_unique_alternative_stop', 'char_diff_unique_alternative_stop',
                              'q1_to_q2_wc_diff_unique_alternative_stop', 'q1_to_q2_wc_ratio_unique_alternative_stop', 'q1_to_q2_char_diff_unique_alternative_stop']]
    #extra_features_test = df_test[['word_match_share', 'tfidf_word_match_share', 'tfidf_word_match', 'unigrams_common_count', 'unigrams_common_ratio',
    #                      'jaccard', 'common_words', 'common_words_stop', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
    #                      'wc_diff_unique', 'wc_ratio_unique', 'wc_diff_unique_stop', 'wc_ratio_unique_stop', 'same_start_word', 'char_diff', 
    #                      'char_ratio', 'char_diff_unique_stop']]
    extra_features_test = df_test[['word_match_share', 'tfidf_word_match', 'jaccard', 'total_unique_words', 'total_unq_words_stop', 'wc_diff', 'wc_ratio', 
                           'wc_diff_unique', 'wc_ratio_unique', 'same_start_word', 'char_diff', 'char_diff_unique_stop', 'q1_to_q2_wc_ratio_unique',
                              'q1_to_q2_char_diff', 'q1_to_q2_char_diff_unique_stop', 'word_match_share_alternative_stop', 'common_words_alternative_stop',
                              'total_unq_words_alternative_stop', 'wc_diff_unique_alternative_stop', 'char_diff_unique_alternative_stop',
                              'q1_to_q2_wc_diff_unique_alternative_stop', 'q1_to_q2_wc_ratio_unique_alternative_stop', 'q1_to_q2_char_diff_unique_alternative_stop']]
    
    ss = StandardScaler()
    ss.fit(np.vstack((extra_features, extra_features_test)))
    extra_features = ss.transform(extra_features)
    extra_features_test = ss.transform(extra_features_test)

In [10]:
# Create embedding matrix for embedding layer
print('Preparing embedding matrix')

num_words = min(Max_Num_Words, len(word_index))+1

embedding_matrix = np.zeros((num_words, Embedding_Dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: '.format(np.sum(np.sum(embedding_matrix, axis=1) == 0)))

Preparing embedding matrix
Null word embeddings: 


In [11]:
# Train Validation split
perm = np.random.permutation(len(train_data_1))
idx_train = perm[:int(len(train_data_1)*(1-Validation_Split_Ratio))]
idx_val = perm[int(len(train_data_1)*(1-Validation_Split_Ratio)):]

data_1_train = np.vstack((train_data_1[idx_train], train_data_2[idx_train]))
data_2_train = np.vstack((train_data_2[idx_train], train_data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
if use_more_features:
    feature_train = np.vstack((extra_features[idx_train], extra_features[idx_train]))
labels_train = np.concatenate((train_labels[idx_train], train_labels[idx_train]))

data_1_val = np.vstack((train_data_1[idx_val], train_data_2[idx_val]))
data_2_val = np.vstack((train_data_2[idx_val], train_data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
if use_more_features:
    feature_val = np.vstack((extra_features[idx_val], extra_features[idx_val]))
labels_val = np.concatenate((train_labels[idx_val], train_labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.471544715
    weight_val[labels_val==0] = 1.309033281

In [12]:
# The embedding layer containing the word vectors
emb_layer = Embedding(
    input_dim=num_words,
    output_dim=Embedding_Dim,
    weights=[embedding_matrix],
    input_length=Max_Sequence_Length,
    trainable=False
)    


# LSTM layer

lstm_layer = LSTM(Num_Lstm, dropout=Rate_Drop_Lstm, recurrent_dropout=Rate_Drop_Lstm)

# Define inputs
seq1 = Input(shape=(Max_Sequence_Length,), dtype='int32')
seq2 = Input(shape=(Max_Sequence_Length,), dtype='int32')

# Run inputs through embedding
emb1 = emb_layer(seq1)
emb2 = emb_layer(seq2)

# Run through LSTM layers
lstm_a = lstm_layer(emb1)
# glob1a = GlobalAveragePooling1D()(lstm_a)
lstm_b = lstm_layer(emb2)
# glob1b = GlobalAveragePooling1D()(lstm_b)

magic_input = Input(shape=(leaks.shape[1],))
magic_dense = BatchNormalization()(magic_input)
magic_dense = Dense(int(Num_Dense/2), activation=act_f)(magic_input)

if use_more_features:
    feature_input = Input(shape=(extra_features.shape[1],))
    feature_dense = BatchNormalization()(feature_input)
    feature_dense = Dense(128, activation='relu')(feature_dense)

if use_more_features:
    merged = concatenate([lstm_a, lstm_b, magic_dense, feature_dense])
else:
    merged = concatenate([lstm_a, lstm_b, magic_dense])
merged = BatchNormalization()(merged)
merged = Dropout(Rate_Drop_Dense)(merged)

merged = Dense(Num_Dense, activation=act_f)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(Rate_Drop_Dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)

# Add class weight
if re_weight:
    class_weight = {0: 1.309033281, 1: 0.471544715}
else:
    class_weight = None



In [13]:
# Train the model
if use_more_features:
    model = Model(inputs=[seq1, seq2, magic_input, feature_input], \
        outputs=preds)
else:
    model = Model(inputs=[seq1, seq2, magic_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])

# Set early stopping (large patience should be useful)
early_stopping =EarlyStopping(monitor='val_acc', patience=6)
bst_model_path = Lstm_Struc + '.h5' 
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

if use_more_features:
    hist = model.fit([data_1_train, data_2_train, leaks_train, feature_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val, feature_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])
else:
    hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path) # store model parameters in .h5 file
bst_val_score = min(hist.history['val_acc'])

Epoch 1/200


ResourceExhaustedError:  OOM when allocating tensor with shape[2048,1024] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/lstm/while/body/_1/model/lstm/while/lstm_cell/dropout_3/random_uniform/RandomUniform}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_8737]

Function call stack:
train_function


In [None]:
# Make the submission
print('Making the submission')

if use_more_features:
    preds = model.predict([test_data_1, test_data_2, test_leaks, extra_features_test], batch_size=8192, verbose=1)
    preds += model.predict([test_data_2, test_data_1, test_leaks, extra_features_test], batch_size=8192, verbose=1)
    preds /= 2
else:
    preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=8192, verbose=1)
    preds += model.predict([test_data_2, test_data_1, test_leaks], batch_size=8192, verbose=1)
    preds /= 2


In [None]:
# Convert percentage to binary predictions
result = []
sub_result = []
for i in preds:
    if i[0] < 0.5:
        sub_result.append(0)
    else:
        sub_result.append(1)
result.append(sub_result)
result = np.array(result)

# Get the accuracy on the test data
true_values = df_test["is_duplicate (Ture Value)"]

score = 0
for i in range(0, len(sub_result)):
    if sub_result[i] == true_values.tolist()[i]:
        score = score + 1
accuracy = score / len(sub_result)
print("Accuracy on test data: {}%".format(round(accuracy*100, 3)))

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':result.ravel()})
submission.to_csv("GloVe + LSTM_with_features" + data_clean_type + "(accuracy: + " + str(round(accuracy*100, 3)) + ")" + ".csv", index=False)