In [1]:
'''
Example of an LSTM model with GloVe embeddings along with magic features

Tested under Keras 2.0 with Tensorflow 1.0 backend

Single model may achieve LB scores at around 0.18+, average ensembles can get 0.17+
'''

########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

########################################
## set directories and parameters
########################################
BASE_DIR = './'
EMBEDDING_FILE = BASE_DIR + 'glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + './examples_v2.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

Using TensorFlow backend.


In [3]:
#these should accomodate the context and abstruct

'''
Example of an LSTM model with GloVe embeddings along with magic features

Tested under Keras 2.0 with Tensorflow 1.0 backend

Single model may achieve LB scores at around 0.18+, average ensembles can get 0.17+
'''

########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

########################################
## set directories and parameters
########################################
BASE_DIR = './'
EMBEDDING_FILE = BASE_DIR + '../../embeddings/glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'examples_v2.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

########################################
## index word vectors
########################################
print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE)
count = 0
for line in f:
    values = line.split()
    count+=1
    #print(count)
    #if (count != 52344) and (count!=128262) and (count!=151103)and (count!=200669):
    word = values[0]
    if (not '@' in line) and (not '.' in word)and (not '.com' in line):
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

print('Found %d word vectors of glove.' % len(embeddings_index))

########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

Indexing word vectors
Found 2055631 word vectors of glove.
Processing text dataset


In [4]:
phrases = []
contexts = [] 
abstructs = []
labels = []
pages = []
#i=0
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        contexts.append(text_to_wordlist(values[3]))
        abstructs.append(text_to_wordlist(values[6]))
        labels.append(int(values[-1]))
        phrases.append(text_to_wordlist(values[1]))
        pages.append(text_to_wordlist(values[5]))
        #if i > 4 :
        #    break
        #i+=1
print('Found %s texts in train.csv' % len(contexts))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(contexts + abstructs)

sequences_1 = tokenizer.texts_to_sequences(contexts)
sequences_2 = tokenizer.texts_to_sequences(abstructs)
#test_sequences_1 = tokenizer.texts_to_sequences(test_contexts)
#test_sequences_2 = tokenizer.texts_to_sequences(test_abstructs)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 214604 texts in train.csv
Found 183299 unique tokens


In [5]:
len(embeddings_index)

2055631

In [6]:


data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

'''
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)
'''


Shape of data tensor: (214604, 30)
Shape of label tensor: (214604,)


'\ntest_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)\ntest_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)\ntest_ids = np.array(test_ids)\n'

In [7]:
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 72416


In [8]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

In [9]:
merged = concatenate([x1, y1])
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [15]:
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
#leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
#leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

In [18]:
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = 'lstm_initial' + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, ], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        callbacks=[early_stopping, model_checkpoint])

lstm_176_126_0.27_0.33
Train on 386286 samples, validate on 42922 samples
Epoch 1/200
 12288/386286 [..............................] - ETA: 859s - loss: nan - acc: 0.0000e+00

KeyboardInterrupt: 

In [27]:
np.save('context_train.npy',data_1_train)
np.save('abstruct_train.npy',data_2_train)
np.save('context_val.npy',data_1_val)
np.save('abstruct_val.npy',data_2_val)
np.save('labels_train.npy',labels_train)
np.save('labels_val.npy',labels_val)

In [None]:
# model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

In [None]:
preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=8192, verbose=1)

In [None]:
submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

In [29]:
import xgboost as xgb

dtrain = xgb.DMatrix(np.concatenate([data_1_train,data_2_train],axis=1),label=labels_train)
dval = xgb.DMatrix(np.concatenate([data_1_val,data_2_val],axis=1),label=labels_val)

In [None]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['silent']=0
params['eta'] = 0.02
params['max_depth'] = 4

watchlist = [(dtrain, 'train'), (dval, 'valid')]

bst = xgb.train(params, dtrain, 400, watchlist, early_stopping_rounds=50, verbose_eval=1)

[0]	train-auc:0.528155	valid-auc:0.522039
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.528167	valid-auc:0.522114
[2]	train-auc:0.528407	valid-auc:0.522114
[3]	train-auc:0.528377	valid-auc:0.522025
[4]	train-auc:0.528357	valid-auc:0.521981
[5]	train-auc:0.576319	valid-auc:0.577272
[6]	train-auc:0.576933	valid-auc:0.576736
[7]	train-auc:0.577125	valid-auc:0.576673
[8]	train-auc:0.587648	valid-auc:0.579242
[9]	train-auc:0.591858	valid-auc:0.58545
[10]	train-auc:0.593272	valid-auc:0.586497
[11]	train-auc:0.593462	valid-auc:0.587756
[12]	train-auc:0.598837	valid-auc:0.58898
[13]	train-auc:0.598925	valid-auc:0.588045
[14]	train-auc:0.597864	valid-auc:0.587882
[15]	train-auc:0.601871	valid-auc:0.590087
[16]	train-auc:0.603362	valid-auc:0.591167
[17]	train-auc:0.603145	valid-auc:0.591233
[18]	train-auc:0.603354	valid-auc:0.591194
[19]	train-auc:0.603883	valid-auc:0.593086
[20]	trai

In [None]:
bst.predict