In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports

In [None]:
import os
import re
import csv
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import codecs
from string import punctuation
from collections import defaultdict
# from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Activation, LSTM, Lambda
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
# from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalAveragePooling1D
import keras.backend as K

# Settings

In [None]:
PATH = '/content/drive/MyDrive/Colab Notebooks/skillfactory/Project-Final/'
PATH_DATA = PATH + 'data/'
embedding_file = PATH_DATA + 'glove.840B.300d.txt'
train_data_file = PATH_DATA + 'train.csv'
test_data_file = PATH_DATA + 'test.csv'
max_sequence_length = 60
max_num_words = 200_000 # There are about 201000 unique words in training dataset, 200000 is enough for tokenization
embedding_dim = 300
validation_split_ratio = 0.1
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
lstm_name = 'lstm_{:d}_{:d}_{:.2f}_{:.2f}'.format(num_lstm, num_dense, rate_drop_lstm, rate_drop_dense)
print(lstm_name)

act_f = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.4% share in test set
np.random.seed(21)

lstm_185_135_0.32_0.37


# Functions

In [None]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        text = [w for w in text if not w in stop_words]
    
    text = " ".join(text)
    
    # Remove punctuation from text
    # text = "".join([c for c in text if c not in punctuation])

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    # text = re.sub(r"\0s", "0", text) # It doesn't make sense to me
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

# Create word embedding dictionary
from 'glove.840B.300d.txt'

In [None]:
print('Create word embedding dictionary')

embeddings_index = {}
f = open(embedding_file, encoding='utf-8')

# for line in tqdm(f):
for line in f:
    values = line.split()
    # word = values[0]
    word = ''.join(values[:-300])   
    # coefs = np.asarray(values[1:], dtype='float32')
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print(f'Found {len(embeddings_index)} word vectors of glove.')

Create word embedding dictionary
Found 2195892 word vectors of glove.


# Process text in dataset

In [None]:
print('Processing text in dataset')

# load data and process with text_to_wordlist
train_texts_1 = [] 
train_texts_2 = []
train_labels = []

df_train = pd.read_csv(train_data_file, encoding='utf-8')
# df_train = df_train.sample(5000) # train data sample to test code
df_train = df_train.fillna('empty')
train_q1 = df_train['question1'].values
train_q2 = df_train['question2'].values
train_labels = df_train['is_duplicate'].values

Processing text in dataset


In [None]:
for text in train_q1:
    train_texts_1.append(text_to_wordlist(text, remove_stopwords=False, stem_words=False))
    
for text in train_q2:
    train_texts_2.append(text_to_wordlist(text, remove_stopwords=False, stem_words=False))

'''
with open(Train_Data_File, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader) # Skip header row
    for values in reader:
        train_texts_1.append(text_to_wordlist(values[3], remove_stopwords=False, stem_words=False))
        train_texts_2.append(text_to_wordlist(values[4], remove_stopwords=False, stem_words=False))
        train_labels.append(int(values[5]))
'''
print(f'{len(train_texts_1)} texts are found in train.csv')

404290 texts are found in train.csv


In [None]:
test_texts_1 = []
test_texts_2 = []
test_ids = []

df_test = pd.read_csv(test_data_file, encoding='utf-8')
# df_test = df_test.sample(5000) # test data sample to test code
df_test = df_test.fillna('empty')
test_q1 = df_test['question1'].values
test_q2 = df_test['question2'].values
test_ids = df_test['test_id'].values

'''
with open(Test_Data_File, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1], remove_stopwords=False, stem_words=False))
        test_texts_2.append(text_to_wordlist(values[2], remove_stopwords=False, stem_words=False))
        test_ids.append(values[0])
'''

  exec(code_obj, self.user_global_ns, self.user_ns)


"\nwith open(Test_Data_File, encoding='utf-8') as f:\n    reader = csv.reader(f, delimiter=',')\n    header = next(reader)\n    for values in reader:\n        test_texts_1.append(text_to_wordlist(values[1], remove_stopwords=False, stem_words=False))\n        test_texts_2.append(text_to_wordlist(values[2], remove_stopwords=False, stem_words=False))\n        test_ids.append(values[0])\n"

In [None]:
for text in test_q1:
    test_texts_1.append(text_to_wordlist(text, remove_stopwords=False, stem_words=False))
    
for text in test_q2:
    test_texts_2.append(text_to_wordlist(text, remove_stopwords=False, stem_words=False))

print(f'{len(test_texts_1)} texts are found in test.csv')

3563475 texts are found in test.csv


# Tokenize words in all sentences

In [None]:
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train_texts_1 + train_texts_2 + test_texts_1 + test_texts_2)

train_sequences_1 = tokenizer.texts_to_sequences(train_texts_1)
train_sequences_2 = tokenizer.texts_to_sequences(train_texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print(f'{len(word_index)} unique tokens are found')

120499 unique tokens are found


In [None]:
# pad all train with Max_Sequence_Length
train_data_1 = pad_sequences(train_sequences_1, maxlen=max_sequence_length)
train_data_2 = pad_sequences(train_sequences_2, maxlen=max_sequence_length)
# train_labels = np.array(train_labels)
print(f'Shape of train data tensor: {train_data_1.shape}')
print(f'Shape of train labels tensor: {train_labels.shape}')

# pad all test with Max_Sequence_Length
test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)
# test_ids = np.array(test_ids)
print(f'Shape of test data tensor: {test_data_2.shape}')
print(f'Shape of test ids tensor:{test_ids.shape}')

Shape of train data tensor: (404290, 60)
Shape of train labels tensor: (404290,)
Shape of test data tensor: (3563475, 60)
Shape of test ids tensor:(3563475,)


# Leaky features

In [None]:
questions = pd.concat([df_train[['question1', 'question2']], \
                        df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(questions.shape[0]):
    q_dict[questions.question1[i]].add(questions.question2[i])
    q_dict[questions.question2[i]].add(questions.question1[i])

In [None]:
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))

df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1)

df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1)

In [None]:
leaks = df_train[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = df_test[['q1_q2_intersect', 'q1_freq', 'q2_freq']]

In [None]:
ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)

  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


# Create embedding matrix for embedding layer

In [None]:
print('Preparing embedding matrix')
num_words = min(max_num_words, len(word_index))+1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(f'Null word embeddings: {np.sum(np.sum(embedding_matrix, axis=1) == 0)}')

Preparing embedding matrix
Null word embeddings: 33233


# Train Validation split

In [None]:
perm = np.random.permutation(len(train_data_1))
idx_train = perm[:int(len(train_data_1)*(1-validation_split_ratio))]
idx_val = perm[int(len(train_data_1)*(1-validation_split_ratio)):]

data_1_train = np.vstack((train_data_1[idx_train], train_data_2[idx_train]))
data_2_train = np.vstack((train_data_2[idx_train], train_data_1[idx_train]))
leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
labels_train = np.concatenate((train_labels[idx_train], train_labels[idx_train]))

data_1_val = np.vstack((train_data_1[idx_val], train_data_2[idx_val]))
data_2_val = np.vstack((train_data_2[idx_val], train_data_1[idx_val]))
leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
labels_val = np.concatenate((train_labels[idx_val], train_labels[idx_val]))

In [None]:
weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.471544715
    weight_val[labels_val==0] = 1.309033281

# The embedding layer containing the word vectors

In [None]:
'''
emb_layer = Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    input_length=Max_Sequence_Length,
    trainable=False
)
'''

emb_layer = Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_sequence_length,
    trainable=False
)    

# LSTM layer
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

# Define inputs
seq1 = Input(shape=(max_sequence_length,), dtype='int32')
seq2 = Input(shape=(max_sequence_length,), dtype='int32')

# Run inputs through embedding
emb1 = emb_layer(seq1)
emb2 = emb_layer(seq2)

# Run through LSTM layers
lstm_a = lstm_layer(emb1)
# glob1a = GlobalAveragePooling1D()(lstm_a)
lstm_b = lstm_layer(emb2)
# glob1b = GlobalAveragePooling1D()(lstm_b)

magic_input = Input(shape=(leaks.shape[1],))
# magic_dense = BatchNormalization()(magic_input)
magic_dense = Dense(int(num_dense/2), activation=act_f)(magic_input)

merged = concatenate([lstm_a, lstm_b, magic_dense])
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

merged = Dense(num_dense, activation=act_f)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(rate_drop_dense)(merged)

preds = Dense(1, activation='sigmoid')(merged)



In [None]:
# Add class weight
if re_weight:
    class_weight = {0: 1.309033281, 1: 0.471544715}
else:
    class_weight = None

# Train the model

In [None]:
model = Model(inputs=[seq1, seq2, magic_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])

In [None]:
# Set early stopping (large patience should be useful)
early_stopping =EarlyStopping(monitor='val_loss', patience=10)
bst_model_path = PATH + 'result/' + lstm_name + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

In [None]:
hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

Epoch 1/200
Epoch 2/200
Epoch 3/200

In [None]:
model.load_weights(bst_model_path) # store model parameters in .h5 file
bst_val_score = min(hist.history['val_loss'])

# Make the submission

In [None]:
preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1, test_leaks], batch_size=8192, verbose=1)
preds /= 2



In [62]:
submission = pd.DataFrame({'test_id': test_ids,
                           'is_duplicate': preds.ravel()})
submission[:2345796].to_csv(PATH+ 'result/{:.4f}_'.format(bst_val_score)+lstm_name+'_with_GloVe_Embedding.csv', index=False)