In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import datetime, time, json, os, math, pickle, sys
from string import punctuation
from __future__ import division
from __future__ import print_function

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import concatenate, Embedding, Dense, Input, Dropout, BatchNormalization, TimeDistributed, Lambda, Activation, LSTM, Flatten, Convolution1D, GRU, MaxPooling1D
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras import initializers
from keras import backend as K
from keras.optimizers import Adam
from collections import defaultdict

Using TensorFlow backend.


In [24]:
DATA_DIR = '../data/'
MODEL = 'Baseline'
if os.getcwd().split('/')[-1] != MODEL:
    print('WRONG MODEL DIR!!!')
CHECKPOINT_DIR = './checkpoint/'
if not os.path.exists(CHECKPOINT_DIR):
    os.mkdir(CHECKPOINT_DIR)
LOG_DIR = './log/'
if not os.path.exists(LOG_DIR):
    os.mkdir(LOG_DIR)
OUTPUT_DIR = './output/'
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
    
MAX_LEN = 36
EMBEDDING_DIM = 300
BATCH_SIZE = 256
VALID_FRAC = 0.95

def get_best_model(checkpoint_dir = CHECKPOINT_DIR):
    files = glob.glob(checkpoint_dir+'*')
    val_losses = [float(f.split('-')[-1][:-5]) for f in files]
    index = val_losses.index(min(val_losses))
    print('Loading model from checkpoint file ' + files[index])
    model = load_model(files[index])
    model_name = files[index].split('/')[-1]
    print('Loading model Done!')
    return (model, model_name)

In [3]:
trainval_df = pd.read_csv(DATA_DIR+"train.csv")
test_df = pd.read_csv(DATA_DIR+"test.csv")
print(trainval_df.shape)
print(test_df.shape)

# Check for any null values
# inds = pd.isnull(train).any(1).nonzero()[0]
# train.loc[inds]
# inds = pd.isnull(test).any(1).nonzero()[0]
# test.loc[inds]

# Add the string 'empty' to empty strings
trainval_df = trainval_df.fillna('empty')
test_df = test_df.fillna('empty')

(404290, 6)
(2345796, 3)


In [25]:
# data cleaning: re.sub, rm punctuation, stop_words, stem_words

stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
print('stop_words:', len(stop_words))

# # nltk.download("stopwords")
# stop_words = stopwords.words('english')
# print('stop_words:', len(stop_words))

def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.
    
    # Convert words to lower case and split them
    #text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0k ", "0000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)


# question to word list by data cleaning

file_name = 'trainval_df.pickle'
if os.path.exists(OUTPUT_DIR+file_name):
    print ('Loading from file '+file_name)
    trainval_df = pd.read_pickle(OUTPUT_DIR+file_name)
else:
    print ('Generating file '+file_name)  
    trainval_df['question1_WL'] = trainval_df.apply(lambda row: text_to_wordlist(row['question1']), axis=1)
    trainval_df['question2_WL'] = trainval_df.apply(lambda row: text_to_wordlist(row['question2']), axis=1)
    trainval_df.to_pickle(OUTPUT_DIR+file_name)      

file_name = 'test_df.pickle'
if os.path.exists(OUTPUT_DIR+file_name):
    print ('Loading from file '+file_name)
    test_df = pd.read_pickle(OUTPUT_DIR+file_name)
else:
    print ('Generating file '+file_name)  
    test_df['question1_WL'] = test_df.apply(lambda row: text_to_wordlist(row['question1']), axis=1)
    test_df['question2_WL'] = test_df.apply(lambda row: text_to_wordlist(row['question2']), axis=1)
    test_df.to_pickle(OUTPUT_DIR+file_name)   
    
test_size = trainval_df.shape[0]-int(math.ceil(trainval_df.shape[0]*VALID_FRAC/1024)*1024)
train_df, valid_df = train_test_split(trainval_df, test_size=test_size, random_state=1986, stratify=trainval_df['is_duplicate'])

stop_words: 35
Loading from file trainval_df.pickle
Loading from file test_df.pickle


In [27]:
# tokenize and pad

all_questions = pd.concat([trainval_df['question1_WL'],trainval_df['question2_WL'],test_df['question1_WL'],test_df['question2_WL']], axis=0)
tokenizer = Tokenizer(num_words=None, lower=True)
tokenizer.fit_on_texts(all_questions)
word_index = tokenizer.word_index
print("Words in index: %d" % len(word_index))

train_q1 = pad_sequences(tokenizer.texts_to_sequences(train_df['question1_WL']), maxlen = MAX_LEN)
train_q2 = pad_sequences(tokenizer.texts_to_sequences(train_df['question2_WL']), maxlen = MAX_LEN)
valid_q1 = pad_sequences(tokenizer.texts_to_sequences(valid_df['question1_WL']), maxlen = MAX_LEN)
valid_q2 = pad_sequences(tokenizer.texts_to_sequences(valid_df['question2_WL']), maxlen = MAX_LEN)
y_train = train_df.is_duplicate
y_valid = valid_df.is_duplicate

Words in index: 120355


In [11]:
# load data

file_name = 'word_embedding_matrix.pickle'
if os.path.exists(OUTPUT_DIR+file_name):
    print ('Loading from file '+file_name)
    with open(OUTPUT_DIR+file_name, 'rb') as f:
        word_embedding_matrix = pickle.load(f)
else:
    print ('Generating file '+file_name)   
    # Load GloVe to use pretrained vectors
    embeddings_index = {}
    with open(DATA_DIR+'/glove/glove.840B.300d.txt') as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    print('Word embeddings:', len(embeddings_index)) #151,250

    # Need to use EMBEDDING_DIM for embedding dimensions to match GloVe's vectors.
    nb_words = len(word_index)
    null_embedding_words = []
    word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            word_embedding_matrix[i] = embedding_vector
        else:
            null_embedding_words.append(word)
    print('Null word embeddings: %d' %len(null_embedding_words)) #75,334

    with open(OUTPUT_DIR+file_name, 'wb') as f:
        pickle.dump(word_embedding_matrix, f)

Loading from file word_embedding_matrix.pickle


In [28]:
sequence_input = Input(shape=(MAX_LEN,))
embedded_sequence = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=MAX_LEN, trainable=False)(sequence_input)
encoded = LSTM(64)(embedded_sequence)
encode_model = Model(sequence_input, encoded)

sequence1_input = Input(shape=(MAX_LEN,), name='q1')
sequence2_input = Input(shape=(MAX_LEN,), name='q2')
encoded_1 = encode_model(sequence1_input)
encoded_2 = encode_model(sequence2_input)
merged_vector = concatenate([encoded_1, encoded_2], axis=-1)
predictions = Dense(1, activation='sigmoid')(merged_vector)
model = Model(inputs=[sequence1_input, sequence2_input], outputs=predictions)

optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

callbacks = [ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1),
             EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1),
             ModelCheckpoint(filepath=CHECKPOINT_DIR+'weights.{epoch:03d}-{val_loss:.4f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True),
             TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=False, write_images=True)]

print('BATCH_SIZE:', BATCH_SIZE)
model.fit({'q1': train_q1, 'q2': train_q2}, y_train, batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks, 
          validation_data=({'q1': valid_q1, 'q2': valid_q2}, y_valid), shuffle=True, initial_epoch=0)

Train on 385024 samples, validate on 19266 samples
Epoch 1/100
Epoch 00000: val_loss improved from inf to 0.51147, saving model to ./checkpoint/weights.000-0.5115.hdf5
151s - loss: 0.5351 - acc: 0.7317 - val_loss: 0.5115 - val_acc: 0.7503
Epoch 2/100
Epoch 00001: val_loss improved from 0.51147 to 0.49540, saving model to ./checkpoint/weights.001-0.4954.hdf5
150s - loss: 0.4925 - acc: 0.7624 - val_loss: 0.4954 - val_acc: 0.7619
Epoch 3/100
Epoch 00002: val_loss improved from 0.49540 to 0.48336, saving model to ./checkpoint/weights.002-0.4834.hdf5
150s - loss: 0.4705 - acc: 0.7766 - val_loss: 0.4834 - val_acc: 0.7704
Epoch 4/100
Epoch 00003: val_loss improved from 0.48336 to 0.47801, saving model to ./checkpoint/weights.003-0.4780.hdf5
150s - loss: 0.4524 - acc: 0.7880 - val_loss: 0.4780 - val_acc: 0.7752
Epoch 5/100
Epoch 00004: val_loss improved from 0.47801 to 0.47471, saving model to ./checkpoint/weights.004-0.4747.hdf5
150s - loss: 0.4365 - acc: 0.7976 - val_loss: 0.4747 - val_acc: 

KeyboardInterrupt: 

In [None]:
#resume training

model, model_name = get_best_model()
# model = load_model(CHECKPOINT_DIR + 'weights.025-0.4508.hdf5')
# model_name = 'weights.025-0.4508.hdf5'
# print('model_name', model_name)

# #try increasing learningrate
# optimizer = Adam(lr=1e-4)
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# callbacks = [ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1),
#              EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1),
#              ModelCheckpoint(filepath=CHECKPOINT_DIR+'weights.{epoch:03d}-{val_loss:.4f}.hdf5', monitor='val_loss', verbose=1, save_best_only=True),
#              TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=False, write_images=True)]

print('BATCH_SIZE:', BATCH_SIZE)
model.fit({'q1': train_q1, 'q2': train_q2}, y_train, batch_size=BATCH_SIZE, epochs=100, verbose=2, callbacks=callbacks, 
          validation_data=({'q1': valid_q1, 'q2': valid_q2}, y_valid), shuffle=True, initial_epoch=)

In [None]:
val_loss = model.evaluate({'q1': valid_q1, 'q2': valid_q2}, y_valid, batch_size=BATCH_SIZE, verbose=1)

In [None]:
#Create submission
test_q1 = pad_sequences(tokenizer.texts_to_sequences(test_df['question1_WL']), maxlen = MAX_LEN, padding = 'post', truncating = 'post')
test_q2 = pad_sequences(tokenizer.texts_to_sequences(test_df['question2_WL']), maxlen = MAX_LEN, padding = 'post', truncating = 'post')
predictions = model.predict({'q1': test_q1, 'q2': test_q2}, batch_size=BATCH_SIZE, verbose=1)

submission = pd.DataFrame(predictions, columns=['is_duplicate'])
submission.insert(0, 'test_id', test.test_id)
file_name = MODEL+'_valloss{:4f}.csv'.format(val_loss)
submission.to_csv(OUTPUT_DIR+file_name, index=False)
submission.head(10)