In [1]:
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy

import utils 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from __future__ import absolute_import
from __future__ import print_function

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, merge, BatchNormalization, Activation, Input, Merge
from keras import backend as K

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 75.0% of memory, cuDNN 5103)


### Loading data

In [2]:
os.getcwd()

'/home/ubuntu/quora'

In [3]:
path = '/home/ubuntu/quora/'
data_home = path +"data/"

In [4]:
df = pd.read_csv(data_home+"train.csv", encoding='utf-8' )
 
# encode questions to unicode
# df['question1'] = df['question1'].apply(lambda x: x.encode('utf-8'))# unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: x.encode('utf-8'))#unicode(str(x),"utf-8"))

In [5]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


#### loading pre-trained  GLOVE model which comes free with Spacy. 
It is trained on Wikipedia and therefore, it is strong in terms of word semantics.  

In [31]:
from spacy.en import English

nlp = spacy.load('en')


In [32]:
doc = nlp(u'This is a sentence.')

In [38]:
doc.vector.shape

(300,)

In [23]:
t = nlp.pipe(df['question1'][1], n_threads=50)

In [25]:
vecs1 = [doc.vector for doc in t]


In [30]:
df['question1'][1]

u'What is the story of Kohinoor (Koh-i-Noor) Diamond?'

In [29]:
len(vecs1[0])

300

In [20]:
for doc in t:
    print ((doc.vector.shape))

In [21]:
vecs1 = [doc.vector for doc in nlp.pipe(df['question1'], n_threads=200)]
vecs1 =  np.array(vecs1)
df['q1_feats'] = list(vecs1)

vecs2 = [doc.vector for doc in nlp.pipe(df['question2'], n_threads=200)]
vecs2 =  np.array(vecs2)
df['q2_feats'] = list(vecs2)


KeyboardInterrupt: 

In [22]:
vecs1.shape

NameError: name 'vecs1' is not defined

In [23]:
# save features
pd.to_pickle(df, data_home+'cache/df_q12_glove_spacy_wiki1.pkl')

#### Tfidf

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False )


In [44]:
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

#### TFIDF weight normalization

After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores. The below code does this for just "question1" column

In [71]:

nlp = spacy.load('en')

def normalize_by_tfidf(question):
    progress = progressbar.ProgressBar()

    normalized_vector = []
    idf_zeros = set()
    for qu in df[question]:
        doc = nlp(qu) 
        mean_vec = np.zeros([len(doc), 300])
        for word in doc:
            # word2vec
            vec = word.vector
            
            # fetch df score            
            if word2tfidf.has_key(str(word)):
                idf = word2tfidf[str(word)]
            else:
#                 print (word, ", setting idf to 0")
                idf_zeros.add(word)
                idf = 0
        
            # compute final vec
            mean_vec += vec * idf
        mean_vec = mean_vec.mean(axis=0)
        normalized_vector.append(mean_vec)
        
    return normalized_vector, idf_zeros    

    


In [72]:
#ToDo
# df['q1_feats'], idf_zeros1= normalize_by_tfidf('question1')
# df['q2_feats'], idf_zeros2= normalize_by_tfidf('question2')


#### Train test split

In [73]:
# shuffle df
df = df.reindex(np.random.permutation(df.index))

# set number of train and test instances
num_train = int(df.shape[0] * 0.88)
num_test = df.shape[0] - num_train                 
print("Number of training pairs: %i"%(num_train))
print("Number of testing pairs: %i"%(num_test))

# init data arrays
X_train = np.zeros([num_train, 2, 300])
X_test  = np.zeros([num_test, 2, 300])
Y_train = np.zeros([num_train]) 
Y_test = np.zeros([num_test])

# format data 
b = [a[None,:] for a in list(df['q1_feats'].values)]
q1_feats = np.concatenate(b, axis=0)

b = [a[None,:] for a in list(df['q2_feats'].values)]
q2_feats = np.concatenate(b, axis=0)

# fill data arrays with features
X_train[:,0,:] = q1_feats[:num_train]
X_train[:,1,:] = q2_feats[:num_train]
Y_train = df[:num_train]['is_duplicate'].values
            
X_test[:,0,:] = q1_feats[num_train:]
X_test[:,1,:] = q2_feats[num_train:]
Y_test = df[num_train:]['is_duplicate'].values

# remove useless variables
del b
del q1_feats
del q2_feats

Number of training pairs: 355775
Number of testing pairs: 48515


#### Model definition

In [75]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, merge, BatchNormalization, Activation, Input, Merge
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_base_network(input_dim):
    '''
    Base network for feature extraction.
    '''
    input = Input(shape=(input_dim, ))
    dense1 = Dense(128)(input)
    bn1 = BatchNormalization(mode=2)(dense1)
    relu1 = Activation('relu')(bn1)

    dense2 = Dense(128)(relu1)
    bn2 = BatchNormalization(mode=2)(dense2)
    res2 = merge([relu1, bn2], mode='sum')
    relu2 = Activation('relu')(res2)    

    dense3 = Dense(128)(relu2)
    bn3 = BatchNormalization(mode=2)(dense3)
    res3 = Merge(mode='sum')([relu2, bn3])
    relu3 = Activation('relu')(res3)   
    
    feats = merge([relu3, relu2, relu1], mode='concat')
    bn4 = BatchNormalization(mode=2)(feats)

    model = Model(input=input, output=bn4)

    return model


def compute_accuracy(predictions, labels):
    '''
    Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()


def create_network(input_dim):
    # network definition
    base_network = create_base_network(input_dim)
    
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    
    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    model = Model(input=[input_a, input_b], output=distance)
    return model


In [77]:
from keras.optimizers import RMSprop, SGD, Adam

net = create_network(300)

# train
optimizer = Adam(lr=0.001)
net.compile(loss=contrastive_loss, optimizer=optimizer)

In [79]:
net.fit([X_train[:,0,:], X_train[:,1,:]], Y_train,
          validation_data=([X_test[:,0,:], X_test[:,1,:]], Y_test),
          batch_size=128, nb_epoch=5, shuffle=True )

Train on 355775 samples, validate on 48515 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5a44b22c50>

In [80]:
net.optimizer.lr = 1e-4

net.fit([X_train[:,0,:], X_train[:,1,:]], Y_train,
          validation_data=([X_test[:,0,:], X_test[:,1,:]], Y_test),
          batch_size=128, nb_epoch=10, shuffle=True )

Train on 355775 samples, validate on 48515 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5975006c50>

In [81]:
net.optimizer.lr = 1e-5

net.fit([X_train[:,0,:], X_train[:,1,:]], Y_train,
          validation_data=([X_test[:,0,:], X_test[:,1,:]], Y_test),
          batch_size=128, nb_epoch=10, shuffle=True )

Train on 355775 samples, validate on 48515 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5a29c9ec90>

In [83]:
net.save_weights(path+"models/siamese_nonorm_nopreproc_.h5")

In [85]:
pred = net.predict([X_test[:,0,:], X_test[:,1,:]], batch_size=128)
te_acc = compute_accuracy(pred, Y_test)

In [86]:
te_acc

0.7297312782902653

### Submission

In [89]:
df_test.head(1)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...


In [91]:

df_test = pd.read_csv(data_home+'test.csv')

 
# encode questions to unicode
df_test['question1'] = df_test['question1'].apply(lambda x: unicode(str(x),"utf-8"))
df_test['question2'] = df_test['question2'].apply(lambda x: unicode(str(x),"utf-8"))

vecs1 = [doc.vector for doc in nlp.pipe(df_test['question1'], n_threads=50)]
vecs1 =  np.array(vecs1)
df_test['q1_feats'] = list(vecs1)

vecs2 = [doc.vector for doc in nlp.pipe(df_test['question2'], n_threads=50)]
vecs2 =  np.array(vecs2)
df_test['q2_feats'] = list(vecs2)

In [92]:
pd.to_pickle(df_test, data_home+'cache/test_q12_glove_spacy_wiki1.pkl')

In [107]:
df_test['q1_feats'] = vecs1
df_test['q2_feats'] = vecs2

In [112]:
len(df_test), len(vecs1)

(2345796, 2345796)

In [None]:
save_array(path+"cache/test")

In [113]:
pred = net.predict([vecs1, vecs2], batch_size=128)

In [135]:
clip =0.82 
submission_name = "subm/siamese_nopreproc_glove_1.csv"
flattend = pred.flatten()
clipped = np.clip(flattend,1-clip,clip)


In [137]:
sub = pd.DataFrame({'test_id': df_test['test_id'], 'is_duplicate': clipped})
sub.head(2)

Unnamed: 0,is_duplicate,test_id
0,0.82,0
1,0.82,1


In [138]:
sub.to_csv(path+submission_name, index=False)


In [139]:
from IPython.lib.display import FileLink

FileLink(path+submission_name)