## Siamese Network 


In [1]:
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy

import utils 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from __future__ import absolute_import
from __future__ import print_function

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, merge, BatchNormalization, Activation, Input, Merge
from keras import backend as K


%matplotlib inline
from __future__ import print_function
import numpy as np
import pandas as pd
import datetime, time, json

from keras.models import Sequential,Model
from keras.layers import Embedding, Dense, Dropout, Reshape, Merge, BatchNormalization, TimeDistributed, Lambda
from keras.layers import Conv1D , Flatten, Input
from keras.layers.pooling import MaxPooling1D



from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K





from sklearn.model_selection import StratifiedShuffleSplit



Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 75.0% of memory, cuDNN 5103)


### Loading data

In [2]:


path = '/home/ubuntu/quora/'
data_home = path +"data/"

Q1_TRAINING_DATA_FILE = data_home+'cache/q1_train.npy'
Q2_TRAINING_DATA_FILE = data_home+'cache/q2_train.npy'
LABEL_TRAINING_DATA_FILE = data_home+'cache/label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = data_home+'cache/word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = data_home+'cache/nb_words.json'
Q1_TESTING_DATA_FILE = 'q1_test.npy'
Q2_TESTING_DATA_FILE = 'q2_test.npy'


MODEL_WEIGHTS_FILE = path+'weights/conv_weights_v1.h5'
MAX_SEQUENCE_LENGTH = 35
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25

In [3]:
path = '/home/ubuntu/quora/'
data_home = path +"data/"

In [4]:
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']
 

In [5]:
   
def prepare_data():
    
    q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
    q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
    labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))



    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=2019)
    X = np.stack((q1_data, q2_data), axis=1)
    y = labels

    for train_index, test_index in sss.split(X, y):
#         print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]


    return X_train,X_test,y_train,y_test



In [6]:
X_train,X_test,y_train,y_test = prepare_data()


#### Model definition

In [21]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, merge, BatchNormalization, Activation, Input, Merge
from keras import backend as K


def euclidean_distance(vects):
    #x.shape  >  (None,35,300)
    
    x, y = vects
    print (x.shape)
    
    batch_size, sentence_length, = x.shape
    word_diffs = np.zeros(sentence_length).astype(float)
    

#     for i in range(sentence_length):
#         word_diffs[i] = K.sqrt(K.sum(K.square(x[i] - y[i]), axis=1, keepdims=True))
        
#     return np.sum(word_diffs)
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_base_network(input_dim):
    '''
    Base network for feature extraction.
    '''
    graph_input = Input(shape=(MAX_SEQUENCE_LENGTH,))#EMBEDDING_DIM))#shape=(input_dim, ))
    
    embd = Embedding(nb_words + 1, 
                     EMBEDDING_DIM, 
                     weights=[word_embedding_matrix], 
                     input_length=MAX_SEQUENCE_LENGTH, 
                     trainable=False)(graph_input)
                      
    dense1 = Dense(128)(embd)
    bn1 = BatchNormalization()(dense1)
    relu1 = Activation('relu')(bn1)

    dense2 = Dense(128)(relu1)
    bn2 = BatchNormalization()(dense2)
    res2 = merge([relu1, bn2], mode='sum')
    relu2 = Activation('relu')(res2)    

    dense3 = Dense(128)(relu2)
    bn3 = BatchNormalization()(dense3)
    res3 = Merge(mode='sum')([relu2, bn3])
    relu3 = Activation('relu')(res3)   
    
    feats = merge([relu3, relu2, relu1], mode='concat')
    bn4 = BatchNormalization()(feats)

    model = Model(inputs=graph_input, outputs=bn4)
    
    return model


def compute_accuracy(predictions, labels):
    '''
    Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()


def create_network(input_dim):
    # network definition
    base_network = create_base_network(input_dim)
    
    input_a = Input(shape=(MAX_SEQUENCE_LENGTH,))
    input_b = Input(shape=(MAX_SEQUENCE_LENGTH,))
    
    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    
    Q1 = base_network(input_a)
    Q2 = base_network(input_b)


    distance = Lambda(euclidean_distance)([Q1, Q2])
    
    model = Model(inputs=[input_a, input_b], outputs=distance)
    return model


In [22]:
from keras.optimizers import RMSprop, SGD, Adam

net = create_network(300)

# train
# optimizer = Adam(lr=0.001)
net.compile(loss=contrastive_loss, optimizer='nadam')

  .format(self.name, input_shape))


In [23]:
# X_train,X_test,y_train,y_test = prepare_data()
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]


In [24]:
MODEL_WEIGHTS_FILE = path+'weights/siamese_v1_epoch_{epoch:02d}_val_loss_{val_loss:.2f}.h5'

callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_loss', save_best_only=True)]


print("Starting training at", datetime.datetime.now())
t0 = time.time()




history = net.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=45,
                    batch_size=256,
                    # validation_split=VALIDATION_SPLIT,
                    validation_data = ([Q1_test, Q2_test],y_test),
                    callbacks=callbacks)


t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2017-04-26 09:14:11.485678
Train on 363861 samples, validate on 40429 samples
Epoch 1/45


TypeError: Bad input argument to theano function with name "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/keras/backend/theano_backend.py:1118" at index 2 (0-based).  
Backtrace when that variable is created:

  File "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2827, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-22-8b34cd7450ad>", line 7, in <module>
    net.compile(loss=contrastive_loss, optimizer='nadam')
  File "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/keras/engine/training.py", line 878, in compile
    dtype=K.dtype(self.outputs[i]))
  File "/home/ubuntu/anaconda2/envs/qenv/lib/python2.7/site-packages/keras/backend/theano_backend.py", line 184, in placeholder
    x = T.TensorType(dtype, broadcast)(name)
Wrong number of dimensions: expected 3, got 2 with shape (256, 1).

In [83]:
net.save_weights(path+"models/siamese_nonorm_nopreproc_.h5")

In [85]:
pred = net.predict([X_test[:,0,:], X_test[:,1,:]], batch_size=128)
te_acc = compute_accuracy(pred, Y_test)

In [86]:
te_acc

0.7297312782902653

### Submission

In [89]:
df_test.head(1)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...


In [91]:

df_test = pd.read_csv(data_home+'test.csv')

 
# encode questions to unicode
df_test['question1'] = df_test['question1'].apply(lambda x: unicode(str(x),"utf-8"))
df_test['question2'] = df_test['question2'].apply(lambda x: unicode(str(x),"utf-8"))

vecs1 = [doc.vector for doc in nlp.pipe(df_test['question1'], n_threads=50)]
vecs1 =  np.array(vecs1)
df_test['q1_feats'] = list(vecs1)

vecs2 = [doc.vector for doc in nlp.pipe(df_test['question2'], n_threads=50)]
vecs2 =  np.array(vecs2)
df_test['q2_feats'] = list(vecs2)

In [92]:
pd.to_pickle(df_test, data_home+'cache/test_q12_glove_spacy_wiki1.pkl')

In [107]:
df_test['q1_feats'] = vecs1
df_test['q2_feats'] = vecs2

In [112]:
len(df_test), len(vecs1)

(2345796, 2345796)

In [None]:
save_array(path+"cache/test")

In [113]:
pred = net.predict([vecs1, vecs2], batch_size=128)

In [135]:
clip =0.82 
submission_name = "subm/siamese_nopreproc_glove_1.csv"
flattend = pred.flatten()
clipped = np.clip(flattend,1-clip,clip)


In [137]:
sub = pd.DataFrame({'test_id': df_test['test_id'], 'is_duplicate': clipped})
sub.head(2)

Unnamed: 0,is_duplicate,test_id
0,0.82,0
1,0.82,1


In [138]:
sub.to_csv(path+submission_name, index=False)


In [139]:
from IPython.lib.display import FileLink

FileLink(path+submission_name)