In [1]:
import numpy as np

In [2]:
import pandas as pd 
import keras 
from keras.layers import   Merge
import keras.backend as K
from keras.models import Model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [57]:
df = pd.read_csv("quora_duplicate_questions.tsv",sep='\t', error_bad_lines=False)
df = df.sample(frac = 0.1)
df = df.reset_index(drop = True )
#df =df[:20]

In [58]:
len(df)

40429

In [59]:
from keras.layers import Input

In [60]:
MAX_SEQUENCE_LENGTH = 10
EMBEDDING_SIZE = 50 
MAX_NB_WORDS = 20000

In [61]:
all_texts = list(set(df['question1'].tolist() + df['question2'].tolist()) )

In [62]:
all_texts = [str(i) for  i in all_texts ]

In [63]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import LSTM, GRU

In [64]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts( all_texts ) 

In [65]:
word_index = tokenizer.word_index

In [66]:
len(word_index)

31929

In [67]:
df['question1'] = [ str(qsn) for qsn in df['question1'].tolist() ]
df['question2'] = [ str(qsn) for qsn in df['question2'].tolist() ]

In [68]:
seqs1  = tokenizer.texts_to_sequences( df['question1'].tolist()) 
seqs2 = tokenizer.texts_to_sequences(df['question2'].tolist() )

In [69]:
seqs1 = keras.preprocessing.sequence.pad_sequences(seqs1, maxlen= MAX_SEQUENCE_LENGTH )
seqs2 = keras.preprocessing.sequence.pad_sequences(seqs2, maxlen= MAX_SEQUENCE_LENGTH )

In [70]:
reverse_word_index = {word_index[val]: val  for val in word_index.keys()  }

In [71]:
Y_vals = np.array(df['is_duplicate'].tolist())
print Y_vals.shape

(40429,)


### What I though initialy: Input format: use 20 lines, each line of 50 words given index. When you use embeddings, it gets converted to 3D.  Turns out this is wrong, siamese based on [original paper](http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf), takes as input only single sentences. So for a sentence, when words are represented as indexes. they are 1D vectors and 2D matrices when usign one-hot/word-embedddings

In [72]:
embeddings_index = {}
f = open('glove.6B.50d.txt')

In [73]:
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [74]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [75]:
sentence1 = Input(shape = (MAX_SEQUENCE_LENGTH,))
sentence2 = Input(shape = (MAX_SEQUENCE_LENGTH,))

In [76]:
embedding = Embedding(len(word_index)+1,  EMBEDDING_SIZE , weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
                        trainable= True ) 

In [77]:
encoded1 = embedding(sentence1)
encoded2 = embedding(sentence2)

In [78]:
lstm = LSTM(32)

In [79]:
op1 = lstm(encoded1)
op2 = lstm(encoded2)

In [99]:
def exponent_neg_manhattan_distance(left, right):
    """ 
    print "left: ",  left
    print "right: ", right
    print "sum no-axis, no-dims  : ", (K.sum(K.abs(left-right)  ))
    print "sum axis, no-dims  : ", (K.sum(K.abs(left-right), axis=1 ))
    print "sum no-axis, dims  : ", (K.sum(K.abs(left-right), keepdims= True  ))
    print "sum axis, dims  : ", (K.sum(K.abs(left-right), axis=1 , keepdims= True  ))
    print "actual ", K.exp(-K.sum(K.abs(left-right),axis =1 ,  keepdims= True  )) 
    """  
    return K.exp(-K.sum(K.abs(left-right),axis =1 ,  keepdims= True  )) 
    
    #return K.exp(-K.sum(K.abs(left-right), keepdims= True  ))
    #this is actually exp( (|(x1 - y1)| + |(x2 - y2)| + ...|(x_lstm_units - y_lstm_units)|)*(-1) )
    #return -K.sum(K.abs(left-right), axis=1, keepdims=True)

In [100]:
ml_dist = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([op1, op2])

  """Entry point for launching an IPython kernel.


In [169]:
malstm = Model([sentence1,sentence2 ], [op1 ])

In [170]:
from keras.optimizers import Adam, SGD,Adadelta,rmsprop,adam

In [171]:
optimizer = Adam( lr = 0.1) 

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

In [None]:
malstm_trained = malstm.fit([ seqs1, seqs2 ], Y_vals , epochs= 50 , batch_size= 256 , shuffle=True  ) 

In [172]:
malstm.summary( )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 50)            1596500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                10624     
Total params: 1,607,124
Trainable params: 1,607,124
Non-trainable params: 0
_________________________________________________________________


In [185]:
op1_vals = malstm.predict([seqs1[:20], seqs1[:20] ] )

In [186]:
op1_vals.shape 

(20, 32)

In [None]:
 K.exp(-K.sum(K.abs(left-right),axis =1 ,  keepdims= True  

In [194]:
np.sum( np.abs(op1_vals + op1_vals)  )

101.595856

In [204]:
np.sum( np.abs(op1_vals + op1_vals) , keepdims= True )

array([[101.595856]], dtype=float32)

In [126]:
malstm.weights[3]

<tf.Variable 'lstm_1/recurrent_kernel:0' shape=(32, 128) dtype=float32_ref>

In [145]:
wt_l2 = np.asarray(malstm.layers[2].get_weights())
wt_l3 = np.asarray(malstm.layers[3].get_weights())

In [143]:
for idx,ll in enumerate(malstm.layers): 
    print idx, ll 

0 <keras.engine.topology.InputLayer object at 0x7fa8ea3b7e90>
1 <keras.engine.topology.InputLayer object at 0x7fa8ea4409d0>
2 <keras.layers.embeddings.Embedding object at 0x7fa8ee89eb50>
3 <keras.layers.recurrent.LSTM object at 0x7fa8ea3e4410>
4 <keras.legacy.layers.Merge object at 0x7fa730456dd0>


In [146]:
wt_l2.shape 

(1, 31930, 50)

In [153]:
wt_l3[1].shape  

(32, 128)

In [166]:
print len(wt_l3[0])
print wt_l3[0].shape 
print wt_l3[0][0].shape 
print wt_l3[0][0][0]

50
(50, 128)
(128,)
-3.79013


In [168]:
print len(wt_l3[1])
print wt_l3[1].shape 
print wt_l3[1][0].shape 
print wt_l3[1][0][0]

32
(32, 128)
(128,)
1.0780965


In [163]:
print len(wt_l3[2])
print wt_l3[2].shape 
print wt_l3[2][0]

128
(128,)
-0.5543394


In [93]:
1 - (np.sum(Y_vals)+0.0001)/len(Y_vals)

0.6288555220262683