In [1]:
import numpy as np

In [2]:
import pandas as pd 
import keras 
from keras.layers import   Merge
import keras.backend as K
from keras.models import Model
from keras.optimizers import Adam, SGD,Adadelta,rmsprop,adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
df = pd.read_csv("quora_duplicate_questions.tsv",sep='\t', error_bad_lines=False)
df = df.sample(frac = 0.1)
df = df.reset_index(drop = True )
#df =df[:20]

In [4]:
len(df)

40429

In [5]:
from keras.layers import Input

In [6]:
MAX_SEQUENCE_LENGTH = 10
EMBEDDING_SIZE = 50 
MAX_NB_WORDS = 20000

In [7]:
all_texts = list(set(df['question1'].tolist() + df['question2'].tolist()) )

In [8]:
all_texts = [str(i) for  i in all_texts ]

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import LSTM, GRU

In [10]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts( all_texts ) 

In [11]:
word_index = tokenizer.word_index

In [12]:
len(word_index)

31761

In [13]:
df['question1'] = [ str(qsn) for qsn in df['question1'].tolist() ]
df['question2'] = [ str(qsn) for qsn in df['question2'].tolist() ]

In [14]:
seqs1  = tokenizer.texts_to_sequences( df['question1'].tolist()) 
seqs2 = tokenizer.texts_to_sequences(df['question2'].tolist() )

In [15]:
seqs1 = keras.preprocessing.sequence.pad_sequences(seqs1, maxlen= MAX_SEQUENCE_LENGTH )
seqs2 = keras.preprocessing.sequence.pad_sequences(seqs2, maxlen= MAX_SEQUENCE_LENGTH )

In [16]:
reverse_word_index = {word_index[val]: val  for val in word_index.keys()  }

In [17]:
Y_vals = np.array(df['is_duplicate'].tolist())
print Y_vals.shape

(40429,)


###### What I though initialy: Input format: use 20 lines, each line of 50 words given index. When you use embeddings, it gets converted to 3D.  Turns out this is wrong, siamese based on [original paper](http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf), takes as input only single sentences. So for a sentence, when words are represented as indexes. they are 1D vectors and 2D matrices when usign one-hot/word-embedddings

In [18]:
embeddings_index = {}
f = open('glove.6B.50d.txt')

In [19]:
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [20]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [21]:
sentence1 = Input(shape = (MAX_SEQUENCE_LENGTH,))
sentence2 = Input(shape = (MAX_SEQUENCE_LENGTH,))

In [22]:
embedding = Embedding(len(word_index)+1,  EMBEDDING_SIZE , weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
                        trainable= True ) 

In [23]:
encoded1 = embedding(sentence1)
encoded2 = embedding(sentence2)

In [24]:
lstm = LSTM(32)

In [25]:
op1 = lstm(encoded1)
op2 = lstm(encoded2)

###### Below is the actual implementation as given in paper. I did not find understanding it simple enough. Check thr numpy_tests notebook for intuition on working of `np.sum( )`

In [27]:
def exponent_neg_manhattan_distance(left, right):
    """ 
    print "left: ",  left
    print "right: ", right
    print "sum no-axis, no-dims  : ", (K.sum(K.abs(left-right)  ))
    print "sum axis, no-dims  : ", (K.sum(K.abs(left-right), axis=1 ))
    print "sum no-axis, dims  : ", (K.sum(K.abs(left-right), keepdims= True  ))
    print "sum axis, dims  : ", (K.sum(K.abs(left-right), axis=1 , keepdims= True  ))
    print "actual ", K.exp(-K.sum(K.abs(left-right),axis =1 ,  keepdims= True  )) 
    """  
    return K.exp(-K.sum(K.abs(left-right),axis =1 ,  keepdims= True  )) 

In [28]:
ml_dist = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([op1, op2])

  """Entry point for launching an IPython kernel.


##### Keeping the op for model as op1 and op2 instead of actual of values to understand the working of the dist function: 

In [36]:
malstm = Model([sentence1,sentence2 ], [op1,op2 ])  

In [38]:
optimizer = Adam( lr = 0.1) 
malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

In [39]:
malstm.summary( )

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 50)       1588100     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 32)           10624       embedding_1[0][0]                
          

In [50]:
op_vals = malstm.predict([seqs1[:20], seqs2[:20] ] )

In [51]:
len(op_vals  )

2

In [52]:
op1_vals = op_vals[0]
op2_vals = op_vals[1]

In [53]:
print op1_vals[1].shape
print op1_vals[1][0]
print op2_vals[1].shape
print op2_vals[1][0]

(32,)
-0.4173376
(32,)
-0.43812448


### Now we need diff for each element wise. i.e. for 20 sentences, we have 2 matrices each of shape 20*32 . We need element wise. First we do element wise subs and take abs. 

In [57]:
diff_ = np.abs(op1_vals - op2_vals)

In [59]:
diff_.shape 

(20, 32)

#### for each of the sentence pair, we now have a array of 20 elements. If we take direct sum and keep dims , we get :

In [63]:
res_ = np.sum( diff_, keepdims= True  )
print res_.shape 

(1, 1)


#### Now coming to actual correct implementation:  

In [65]:
res_ = np.sum( diff_, axis = 1 , keepdims= True  )
print res_.shape 

(20, 1)


###### i.e. for each pair of sentence, one op is present. 

##### For the actual model : 

In [76]:
malstm = Model([sentence1,sentence2 ],  ml_dist )
optimizer = Adam( lr = 0.1) 
malstm.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
malstm_trained = malstm.fit([ seqs1, seqs2 ], Y_vals , epochs= 50 , batch_size= 256 , shuffle=True  ) 

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50

KeyboardInterrupt: 