In [72]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [73]:
dataset=pd.read_csv("sample_data.csv", skipinitialspace=True)

In [74]:
dataset.head(10)

Unnamed: 0,id,sentences1,sentences2,is_similar
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [75]:
dataset.dtypes

id             int64
sentences1    object
sentences2    object
is_similar     int64
dtype: object

In [76]:
np.sum(dataset["is_similar"]), len(dataset["is_similar"])

(188, 499)

In [77]:
import gensim.downloader as api

In [78]:
embedding_model=api.load("glove-wiki-gigaword-100")

In [79]:
embedding_model.most_similar("orange")

[('yellow', 0.7358633279800415),
 ('red', 0.7140780091285706),
 ('blue', 0.7118036150932312),
 ('green', 0.7111418843269348),
 ('pink', 0.677507221698761),
 ('purple', 0.6774231791496277),
 ('black', 0.6709616780281067),
 ('colored', 0.665260910987854),
 ('lemon', 0.6251963973045349),
 ('peach', 0.6168624758720398)]

In [80]:
len(embedding_model["president"])

100

In [81]:
def load_data(df):
    sentences1=df["sentences1"].astype(str).values
    sentences2=df["sentences2"].astype(str).values
    #combine to get the tokens
    df["combined"]=df["sentences1"] + df["sentences2"]
    labels=df["is_similar"].values
    return sentences1, sentences2, labels

In [82]:
sentences1, sentences2, labels=load_data(dataset)

In [83]:
sentences1[0], sentences2[0], labels[0]

('What is the step by step guide to invest in share market in india?',
 'What is the step by step guide to invest in share market?',
 0)

In [84]:
combined=list(sentences1) + list(sentences2)

In [85]:
dataset["combined"][0]

'What is the step by step guide to invest in share market in india?What is the step by step guide to invest in share market?'

In [86]:
len(combined)

998

In [87]:
def clean_ascii(text):
    return ''.join( i for i in text if ord(i)<128)

In [88]:
cleaned = clean_ascii('Mad%sk')
cleaned

'Mad%sk'

In [89]:
max_words=10000

In [90]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [91]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [92]:
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')

In [93]:
tokenizer.fit_on_texts(combined)

In [94]:
sequences=tokenizer.texts_to_sequences(combined)

In [95]:
sequences_padded=pad_sequences(sequences, maxlen=300, padding='post')

In [96]:
sequences_padded[0];

In [97]:
embedding_dimension=100
embedding_matrix=np.zeros((max_words, embedding_dimension))
embedding_matrix.shape

(10000, 100)

In [98]:
#create embedding matrix
def get_embedding_matrix():
    for word, i in tokenizer.word_index.items():
        if embedding_model.has_index_for(word):
            embedding_vector=embedding_model[word]
            if embedding_vector is not None:
                embedding_matrix[i]=embedding_vector
    return embedding_matrix

In [99]:
emb_matrix=get_embedding_matrix()

In [100]:
emb_matrix.shape

(10000, 100)

In [101]:
from tensorflow.keras.layers import Bidirectional, LSTM, Input, Lambda, Dense

In [102]:
from tensorflow.keras.models import Model

In [103]:
from sklearn.model_selection import train_test_split

In [104]:
import tensorflow.keras.backend as K

In [105]:
lstm_layer=Bidirectional(LSTM(10, dropout=0.2, recurrent_dropout=0.2))

In [106]:
type(lstm_layer)

keras.layers.wrappers.Bidirectional

In [107]:
emb = tf.keras.layers.Embedding(max_words, embedding_dimension, input_length=300, weights=[embedding_matrix],trainable=False)

In [108]:
input1=Input(shape=(300,), name="input1")

In [109]:
e1=emb(input1)
x1=lstm_layer(e1)

In [110]:
input2=Input(shape=(300,), name="input2")

In [111]:
e2=emb(input2)
x2=lstm_layer(e2)

In [112]:
manhattan_distance=lambda x: tf.keras.backend.abs(x[0] - x[1])

In [113]:
def euclidean_distance(x):
	sumSquared = K.sum(K.square(x[0] - x[1]), axis=1,
		keepdims=True)
	return K.sqrt(K.maximum(sumSquared, K.epsilon()))

In [114]:
?euclidean_distance

In [115]:
merged = Lambda(function=euclidean_distance, output_shape=lambda x: x[0],  name="L1_distance")([x1, x2])

In [116]:
preds=Dense(1, activation="sigmoid")(merged)

In [117]:
model=Model(inputs=[input1, input2], outputs=preds)

In [118]:
def contrastive_loss(y_true, y_pred):
    y_true=tf.dtypes.cast(y_true, tf.float64)
    y_pred=tf.dtypes.cast(y_pred, tf.float64)
    margin = 1
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

In [119]:
model.compile(loss="mse", optimizer="adam",  metrics=["accuracy"])

In [120]:
model.compile(loss=contrastive_loss, optimizer="adam",  metrics=["accuracy"])

In [121]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input1 (InputLayer)            [(None, 300)]        0           []                               
                                                                                                  
 input2 (InputLayer)            [(None, 300)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 300, 100)     1000000     ['input1[0][0]',                 
                                                                  'input2[0][0]']                 
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 20)          8880        ['embedding_1[0][0]',      

In [122]:
def create_data():
    features, labels=dataset.drop(columns=["id","is_similar","combined"]).values, dataset["is_similar"].values
    x_train, x_test, y_train, y_test=train_test_split(features, labels, test_size=0.2, random_state=42)
    x_train, x_val, y_train, y_val=train_test_split(x_train, y_train, test_size=0.25, random_state=42)
    return x_train, y_train, x_test, y_test, x_val, y_val

In [123]:
x_train, y_train, x_test,y_test, x_val, y_val=create_data()

In [124]:
def convert_to_sequences(sentences):
    sequences=tokenizer.texts_to_sequences(sentences)
    seq_padded=pad_sequences(sequences, maxlen=300, padding="post")
    return seq_padded

In [125]:
x_converted=convert_to_sequences(x_train[:,0])

In [126]:
x_converted.shape

(299, 300)

In [127]:
x_train[:,0].shape

(299,)

In [128]:
history=model.fit([convert_to_sequences(x_train[:,0]), 
                   convert_to_sequences(x_train[:,1])], 
                  y_train, epochs=10, verbose=1,
                  validation_data=([
                      convert_to_sequences(x_val[:,0]),
                      convert_to_sequences(x_val[:,1])], y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [129]:
model.evaluate([convert_to_sequences(x_test[:,0]), convert_to_sequences(x_test[:,1])], y_test)



[0.21987484395503998, 0.3100000023841858]

In [130]:
import matplotlib.pyplot as plt

In [131]:
history.model

<keras.engine.functional.Functional at 0x1f5cbd13460>

In [132]:
model.weights[3]

<tf.Variable 'bidirectional_1/forward_lstm_1/lstm_cell_4/bias:0' shape=(40,) dtype=float32, numpy=
array([-4.40658390e-04, -3.10018193e-04, -1.21247569e-04, -5.58193242e-05,
       -9.37078294e-05, -9.06131390e-05, -4.24698141e-04, -1.14506714e-04,
       -1.18305557e-04,  1.28492570e-06,  9.99420762e-01,  9.99589086e-01,
        9.99849021e-01,  9.99929786e-01,  9.99915481e-01,  9.99858797e-01,
        9.99516606e-01,  9.99843121e-01,  9.99836385e-01,  1.00002789e+00,
        6.25121361e-03, -2.66479235e-03, -3.01838876e-03, -3.05540022e-03,
        3.15140979e-03, -1.92969292e-03, -1.50299538e-03, -2.53746682e-03,
        9.14265634e-04,  5.39921457e-03, -4.40717267e-04, -3.10077856e-04,
       -1.21275662e-04, -5.59139844e-05, -9.36218639e-05, -9.05395646e-05,
       -4.24642581e-04, -1.14596325e-04, -1.18449374e-04,  1.29601801e-06],
      dtype=float32)>

In [133]:
test_instance=x_test[3]

In [134]:
test_instance[0], test_instance[1], y_test[3]

('Which is the best fixed income fund?',
 'What is the best fixed income fund?',
 1)

In [135]:
test_label=y_test[0]

In [136]:
model.predict([convert_to_sequences([test_instance[0]]), convert_to_sequences([test_instance[1]])])

array([[0.6100971]], dtype=float32)

In [137]:
predicted = model.predict([convert_to_sequences(x_test[:,0]), convert_to_sequences(x_test[:,1])])

In [138]:
predicted

array([[0.62034684],
       [0.65968287],
       [0.6531271 ],
       [0.6100971 ],
       [0.62533605],
       [0.5335964 ],
       [0.5897736 ],
       [0.6226137 ],
       [0.57938975],
       [0.6035603 ],
       [0.5703286 ],
       [0.6092944 ],
       [0.53517056],
       [0.5648673 ],
       [0.5569642 ],
       [0.74707615],
       [0.555616  ],
       [0.5366752 ],
       [0.6280457 ],
       [0.51855916],
       [0.6834405 ],
       [0.56233984],
       [0.6059477 ],
       [0.54446745],
       [0.5507968 ],
       [0.6023898 ],
       [0.58029217],
       [0.615251  ],
       [0.5547946 ],
       [0.5795827 ],
       [0.5302872 ],
       [0.68535596],
       [0.6994402 ],
       [0.66497123],
       [0.563051  ],
       [0.5633359 ],
       [0.521888  ],
       [0.65115464],
       [0.58998597],
       [0.6226219 ],
       [0.6408579 ],
       [0.5965718 ],
       [0.67808485],
       [0.5593966 ],
       [0.6050842 ],
       [0.52781814],
       [0.65829605],
       [0.590