In [138]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import random as rnd
import tensorflow as tf

rnd.seed(34)

In [139]:
data = pd.read_csv('questions.csv')
N = len(data)
print(f'Numbers of questions: {N}')
data.head()

Numbers of questions: 404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [140]:
data[data['is_duplicate'] == 1].head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [141]:
N_train = 300000
N_test = 10240
data_train = data[:N_train]
data_test = data[N_train:N_train + N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))

Train set: 300000 Test set: 10240


### Select only the question pairs that are duplicate to train the model

In [142]:
td_index = data_train['is_duplicate'] == 1
td_index

0         False
1         False
2         False
3         False
4         False
          ...  
299995    False
299996     True
299997    False
299998     True
299999    False
Name: is_duplicate, Length: 300000, dtype: bool

In [143]:
td_index = [i for i, is_duplicate in enumerate(td_index) if is_duplicate]

In [144]:
print('Number of duplicate questions: ', len(td_index))
print('Indexes of first ten duplicate questions:', td_index[:10])

Number of duplicate questions:  111486
Indexes of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


### Split the data into a training and test set

In [145]:
print(data_train['question1'][5])
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1


In [146]:
Q1_train = data_train['question1'][td_index].to_numpy()
Q2_train = data_train['question2'][td_index].to_numpy()

Q1_test = data_test['question1'].to_numpy()
Q2_test = data_test['question2'].to_numpy()
y_test = data_test['is_duplicate'].to_numpy()

In [147]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train[0])
print('Question 2: ', Q2_train[0], '\n')
print('Question 1: ', Q1_train[5])
print('Question 2: ', Q2_train[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test[0])
print('Question 2: ', Q2_test[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0 



### Split your training set into training/validation sets so that you can use them at training time.

In [148]:
cut_off = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off:], Q2_train[cut_off:]

print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  111486
The length of the training set is:   89188
The length of the validation set is:  22298


### Learning question encoding

In [149]:
tf.random.set_seed(0)
text_vectorization = tf.keras.layers.TextVectorization(output_mode='int', split='whitespace', standardize='strip_punctuation')
text_vectorization.adapt(np.concatenate([Q1_train, Q2_train]))

In [150]:
print(f'Vocabulary size: {text_vectorization.vocabulary_size()}')

Vocabulary size: 36224


In [151]:
print('first question in the train set:\n')
print(Q1_train[0], '\n') 
print('encoded version:')
print(text_vectorization(Q1_train[0]),'\n')

print('first question in the test set:\n')
print(Q1_test[0], '\n')
print('encoded version:')
print(text_vectorization(Q1_test[0]) )

first question in the train set:

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

encoded version:
tf.Tensor(
[ 6984     6   178    10  8988  2442 35393   761    13  6636 28205    31
    28   483    45    98], shape=(16,), dtype=int64) 

first question in the test set:

How do I prepare for interviews for cse? 

encoded version:
tf.Tensor([    4     8     6   160    17  2079    17 11775], shape=(8,), dtype=int64)


### Creating the Siamese network

In [152]:
def Siamese(text_vectorizer, vocab_size=36224, d_feature=128):
    branch = tf.keras.models.Sequential(name='sequential')
    branch.add(text_vectorizer)
    branch.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_feature, name='embedding'))
    branch.add(tf.keras.layers.LSTM(units = d_feature, return_sequences=True, name='LSTM'))
    branch.add(tf.keras.layers.GlobalAveragePooling1D(name='mean'))
    branch.add(tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x), name='out'))

    input1 = tf.keras.layers.Input(shape=(1,), dtype=tf.dtypes.string, name='input_1')
    input2 = tf.keras.layers.Input(shape=(1,), dtype=tf.dtypes.string, name='input_2')

    branch1 = branch(input1)
    branch2 = branch(input2)

    conc = tf.keras.layers.Concatenate(axis=1, name='conc_1_2')([branch1, branch2])

    return tf.keras.models.Model(inputs = [input1, input2], outputs=conc, name='SiameseModel')

The Siamese model you're asking about is designed using TensorFlow and consists of several layers where two inputs go through the same series of transformations and are then combined. Let's dissect the model, focusing on the input and output dimensions of each layer, and explain the role of the LSTM layer and GlobalAveragePooling1D.

**Input Layer**

The model begins with two separate input layers, each accepting a single string:

```python
input1 = tf.keras.layers.Input(shape=(1,), dtype=tf.dtypes.string, name='input_1')
input2 = tf.keras.layers.Input(shape=(1,), dtype=tf.dtypes.string, name='input_2')
```

Here, shape=(1,) indicates each input is a single string.


**Text Vectorization**

Following the inputs, the first layer within the shared branch is a text vectorizer:

```python
branch.add(text_vectorizer)
```

This layer transforms each string input into a sequence of integers, representing words or tokens from the text. The output dimension depends on the configuration of the vectorizer but typically results in a shape like (batch_size, sequence_length) where sequence_length depends on the length of the longest sequence after vectorization.


**Embedding Layer**

The next layer is an Embedding layer:

```python
branch.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_feature, name='embedding'))
```

The embedding layer converts the integer-encoded text into dense vectors of fixed size (128 in this case). It takes an input of shape (batch_size, sequence_length) and outputs an array of shape (batch_size, sequence_length, d_feature).


**LSTM Layer**

The LSTM layer follows:

```python
branch.add(tf.keras.layers.LSTM(units=d_feature, return_sequences=True, name='LSTM'))
```

This layer is configured to return sequences, meaning instead of returning just the final output of the LSTM (common in many applications like classification), it returns an output for each timestep. This is critical for the next pooling layer to compute features across all timesteps. The output shape here is (batch_size, sequence_length, d_feature).


**GlobalAveragePooling1D**

```python
branch.add(tf.keras.layers.GlobalAveragePooling1D(name='mean'))
```

GlobalAveragePooling1D works by taking the average over the time dimension (sequence_length), effectively reducing the features of each timestep into a single vector. This operation reduces the output shape from (batch_size, sequence_length, d_feature) to (batch_size, d_feature). It helps to reduce the model's complexity by summarizing the information in the sequence, making it useful for capturing the overall sentiment or characteristics of the input sequence without overfitting to specific words.


**Normalization Layer**

A Lambda layer normalizes the data:

```python
branch.add(tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x)), name='out')
```

Normalization (here L2 normalization) ensures that the vector length is scaled to 1, often useful for improving the stability of the learning process.


Combining Branches
The outputs from the two input branches are then concatenated:

```python
conc = tf.keras.layers.Concatenate(axis=1, name='conc_1_2')([branch1, branch2])
```

Since each branch outputs a normalized vector of shape (batch_size, d_feature), concatenating these along axis=1 results in an output shape of (batch_size, 2*d_feature).


**Final Model Assembly**

The final model combines everything:

```python
return tf.keras.models.Model(inputs=[input1, input2], outputs=conc, name='SiameseModel')
```

This setup is typical for tasks where the relationship between two inputs needs to be learned, such as comparing the similarity between texts, which is a common use case in Siamese networks.


**Summary**

The use of return_sequences=True in the LSTM layer is critical for the GlobalAveragePooling1D layer to function correctly, as it averages over all the timestep outputs from the LSTM, providing a robust summary of the entire input sequence. This model structure is effective for learning complex patterns in sequences and comparing them in a normalized feature space.

In [153]:
model = Siamese(text_vectorization, vocab_size=text_vectorization.vocabulary_size())
model.build(input_shape=(1,))
model.summary()
model.get_layer(name='sequential').summary()

In [154]:
tf.keras.utils.plot_model(model,
                          to_file='my_model.png',
                          show_shapes=True,
                          show_dtype=True,
                          show_layer_names=True,
                          rankdir="TB",
                          expand_nested=True)

You must install pydot (`pip install pydot`) for `plot_model` to work.


### Hard negative mining

In [155]:
def TripletLossFn(v1, v2, margin=0.25):
    """Custom Loss function.

    Args:
        v1 (numpy.ndarray or Tensor): Array with dimension (batch_size, model_dimension) associated to Q1.
        v2 (numpy.ndarray or Tensor): Array with dimension (batch_size, model_dimension) associated to Q2.
        margin (float, optional): Desired margin. Defaults to 0.25.

    Returns:
        triplet_loss (numpy.ndarray or Tensor)
    """
    scores = tf.linalg.matmul(a=v2, b=v1, transpose_b=True)
    batch_size = tf.cast(tf.shape(v1)[0], scores.dtype)
    
    positive = tf.linalg.diag_part(scores)
    negative_zero_on_duplicate = scores - tf.linalg.diag(positive)
    mean_negative = tf.reduce_sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)

    mask_exclude_positives = tf.cast((tf.eye(batch_size) == 1)|(negative_zero_on_duplicate > tf.expand_dims(positive, 1)),
                                    scores.dtype)
    negative_without_positive = negative_zero_on_duplicate - 2.0 * mask_exclude_positives
    closest_negative = tf.math.reduce_max(negative_without_positive, axis=1)
    
    triplet_loss1 = tf.maximum(closest_negative -  positive + margin, 0)
    triplet_loss2 = tf.maximum(mean_negative - positive + margin, 0)
    triplet_loss = tf.reduce_sum(triplet_loss1 + triplet_loss2)
    return triplet_loss
    



In [156]:
v1 = np.array([[0.26726124, 0.53452248, 0.80178373],[0.5178918 , 0.57543534, 0.63297887]])
v2 = np.array([[ 0.26726124,  0.53452248,  0.80178373],[-0.5178918 , -0.57543534, -0.63297887]])
print("Triplet Loss:", TripletLossFn(v1,v2))

Triplet Loss: tf.Tensor(0.703507682515891, shape=(), dtype=float64)


In [157]:
def TripletLoss(labels, out, margin=0.25):
    _, out_size = out.shape # get embedding size
    v1 = out[:,:int(out_size/2)] # Extract v1 from out
    v2 = out[:,int(out_size/2):] # Extract v2 from out
    return TripletLossFn(v1, v2, margin=margin)

### Train

In [158]:
train_dataset = tf.data.Dataset.from_tensor_slices(((train_Q1, train_Q2),tf.constant([1]*len(train_Q1))))
val_dataset = tf.data.Dataset.from_tensor_slices(((val_Q1, val_Q2),tf.constant([1]*len(val_Q1))))

In [159]:
def train_model(Siamese, TripletLoss, text_vectorizer, train_dataset, val_dataset, d_feature=128, lr=0.01, train_steps=5):
    """Training the Siamese Model

    Args:
        Siamese (function): Function that returns the Siamese model.
        TripletLoss (function): Function that defines the TripletLoss loss function.
        text_vectorizer: trained instance of `TextVecotrization` 
        train_dataset (tf.data.Dataset): Training dataset
        val_dataset (tf.data.Dataset): Validation dataset
        d_feature (int, optional) = size of the encoding. Defaults to 128.
        lr (float, optional): learning rate for optimizer. Defaults to 0.01
        train_steps (int): number of epochs
        
    Returns:
        tf.keras.Model
    """
    ## START CODE HERE ###

    # Instantiate your Siamese model
    model = Siamese(text_vectorizer,
                    vocab_size = len(text_vectorizer.get_vocabulary()), #set vocab_size accordingly to the size of your vocabulary
                    d_feature = d_feature)
    # Compile the model
    model.compile(loss=TripletLoss,
                  optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
            )
    # Train the model 
    model.fit(train_dataset,
              epochs = train_steps,
              validation_data = val_dataset,
             )
             
    ### END CODE HERE ###

    return model

In [160]:
train_steps = 6
batch_size = 256
train_generator = train_dataset.shuffle(len(train_Q1),
                                        seed=7, 
                                        reshuffle_each_iteration=True).batch(batch_size=batch_size)
val_generator = val_dataset.shuffle(len(val_Q1), 
                                   seed=7,
                                   reshuffle_each_iteration=True).batch(batch_size=batch_size)
model = train_model(Siamese, TripletLoss,text_vectorization, 
                                            train_generator, 
                                            val_generator, 
                                            train_steps=train_steps,)

Epoch 1/6
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 135ms/step - loss: 127.2549 - val_loss: 126.9564
Epoch 2/6
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 172ms/step - loss: 126.8748 - val_loss: 126.3775
Epoch 3/6
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 173ms/step - loss: 126.5148 - val_loss: 126.3211
Epoch 4/6
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 179ms/step - loss: 126.4039 - val_loss: 126.2875
Epoch 5/6
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 179ms/step - loss: 126.3733 - val_loss: 126.2338
Epoch 6/6
[1m349/349[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 179ms/step - loss: 126.3391 - val_loss: 126.1884


### Evaluation

In [161]:
model.summary()

In [162]:
def classify(test_Q1, test_Q2, y_test, threshold, model, batch_size=64, verbose=True):
    """Function to test the accuracy of the model.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions. Each element of the array would be a string.
        test_Q2 (numpy.ndarray): Array of Q2 questions. Each element of the array would be a string.
        y_test (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold
        model (tensorflow.Keras.Model): The Siamese model.
        batch_size (int, optional): Size of the batches. Defaults to 64.

    Returns:
        float: Accuracy of the model
        numpy.array: confusion matrix
    """
    y_pred = []
    test_gen = tf.data.Dataset.from_tensor_slices(((test_Q1, test_Q2),None)).batch(batch_size=batch_size)
    
    ### START CODE HERE ###

    pred = model.predict(test_gen)
    _, n_feat = pred.shape
    v1 = pred[:,:int(n_feat/2)]
    v2 = pred[:,int(n_feat/2):]
    
    # Compute the cosine similarity. Using `tf.math.reduce_sum`. 
    # Don't forget to use the appropriate axis argument.
    dot = tf.math.reduce_sum(v1*v2, axis = 1)
    norm = tf.math.sqrt(
        tf.math.reduce_sum(v1 * v1, axis=1)
        *
        tf.math.reduce_sum(v2 * v2, axis=1))
    d  = dot / norm
    # Check if d>threshold to make predictions
    y_pred = tf.cast(d>threshold, tf.float64)
    # take the average of correct predictions to get the accuracy
    accuracy = tf.math.reduce_mean(tf.cast(tf.equal(y_pred, y_test), tf.float64))
    # compute the confusion matrix using `tf.math.confusion_matrix`
    cm = tf.math.confusion_matrix(y_test, y_pred)
    
    ### END CODE HERE ###
    
    return accuracy, cm

In [163]:
# this takes around 1 minute
accuracy, cm = classify(Q1_test,Q2_test, y_test, 0.7, model,  batch_size = 512) 
print("Accuracy", accuracy.numpy())
print(f"Confusion matrix:\n{cm.numpy()}")

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 75ms/step
Accuracy 0.668359375
Confusion matrix:
[[4237 2145]
 [1251 2607]]


In [164]:
def predict(question1, question2, threshold, model, verbose=False):
    """Function for predicting if two questions are duplicates.

    Args:
        question1 (str): First question.
        question2 (str): Second question.
        threshold (float): Desired threshold.
        model (tensorflow.keras.Model): The Siamese model.
        verbose (bool, optional): If the results should be printed out. Defaults to False.

    Returns:
        bool: True if the questions are duplicates, False otherwise.
    """
    generator = tf.data.Dataset.from_tensor_slices((([question1], [question2]),None)).batch(batch_size=1)
    
    ### START CODE HERE ###
    
    # Call the predict method of your model and save the output into v1v2
    v1v2 = model.predict(generator)
    # Extract v1 and v2 from the model output
    v1 = v1v2[:,:int(v1v2.shape[1]/2)]
    v2 = v1v2[:,int(v1v2.shape[1]/2):]
    # Take the dot product to compute cos similarity of each pair of entries, v1, v2
    # Since v1 and v2 are both vectors, use the function tf.math.reduce_sum instead of tf.linalg.matmul
    dot = tf.math.reduce_sum(v1*v2, axis = 1)
    norm = tf.math.sqrt(
        tf.math.reduce_sum(v1 * v1, axis=1)
        *
        tf.math.reduce_sum(v2 * v2, axis=1))
    d = dot / norm
    # Is d greater than the threshold?
    res = d > threshold

    ### END CODE HERE ###
    
    if(verbose):
        print("Q1  = ", question1, "\nQ2  = ", question2)
        print("d   = ", d.numpy())
        print("res = ", res.numpy())

    return res.numpy()

In [165]:
# Feel free to try with your own questions
question1 = "When will I see you?"
question2 = "When can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose = True)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
Q1  =  When will I see you? 
Q2  =  When can I see you again?
d   =  [0.6716684]
res =  [False]


array([False])

In [166]:
# Feel free to try with your own questions
question1 = "Do they enjoy eating the dessert?"
question2 = "Do they like hiking in the desert?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose=True)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Q1  =  Do they enjoy eating the dessert? 
Q2  =  Do they like hiking in the desert?
d   =  [0.01285205]
res =  [False]


array([False])