<a href="https://colab.research.google.com/github/mr-alamdari/NLP-Question-Duplicates-Beginner/blob/main/NLP_Question_Duplicates_Beginner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import nltk
import numpy as np
import pandas as pd
import random as rnd

In [None]:
!pip install trax
import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp

Collecting trax
  Downloading trax-1.4.1-py2.py3-none-any.whl (637 kB)
[?25l[K     |▌                               | 10 kB 21.4 MB/s eta 0:00:01[K     |█                               | 20 kB 13.5 MB/s eta 0:00:01[K     |█▌                              | 30 kB 9.3 MB/s eta 0:00:01[K     |██                              | 40 kB 3.8 MB/s eta 0:00:01[K     |██▋                             | 51 kB 3.8 MB/s eta 0:00:01[K     |███                             | 61 kB 4.5 MB/s eta 0:00:01[K     |███▋                            | 71 kB 4.6 MB/s eta 0:00:01[K     |████                            | 81 kB 4.7 MB/s eta 0:00:01[K     |████▋                           | 92 kB 5.2 MB/s eta 0:00:01[K     |█████▏                          | 102 kB 4.4 MB/s eta 0:00:01[K     |█████▋                          | 112 kB 4.4 MB/s eta 0:00:01[K     |██████▏                         | 122 kB 4.4 MB/s eta 0:00:01[K     |██████▊                         | 133 kB 4.4 MB/s eta 0:00:01[K   

In [8]:
def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True):
    input1 = []
    input2 = []
    idx = 0
    len_q = len(Q1)
    question_indexes = [*range(len_q)]
    
    if shuffle:
        rnd.shuffle(question_indexes)
    
    while True:
        if idx >= len_q:
            idx = len_q
            if shuffle:
                rnd.shuffle(question_indexes)
        
        q1 = Q1[question_indexes[idx]]
        q2 = Q2[question_indexes[idx]]
        idx += 1
        input1.append(q1)
        input2.append(q2)
        if len(input1) == batch_size:
            max_len = max(max([len(q) for q in input1]),max([len(q) for q in input2]))
            max_len = 2**int(np.ceil(np.log2(max_len)))
            b1 = []
            b2 = []
            for q1, q2 in zip(input1, input2):
                q1 = q1 + [pad] * (max_len - len(q1))
                q2 = q2 + [pad] * (max_len - len(q2))
                b1.append(q1)
                b2.append(q2)
            yield np.array(b1), np.array(b2)
            input1, input2 = [], []
            

In [9]:
batch_size = 2
res1, res2 = next(data_generator(train_Q1, train_Q2, batch_size))

In [10]:
normalize = lambda x: x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))

In [11]:
def Siamese(vocab_size, d_model=128, mode='train'):
    q_processor = tl.Serial(
        tl.Embedding(vocab_size, d_model),
        tl.LSTM(d_model),
        tl.Mean(axis=1),
        tl.Fn('Normalize', lambda x: normalize(x)) 
    ) 
    
    model = tl.Parallel(q_processor, q_processor)
    return model

In [12]:
def TripletLossFn(v1, v2, margin=0.25):
    scores = fastnp.dot(v1, v2.T)
    batch_size = len(scores)
    positive = fastnp.diagonal(scores)
    negative_without_positive = scores - 2.0 * fastnp.eye(batch_size)
    closest_negative = negative_without_positive.max(axis=1)
    negative_zero_on_duplicate = scores * (1.0 - fastnp.eye(batch_size))
    mean_negative = np.sum(negative_zero_on_duplicate, axis=1) / (batch_size-1)
    triplet_loss1 = fastnp.maximum(0.0, margin - positive + closest_negative)
    triplet_loss2 = fastnp.maximum(0.0, margin - positive + mean_negative)
    triplet_loss = fastnp.mean(triplet_loss1 + triplet_loss2)
    return triplet_loss

In [None]:
v1 = np.array([[0.26726124, 0.53452248, 0.80178373],[0.5178918 , 0.57543534, 0.63297887]])
v2 = np.array([[ 0.26726124,  0.53452248,  0.80178373],[-0.5178918 , -0.57543534, -0.63297887]])
TripletLossFn(v2,v1)
print("Triplet Loss:", TripletLossFn(v2,v1))

In [None]:
from functools import partial
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

In [None]:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size, vocab['<PAD>'])
val_generator = data_generator(val_Q1, val_Q2, batch_size, vocab['<PAD>'])
print('train_Q1.shape ', train_Q1.shape)
print('val_Q1.shape   ', val_Q1.shape)

In [14]:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01)

def train_model(Siamese, TripletLoss, lr_schedule, train_generator, val_generator, output_dir='model/'):
    output_dir = os.path.expanduser(output_dir)
    train_task = training.TrainTask(
        labeled_data=train_generator,            
        loss_layer=TripletLoss(),                
        optimizer=trax.optimizers.Adam(0.01),    
        lr_schedule=lr_schedule,                  
    )
    eval_task = training.EvalTask(
        labeled_data=val_generator,       
        metrics=[TripletLoss()],          
    )
    training_loop = training.Loop(Siamese(),train_task,eval_task=eval_task,output_dir=output_dir)

    return training_loop

In [15]:
train_steps = 100
training_loop = train_model(Siamese, TripletLoss, lr_schedule)
training_loop.run(train_steps)

In [None]:
model = Siamese()
model.init_from_file('model.pkl.gz')

In [17]:
def classify(test_Q1, test_Q2, y, threshold, model, vocab, data_generator=data_generator, batch_size=64):
    accuracy = 0
    for i in range(0, len(test_Q1), batch_size):
        q1, q2 = next(data_generator(test_Q1[i:i + batch_size], test_Q2[i:i + batch_size], batch_size, vocab['<PAD>'], shuffle=False))
        y_test = y[i:i + batch_size]
        v1, v2 = model((q1, q2))
        for j in range(batch_size):
            d = np.dot(v1[j], v2[j].T)
            res = d > threshold
            accuracy += (y_test[j] == res)
    accuracy = accuracy / len(test_Q1)
    return accuracy

In [18]:
accuracy = classify(Q1_test,Q2_test, y_test, 0.7, model, vocab, batch_size = 512) 

In [19]:
def predict(question1, question2, threshold, model, vocab, data_generator=data_generator, verbose=False):
    q1 = nltk.word_tokenize(question1)  
    q2 = nltk.word_tokenize(question2)  
    Q1 = [vocab[word] for word in q1]
    Q2 = [vocab[word] for word in q2]
        
    Q1, Q2 = next(data_generator([Q1], [Q2], 1, vocab['<PAD>']))
    v1, v2 = model((Q1, Q2))
    d = np.dot(v1[0], v2[0].T)
    res = d > threshold
    if(verbose):
        print("Q1  = ", Q1, "\nQ2  = ", Q2)
        print("d   = ", d)
        print("res = ", res)

    return res

In [None]:
question1 = "When will I see you?"
question2 = "When can I see you again?"
predict(question1 , question2, 0.7, model, vocab, verbose = True)