<a href="https://colab.research.google.com/github/mabdullah1994/deep-learning/blob/master/Siamese_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
from string import punctuation

import torch
import gensim
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, WeightedRandomSampler
from torch.autograd import Variable

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print ('Device: ', device)

Device:  cuda:0


**Global Variables**

In [0]:
ROOT_PATH = '/content/drive/My Drive/QuoraQuestionsSimilarity/'
TRAIN_FILE_PATH = ROOT_PATH + 'train.csv'
EMBEDDING_PATH = '/content/drive/My Drive/GoogleNews-vectors-negative300.bin'
EMBEDDING_DIMENSION = 300
EMBEDDING_REQUIRES_GRAD = False
HIDDEN_CELLS = 50
NUM_LAYERS = 1

**Load Train File and check the distribution of Duplicate Questions**

In [19]:
df_train = pd.read_csv(TRAIN_FILE_PATH)
print ('Percentage of Duplicate Questions Pair: ', df_train['is_duplicate'].mean() * 100)

Percentage of Duplicate Questions Pair:  36.9197853026293


**Data Cleansing**

In [0]:
# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    text = text.strip()
    return text

**Convert train data into list of tuples where each tuple is of the form (question1, question2)**

In [21]:
train_questions_pair = []
train_labels = []
for _, row in df_train.iterrows():
    
    q1 = text_to_wordlist(str(row['question1']))
    q2 = text_to_wordlist(str(row['question2']))
    label = int(row['is_duplicate'])
    if q1 and q2:
        train_questions_pair.append((
                q1, q2
            ))
        train_labels.append(label)

print ('Train Data Question Pairs: ', len(train_questions_pair))


Train Data Question Pairs:  404270


**Create a Language class that will keep track of the dataset vocabulary and corresponding indices**

In [0]:
class Language:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words + 1
            self.word2count[word] = 1
            self.index2word[self.n_words + 1] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

language = Language()
for data in [train_questions_pair]:
    for question_pair in data:
        q1 = question_pair[0]
        q2 = question_pair[1]
        language.addSentence(q1)
        language.addSentence(q2)

**Create a dataset class which can be indexed to retrieve Questions Pair along with corresponding Label**

In [0]:
class QuestionsDataset(Dataset):
    def __init__(self, questions_list, word2index, labels):
        self.questions_list = questions_list
        self.labels = labels
        self.word2index = word2index
        
    def __len__(self):
        return len(self.questions_list)
    
    def __getitem__(self, index):
        questions_pair = self.questions_list[index]
        q1 = questions_pair[0]
        q1_indices = []
        for word in q1.split():
            q1_indices.append(self.word2index[word])
            
        q2 = question_pair[1]
        q2_indices = []
        for word in q2.split():
            q2_indices.append(self.word2index[word])
            
        # q1_indices and q2_indices are lists of indices against words used in the sentence 
        return q1_indices, q2_indices, self.labels[index]
    
train_dataset = QuestionsDataset(train_questions_pair, language.word2index, train_labels)

In [25]:
n_vocabulary_words = len(language.word2index)
print ('Total Unique Vocabulary Words: ', n_vocabulary_words)

Total Unique Vocabulary Words:  86001


**Custom Collate is implemented to adjust the data in the desired format and calculate lengths which will later be used for padding and packing.**

In [0]:
class CustomCollate:
    def custom_collate(self, batch):

        # batch = list of tuples where each tuple is of the form ([i1, i2, i3], [j1, j2, j3], label)
        q1_list = []
        q2_list = []
        labels = []
        for training_example in batch:
          q1_list.append(training_example[0])
          q2_list.append(training_example[1])
          labels.append(training_example[2])
          
        
        q1_lengths = [len(q) for q in q1_list]
        q2_lengths = [len(q) for q in q2_list]
        
        return q1_list, q1_lengths, q2_list, q2_lengths, labels

    def __call__(self, batch):
        return self.custom_collate(batch)

**Split Training Data into Train and Validation Set**

In [38]:
validation_split = 0.2
dataset_size = len(train_dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
shuffle_dataset = True
random_seed = 32

if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_indices)
validation_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, sampler=train_sampler, collate_fn=CustomCollate())
val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, sampler=validation_sampler, collate_fn=CustomCollate())

print ('Training Set Size {}, Validation Set Size {}'.format(len(train_indices), len(val_indices)))

Training Set Size 323416, Validation Set Size 80854


**Create Embeding Matrix for the dataset vocabulary using pre-trained Word2Vec Embeddings**

In [0]:
# Load pre-trained embeddings from word2vec
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=True)
# Convert word2vec embeddings into FloatTensor
word2vec_weights = torch.FloatTensor(word2vec_model.vectors)

# Create a random weight tensor of the shape (n_vocabulary_words + 1, EMBEDDING_DIMENSION) and place each word's embedding from word2vec at the index assigned to that word
# Two key points:
# 1. Weights tensor has been initialized randomly so that the words which are part of our dataset vocabulary but are not present in word2vec are given a random embedding
# 2. Embedding at 0 index is all zeros. This is the embedding for the padding that we will do for batch processing
weights = torch.randn(n_vocabulary_words + 1, EMBEDDING_DIMENSION)
weights[0] = torch.zeros(EMBEDDING_DIMENSION)
for word, lang_word_index in language.word2index.items():
    if word in word2vec_model:
        weights[lang_word_index] = torch.FloatTensor(word2vec_model.word_vec(word))

del word2vec_model
del word2vec_weights

**Siamese Network with single LSTM**

In [0]:
class SiameseNetwork(nn.Module):
    def __init__(self, pretrained_weights):
        super(SiameseNetwork, self).__init__()
        # Creating embedding object from the pre-trained weights
        self.embedding = nn.Embedding.from_pretrained(pretrained_weights)
        self.embedding.weight.requires_grad = EMBEDDING_REQUIRES_GRAD
        # Create a single LSTM since this is a Siamese Network and the weights are shared
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIMENSION, hidden_size=HIDDEN_CELLS, num_layers = NUM_LAYERS, batch_first = True)
    
    # Manhattan Distance Calculator
    def exponent_neg_manhattan_distance(self, x1, x2):
        return torch.exp(-torch.sum(torch.abs(x1 - x2), dim=0)).to(device)

    def forward_once(self, x, input_lengths):
      
        # x is of the shape (batch_dim, sequence)
        # e.g. x = [
        #  [i1, i2, i3],
        #  [j1, j2, j3, j4]
        # ]
        
        # input_lengths is the list that contains the sequence lengths for each sequence
        # e.g. input_lengths = [3, 4]
        
        # Reverse sequence lengths indices in decreasing order as per the requirement from PyTorch before Padding and Packing
        sorted_indices = np.flipud(np.argsort(input_lengths))
        input_lengths = np.flipud(np.sort(input_lengths))
        input_lengths = input_lengths.copy() # https://github.com/facebookresearch/InferSent/issues/99
        
        # Reorder questions in the decreasing order of their lengths
        ordered_questions = [torch.LongTensor(x[i]).to(device) for i in sorted_indices]
        # Pad sequences with 0s to the max length sequence in the batch
        ordered_questions = torch.nn.utils.rnn.pad_sequence(ordered_questions, batch_first=True)
        # Retrieve Embeddings
        embeddings = self.embedding(ordered_questions).to(device)
        # Pack the padded sequences and pass it through LSTM
        packed = torch.nn.utils.rnn.pack_padded_sequence(embeddings, input_lengths, batch_first=True)
        out, (hn, cn) = self.lstm(packed)
        unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=int(input_lengths[0]))
        
        # The following step reorders the calculated activations to the original order in which questions were passed
        result = torch.FloatTensor(unpacked.size())
        for i, encoded_matrix in enumerate(unpacked):
            result[sorted_indices[i]] = encoded_matrix
        return result

    def forward(self, q1, q1_lengths, q2, q2_lengths):
        output1 = self.forward_once(q1, q1_lengths)
        output2 = self.forward_once(q2, q2_lengths)
        similarity_score = torch.zeros(output1.size()[0]).to(device)
        # Calculate Similarity Score between both questions in a single pair
        for index in range(output1.size()[0]):
            # Sequence lenghts are being used to index and retrieve the activations before the zero padding since they were not part of original question
            q1 = output1[index, q1_lengths[index] - 1, :]
            q2 = output2[index, q2_lengths[index] - 1, :]
            similarity_score[index] = self.exponent_neg_manhattan_distance(q1, q2)
        return similarity_score
    
model = SiameseNetwork(weights).to(device)

**Create a Loss function and an Optimizer. In this case, we have created Mean Squared Error as our Loss function and Adam as an Optimizer.**

In [0]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01 )
num_epochs = 50

**Let's train the model !!!**

In [0]:
total_step = len(train_loader)
# Threshold 0.5. Since similarity score will be a value between 0 and 1, we will consider all question pair with values greater than threshold as Duplicate
threshold = torch.Tensor([0.5]).to(device)

for epoch in range(num_epochs):
    loss_history = []
    model.train(True)
    train_correct_total = 0
    for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(train_loader):

        labels = torch.FloatTensor(labels).to(device)
        
        # Clear grads
        optimizer.zero_grad()
        
        # Run the forward pass
        similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
        predictions = (similarity_score > threshold).float() * 1
        total = labels.size()[0]
        correct = (predictions == labels).sum().item()
        train_correct_total += correct
        
        # Calculate Loss
        loss = criterion(similarity_score, labels)
        
        # Calculate gradients
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        if (i + 1) % 100 == 0:
            loss_history.append(loss.item())
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch + 1, num_epochs, i + 1, total_step, np.mean(loss_history), (correct / total) * 100))
            
    print('Training Loss: {:.4f}, Training Accuracy: {:.4f}'.format(np.mean(loss_history), (train_correct_total / len(train_indices)) * 100))
    
    model.train(False)
    val_correct_total = 0
    with torch.no_grad():
        for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(val_loader):

            labels = torch.FloatTensor(labels).to(device)

            similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
            predictions = (similarity_score > threshold).float() * 1
            total = labels.size()[0]
            correct = (predictions == labels).sum().item()
            val_correct_total += correct
        
        avg_acc_val =  val_correct_total * 100 / len(val_indices)
        print ('Validation Set Size {}, Correct in Validation {}, Validation Accuracy {:.2f}%'.format(len(val_indices), val_correct_total, avg_acc_val))
