In [1]:
import pandas as pd
from gensim import models
import numpy as np
import gensim
import gensim.downloader
from tqdm import tqdm
import time
import math

## Dataset

In [2]:
df = pd.read_csv('data/qqp/train_v1.csv', header=0, index_col=0)
df

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,64665,92664,What is the total number of MBBS seats availab...,What is the total number of MBBS seats (govt a...,0
1,514255,177254,What are the difference between the hard drive...,"Which is the best external hard drive, Seagate...",0
2,250407,468939,Does IQ increase with age?,I got glasses around age 14 which is about -1 ...,0
3,9610,9611,How did caterpillars evolve to digest themselv...,What is the evolutionary advantage of metamorp...,0
4,297688,442567,My brother had hemorrhage on his right brain a...,"A teenaged cousin brother, who is sharp and in...",0
...,...,...,...,...,...
404282,397063,397064,What's the best thing to do in Goa?,What is the best thing we can do in Goa?,1
404283,210926,210927,Can we write in our own words in IPCC theory s...,What's the saddest story you can write in six ...,0
404284,67643,67644,What is an aristocracy?,What is aristocracy?,1
404285,143972,143973,How does Stripes compare to Spring MVC?,"Which is better, Play Framework or Spring MVC?...",0


In [3]:
df['question'] = (df['question1'] + "|||" + df['question2'])

In [4]:
df[df.is_duplicate == 1].shape[0]

149263

In [5]:
df[df.is_duplicate == 0].shape[0]

255024

In [6]:
question_list = df['question'].to_list()

question_list = [question.lower().split() for question in question_list]

labels = df['is_duplicate'].to_list()

In [7]:
max_length = -np.inf
for sentence in question_list:
    if len(sentence) > max_length:
        max_length = len(sentence)
        
for sentence in question_list:
    if len(sentence) > max_length:
        max_length = len(sentence)

In [8]:
max_length

269

## Word Embeddings

In [9]:
vector_size = 50
window_size = 5
negative_size = 15

wv_model_file = 'qqp_wv.pth'

In [10]:
# wv_model = gensim.downloader.load('glove-wiki-gigaword-50')
# wv_model = models.Word2Vec(sentences=sentences, vector_size=vector_size, window=window_size, negative=negative_size).wv
# wv_model = models.Word2Vec(corpus_file='data/corpus.txt', vector_size=vector_size, window=window_size, negative=negative_size).wv

# wv_model.save(wv_model_file)
# del wv_model

In [11]:
def vectorize_sentences(sentences, wv, sentence_size):
    voc = wv.key_to_index.keys()
    unk = wv['<unk>']
    eos = wv['<eos>']
    lengths = []
    for i, sentence in enumerate(sentences):
        lengths.append(len(sentence))
        
        for i, token in enumerate(sentence):
            if token in voc:
                sentence[i] = wv[token]
            else:
                sentence[i] = unk
        
        
        while len(sentence) < sentence_size:
            sentence.append(eos)
        
        
    return sentences, lengths

In [12]:
wv = models.KeyedVectors.load(wv_model_file)

In [13]:
wv.add_vectors(
    ['<unk>', '<eos>'],
    [np.zeros(wv.vector_size), np.ones(wv.vector_size)]
)

## BiLSTM

In [14]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [15]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [16]:
def save_model(model, file_name):
    torch.save(model.state_dict(), file_name)
def load_model(model, file_name):
    return model.load_state_dict(torch.load(file_name))

In [17]:
hidden_size = 50
sentence_size = 275
num_layers = 2
bidirectional = True

batch_size = 128

lr = 0.001
num_epochs = 20


eval_rate = 0.1

model_file = 'logs/qqp/lstm_qqp_v2.pth'

In [18]:
question_vec, question_lengths = vectorize_sentences(question_list, wv, sentence_size)

In [19]:
question_lengths = np.array(question_lengths)
labels = np.array(labels)

In [20]:
# vec_labels = np.zeros((labels.size, labels.max()+1))
# vec_labels[np.arange(labels.size), labels] = 1

In [21]:
# vectors = vec_sentences.reshape((-1, 50))

# mu = vectors.mean(axis=0)
# sigma = np.sqrt(((vectors - mu) ** 2).mean(axis=0))

# vec_sentences = (vec_sentences - mu) / sigma

In [22]:
eval_index = int(len(question_vec) * eval_rate)

question_train = question_vec[eval_index:]
question_eval = question_vec[:eval_index]

question_len_train = question_lengths[eval_index:]
question_len_eval = question_lengths[:eval_index]


label_train = labels[eval_index:]
label_eval = labels[:eval_index]

In [23]:
# question1_train = torch.tensor(question1_train, dtype=torch.float)
# question1_eval = torch.tensor(question1_eval, dtype=torch.float)

# question2_train = torch.tensor(question2_train, dtype=torch.float)
# question2_eval = torch.tensor(question2_eval, dtype=torch.float)

question_len_train = torch.tensor(question_len_train, dtype=torch.int)
question_len_eval = torch.tensor(question_len_eval, dtype=torch.int)

label_train = torch.tensor(label_train, dtype=torch.long)
label_eval = torch.tensor(label_eval, dtype=torch.long)

In [24]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        lstm_dim = hidden_size * 2 * (2 if bidirectional else 1)
        
        
        self.lstm = nn.LSTM(input_size=vector_size,
                              hidden_size=hidden_size,
                              num_layers=num_layers,
                              bidirectional=bidirectional,
                              batch_first=True
                             )

        
        self.fcnn_1 = nn.Linear(in_features=lstm_dim, out_features=64)
        
        self.fcnn_2 = nn.Linear(in_features=64, out_features=2)
        
    def forward(self, question, question_len):
        
        question = pack_padded_sequence(question, question_len.cpu(), batch_first=True, enforce_sorted=False)
        question, _ = self.lstm(question)
        question, _ = pad_packed_sequence(question, batch_first=True)
        
        
        avg_pool = torch.mean(question, 1)
        max_pool, _ = torch.max(question, 1)
        output = torch.cat([avg_pool, max_pool], dim=1)
        
        
        output = self.fcnn_1(output)
        output = torch.relu(output)
        
        output = self.fcnn_2(output)
        return output

In [25]:
# weights = torch.log(1/(train_y.sum(dim=0) / train_y.sum()))
# weights = weights.detach().to(device)
# weights

In [26]:
num_train_batch = int(len(question_train) / batch_size)
num_eval_batch = int(len(question_eval) / batch_size)

In [27]:
classifier = Classifier().to(device)

In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)

In [29]:
min_loss = np.inf

for i in range(num_epochs):
    print(f'---> Epoch {i} <---')
    time.sleep(0.5)
    
    classifier.train()
    loader = tqdm(range(num_train_batch), postfix={'Epoch': i})
    train_losses = []
    
    for i_batch in loader:
        question, question_len, targets = (
            question_train[i_batch*batch_size:(i_batch+1)*batch_size],
            question_len_train[i_batch*batch_size:(i_batch+1)*batch_size],
            label_train[i_batch*batch_size:(i_batch+1)*batch_size]
        )
        
        question = torch.tensor(np.array(question), dtype=torch.float, device=device)
        
        
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        outputs = classifier(question, question_len)
        
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            train_losses.append(loss.item())
            loader.set_postfix({
                'Epoch': i,
                'Train loss': np.mean(train_losses)
            }, refresh=True)
    
    
    time.sleep(0.5)
    
    with torch.no_grad():
        classifier.eval()
        loader = tqdm(range(num_eval_batch), postfix={'Epoch': i,}, colour='green')
        eval_losses = []
        eval_scores = []

        for i_batch in loader:
            question, question_len, targets = (
                question_eval[i_batch*batch_size:(i_batch+1)*batch_size],
                question_len_eval[i_batch*batch_size:(i_batch+1)*batch_size],
                label_eval[i_batch*batch_size:(i_batch+1)*batch_size]
            )
            
            question = torch.tensor(question, dtype=torch.float, device=device)
            targets = targets.to(device)
            

            outputs = classifier(question, question_len)
            
            loss = criterion(outputs, targets)
            
            score = (outputs.argmax(dim=1) == targets).detach().cpu().numpy()
            eval_scores.append(score)
            eval_losses.append(loss.item())
            loader.set_postfix({
                'Epoch': i,
                'Eval loss': np.mean(eval_losses),
                'Eval score': np.concatenate(eval_scores).mean()
            }, refresh=True)


        eval_loss = np.mean(eval_losses)
        if eval_loss <= min_loss:
            min_loss = eval_loss
            save_model(classifier, model_file)
            loader.write('*** save ***')
        
    time.sleep(0.5)

---> Epoch 0 <---


100%|██████████| 2842/2842 [01:51<00:00, 25.51it/s, Epoch=0, Train loss=0.533]
  question = torch.tensor(question, dtype=torch.float, device=device)
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.98it/s, Epoch=0, Eval loss=0.496, Eval score=0.752]


*** save ***
---> Epoch 1 <---


100%|██████████| 2842/2842 [01:55<00:00, 24.57it/s, Epoch=1, Train loss=0.474]
100%|[32m██████████[0m| 315/315 [00:44<00:00,  7.06it/s, Epoch=1, Eval loss=0.47, Eval score=0.767] 


*** save ***
---> Epoch 2 <---


100%|██████████| 2842/2842 [01:53<00:00, 25.13it/s, Epoch=2, Train loss=0.446]
100%|[32m██████████[0m| 315/315 [00:44<00:00,  7.08it/s, Epoch=2, Eval loss=0.458, Eval score=0.774]


*** save ***
---> Epoch 3 <---


100%|██████████| 2842/2842 [01:53<00:00, 25.11it/s, Epoch=3, Train loss=0.424]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.88it/s, Epoch=3, Eval loss=0.451, Eval score=0.781]


*** save ***
---> Epoch 4 <---


100%|██████████| 2842/2842 [01:54<00:00, 24.77it/s, Epoch=4, Train loss=0.407]
100%|[32m██████████[0m| 315/315 [00:46<00:00,  6.78it/s, Epoch=4, Eval loss=0.448, Eval score=0.785]


*** save ***
---> Epoch 5 <---


100%|██████████| 2842/2842 [01:56<00:00, 24.48it/s, Epoch=5, Train loss=0.391]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.94it/s, Epoch=5, Eval loss=0.448, Eval score=0.788]


*** save ***
---> Epoch 6 <---


100%|██████████| 2842/2842 [01:54<00:00, 24.90it/s, Epoch=6, Train loss=0.377]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.97it/s, Epoch=6, Eval loss=0.449, Eval score=0.788]


---> Epoch 7 <---


100%|██████████| 2842/2842 [01:54<00:00, 24.88it/s, Epoch=7, Train loss=0.364]
100%|[32m██████████[0m| 315/315 [00:44<00:00,  7.01it/s, Epoch=7, Eval loss=0.45, Eval score=0.79]  


---> Epoch 8 <---


100%|██████████| 2842/2842 [01:53<00:00, 25.14it/s, Epoch=8, Train loss=0.353]
100%|[32m██████████[0m| 315/315 [00:46<00:00,  6.83it/s, Epoch=8, Eval loss=0.452, Eval score=0.79] 


---> Epoch 9 <---


100%|██████████| 2842/2842 [01:54<00:00, 24.75it/s, Epoch=9, Train loss=0.342]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.86it/s, Epoch=9, Eval loss=0.459, Eval score=0.789]


---> Epoch 10 <---


100%|██████████| 2842/2842 [01:53<00:00, 24.99it/s, Epoch=10, Train loss=0.333]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.96it/s, Epoch=10, Eval loss=0.466, Eval score=0.79] 


---> Epoch 11 <---


100%|██████████| 2842/2842 [01:50<00:00, 25.63it/s, Epoch=11, Train loss=0.324]
100%|[32m██████████[0m| 315/315 [00:43<00:00,  7.27it/s, Epoch=11, Eval loss=0.476, Eval score=0.79] 


---> Epoch 12 <---


100%|██████████| 2842/2842 [01:51<00:00, 25.41it/s, Epoch=12, Train loss=0.315]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.98it/s, Epoch=12, Eval loss=0.491, Eval score=0.788]


---> Epoch 13 <---


100%|██████████| 2842/2842 [01:53<00:00, 24.94it/s, Epoch=13, Train loss=0.307]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.94it/s, Epoch=13, Eval loss=0.504, Eval score=0.789]


---> Epoch 14 <---


100%|██████████| 2842/2842 [01:54<00:00, 24.88it/s, Epoch=14, Train loss=0.3]  
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.88it/s, Epoch=14, Eval loss=0.515, Eval score=0.788]


---> Epoch 15 <---


100%|██████████| 2842/2842 [01:52<00:00, 25.16it/s, Epoch=15, Train loss=0.292]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.92it/s, Epoch=15, Eval loss=0.528, Eval score=0.786]


---> Epoch 16 <---


100%|██████████| 2842/2842 [01:55<00:00, 24.69it/s, Epoch=16, Train loss=0.286]
100%|[32m██████████[0m| 315/315 [00:46<00:00,  6.77it/s, Epoch=16, Eval loss=0.547, Eval score=0.786]


---> Epoch 17 <---


100%|██████████| 2842/2842 [01:53<00:00, 24.94it/s, Epoch=17, Train loss=0.279]
100%|[32m██████████[0m| 315/315 [00:44<00:00,  7.04it/s, Epoch=17, Eval loss=0.569, Eval score=0.787]


---> Epoch 18 <---


100%|██████████| 2842/2842 [01:53<00:00, 25.14it/s, Epoch=18, Train loss=0.274]
100%|[32m██████████[0m| 315/315 [00:46<00:00,  6.72it/s, Epoch=18, Eval loss=0.564, Eval score=0.786]


---> Epoch 19 <---


100%|██████████| 2842/2842 [01:54<00:00, 24.75it/s, Epoch=19, Train loss=0.269]
100%|[32m██████████[0m| 315/315 [00:45<00:00,  6.90it/s, Epoch=19, Eval loss=0.587, Eval score=0.784]
