In [95]:
__title__ = "IR HW2"
__author__ = "Mohammadreza Ghofrani"
__stdid__ = "400131076"

In [97]:
import re
from copy import deepcopy
from collections import Counter

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim

import gensim
import gensim.downloader
from gensim.matutils import cossim

from transformers import AutoTokenizer, AutoModel

torch.manual_seed(1)

<torch._C.Generator at 0x7f125995d4d0>

In [98]:
class config:
    top_k = 10

# Evaluation Functions

In [99]:
def reciprocal_rank(ref, pred):
    for i, p in enumerate(pred):
        if p in ref:
            return 1 / (i+1)
    return 0

def MRR(ref_set, pred_set):
    m = len(ref_set)
    mean_reciprocal_rank = 0
    for doc_id in ref_set:
        mean_reciprocal_rank += reciprocal_rank(ref_set[doc_id], pred_set[doc_id]) / m
    return mean_reciprocal_rank

In [100]:
def p_at_k(ref, k_pred):
    pred_relevent = len(set(k_pred).intersection(set(ref)))
    return pred_relevent / len(k_pred)

def MAP(ref_set, pred_set):
    m = len(ref_set)
    mean_average_precision = 0
    for doc_id in ref_set:
        sum_avg_precision = 0
        ref, pred = ref_set[doc_id], pred_set[doc_id]

        ref_unseen = deepcopy(ref)
        for k in range(1, config.top_k+1):
            k_pred = pred[:k]
            if k_pred[-1] in ref_unseen:
                ref_unseen.remove(k_pred[-1])
                sum_avg_precision += (p_at_k(ref, k_pred) / len(ref))
        mean_average_precision += (sum_avg_precision / m)

    return mean_average_precision

def AP(ref_set, pred_set, k):
    m = len(ref_set)
    avg_precision = 0
    for doc_id in ref_set:
        ref, k_pred = ref_set[doc_id], pred_set[doc_id][:k]
        avg_precision += (p_at_k(ref, k_pred)/m)

    return avg_precision

# Reading Dataset

In [None]:
train_df = pd.read_csv('data/train_data.csv')
val_df = pd.read_csv('data/valid_data.csv')
test_df = pd.read_csv('data/test_data.csv')

In [None]:
query_answer_ref = dict()

test_qid1_grouped = test_df.groupby('qid1')
for qid1 in test_df.qid1.unique():
    group = test_qid1_grouped.get_group(qid1)
    query_answer_ref[qid1] = group['qid2'].values.tolist()

query_answer_val_ref = dict()
val_qid1_grouped = val_df.groupby('qid1')
for qid1 in val_df.qid1.unique():
    group = val_qid1_grouped.get_group(qid1)
    query_answer_val_ref[qid1] = group['qid2'].values.tolist()

# Part 1

In [165]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [166]:
with torch.no_grad():
    model.eval()
    model.to('cuda')

    doc_repr = dict()
    for _, (doc_id, doc) in tqdm(train_df[['qid2', 'question2']].iterrows(), total=len(train_df)):
        inputs = tokenizer(doc, return_tensors="pt").to('cuda')
        outputs = model(**inputs)
        doc_repr[doc_id] = outputs.last_hidden_state.mean(dim=1).squeeze()

100%|██████████| 37250/37250 [06:14<00:00, 99.41it/s] 


In [167]:
with torch.no_grad():
    model.eval()
    model.to('cuda')
    query_answer_bert = dict()
    for _, (qid, qtext) in tqdm(test_df[['qid1', 'question1']].iterrows(), total=len(test_df)):
        if qid in query_answer_bert:
            continue

        inputs = tokenizer(qtext, return_tensors="pt").to('cuda')
        output = model(**inputs)
        query_vec = output.last_hidden_state.mean(dim=1).squeeze()

        doc_score = []
        for doc_id in doc_repr:
            doc_vec = doc_repr[doc_id]

            sim = F.cosine_similarity(query_vec, doc_vec, dim=0).item()
            doc_score.append((doc_id, sim))

        doc_score = pd.DataFrame(doc_score, columns =['id', 'score']).sort_values('score', ascending=False)
        topk = doc_score.nlargest(n=config.top_k, columns='score')
        query_answer_bert[qid] = topk['id'].tolist()

100%|██████████| 980/980 [04:13<00:00,  3.87it/s]


## Evaluation

In [168]:
map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer_bert)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer_bert)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer_bert, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer_bert, k=10)
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

map= 0.45695022228266463
mrr= 0.7656936290497952
ap@5= 0.4999999999999992
ap@10= 0.3458904109589043


# Part 2

In [115]:
DEVICE = 'cuda'
N_EPOCHS = 10
THRESHOLD = 0.5

MAX_LENGTH = 64
HIDDEN_DIM = 128
EMBEDDING_DIM = 300

N_LOGGING_STEPS = 1000

In [152]:
class SiameseDataset(Dataset):
    def __init__(self, embedding, dataframe):
        self.id1_list = dataframe['qid1'].to_list()
        self.sent1_list = dataframe['question1'].tolist()
        self.id2_list = dataframe['qid2'].tolist()
        self.sent2_list = dataframe['question2'].tolist()
        self.label_list = dataframe['is_duplicate'].tolist()
        self.embedding = embedding

    def __len__(self):
        return len(self.sent1_list)

    def prepare_sequence(self, seq):
        embedding = list()
        for i in range(MAX_LENGTH):
            if i < len(seq):
                w = seq[i]
                if w in self.embedding:
                    embedding.append(self.embedding[w])
                else:
                    embedding.append(np.ones(EMBEDDING_DIM))
            else:
                embedding.append(np.zeros(EMBEDDING_DIM))
        return torch.tensor(np.array(embedding), dtype=torch.float)

    def __getitem__(self, index):
        sent1 = self.sent1_list[index]
        sent2 = self.sent2_list[index]
        label = self.label_list[index]

        sent1_seq = list(filter(None, re.split('\W+', sent1.lower())))
        sent1_embedding = self.prepare_sequence(sent1_seq)
        sent2_seq = list(filter(None, re.split('\W+', sent2.lower())))
        sent2_embedding = self.prepare_sequence(sent2_seq)
        label_tensor = torch.tensor([label], dtype=torch.float)

        return {
            'id1': self.id1_list[index],
            'sentence1': sent1_embedding,
            'sentence1_len': len(sent1_seq),
            'id2': self.id2_list[index],
            'sentence2': sent2_embedding,
            'sentence2_len': len(sent2_seq),
            'label': label_tensor
        }

In [153]:
class SiameseNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, 
                 bidirectional=False, initial_embedding=None, output_composition='linear'):
        super(SiameseNN, self).__init__()
        target_size = 1

        self.bidirectional = bidirectional
        self.lstm_layer = nn.LSTM(embedding_dim, hidden_dim,
                                  bidirectional=bidirectional, batch_first=True)
        self.dropout = nn.Dropout(p=0.2)

        self.output_composition = output_composition
        if self.output_composition == 'cosine':
            self.cls = nn.CosineSimilarity(dim=1)
        elif self.output_composition == 'linear':
            in_features = 4*hidden_dim if bidirectional else 2*hidden_dim
            self.cls = nn.Linear(in_features, target_size)
        else:
            raise NotImplementedError("Only 'cosine' and 'linear' are implemented for output_composition")

    def get_text_embedding(self, sent):
        seq = sent.split()
        idxs = [self.word2id.get(w, 0) for w in seq]
        idxs = torch.tensor(idxs, dtype=torch.long)
        return self._text_embedder(idxs)

    def get_text_similarity(self, sent1_embed, sent2_embed):
        if self.output_composition == 'cosine':
            sim_score = self.cls(sent1_embed, sent2_embed)
            sim_score = torch.clamp(sim_score, min=0.0, max=1.0)
            sim_score = sim_score.unsqueeze(dim=1)

        elif self.output_composition == 'linear':
            cls_out = self.cls(torch.cat((sent1_embed, sent2_embed), dim=1))
            sim_score = torch.sigmoid(cls_out)

        return sim_score

    def _text_embedder(self, sent_embed, sent_len):
        pack_pad_embed = pack_padded_sequence(sent_embed, sent_len, batch_first=True, enforce_sorted=False)
        out, (hidden, ct) = self.lstm_layer(pack_pad_embed)
        embedding = self.dropout(hidden.squeeze(dim=0))
        if self.bidirectional:
            embedding = torch.cat((embedding[0], embedding[1]), dim=1)
        return embedding

    def forward(self, sent1_embed, sent1_length, sent2_embed, sent2_length):
        embed1 = self._text_embedder(sent1_embed, sent1_length)
        embed2 = self._text_embedder(sent2_embed, sent2_length)

        sim_score = self.get_text_similarity(embed1, embed2)
        return sim_score

In [154]:
def train_document_embedder(model, train_loader, device):
    model.eval()
    with torch.no_grad():
        model.to(device)

        doc_ids = list()
        doc_embedding = list()
        doc_init_embedding = list()
        for data in (pbar := tqdm(train_loader)):
            qid2 = data['id2'].to(device)
            sent2_embedding = data['sentence2'].to(device)
            sent2_length = data['sentence2_len'].to(torch.int64)
            batch_doc_reprs = model._text_embedder(sent2_embedding, sent2_length)
            for doc_id, sent2_embed, doc_embed in zip(qid2, sent2_embedding, batch_doc_reprs):
                _doc_id = doc_id.item()
                if _doc_id not in doc_ids:
                    doc_ids.append(_doc_id)
                    doc_init_embedding.append(sent2_embed.unsqueeze(dim=0))
                    doc_embedding.append(doc_embed.unsqueeze(dim=0))

    doc_embedding = torch.cat(doc_embedding, dim=0)
    doc_init_embedding = torch.cat(doc_init_embedding, dim=0)
    return doc_ids, doc_embedding

def get_query_answer(model, test_loader, doc_ids, doc_embedding, device):
    model.eval()
    with torch.no_grad():
        model.to(device)
        query_answer = dict()
        for data in (pbar := tqdm(test_loader)):
            batch_qid = data['id1'].to(device)
            query_embedding = data['sentence1'].to(device)
            query_length = data['sentence1_len'].to(torch.int64)
            batch_query_reprs = model._text_embedder(query_embedding, query_length)
            for qid, query_vec in zip(batch_qid, batch_query_reprs):
                _qid = qid.item()
                if _qid in query_answer:
                    continue

                similarities = model.get_text_similarity(query_vec.repeat((doc_embedding.shape[0],1)), doc_embedding).squeeze().tolist()
                query_answer[_qid] = [_id for _, _id in sorted(zip(similarities, doc_ids), reverse=True)]
    return query_answer

In [157]:
def eval(model, loss_function, train_loader, dev_loader, device, epoch):
    model.eval()
    with torch.no_grad():
        print(f'epoch {epoch+1}', end=' ')
        for desc, data_loader in [('train', train_loader), ('val', dev_loader)]:
            n = 0
            acc = 0
            total_loss = 0
            for data in data_loader:
                y = data['label'].to(device)
                sentence1 = data['sentence1'].to(device)
                sent1_length = data['sentence1_len'].to(torch.int64)
                sentence2 = data['sentence2'].to(device)
                sent2_length = data['sentence2_len'].to(torch.int64)
                sim_score = model(sentence1, sent1_length, sentence2, sent2_length)
                y_pred = (sim_score > THRESHOLD).float()

                n += 1
                acc += (y_pred == y).sum().item() / y.shape[0]
                total_loss += loss_function(sim_score, y).item()
            acc = acc/n
            total_loss = total_loss / n
            print(f'{desc}-acc {acc:.3f} {desc}-loss {total_loss:.3f}', end=' ')
        print()
        model.train()

def train(model, n_epochs, train_loader, val_loader, device):
    model.train()
    loss_function = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    global_steps = 0
    model.to(device)
    for epoch in range(n_epochs):
        for data in train_loader:
            global_steps += 1
            model.zero_grad()

            y = data['label'].to(device)
            sentence1 = data['sentence1'].to(device)
            sent1_length = data['sentence1_len'].to(torch.int64)
            sentence2 = data['sentence2'].to(device)
            sent2_length = data['sentence2_len'].to(torch.int64)

            sim_score = model(sentence1, sent1_length, sentence2, sent2_length)

            loss = loss_function(sim_score, y)
            loss.backward()
            optimizer.step()

        eval(model, loss_function, train_loader, val_loader, device, epoch)
    return model

## Word2Vec

In [None]:
word2vec = gensim.downloader.load('word2vec-google-news-300')



In [None]:
train_set = SiameseDataset(word2vec, train_df)
val_set = SiameseDataset(word2vec, val_df)
test_set = SiameseDataset(word2vec, test_df)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False, num_workers=0)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False, num_workers=0)

## Uni-directional

#### Linear

In [162]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM, bidirectional=False, output_composition='linear')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

epoch 1 train-acc 0.802 train-loss 0.432 val-acc 0.899 val-loss 0.340 
epoch 2 train-acc 0.829 train-loss 0.392 val-acc 0.872 val-loss 0.349 
epoch 3 train-acc 0.852 train-loss 0.353 val-acc 0.900 val-loss 0.275 
epoch 4 train-acc 0.866 train-loss 0.333 val-acc 0.900 val-loss 0.292 
epoch 5 train-acc 0.870 train-loss 0.324 val-acc 0.895 val-loss 0.298 
epoch 6 train-acc 0.875 train-loss 0.313 val-acc 0.909 val-loss 0.266 
epoch 7 train-acc 0.883 train-loss 0.300 val-acc 0.910 val-loss 0.253 
epoch 8 train-acc 0.887 train-loss 0.294 val-acc 0.884 val-loss 0.305 
epoch 9 train-acc 0.892 train-loss 0.283 val-acc 0.896 val-loss 0.279 
epoch 10 train-acc 0.895 train-loss 0.280 val-acc 0.899 val-loss 0.293 


In [163]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:17<00:00, 68.36it/s]
100%|██████████| 62/62 [00:01<00:00, 36.15it/s]



map= 0.0006030270132897983
mrr= 0.0052720079008686934
ap@5= 0.00410958904109589
ap@10= 0.0027397260273972603


#### Cosine

In [169]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM,bidirectional=False, output_composition='cosine')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

epoch 1 train-acc 0.800 train-loss 0.990 val-acc 0.899 val-loss 0.524 
epoch 2 train-acc 0.802 train-loss 0.967 val-acc 0.899 val-loss 0.492 
epoch 3 train-acc 0.809 train-loss 0.936 val-acc 0.900 val-loss 0.464 
epoch 4 train-acc 0.815 train-loss 0.842 val-acc 0.907 val-loss 0.358 
epoch 5 train-acc 0.826 train-loss 0.810 val-acc 0.910 val-loss 0.335 
epoch 6 train-acc 0.833 train-loss 0.788 val-acc 0.909 val-loss 0.310 
epoch 7 train-acc 0.849 train-loss 0.760 val-acc 0.915 val-loss 0.294 
epoch 8 train-acc 0.846 train-loss 0.784 val-acc 0.923 val-loss 0.283 
epoch 9 train-acc 0.846 train-loss 0.645 val-acc 0.915 val-loss 0.389 
epoch 10 train-acc 0.853 train-loss 0.750 val-acc 0.919 val-loss 0.292 


In [170]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:17<00:00, 66.06it/s]
100%|██████████| 62/62 [00:01<00:00, 33.69it/s]


map= 0.26169488741973956
mrr= 0.5703474654126499
ap@5= 0.29863013698630103
ap@10= 0.21232876712328744





### Bi-directional

#### Linear

In [171]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM, bidirectional=True, output_composition='linear')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

epoch 1 train-acc 0.827 train-loss 0.391 val-acc 0.902 val-loss 0.294 
epoch 2 train-acc 0.851 train-loss 0.353 val-acc 0.916 val-loss 0.272 
epoch 3 train-acc 0.868 train-loss 0.334 val-acc 0.899 val-loss 0.280 
epoch 4 train-acc 0.882 train-loss 0.308 val-acc 0.905 val-loss 0.267 
epoch 5 train-acc 0.887 train-loss 0.298 val-acc 0.895 val-loss 0.295 
epoch 6 train-acc 0.892 train-loss 0.288 val-acc 0.917 val-loss 0.256 
epoch 7 train-acc 0.898 train-loss 0.273 val-acc 0.912 val-loss 0.259 
epoch 8 train-acc 0.901 train-loss 0.268 val-acc 0.907 val-loss 0.268 
epoch 9 train-acc 0.904 train-loss 0.260 val-acc 0.913 val-loss 0.268 
epoch 10 train-acc 0.906 train-loss 0.257 val-acc 0.923 val-loss 0.241 


In [172]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:17<00:00, 65.89it/s]
100%|██████████| 62/62 [00:01<00:00, 34.65it/s]



map= 7.872775940796725e-05
mrr= 0.003760231778857556
ap@5= 0.0013698630136986301
ap@10= 0.0006849315068493151


#### Cosine

In [None]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM, bidirectional=True, output_composition='cosine')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

In [174]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:18<00:00, 62.68it/s]
100%|██████████| 62/62 [00:02<00:00, 30.20it/s]


map= 0.21823231551812347
mrr= 0.5327716366396211
ap@5= 0.2671232876712327
ap@10= 0.18630136986301346





## FastText

In [128]:
fasttext = gensim.downloader.load('fasttext-wiki-news-subwords-300')



In [130]:
train_set = SiameseDataset(fasttext, train_df)
val_set = SiameseDataset(fasttext, val_df)
test_set = SiameseDataset(fasttext, test_df)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False, num_workers=0)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False, num_workers=0)

### Uni-directional

#### Linear

In [179]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM,bidirectional=False, output_composition='linear')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

epoch 1 train-acc 0.806 train-loss 0.415 val-acc 0.910 val-loss 0.298 
epoch 2 train-acc 0.842 train-loss 0.370 val-acc 0.894 val-loss 0.314 
epoch 3 train-acc 0.859 train-loss 0.347 val-acc 0.879 val-loss 0.307 
epoch 4 train-acc 0.864 train-loss 0.333 val-acc 0.907 val-loss 0.262 
epoch 5 train-acc 0.867 train-loss 0.334 val-acc 0.873 val-loss 0.333 
epoch 6 train-acc 0.870 train-loss 0.319 val-acc 0.915 val-loss 0.244 
epoch 7 train-acc 0.880 train-loss 0.311 val-acc 0.883 val-loss 0.312 
epoch 8 train-acc 0.883 train-loss 0.296 val-acc 0.901 val-loss 0.264 
epoch 9 train-acc 0.886 train-loss 0.291 val-acc 0.911 val-loss 0.251 
epoch 10 train-acc 0.880 train-loss 0.302 val-acc 0.922 val-loss 0.227 


In [181]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:18<00:00, 63.49it/s]
100%|██████████| 62/62 [00:01<00:00, 34.00it/s]


map= 0.00020924886416955632
mrr= 0.003706966052230346
ap@5= 0.0
ap@10= 0.0020547945205479454





#### Cosine

In [177]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM,bidirectional=False, output_composition='cosine')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

epoch 1 train-acc 0.800 train-loss 1.069 val-acc 0.899 val-loss 0.525 
epoch 2 train-acc 0.801 train-loss 0.998 val-acc 0.899 val-loss 0.414 
epoch 3 train-acc 0.804 train-loss 0.900 val-acc 0.898 val-loss 0.386 
epoch 4 train-acc 0.813 train-loss 0.756 val-acc 0.903 val-loss 0.357 
epoch 5 train-acc 0.830 train-loss 0.825 val-acc 0.910 val-loss 0.330 
epoch 6 train-acc 0.835 train-loss 0.688 val-acc 0.913 val-loss 0.294 
epoch 7 train-acc 0.853 train-loss 0.769 val-acc 0.909 val-loss 0.276 
epoch 8 train-acc 0.847 train-loss 0.598 val-acc 0.919 val-loss 0.255 
epoch 9 train-acc 0.854 train-loss 0.657 val-acc 0.920 val-loss 0.250 
epoch 10 train-acc 0.803 train-loss 1.059 val-acc 0.900 val-loss 0.405 


In [178]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:17<00:00, 66.01it/s]
100%|██████████| 62/62 [00:01<00:00, 33.92it/s]


map= 0.18713290257962042
mrr= 0.5076387898892228
ap@5= 0.22328767123287657
ap@10= 0.16506849315068473





### Bi-directional

#### Linear

In [160]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM,bidirectional=True, output_composition='linear')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

epoch 1 train-acc 0.818 train-loss 0.407 val-acc 0.906 val-loss 0.334 
epoch 2 train-acc 0.856 train-loss 0.357 val-acc 0.878 val-loss 0.321 
epoch 3 train-acc 0.875 train-loss 0.322 val-acc 0.908 val-loss 0.280 
epoch 4 train-acc 0.878 train-loss 0.313 val-acc 0.934 val-loss 0.243 
epoch 5 train-acc 0.891 train-loss 0.289 val-acc 0.902 val-loss 0.273 
epoch 6 train-acc 0.898 train-loss 0.273 val-acc 0.914 val-loss 0.251 
epoch 7 train-acc 0.905 train-loss 0.259 val-acc 0.912 val-loss 0.263 
epoch 8 train-acc 0.905 train-loss 0.256 val-acc 0.894 val-loss 0.278 
epoch 9 train-acc 0.907 train-loss 0.248 val-acc 0.935 val-loss 0.229 
epoch 10 train-acc 0.917 train-loss 0.229 val-acc 0.922 val-loss 0.259 


In [161]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:17<00:00, 67.66it/s]
100%|██████████| 62/62 [00:01<00:00, 36.10it/s]



map= 0.0
mrr= 0.0021738120170075375
ap@5= 0.0
ap@10= 0.0


#### Cosine

In [158]:
n_epochs = 10
model = SiameseNN(EMBEDDING_DIM, HIDDEN_DIM, bidirectional=True, output_composition='cosine')
model = train(model, n_epochs, train_loader, val_loader, DEVICE)

epoch 1 train-acc 0.802 train-loss 0.996 val-acc 0.898 val-loss 0.392 
epoch 2 train-acc 0.802 train-loss 0.908 val-acc 0.898 val-loss 0.376 
epoch 3 train-acc 0.812 train-loss 0.871 val-acc 0.892 val-loss 0.345 
epoch 4 train-acc 0.825 train-loss 0.821 val-acc 0.901 val-loss 0.323 
epoch 5 train-acc 0.833 train-loss 0.789 val-acc 0.908 val-loss 0.310 
epoch 6 train-acc 0.843 train-loss 0.707 val-acc 0.909 val-loss 0.293 
epoch 7 train-acc 0.855 train-loss 0.710 val-acc 0.914 val-loss 0.469 
epoch 8 train-acc 0.855 train-loss 0.663 val-acc 0.912 val-loss 0.469 
epoch 9 train-acc 0.859 train-loss 0.761 val-acc 0.920 val-loss 0.449 
epoch 10 train-acc 0.856 train-loss 0.630 val-acc 0.917 val-loss 0.254 


In [159]:
train_doc_ids, train_doc_embedding = train_document_embedder(model, train_loader, DEVICE)
query_answer = get_query_answer(model, test_loader, train_doc_ids, train_doc_embedding, DEVICE)

map_ = MAP(ref_set=query_answer_ref, pred_set=query_answer)
mrr = MRR(ref_set=query_answer_ref, pred_set=query_answer)
avg_p_at_5 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=5)
avg_p_at_10 = AP(ref_set=query_answer_ref, pred_set=query_answer, k=10)

print()
print('map=', map_)
print('mrr=', mrr)
print('ap@5=', avg_p_at_5)
print('ap@10=', avg_p_at_10)

100%|██████████| 1165/1165 [00:17<00:00, 66.36it/s]
100%|██████████| 62/62 [00:01<00:00, 33.37it/s]


map= 0.22808093448946196
mrr= 0.5569746806544218
ap@5= 0.27123287671232854
ap@10= 0.19726027397260248



