# TAR Project 2024 - Team Mojave

# Google colab prerequisites

In [None]:
## Uncomment these lines if you are running the notebook from Google Colab
#
# !git clone -b feature/sbert https://github.com/opacicmarko/duplicate-question-identification.git
# %cd duplicate-question-identification/src/data/
# !wget https://sbert.net/datasets/quora-IR-dataset.zip
# !unzip quora-IR-dataset.zip
# !mv classification/* sbert/
# %cd ..
# !pip install sentence-transformers

# Imports

In [None]:
import torch
import numpy as np
import pandas as pd
import logging
import time

from utils import log

# Config

In [None]:
MODEL_SAVE_DIR = './saved_models/'
LOG_DIR = './logs/'

logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


# Baseline models

## Loading the questions and embeddings

In [None]:
from preprocessing import load_glove_embeddings, load_and_embed_questions, get_max_len

# Load embeddings and calculate the average embedding
glove_embeddings, glove_avg_embedding = load_glove_embeddings('glove-wikipedia/glove.6B.300d.txt', calculate_average=True)

In [None]:
result = load_and_embed_questions('data/train.csv', None, glove_embeddings, glove_avg_embedding)
print(len(result.vocab.idx_words))


In [None]:
print(len(result.vocab.idx_words))

# Sanity check (should look like [-0.36886 0.16665 0.053452 ... -0.030849 -0.031811])
print(result.embedding[result.vocab.words_idx['year']])

In [None]:
print('Calculating maximum question length...')
max_len = get_max_len(pd.read_csv('data/train.csv'))
print('Maximum length:', max_len)

## Dataset split generation

In [None]:
from datasplit import make_dataset_split

RANDOM_STATE = 73

ORIGINAL_DATA_PATH = 'data/train.csv'
TRAIN_PATH = 'data/mojave/mojave_train.csv'
VALIDATION_PATH = 'data/mojave/mojave_validation.csv'
TEST_PATH = 'data/mojave/mojave_test.csv'

train_df, valid_df, test_df = make_dataset_split(
    data_path=ORIGINAL_DATA_PATH,
    train_path=TRAIN_PATH,
    validation_path=VALIDATION_PATH,
    test_path=TEST_PATH,
    random_state=RANDOM_STATE
)

In [None]:
train_df.plot.hist(by='is_duplicate');

## Models

In [None]:
from preprocessing import ProcessedResult, preprocess_text

def embedding_for_word(word: str, result: ProcessedResult) -> np.array:
    idx = None
    idx = result.vocab.words_idx.get(word, None)
    if idx is None:
        return result.avg_embedding
    return result.embedding[idx]

def question_embedding_tensor(text: str, result: ProcessedResult, device: torch.device) -> torch.Tensor:
    tokens = preprocess_text(text)
    token_embeddings = list(map(lambda token: embedding_for_word(token, result), tokens))
    if len(token_embeddings) == 0:
        token_embeddings = [np.array(result.avg_embedding)]
    token_embeddings = np.vstack(token_embeddings)
    return torch.from_numpy(token_embeddings.sum(axis=0)).to(device)

## Dataset splits loading

In [None]:
from torch import nn
from torch.utils.data import Dataset
from preprocessing import MojaveVocab
from typing import Any

questions_dtype = {
    'id': int,
    'qid1': int,
    'qid2': int,
    'question1': str,
    'question2': str,
    'is_duplicate': int
}

class QuestionPairDataset(Dataset):
    def __init__(self, questions_path: str, vocab: MojaveVocab, max_len: int, device: torch.device):
        self.path = questions_path
        self.vocab = vocab
        self.max_len = max_len
        self.device = device
        self.questions = pd.read_csv(questions_path, dtype=questions_dtype)
        # self.result = load_and_embed_questions(questions_path, None, glove_embeddings, glove_avg_embedding)
    def __len__(self):
        return len(self.questions)
    def __getitem__(self, index) -> Any:
        row = self.questions.iloc[index]
        question1 = preprocess_text(row['question1'])
        question2 = preprocess_text(row['question2'])
        label = row['is_duplicate']

        question1_idxs = [self.vocab.words_idx.get(word, self.vocab.unk_idx) for word in question1]
        question2_idxs = [self.vocab.words_idx.get(word, self.vocab.unk_idx) for word in question2]

        question1_idxs = question1_idxs + [self.vocab.pad_idx] * (self.max_len - len(question1_idxs))
        question2_idxs = question2_idxs + [self.vocab.pad_idx] * (self.max_len - len(question2_idxs))

        return (
            torch.tensor(question1_idxs, dtype=torch.long, device=self.device),
            torch.tensor(question2_idxs, dtype=torch.long, device=self.device),
            torch.tensor(label, dtype=torch.long, device=self.device)
        )

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 16
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.backends.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)

train_data = QuestionPairDataset(questions_path=TRAIN_PATH, vocab=result.vocab, max_len=max_len, device=device)
validation_data = QuestionPairDataset(questions_path=VALIDATION_PATH, vocab=result.vocab, max_len=max_len, device=device)
test_data = QuestionPairDataset(questions_path=TEST_PATH, vocab=result.vocab, max_len=max_len, device=device)

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

## Model training

In [None]:
from models import QuestionPairMLP

def train_one_epoch(model: QuestionPairMLP, criterion: nn.CrossEntropyLoss, optimizer: torch.optim.Optimizer, training_loader: DataLoader, device: torch.device):
    validation_interval = 1000
    running_loss = 0.
    last_loss = 0.
    for i, data in enumerate(training_loader):
        question1, question2, labels = data

        optimizer.zero_grad()

        outputs = model(question1, question2)

        loss = criterion(outputs, labels)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100 # loss per batch
            log(logger, '  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.
        if i % validation_interval == validation_interval - 1:
            running_vloss = 0.0
            # Set the model to evaluation mode, disabling dropout and using population
            # statistics for batch normalization.
            model.eval()

            # Disable gradient computation and reduce memory consumption.
            with torch.no_grad():
                for i, vdata in enumerate(validation_dataloader):
                    vq1, vq2, vlabels = vdata
                    voutputs = model(vq1, vq2)
                    vloss = criterion(voutputs, vlabels)
                    running_vloss += vloss

            avg_vloss = running_vloss / (i + 1)
            log(logger, 'LOSS valid {}'.format(avg_vloss))
        model.train(True)
            
    return last_loss


In [None]:
HIDDEN_LAYER_SIZE_1 = 128
HIDDEN_LAYER_SIZE_2 = 64
EPOCHS = 1
LR = 1e-4
WD = 0.1

model = QuestionPairMLP(len(result.vocab), result.embedding, 300, HIDDEN_LAYER_SIZE_1, HIDDEN_LAYER_SIZE_2, device)
model.to(device)

# Logging setup
timestamp = str(int(time.time()))
fh = logging.FileHandler(LOG_DIR + timestamp + '_mlp.log')
fh.setLevel(logging.DEBUG)
logger.handlers.clear()
logger.addHandler(fh)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR, weight_decay=WD)

log(logger, 'Run at timestamp: ' + timestamp)
log(logger, f'Total params: {sum(parameter.numel() for parameter in model.parameters() if parameter.requires_grad)}')
log(logger, f'HIDDEN_LAYER_SIZE_1: {HIDDEN_LAYER_SIZE_1}')
log(logger, f'HIDDEN_LAYER_SIZE_2: {HIDDEN_LAYER_SIZE_2}')
log(logger, f'LR = {LR}')
log(logger, f'WD = {WD}')

for epoch in range(EPOCHS):
    log(logger, f'EPOCH {epoch}')

    model.train(True)
    avg_loss = train_one_epoch(model, criterion, optimizer, train_dataloader, device)

torch.save(model.state_dict(), MODEL_SAVE_DIR + timestamp + '.model')

print('DONE with training')

In [None]:
running_tloss = 0.

correct_pred = 0
total_pred = len(test_data)

model.eval()
with torch.no_grad():
    for i, tdata in enumerate(test_dataloader):
        tq1, tq2, tlabels = tdata
        toutputs = model(tq1, tq2)
        prob_toutputs = nn.functional.softmax(toutputs, dim=1)
        prediction = torch.zeros_like(prob_toutputs)
        mask = toutputs > 0.5
        prediction[mask] = 1.
        prediction = prediction[:, 0]
        correct_pred += int(torch.sum((prediction == tlabels) * (prediction == 1.)).float())
        total_pred += prediction.size(0)
        tloss = criterion(toutputs, tlabels)
        running_tloss += tloss
total = i + 1
avg_tloss = running_tloss / total
accuracy = correct_pred / total_pred


In [None]:
# avg_tloss
accuracy

In [None]:
from models import QuestionPairCosineSimilarity

EPOCHS = 1
LR = 1e-5

cos_model = QuestionPairCosineSimilarity(len(result.vocab), result.embedding, 300, device)
cos_model.to(device)

# Logging setup
timestamp = str(int(time.time()))
fh = logging.FileHandler(LOG_DIR + timestamp + '_cos.log')
fh.setLevel(logging.DEBUG)
logger.handlers.clear()
logger.addHandler(fh)

log(logger, 'Run timestamp: ' + timestamp)
log(logger, f'EPOCHS: {EPOCHS}')
log(logger, f'LR = {LR}')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cos_model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    cos_model.train()

    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_dataloader):
        # Get inputs and labels
        q1, q2, labels = data
        
        optimizer.zero_grad()
        
        outputs = cos_model(q1, q2)
        
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * q1.size(0)

        if i % 100 == 99:
            last_loss = running_loss / 100 # loss per batch
            log(logger, '  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.

    running_vloss = 0.
    
    cos_model.eval()

    with torch.no_grad():
        for i, vdata in enumerate(validation_dataloader):
            vq1, vq2, vlabels = vdata
            voutputs = cos_model(vq1, vq2)
            vloss = criterion(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    log(logger, 'LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    
    epoch_loss = running_loss / len(train_dataloader.dataset)
    log(logger, f"Epoch [{epoch+1}/{EPOCHS}], Loss: {epoch_loss:.4f}")


In [None]:
running_tloss = 0.

correct_preds = 0
total_preds = 0

cos_model.eval()
with torch.no_grad():
    for i, tdata in enumerate(test_dataloader):
        tq1, tq2, tlabels = tdata
        toutputs = cos_model(tq1, tq2)
        prob_toutputs = nn.functional.softmax(toutputs, dim=1)
        prediction = torch.zeros_like(prob_toutputs)
        mask = prob_toutputs > 0.5
        prediction[mask] = 1.
        prediction = prediction[:, 0]
        correct_preds += int(torch.sum((prediction == tlabels) * (prediction == 1.)).float())
        total_preds += prediction.size(0)
        tloss = criterion(toutputs, tlabels)
        running_tloss += tloss
total = i + 1
avg_tloss = running_tloss / total
accuracy = correct_preds / total_preds
print(accuracy)

In [None]:
# import torch
# from torch import nn
# import logging
# import time

# Suponiendo que result, LOG_DIR, train_dataloader, validation_dataloader y test_dataloader están definidos

from models import QuestionPairLSTM

EPOCHS = 1
LR = 1e-5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lstm_model = QuestionPairLSTM(len(result.vocab), result.embedding, embedding_size=300, hidden_size=128, num_layers=2, device=device)
lstm_model.to(device)

# Logging setup
timestamp = str(int(time.time()))
fh = logging.FileHandler(LOG_DIR + timestamp + '_lstm.log')
fh.setLevel(logging.DEBUG)
logger = logging.getLogger()
logger.handlers.clear()
logger.addHandler(fh)

def log(logger, message):
    logger.debug(message)
    print(message)

log(logger, 'Run timestamp: ' + timestamp)
log(logger, f'EPOCHS: {EPOCHS}')
log(logger, f'LR = {LR}')

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    lstm_model.train()

    running_loss = 0.0

    for i, data in enumerate(train_dataloader):
        q1, q2, labels = data
        q1, q2, labels = q1.to(device), q2.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = lstm_model(q1, q2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * q1.size(0)

        if i % 100 == 99:
            last_loss = running_loss / 100
            log(logger, f'  batch {i + 1} loss: {last_loss}')
            running_loss = 0.0

    running_vloss = 0.0

    lstm_model.eval()
    with torch.no_grad():
        for i, vdata in enumerate(validation_dataloader):
            vq1, vq2, vlabels = vdata
            vq1, vq2, vlabels = vq1.to(device), vq2.to(device), vlabels.to(device)
            voutputs = lstm_model(vq1, vq2)
            vloss = criterion(voutputs, vlabels)
            running_vloss += vloss.item() * vq1.size(0)

    avg_vloss = running_vloss / len(validation_dataloader.dataset)
    log(logger, f'LOSS train {running_loss / len(train_dataloader.dataset)} valid {avg_vloss}')


In [None]:
running_tloss = 0.0
correct_preds = 0
total_preds = 0

lstm_model.eval()
with torch.no_grad():
    for i, tdata in enumerate(test_dataloader):
        tq1, tq2, tlabels = tdata
        tq1, tq2, tlabels = tq1.to(device), tq2.to(device), tlabels.to(device)
        toutputs = lstm_model(tq1, tq2)
        prob_toutputs = nn.functional.softmax(toutputs, dim=1)
        _, predicted = torch.max(prob_toutputs, 1)
        correct_preds += (predicted == tlabels).sum().item()
        total_preds += tlabels.size(0)
        tloss = criterion(toutputs, tlabels)
        running_tloss += tloss.item() * tq1.size(0)

avg_tloss = running_tloss / len(test_dataloader.dataset)
accuracy = correct_preds / total_preds
log(logger, f'Test Loss: {avg_tloss}, Test Accuracy: {accuracy}')
print(f'Final Test Accuracy: {accuracy}')

# Sentence-BERT

## Dataset splits

In [None]:
from torch.utils.data import Dataset
from sentence_transformers.readers import InputExample

sbert_questions_dtype = {
    'qid1': int,
    'qid2': int,
    'question1': str,
    'question2': str,
    'is_duplicate': int
}

class SBERTQuestionPairDataset(Dataset):
    def __init__(self, questions_path: str):
        self.path = questions_path
        self.questions = pd.read_csv(questions_path, sep='\t', dtype=sbert_questions_dtype, quoting=3)
    def __len__(self):
        return len(self.questions)
    def __getitem__(self, index) -> InputExample:
        row = self.questions.iloc[index]
        return InputExample(texts=[str(row['question1']), str(row['question2'])], label=int(row['is_duplicate']))

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 16

train_dataset = SBERTQuestionPairDataset('data/sbert/train_pairs.tsv')
dev_dataset = SBERTQuestionPairDataset('data/sbert/dev_pairs.tsv')
test_dataset = SBERTQuestionPairDataset('data/sbert/test_pairs.tsv')

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE)

## Distilled RoBERTa Base

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
import math

NUM_EPOCHS = 4
WARMUP_RATIO = 0.1

if torch.backends.mps.is_available():
    torch.mps.set_per_process_memory_fraction(0.0)

LEARNING_RATES = [3e-5, 4e-5]

for lr in LEARNING_RATES:
    evaluator = CEBinaryClassificationEvaluator.from_input_examples(list(dev_dataset), name="QuoraQuestionPairs-dev")

    warmup_steps = math.ceil(len(train_dataloader) * NUM_EPOCHS * WARMUP_RATIO)
    log(logger, "Warmup-steps: {}".format(warmup_steps))
    log(logger, "Learing rate: {}".format(lr))

    model = CrossEncoder("distilroberta-base", num_labels=1)

    timestamp = str(int(time.time()))

    MODEL_SAVE_PATH = MODEL_SAVE_DIR + timestamp + '_' + '{:.0E}'.format(lr) + '/'

    # Train the model
    model.fit(
        train_dataloader=train_dataloader,
        evaluator=evaluator,
        epochs=NUM_EPOCHS,
        optimizer_params={'lr': lr},
        evaluation_steps=5000,
        warmup_steps=warmup_steps,
        output_path=MODEL_SAVE_PATH,
    )

In [None]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

model = CrossEncoder(MODEL_SAVE_PATH)

evaluator = CECorrelationEvaluator.from_input_examples(list(test_dataset), name="QuoraQuestionPairs-test")
evaluator(model)

## RoBERTa Base

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 16

train_dataset = SBERTQuestionPairDataset('data/sbert/train_pairs.tsv')
dev_dataset = SBERTQuestionPairDataset('data/sbert/dev_pairs.tsv')
test_dataset = SBERTQuestionPairDataset('data/sbert/test_pairs.tsv')

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE)

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
import math

NUM_EPOCHS = 4
WARMUP_RATIO = 0.1

if torch.backends.mps.is_available():
    torch.mps.set_per_process_memory_fraction(0.0)

LEARNING_RATES = [3e-5, 4e-5]

for lr in LEARNING_RATES:
    evaluator = CEBinaryClassificationEvaluator.from_input_examples(list(dev_dataset), name="QuoraQuestionPairs-dev")

    warmup_steps = math.ceil(len(train_dataloader) * NUM_EPOCHS * WARMUP_RATIO)
    log(logger, "Warmup-steps: {}".format(warmup_steps))
    log(logger, "Learing rate: {}".format(lr))

    model = CrossEncoder("roberta-base", num_labels=1)

    timestamp = str(int(time.time()))

    MODEL_SAVE_PATH = MODEL_SAVE_DIR + timestamp + '_' + '{:.0E}'.format(lr) + '/'

    # Train the model
    model.fit(
        train_dataloader=train_dataloader,
        evaluator=evaluator,
        epochs=NUM_EPOCHS,
        optimizer_params={'lr': lr},
        evaluation_steps=5000,
        warmup_steps=warmup_steps,
        output_path=MODEL_SAVE_PATH,
    )

In [None]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

model = CrossEncoder(MODEL_SAVE_PATH)

evaluator = CECorrelationEvaluator.from_input_examples(list(test_dataset), name="QuoraQuestionPairs-test")
evaluator(model)