In [1]:
import pandas as pd
import json
import random
import torch
import sklearn
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
from torch.amp import autocast
import nltk
from nltk.corpus import stopwords
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.nn import TripletMarginWithDistanceLoss


In [2]:
is_cuda = torch.cuda.is_available()
print("Cuda is available: ", is_cuda)
print("GPU: ", torch.cuda.get_device_name(0))
device = torch.device("cuda" if is_cuda else "cpu")

Cuda is available:  True
GPU:  Quadro RTX 4000


In [4]:
dev = pd.read_csv('./Data/dev.csv')
test = pd.read_csv('./Data/test.csv')
train = pd.read_csv('./Data/train.csv')
sample_submission = pd.read_csv('./Data/sample_submission.csv')
with open("./Data/corpus.json", "r") as f:
    documents = json.load(f)

# TEST BERT

In [53]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

In [88]:
def cosine_sim(a, b):
    return a @ b / (torch.norm(a) * torch.norm(b))

In [90]:
num_epochs = 10
triplet_loss = TripletMarginWithDistanceLoss(distance_function=cosine_sim)

model.train()
for epoch in tqdm(range(num_epochs)):  # Nombre d'époques
    for i in range(len(dev)):
        optimizer.zero_grad()  # Réinitialiser les gradients
        docs_pos = [doc["text"] for doc in documents if doc["docid"] == dev['positive_docs'].iloc[i]]
        docs_neg = [doc["text"] for doc in documents if doc["docid"] in dev['negative_docs'].iloc[i]][:1]
        query = dev['query'].iloc[i]

        inputs_docs_pos = tokenizer(docs_pos, return_tensors="pt", padding=True, truncation=True,max_length=512).to(device)
        inputs_docs_neg = tokenizer(docs_neg, return_tensors="pt", padding=True, truncation=True,max_length=512).to(device)
        inputs_query = tokenizer(query, return_tensors="pt", padding=True, truncation=True,max_length=512).to(device)

        outputs_docs_pos = model(**inputs_docs_pos)  # Passer les documents positifs dans le modèle
        outputs_docs_neg = model(**inputs_docs_neg)
        output_query = model(**inputs_query)

        pos_doc_embedding = outputs_docs_pos.last_hidden_state[:, 0, :].squeeze().cpu()
        neg_doc_embedding = outputs_docs_neg.last_hidden_state[:, 0, :].squeeze().cpu()
        query_embedding = output_query.last_hidden_state[:, 0, :].squeeze().cpu()

        loss = triplet_loss(query_embedding, pos_doc_embedding, neg_doc_embedding)     
        loss.backward()
        optimizer.step()


TypeError: 'module' object is not callable