In [9]:
import pandas as pd
import json
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [None]:
dev = pd.read_csv('./Data/dev.csv')
dev.head()

In [None]:
test = pd.read_csv('./Data/test.csv')
test.head()

In [None]:
train = pd.read_csv('./Data/train.csv')
train.head()

In [None]:
sample_submission = pd.read_csv('./Data/sample_submission.csv')
sample_submission.head()

In [6]:
with open("./Data/corpus.json/corpus.json", "r") as f:
    documents = json.load(f)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [10]:
def embed_documents(docs, batch_size=32):
    embeddings = {}
    for i in tqdm(range(0, len(docs), batch_size), desc="Embedding documents"):
        batch_docs = docs[i:i+batch_size]
        batch_texts = [doc["text"] for doc in batch_docs]
        doc_ids = [doc["docid"] for doc in batch_docs]
        
        # Tokenize the texts
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        # Use BERT
        with torch.no_grad():
            outputs = model(**inputs)
        
        cls_embeddings = outputs.last_hidden_state[:, 0, :].squeeze().cpu()
        
        for doc_id, cls_embedding in zip(doc_ids, cls_embeddings):
            embeddings[doc_id] = cls_embedding

    return embeddings


In [31]:
doc_embeddings = embed_documents(documents)

In [32]:
def retrieve_documents(query, embeddings, top_k=10):
    # Tokenize the query
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
    # Use BERT
    with torch.no_grad():
        outputs = model(**inputs)
    
    query_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu()
    
    doc_embeddings = torch.stack([emb for emb in embeddings.values()])
    doc_ids = list(embeddings.keys())
    
    # Compute similarities
    query_embedding_norm = query_embedding / query_embedding.norm(dim=0, keepdim=True)
    doc_embeddings_norm = doc_embeddings / doc_embeddings.norm(dim=1, keepdim=True)
    
    similarities = torch.matmul(doc_embeddings_norm, query_embedding_norm)
    
    # Retrieve top-k most similar documents
    top_k_indices = similarities.argsort(descending=True)[:top_k]
    results = [(doc_ids[i], similarities[i].item()) for i in top_k_indices]
    
    return results

In [None]:
sample_data = []
for i, row in tqdm(test.iterrows(), total=len(test), desc="Retrieving documents"):
    query = row["query"]
    results = retrieve_documents(query, doc_embeddings, top_k=10)
    doc_ids = [doc_id for doc_id, _ in results]
    sample_data.append({"id": row["id"], "docids": doc_ids})

sample = pd.DataFrame(sample_data)
sample.to_csv("sample.csv", index=False)