In [None]:
!nvidia-smi

Tue Aug  5 17:04:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             43W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Standard Libraries
!pip install rank_bm25
import os
import json
import pickle
import random
from collections import Counter
from statistics import mean

# Data Handling
import pandas as pd
import numpy as np

# Progress Bar
from tqdm import tqdm

# NLP & Transformers
import nltk
from nltk.tokenize import word_tokenize
from torch.optim import AdamW  # <- use this instead of transformers.AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    BertConfig,
    get_linear_schedule_with_warmup
)


# PyTorch
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax

# Download NLTK tokenizer model
nltk.download('punkt')

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define file paths to dataset stored in Google Drive
data_dir = '/content/drive/MyDrive/FIQA/Data'

file_answers = os.path.join(data_dir, 'FiQA_train_doc_final.tsv')
file_questions = os.path.join(data_dir, 'FiQA_train_question_final.tsv')
file_qid_docid = os.path.join(data_dir, 'FiQA_train_question_doc_final.tsv')

Mounted at /content/drive


In [None]:
import pickle

# Load the training set
with open('/content/drive/MyDrive/FIQA/Data/train_set_50.pickle', 'rb') as f:
    train_set = pickle.load(f)

# Total number of entries
print("Total training entries:", len(train_set))

# Optional: Preview a few entries
print("Sample entry:", train_set[0])


Total training entries: 5676
Sample entry: [0, [18850], [531578, 417981, 324911, 524879, 397608, 216077, 173212, 434846, 104464, 326261, 528838, 234436, 571062, 196374, 481692, 207449, 338700, 153377, 406418, 327002, 421301, 11538, 375748, 238271, 322893, 130631, 483385, 73427, 560087, 531442, 156554, 541809, 562777, 192843, 553328, 283505, 209224, 351672, 324513, 18850, 55200, 540395, 297841, 367754, 455984, 160340, 577284, 287474, 565935, 354716]]


In [None]:
!pip install faiss-cpu --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pickle
import faiss
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

# --- Config ---
model_name = "sentence-transformers/msmarco-MiniLM-L-6-v3"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
top_k = 50
embedding_dim = 384  # MiniLM outputs 384-dim vectors

# --- Load model & tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

# --- Load data ---
with open("/content/drive/MyDrive/FIQA/docid_to_text.pickle", "rb") as f:
    docid_to_text = pickle.load(f)

with open("/content/drive/MyDrive/FIQA/qid_to_text.pickle", "rb") as f:
    qid_to_text = pickle.load(f)

# --- Helper: Mean Pooling ---
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

# --- 1. Encode all documents ---
docids = list(docid_to_text.keys())
doc_texts = [docid_to_text[docid] for docid in docids]
doc_embeddings = []

for text in tqdm(doc_texts, desc="Encoding documents"):
    encoded = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        output = model(**encoded)
        embedding = mean_pooling(output, encoded['attention_mask'])
        doc_embeddings.append(embedding.cpu().numpy())

doc_embeddings = np.vstack(doc_embeddings).astype("float32")

# --- 2. Build FAISS index ---
index = faiss.IndexFlatIP(embedding_dim)
faiss.normalize_L2(doc_embeddings)  # Normalize docs for cosine similarity
index.add(doc_embeddings)

# --- 3. Encode queries and retrieve top-50 docs ---
qid_to_top_docs = {}

for qid, query_text in tqdm(qid_to_text.items(), desc="Retrieving for queries"):
    encoded = tokenizer(query_text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        output = model(**encoded)
        q_embedding = mean_pooling(output, encoded['attention_mask']).cpu().numpy()

    faiss.normalize_L2(q_embedding)  # Normalize query for cosine similarity
    D, I = index.search(q_embedding, top_k)
    top_doc_ids = [docids[i] for i in I[0]]
    qid_to_top_docs[str(qid)] = [str(did) for did in top_doc_ids]

# --- 4. Save results ---
with open("/content/drive/MyDrive/FIQA/Data/minilm_retrieved_top50.pickle", "wb") as f:
    pickle.dump(qid_to_top_docs, f)

print(" Dense retrieval with MiniLM completed and saved!")


tokenizer_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Encoding documents: 100%|██████████| 57600/57600 [05:30<00:00, 174.29it/s]
Retrieving for queries: 100%|██████████| 6648/6648 [01:23<00:00, 79.75it/s]


 Dense retrieval with MiniLM completed and saved!


In [None]:
import pickle
import numpy as np

# --- 1. Load Ground Truth Labels ---
with open("/content/drive/MyDrive/FIQA/labels.pickle", "rb") as f:
    ground_truth_labels = pickle.load(f)

# Normalize IDs to strings
ground_truth_labels = {
    str(qid): {str(docid) for docid in docids}
    for qid, docids in ground_truth_labels.items()
}

# --- 2. Load MiniLM Retrieval Result ---
with open("/content/drive/MyDrive/FIQA/Data/minilm_retrieved_top50.pickle", "rb") as f:
    minilm_rankings = pickle.load(f)

# --- 3. Evaluation Functions ---
def evaluate(ranked_docs, relevant_docs, k):
    def dcg(rels):
        return sum((1 / np.log2(i + 2) if rel else 0) for i, rel in enumerate(rels))

    def ndcg(preds, gold):
        rels = [1 if doc in gold else 0 for doc in preds[:k]]
        ideal_rels = sorted(rels, reverse=True)
        return dcg(rels) / dcg(ideal_rels) if dcg(ideal_rels) > 0 else 0

    def mrr(preds, gold):
        for i, doc in enumerate(preds[:k]):
            if doc in gold:
                return 1 / (i + 1)
        return 0

    def precision_at_k(preds, gold):
        return sum(1 for doc in preds[:k] if doc in gold) / k

    ndcgs, mrrs, precisions = [], [], []
    for qid, pred_docs in ranked_docs.items():
        gold_docs = relevant_docs.get(qid, set())
        ndcgs.append(ndcg(pred_docs, gold_docs))
        mrrs.append(mrr(pred_docs, gold_docs))
        precisions.append(precision_at_k(pred_docs, gold_docs))

    return {
        f"nDCG@{k}": np.mean(ndcgs),
        f"MRR@{k}": np.mean(mrrs),
        f"Precision@{k}": np.mean(precisions),
    }

# --- 4. Run Evaluation ---
k = 10  # Evaluate top-10 documents
results = evaluate(minilm_rankings, ground_truth_labels, k)

# --- 5. Print Results ---
print("\nMiniLM Dense Retrieval Evaluation:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")



MiniLM Dense Retrieval Evaluation:
nDCG@10: 0.3018
MRR@10: 0.2706
Precision@10: 0.0593


In [None]:
import pickle
import numpy as np

# --- 1. Load Ground Truth Labels ---
with open("/content/drive/MyDrive/FIQA/labels.pickle", "rb") as f:
    ground_truth_labels = pickle.load(f)

# Normalize IDs to strings
ground_truth_labels = {
    str(qid): {str(docid) for docid in docids}
    for qid, docids in ground_truth_labels.items()
}

# --- 2. Load MiniLM Retrieval Result ---
with open("/content/drive/MyDrive/FIQA/Data/minilm_retrieved_top50.pickle", "rb") as f:
    minilm_rankings = pickle.load(f)

# --- 3. Evaluation Functions ---
def evaluate_custom(ranked_docs, relevant_docs, ndcg_k, mrr_k, precision_k):
    def dcg(rels):
        return sum((1 / np.log2(i + 2) if rel else 0) for i, rel in enumerate(rels))

    def ndcg(preds, gold, k):
        rels = [1 if doc in gold else 0 for doc in preds[:k]]
        ideal_rels = sorted(rels, reverse=True)
        return dcg(rels) / dcg(ideal_rels) if dcg(ideal_rels) > 0 else 0

    def mrr(preds, gold, k):
        for i, doc in enumerate(preds[:k]):
            if doc in gold:
                return 1 / (i + 1)
        return 0

    def precision_at_k(preds, gold, k):
        return sum(1 for doc in preds[:k] if doc in gold) / k

    ndcgs, mrrs, precisions = [], [], []
    for qid, pred_docs in ranked_docs.items():
        gold_docs = relevant_docs.get(qid, set())
        ndcgs.append(ndcg(pred_docs, gold_docs, ndcg_k))
        mrrs.append(mrr(pred_docs, gold_docs, mrr_k))
        precisions.append(precision_at_k(pred_docs, gold_docs, precision_k))

    return {
        f"nDCG@{ndcg_k}": np.mean(ndcgs),
        f"MRR@{mrr_k}": np.mean(mrrs),
        f"Precision@{precision_k}": np.mean(precisions),
    }

# --- 4. Run Evaluation ---
results = evaluate_custom(
    ranked_docs=minilm_rankings,
    relevant_docs=ground_truth_labels,
    ndcg_k=10,
    mrr_k=10,
    precision_k=1
)

# --- 5. Print Results ---
print("\nMiniLM Dense Retrieval Evaluation:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")



MiniLM Dense Retrieval Evaluation:
nDCG@10: 0.3018
MRR@10: 0.2706
Precision@1: 0.2041


In [None]:
from sentence_transformers import SentenceTransformer
import pickle
import numpy as np
import faiss
from tqdm import tqdm

# Load fine-tuned financial retriever
model = SentenceTransformer('mukaj/fin-mpnet-base')
embedding_dim = model.get_sentence_embedding_dimension()

with open("/content/drive/MyDrive/FIQA/docid_to_text.pickle", "rb") as f:
    docid_to_text = pickle.load(f)
with open("/content/drive/MyDrive/FIQA/qid_to_text.pickle", "rb") as f:
    qid_to_text = pickle.load(f)

docids = list(docid_to_text.keys())
doc_texts = [docid_to_text[d] for d in docids]

# Step 1: encode docs
doc_embeddings = model.encode(doc_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
faiss.normalize_L2(doc_embeddings)

# Step 2: create FAISS index
index = faiss.IndexFlatIP(embedding_dim)
index.add(doc_embeddings)

# Step 3: encode queries and retrieve
top_k = 50
qid_to_top = {}
for qid, query in tqdm(qid_to_text.items()):
    q_embed = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_embed)
    D, I = index.search(q_embed, top_k)
    qid_to_top[str(qid)] = [str(docids[i]) for i in I[0]]

with open("/content/drive/MyDrive/FIQA/Data/finmpnet_retrieved_top50.pickle", "wb") as f:
    pickle.dump(qid_to_top, f)

print("✅ fin‑mpnet retrieval done")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/900 [00:00<?, ?it/s]

100%|██████████| 6648/6648 [03:37<00:00, 30.56it/s]


✅ fin‑mpnet retrieval done


In [None]:
import pickle
import numpy as np

# --- 1. Load Ground Truth Labels ---
with open("/content/drive/MyDrive/FIQA/labels.pickle", "rb") as f:
    ground_truth_labels = pickle.load(f)

ground_truth_labels = {
    str(qid): {str(docid) for docid in docids}
    for qid, docids in ground_truth_labels.items()
}

# --- 2. Load FinMPNet Rankings ---
with open("/content/drive/MyDrive/FIQA/Data/finmpnet_retrieved_top50.pickle", "rb") as f:
    finmpnet_rankings = pickle.load(f)

# --- 3. Evaluation Function ---
def evaluate_custom(ranked_docs, relevant_docs):
    def dcg(rels):
        return sum((1 / np.log2(i + 2) if rel else 0) for i, rel in enumerate(rels))

    def ndcg_at_10(preds, gold):
        rels = [1 if doc in gold else 0 for doc in preds[:10]]
        ideal_rels = sorted(rels, reverse=True)
        return dcg(rels) / dcg(ideal_rels) if dcg(ideal_rels) > 0 else 0

    def mrr_at_10(preds, gold):
        for i, doc in enumerate(preds[:10]):
            if doc in gold:
                return 1 / (i + 1)
        return 0

    def precision_at_1(preds, gold):
        return 1.0 if preds and preds[0] in gold else 0.0

    ndcgs, mrrs, precisions = [], [], []
    for qid, pred_docs in ranked_docs.items():
        gold_docs = relevant_docs.get(qid, set())
        ndcgs.append(ndcg_at_10(pred_docs, gold_docs))
        mrrs.append(mrr_at_10(pred_docs, gold_docs))
        precisions.append(precision_at_1(pred_docs, gold_docs))

    return {
        "nDCG@10": np.mean(ndcgs),
        "MRR@10": np.mean(mrrs),
        "Precision@1": np.mean(precisions),
    }

# --- 4. Run Evaluation ---
results = evaluate_custom(finmpnet_rankings, ground_truth_labels)

# --- 5. Print Results ---
print("\nFinMPNet Evaluation:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")



FinMPNet Evaluation:
nDCG@10: 0.8398
MRR@10: 0.8206
Precision@1: 0.7306




---

---



---



---



---



---



