# Outline

I want to compare embedding models for their scores on a small quantitative benchmark, as well as visually on a new custom qualitative benchmark

## Models

- sentence-transformers/all-MiniLM-L6-v2
- sentence-transformers/all-mpnet-base-v2
- nomic-ai/nomic-embed-text-v1
- nomic-ai/nomic-embed-text-v1.5
- nomic-ai/modernbert-embed-base
- lightonai/modernbert-embed-large
- dunzhang/stella_en_400M_v5
- mixedbread-ai/mxbai-embed-large-v1
- jinaai/jina-embeddings-v3
- Snowflake/snowflake-arctic-embed-l-v2.0
- ibm-granite/granite-embedding-278m-multilingual
- BAAI/bge-m3
- Alibaba-NLP/gte-multilingual-base
- intfloat/e5-base-v2
- intfloat/e5-large-v2
- answerdotai/answerai-colbert-small-v1
- jxm/cde-small-v1
- jxm/cde-small-v2

In [1]:
models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "nomic-ai/nomic-embed-text-v1",
    "nomic-ai/nomic-embed-text-v1.5", 
    "nomic-ai/modernbert-embed-base",
    "lightonai/modernbert-embed-large",
    "dunzhang/stella_en_400M_v5",
    "mixedbread-ai/mxbai-embed-large-v1",
    "jinaai/jina-embeddings-v3",
    "Snowflake/snowflake-arctic-embed-l-v2.0",
    "ibm-granite/granite-embedding-278m-multilingual",
    "BAAI/bge-m3",
    "Alibaba-NLP/gte-multilingual-base",
    "intfloat/e5-base-v2",
    "intfloat/e5-large-v2",
    "answerdotai/answerai-colbert-small-v1",
    "jxm/cde-small-v1",
    "jxm/cde-small-v2"
]


In [25]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

In [12]:
from datasets import load_dataset

d0_corpus = load_dataset("zeta-alpha-ai/NanoTouche2020", "corpus")["train"]
d0_qrels = load_dataset("zeta-alpha-ai/NanoTouche2020", "qrels")["train"]
d0_queries = load_dataset("zeta-alpha-ai/NanoTouche2020", "queries")["train"]

In [23]:
# Create a mapping from query ID to relevant corpus IDs
qrels_dict = {}
for sample in d0_qrels:
    query_id = sample['query-id']
    if query_id not in qrels_dict:
        qrels_dict[query_id] = set()
    qrels_dict[query_id].add(sample['corpus-id'])

In [5]:
model_allminilml6v2 = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [22]:
# Encode all corpus texts
query_embeddings = model_allminilml6v2.encode(d0_queries['text'])
corpus_embeddings = model_allminilml6v2.encode(d0_corpus['text'])

In [26]:
# Convert embeddings to torch tensors
query_embeddings_tensor = torch.tensor(query_embeddings)
corpus_embeddings_tensor = torch.tensor(corpus_embeddings)

In [27]:
# Calculate scores and metrics
ndcg_scores = []
recall_at_k_scores = []
k = 10

for idx, query_embedding in enumerate(query_embeddings):
    # Calculate cosine similarities using cos_sim
    similarities = cos_sim(
        query_embeddings_tensor[idx:idx+1], 
        corpus_embeddings_tensor
    ).squeeze()
    
    # Get top k document indices (make sure k is not larger than corpus size)
    k_actual = min(k, len(corpus_embeddings))
    top_k_indices = torch.topk(similarities, k=k_actual).indices.tolist()
    
    # Get relevant docs for this query
    query_id = d0_queries[idx]['_id']
    relevant_docs = qrels_dict.get(query_id, set())
    
    # Calculate metrics
    retrieved_relevant = [1 if d0_corpus[i]['_id'] in relevant_docs else 0 for i in top_k_indices]
    
    # Calculate NDCG@k
    dcg = sum((rel / np.log2(rank + 2)) for rank, rel in enumerate(retrieved_relevant))
    ideal_rel = sorted(retrieved_relevant, reverse=True)
    idcg = sum((rel / np.log2(rank + 2)) for rank, rel in enumerate(ideal_rel))
    ndcg = dcg / idcg if idcg > 0 else 0
    ndcg_scores.append(ndcg)
    
    # Calculate Recall@k
    num_relevant_retrieved = sum(retrieved_relevant)
    recall = num_relevant_retrieved / len(relevant_docs) if len(relevant_docs) > 0 else 0
    recall_at_k_scores.append(recall)

# Print results
print(f"Average NDCG@{k}: {np.mean(ndcg_scores):.4f}")
print(f"Average Recall@{k}: {np.mean(recall_at_k_scores):.4f}")

Average NDCG@10: 0.7310
Average Recall@10: 0.2819
