In [1]:
%load_ext jupyter_black

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

# import torch
import numpy as np
import faiss
import json
from loguru import logger
from sklearn.preprocessing import normalize

In [3]:
from src.get_embedding import get_embedding
from src.load_data import load_data

In [4]:
# Define models
models = {
    "EuroBERT": {
        "model": AutoModel.from_pretrained(
            "EuroBERT/EuroBERT-210m", trust_remote_code=True
        ),
        "tokenizer": AutoTokenizer.from_pretrained(
            "EuroBERT/EuroBERT-210m", trust_remote_code=True
        ),
        "type": "transformer",
    },
    "EuroBERT_FT": {
        "model": AutoModel.from_pretrained(
            "nomic-ai/eurobert-210m-2e4-128sl-full-ft", trust_remote_code=True
        ),
        "tokenizer": AutoTokenizer.from_pretrained(
            "nomic-ai/eurobert-210m-2e4-128sl-full-ft", trust_remote_code=True
        ),
        "type": "transformer",
    },
    "RuModernBERT_USER2_FT": {
        "model": AutoModel.from_pretrained("deepvk/USER2-base", trust_remote_code=True),
        "tokenizer": AutoTokenizer.from_pretrained(
            "deepvk/USER2-base", trust_remote_code=True
        ),
        "type": "transformer",
    },
    "LaBSE": {
        "model": AutoModel.from_pretrained("sentence-transformers/LaBSE"),
        "tokenizer": AutoTokenizer.from_pretrained("sentence-transformers/LaBSE"),
        "type": "transformer",
    },
    "RuModernBERT": {
        "model": AutoModel.from_pretrained(
            "deepvk/RuModernBERT-base", output_attentions=True, trust_remote_code=True
        ),
        "tokenizer": AutoTokenizer.from_pretrained(
            "deepvk/RuModernBERT-base", trust_remote_code=True
        ),
        "type": "transformer",
    },
    "E5": {
        "model": AutoModel.from_pretrained("intfloat/multilingual-e5-base"),
        "tokenizer": AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base"),
        "type": "transformer",
    },
    "MiniLM": {
        "model": SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2"),
        "type": "sentence-transformer",
    },
    "BERT-multilingual": {
        "model": AutoModel.from_pretrained("bert-base-multilingual-cased"),
        "tokenizer": AutoTokenizer.from_pretrained("bert-base-multilingual-cased"),
        "type": "transformer",
    },
    "gte-multilingual-base": {
        "model": AutoModel.from_pretrained(
            "Alibaba-NLP/gte-multilingual-base", trust_remote_code=True
        ),
        "tokenizer": AutoTokenizer.from_pretrained(
            "Alibaba-NLP/gte-multilingual-base", trust_remote_code=True
        ),
        "type": "transformer",
    },
    "Granite-Embedding-Multilingual": {
        "model": AutoModel.from_pretrained(
            "ibm-granite/granite-embedding-278m-multilingual"
        ),
        "tokenizer": AutoTokenizer.from_pretrained(
            "ibm-granite/granite-embedding-278m-multilingual"
        ),
        "type": "transformer",
    },
}

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def make_vector_index(model_type, tokenizer, model, support_passages):
    support_embeddings = [
        get_embedding(model_type=model_type, tokenizer=tokenizer, model=model, text=p)
        for p in support_passages
    ]  # use your current ModernBERT method
    # Stack and normalize to unit vectors
    support_embeddings_np = np.vstack(support_embeddings).astype("float32")
    support_embeddings_np = normalize(support_embeddings_np, norm="l2", axis=1)

    # FAISS IndexFlatL2 works as cosine similarity now
    dimension = support_embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(support_embeddings_np)

    return index


def retrieve_query(index, model_type, tokenizer, model, query, top_k=3):
    try:
        emb = get_embedding(
            model_type=model_type, tokenizer=tokenizer, model=model, text=query
        ).astype("float32")
    except:
        logger.warning(f"Could not embed query {query}")
        raise
    D_a, I_a = index.search(np.array([emb]), top_k)
    return df.iloc[I_a[0]]


def compute_retrieval_results(
    index, model_type, tokenizer, model, model_name, df, language, verbose=False
):
    retrieval_results = dict()
    for i, row in df.iterrows():
        row_id = row["id"]
        q_active = row["query_active"]
        q_passive = row["query_passive"]
        if verbose:
            logger.info(f"Retrieving for {q_active}, {q_passive}")
        retrieved_ids_active = retrieve_query(
            index=index,
            model_type=model_type,
            tokenizer=tokenizer,
            model=model,
            query=q_active,
            top_k=top_k,
        )["id"].tolist()
        retrieved_ids_passive = retrieve_query(
            index=index,
            model_type=model_type,
            tokenizer=tokenizer,
            model=model,
            query=q_passive,
            top_k=top_k,
        )["id"].tolist()
        active_passive_retrievals = {
            "active": retrieved_ids_active,
            "passive": retrieved_ids_passive,
        }
        retrieval_results[row_id] = active_passive_retrievals
    with open(f"results/retrieval_results_{language}_{model_name}.json", "w") as f:
        json.dump(retrieval_results, f)

In [6]:
top_k = 5

In [7]:
for model_name in models.keys():
    for language in ["en", "ru"]:
        logger.info(f"PROCESSING {model_name}, {language}")
        df = load_data(language)
        support_passages = df["support_passage"].tolist()
        loaded_models = models[model_name]
        model = loaded_models["model"]
        tokenizer = loaded_models.get("tokenizer", None)
        model_type = loaded_models["type"]
        index = make_vector_index(
            model_type=model_type,
            tokenizer=tokenizer,
            model=model,
            support_passages=support_passages,
        )
        compute_retrieval_results(
            index=index,
            model_type=model_type,
            tokenizer=tokenizer,
            model=model,
            model_name=model_name,
            df=df,
            language=language,
        )

[32m2025-06-11 16:10:11.342[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mPROCESSING EuroBERT, en[0m
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[32m2025-06-11 16:11:11.644[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mPROCESSING EuroBERT, ru[0m
[32m2025-06-11 16:12:17.369[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mPROCESSING EuroBERT_FT, en[0m
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[32m2025-06-11 16:13:19.719[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mPROCESSING EuroBERT_FT, ru[0m
[32m2025-06-11 16:14:24.603[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mPROCESSING RuModernBERT_USER2_FT, en[0m
[32m2025-06-11 16:15:30