In [1]:
import time
import pandas as pd

from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

import retrival_system as rs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_CSV_PATH = "multilingual_dataset.csv"

MODEL_CANDIDATES = [
    "snowflake-arctic-embed2:latest",
    "nomic-embed-text:latest",
    "mxbai-embed-large:latest",
    "snowflake-arctic-embed:latest",
    "embeddinggemma:latest",
]

TOP_K = 5

In [3]:
EVAL_QUERIES = [
    {
        "query": "مشین لرننگ کے الگورتھم جدید نظاموں میں پیش گوئی کے تجزیے کے لئے کیسے استعمال ہوتے ہیں؟",  # How are machine learning algorithms used for predictive analytics in modern systems?
        "relevant_doc_ids": list(range(1, 11)),
    },
    {
        "query": "machine learning algorithms for predictive analytics in modern systems",
        "relevant_doc_ids": list(range(1, 11)),
    },

    {
        "query": "سیسمک انٹرپریٹیشن اور فالٹ ڈیٹیکشن میں AI کا کردار",  # The role of AI in seismic interpretation and fault detection
        "relevant_doc_ids": list(range(11, 21)),
    },
    {
        "query": "AI models for fault detection in seismic volumes",
        "relevant_doc_ids": list(range(11, 21)),
    },

    {
        "query": "صحت کے شعبے میں AI بیماری کی تشخیص اور ذاتی علاج میں کیسے مدد کرتا ہے؟",  # How does AI help in disease diagnosis and personalized treatment in healthcare?
        "relevant_doc_ids": list(range(21, 31)),
    },
    {
        "query": "AI and neural networks for faster diagnosis and anomaly detection in MRI/CT",
        "relevant_doc_ids": list(range(21, 31)),
    },

    {
        "query": "موسمیاتی تبدیلی اور گرین ہاؤس گیسوں کے اخراج میں کمی",  # Climate change and reducing greenhouse gas emissions
        "relevant_doc_ids": list(range(31, 41)),
    },
    {
        "query": "climate change, extreme weather events and reducing emissions",
        "relevant_doc_ids": list(range(31, 41)),
    },
]


In [4]:
def recall_at_k(ranked_ids, relevant_ids, k):
    top_k = ranked_ids[:k]
    hits = sum(1 for d in top_k if d in relevant_ids)
    return hits / max(len(relevant_ids), 1)


def mrr_at_k(ranked_ids, relevant_ids, k):
    for i, d in enumerate(ranked_ids[:k], start=1):
        if d in relevant_ids:
            return 1.0 / i
    return 0.0

In [5]:
df = pd.read_csv(DATA_CSV_PATH)

print("data:", df.shape)
df.head()

data: (200, 4)


Unnamed: 0,doc_id,lang,text,en_translation
0,1,en,Machine learning algorithms are increasingly u...,Machine learning algorithms are increasingly u...
1,2,en,Deep neural networks enable breakthroughs in c...,Deep neural networks enable breakthroughs in c...
2,3,ur,مشین لرننگ کے الگورتھم جدید نظاموں میں پیش گوئ...,Machine learning algorithms are widely used fo...
3,4,ur,ڈیپ نیورل نیٹ ورکس کمپیوٹر وژن اور اسپیچ ریکگن...,Deep neural networks make major advances in co...
4,5,hi,मशीन लर्निंग एल्गोरिदम आधुनिक प्रणालियों में भ...,Machine learning algorithms are used for predi...


In [6]:
def build_vectorstore_for_model(df_docs, model_name):
    df_local = df_docs.copy()
    df_local["text"] = df_local["text"].astype(str).map(rs.preprocess_text)

    documents = []
    for _, row in df_local.iterrows():
        documents.append(
            Document(
                page_content=row["text"],
                metadata={"doc_id": row["doc_id"], "lang": row["lang"]},
            )
        )

    embeddings = OllamaEmbeddings(model=model_name)

    base_name = model_name.split(":")[0]
    base_name = base_name.replace("-", "_")
    collection_name = f"test_{base_name}"

    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        collection_name=collection_name
    )

    return vectorstore

In [7]:
def evaluate_model_on_queries(df_docs, model_name):
    vectorstore = build_vectorstore_for_model(df_docs, model_name)

    recalls = []
    mrrs = []
    latencies = []

    for item in EVAL_QUERIES:
        query_text = item["query"]
        relevant_ids = item["relevant_doc_ids"]

        clean_query = rs.preprocess_text(query_text)

        t0 = time.time()
        results = vectorstore.similarity_search_with_score(clean_query, k=TOP_K)
        t1 = time.time()

        latencies.append(t1 - t0)

        ranked_ids = [doc.metadata["doc_id"] for doc, score in results]

        recalls.append(recall_at_k(ranked_ids, relevant_ids, TOP_K))
        mrrs.append(mrr_at_k(ranked_ids, relevant_ids, TOP_K))

    return {
        "model": model_name,
        "recall@5": sum(recalls) / len(recalls),
        "mrr@5": sum(mrrs) / len(mrrs),
        "avg_latency_sec": sum(latencies) / len(latencies),
    }

In [8]:
results = []

for model_name in MODEL_CANDIDATES:
    try:
        metrics = evaluate_model_on_queries(df, model_name)
        results.append(metrics)

        print(
            f"{model_name} → Recall@5={metrics['recall@5']:.3f}, "
            f"MRR@5={metrics['mrr@5']:.3f}, "
            f"Latency={metrics['avg_latency_sec']:.3f}s"
        )
    except Exception as e:
        print(f"Error evaluating {model_name}: {e}")

snowflake-arctic-embed2:latest → Recall@5=0.500, MRR@5=1.000, Latency=0.162s
nomic-embed-text:latest → Recall@5=0.212, MRR@5=1.000, Latency=0.040s
mxbai-embed-large:latest → Recall@5=0.212, MRR@5=0.854, Latency=0.068s
snowflake-arctic-embed:latest → Recall@5=0.200, MRR@5=0.875, Latency=0.058s
embeddinggemma:latest → Recall@5=0.425, MRR@5=0.938, Latency=0.125s
