# Retrieval Evaluation

In [1]:
# %pip install "haystack-ai>=2.15.0rc1"
# %pip install ragas-haystack
# %pip install nltk
# %pip install openai
# %pip install pandas
# %pip install ragas-haystack
# %pip install "sentence-transformers>=3.0.0"
# %pip install hf_xet
# %pip install "ollama-haystack==2.4.2"
# %pip install tqdm # For Progress Bar
# %pip install einops

In [4]:
import os
from haystack.document_stores.in_memory import InMemoryDocumentStore
import importlib
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()
import logging
from utils.pickle_utils import for_each_pickle_file
from pipelines.evaluation.base_retrieval_eval_pipeline import get_base_retrieval_eval_pipeline
from models import EmbeddingModelConfig, EmbeddingModelProvider, RerankingModelConfig, RerankingModelProvider, LLMConfig, LLMProvider
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s %(levelname)s %(message)s',
    datefmt='%H:%M:%S'
)
logging.getLogger("haystack").setLevel(logging.WARNING)

os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./model-assets/sentence-transformers"
os.environ["HF_HUB_CACHE"] = "./model-assets/hugging-face"


In [1]:
TOP_K_VALUES = [5, 10, 20]
NUMBER_OF_QUESTIONS_IN_EVAL = 2

In [5]:
test_configs = [
    {
        "embedding_model": EmbeddingModelConfig(name="Qwen/Qwen3-Embedding-4B", provider=EmbeddingModelProvider.SENTENCE_TRANSFORMER),
        "reranking_model": RerankingModelConfig(name="Qwen/Qwen3-Reranker-0.6B", provider=RerankingModelProvider.HUGGING_FACE),
        "contextualizer_model": LLMConfig(name="gemma3:12b", provider=LLMProvider.OLLAMA),
    },
    {
        "embedding_model": EmbeddingModelConfig(name="Qwen/Qwen3-Embedding-4B", provider=EmbeddingModelProvider.SENTENCE_TRANSFORMER),
        "reranking_model": RerankingModelConfig(name="Qwen/Qwen3-Reranker-0.6B", provider=RerankingModelProvider.HUGGING_FACE),
        "contextualizer_model": LLMConfig(name="gpt-4.1-mini", provider=LLMProvider.OPEN_AI),
    },
]

In [6]:
now = datetime.now()

def run_retrieval_eval(filename, df):
    import config.prompt
    importlib.reload(config.prompt)

    import re

    match = re.search(r"answers_(.*?)_dataset", filename)
    if match:
        splitting_strategy = match.group(1)
    else:
        splitting_strategy = None

    # 1) Filter out the null‐question rows
    df_nonnull = df[df["question"].notnull()]

    df_shuffled = df_nonnull.sample(n=NUMBER_OF_QUESTIONS_IN_EVAL, random_state=42).reset_index(drop=True)

    for top_k in TOP_K_VALUES:
        for index, row in tqdm(
            df_shuffled.iterrows(),
            total=len(df_shuffled),
            desc="Processing rows",
            unit="row"
        ):
            relevant_documents = row["documents"]
            question = row["question"]

            for test_config in test_configs:
                base_indexing_store = InMemoryDocumentStore.load_from_disk(f"data/document_stores/{test_config['embedding_model'].name}/context/{test_config['contextualizer_model'].name}/{splitting_strategy}_indexing_store.json")
                pipeline = get_base_retrieval_eval_pipeline(
                    base_indexing_store=base_indexing_store,
                    embedding_model_config=test_config["embedding_model"],
                    reranking_model_config=test_config["reranking_model"],
                )
                request_payload = {
                    "retriever": {
                        "top_k": top_k,
                    },
                    "map_evaluator": {
                        "ground_truth_documents": [relevant_documents],
                    },
                    "mrr_evaluator": {
                        "ground_truth_documents": [relevant_documents],
                    },
                    "recall_evaluator": {
                        "ground_truth_documents": [relevant_documents],
                    }
                }
                if "query_embedder" in pipeline.graph.nodes:
                    request_payload["query_embedder"] = {
                        "text": question,
                    }
                if "reranker" in pipeline.graph.nodes:
                    request_payload["reranker"] = {
                        "query": question,
                        "top_k": top_k,
                    }
                result = pipeline.run(request_payload)

                map_score = result.get("map_evaluator", {}).get("score", {})
                mrr_score = result.get("mrr_evaluator", {}).get("score", {})
                recall_score = result.get("recall_evaluator", {}).get("score", {})

                df.at[index, f"{test_config['contextualizer_model']}_map"] = map_score
                df.at[index, f"{test_config['contextualizer_model']}_mrr"] = mrr_score
                df.at[index, f"{test_config['contextualizer_model']}_recall"] = recall_score

        save_path = f"results/retrieval/context_model_eval/{now.strftime('%Y-%m-%d_%H-%M-%S')}/{test_config['embedding_model'].name}/{splitting_strategy}/topk_{top_k}.pkl"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_pickle(save_path)

for_each_pickle_file("data/qa_with_docs_flat", run_retrieval_eval)

Processing Pickle files:   0%|          | 0/6 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/2 [00:42<?, ?row/s]
Processing Pickle files:   0%|          | 0/6 [00:42<?, ?it/s]


KeyboardInterrupt: 