In [1]:
# import os

# # Setting temp dir to /srv/data directory, as it otherwise fills up the home directory too much
# # Just comment out on machines that are not "Goober"
# os.environ["TMPDIR"] = "/srv/data/tmp"
# os.makedirs("/srv/data/tmp", exist_ok=True)

In [2]:
# %pip install haystack-ai
# %pip install ragas-haystack
# %pip install nltk
# %pip install markdown-it-py
# %pip install mdit_plain
# %pip install openai
# %pip install pandas
# %pip install ragas-haystack
# %pip install sentence-transformers
# %pip install hf_xet
# %pip install ollama-haystack==2.4.2
# %pip install tqdm # For Progress Bar

In [3]:
import os
from datetime import datetime
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from pipelines.evaluation.rag_eval_pipeline import get_rag_evaluation_pipeline
from models import EmbeddingModelConfig, EmbeddingModelProvider, LLMConfig, LLMProvider
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.dataclasses import ChatMessage
from config.prompt import PROMPT_TEMPLATE

In [4]:
from config.secret import OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

os.environ["EMBEDDING_MODEL_NAME"] = "Linq-AI-Research/Linq-Embed-Mistral"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./model-assets/sentence-transformers"

In [5]:
test_configs = [
    {
        "name": "Closed Source RAG",
        "embedding_model": EmbeddingModelConfig(name="text-embedding-3-large", provider=EmbeddingModelProvider.OPENAI),
        "reranking_model": None,
        "contextualizer_model": LLMConfig(name="gpt-4.1-mini", provider=LLMProvider.OPEN_AI),
        "rewriter_model": None,
        "llm": LLMConfig(name="gpt-4.1", provider=LLMProvider.OPEN_AI),
        "retrieval-top-k": 10,
    },
    {
        "name": "Closed Source Long Context",
        "embedding_model": None,
        "reranking_model": None,
        "contextualizer_model": None,
        "rewriter_model": None,
        "llm": LLMConfig(name="gpt-4.1", provider=LLMProvider.OPEN_AI),
        "retrieval-top-k": 10,
    },
    {
        "name": "Open Source RAG",
        "embedding_model": EmbeddingModelConfig(name="Qwen/Qwen3-Embedding-4B", provider=EmbeddingModelProvider.SENTENCE_TRANSFORMER),
        "reranking_model": None,
        "contextualizer_model": LLMConfig(name="gemma3:12b", provider=LLMProvider.OLLAMA),
        "rewriter_model": None,
        "llm": LLMConfig(name="gemma3:27b", provider=LLMProvider.OLLAMA),
        "retrieval-top-k": 10,
    },
    {
        "name": "Open Source Long Context",
        "embedding_model": None,
        "reranking_model": None,
        "contextualizer_model": None,
        "rewriter_model": None,
        "llm": LLMConfig(name="gemma3:27b", provider=LLMProvider.OLLAMA),
        "retrieval-top-k": 10,
    },
    
]

In [6]:
import uuid
from typing import List
from haystack.dataclasses import Document
from utils.markdown_utils import for_each_markdown_file

def get_full_documents() -> List[str]:
    full_documents = []
    def add_to_docs_list(filename: str, bytes):
        file_content = bytes.decode("utf-8")
        document = Document(id=str(uuid.uuid4()), content=file_content)
        full_documents.append(document)
    for_each_markdown_file("data/md_files", add_to_docs_list)
    return full_documents

In [7]:


now = datetime.now()
NUMBER_OF_QUESTIONS_IN_EVAL = 1

def run_retrieval_eval(filename, df):
    import re

    match = re.search(r"answers_(.*?)_dataset", filename)
    if match:
        splitting_strategy = match.group(1)
    else:
        splitting_strategy = None

    # 1) Filter out the null‐question rows
    df_nonnull = df[df["question"].notnull()]

    df_shuffled = df_nonnull.sample(n=NUMBER_OF_QUESTIONS_IN_EVAL, random_state=42).reset_index(drop=True)
    full_documents = get_full_documents()

    for index, row in tqdm(
        df_shuffled.iterrows(),
        total=len(df_shuffled),
        desc="Processing rows",
        unit="row"
    ):
        question = row["question"]
        reference = row["groundTruth"]
        previous_messages = [
            ChatMessage.from_user(msg) if idx == 0
            else ChatMessage.from_assistant(msg)
            for idx, msg in enumerate(row["prev_messages"])
        ]

        for test_config in test_configs:
            if test_config["embedding_model"] == None:
                pipeline = get_rag_evaluation_pipeline(
                    base_indexing_store=None,
                    embedding_model_config=None,
                    reranking_model_config=None,
                    rewriting_model_config=None,
                    llm_config=test_config["llm"]
                )
                request_payload = {
                    "prompt_builder": {
                        "template": [ChatMessage.from_system(PROMPT_TEMPLATE)] + previous_messages + [ChatMessage.from_user(question)],
                        "documents": full_documents,
                    },
                    "evaluator": {
                        "query": question,
                        "reference": reference,
                        "documents": [document.content for document in full_documents],
                        "reference_contexts": [document.content for document in full_documents],
                    }
                }
            else:
                base_indexing_store = InMemoryDocumentStore.load_from_disk(f"data/document_stores/{test_config['embedding_model'].name}/context/{test_config['contextualizer_model'].name}/{splitting_strategy}_indexing_store.json")
                pipeline = get_rag_evaluation_pipeline(
                    base_indexing_store=base_indexing_store,
                    embedding_model_config=test_config["embedding_model"],
                    reranking_model_config=test_config["reranking_model"],
                    rewriting_model_config=test_config["rewriter_model"],
                    llm_config=test_config["llm"]
                )
                request_payload = {
                    "retriever": {
                        "top_k": test_config["retrieval-top-k"],
                    },
                    "prompt_builder": {
                        "template": [ChatMessage.from_system(PROMPT_TEMPLATE)] + previous_messages + [ChatMessage.from_user(question)]
                    },
                    "evaluator": {
                        "query": question,
                        "reference": reference,
                        "reference_contexts": [document.content for document in row["documents"]]
                    }
                }

                if "rewriter" in pipeline.graph.nodes:
                    request_payload["rewriter"] = {
                        "query": question,
                        "previous_messages": previous_messages,
                    }
                else:
                    request_payload["query_embedder"] = {
                        "text": question,
                    }
            result = pipeline.run(request_payload)

            answer_correctness = result.get("evaluator", {}).get("result", {})["answer_correctness"]
            faithfulness = result.get("evaluator", {}).get("result", {})["faithfulness"]
            context_precision = result.get("evaluator", {}).get("result", {})["context_precision"]

            df.at[index, f"{test_config['name']}_answer_correctness"] = answer_correctness
            df.at[index, f"{test_config['name']}_faithfulness"] = faithfulness
            df.at[index, f"{test_config['name']}_context_precision"] = context_precision

    save_path = f"results/generation/{now.strftime('%Y-%m-%d_%H-%M-%S')}.pkl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df.to_pickle(save_path)

df = pd.read_pickle("data/qa_with_docs_flat/question_answers_docs_word_50_10_dataset_flat.pkl")
run_retrieval_eval("question_answers_docs_word_50_10_dataset_flat.pkl", df)

Processing Markdown files: 100%|██████████| 6/6 [00:00<00:00, 11618.57it/s]
Processing rows:   0%|          | 0/1 [00:00<?, ?row/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 1/1 [05:13<00:00, 313.79s/row]


In [8]:
df = pd.read_pickle("results/generation/2025-07-23_17-43-46.pkl")

In [9]:
df

Unnamed: 0,question,references,groundTruth,source_file,variations_pretty,documents,variant,prev_messages,Closed Source RAG_answer_correctness,Closed Source RAG_faithfulness,Closed Source RAG_context_precision,Closed Source Long Context_answer_correctness,Closed Source Long Context_faithfulness,Closed Source Long Context_context_precision,Open Source RAG_answer_correctness,Open Source RAG_faithfulness,Open Source RAG_context_precision,Open Source Long Context_answer_correctness,Open Source Long Context_faithfulness,Open Source Long Context_context_precision
0,Wie läuft eine äußere Wendung genau ab und was...,[Eine äußere Wendung findet in der Regel in ei...,Eine äußere Wendung findet ambulant in einer K...,Äußere Wendung.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=6574586c437e0711c9ac7d01e25f91424...,default,[],0.412757,0.892857,0.715278,0.402086,1.0,1.0,0.486363,0.833333,0.75,0.239715,1.0,1.0
1,Und was passiert dann genau davor?,[Eine äußere Wendung findet in der Regel in ei...,Eine äußere Wendung findet ambulant in einer K...,Äußere Wendung.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=6574586c437e0711c9ac7d01e25f91424...,contextual,"[Ich habe von der äußeren Wendung gehört, könn...",,,,,,,,,,,,
2,Wie genau läuft so ne äußere Wendung ab und wa...,[Eine äußere Wendung findet in der Regel in ei...,Eine äußere Wendung findet ambulant in einer K...,Äußere Wendung.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=6574586c437e0711c9ac7d01e25f91424...,slang,[],,,,,,,,,,,,
3,How is an external version performed exactly a...,[Eine äußere Wendung findet in der Regel in ei...,Eine äußere Wendung findet ambulant in einer K...,Äußere Wendung.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=6574586c437e0711c9ac7d01e25f91424...,english,[],,,,,,,,,,,,
4,äußere Wendung Ablauf Vorbereitung,[Eine äußere Wendung findet in der Regel in ei...,Eine äußere Wendung findet ambulant in einer K...,Äußere Wendung.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=6574586c437e0711c9ac7d01e25f91424...,keyword,[],,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,Risiken Ballonkatheter?,[Die Einlage und das Befüllen der Ballons könn...,Bei der Anwendung eines Ballonkatheters kann d...,Einleitung der Geburt.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=b92aa18d221df9e8cbef44cd1db9e8a52...,short,[],,,,,,,,,,,,
606,"Ich möchte wissen, welche möglichen Komplikati...",[Die Einlage und das Befüllen der Ballons könn...,Bei der Anwendung eines Ballonkatheters kann d...,Einleitung der Geburt.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=b92aa18d221df9e8cbef44cd1db9e8a52...,long,[],,,,,,,,,,,,
607,Welche potenziellen Komplikationen und Risiken...,[Die Einlage und das Befüllen der Ballons könn...,Bei der Anwendung eines Ballonkatheters kann d...,Einleitung der Geburt.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=b92aa18d221df9e8cbef44cd1db9e8a52...,technical,[],,,,,,,,,,,,
608,Welche Risiken gibt es bei der Anwendung eines...,[Die Einlage und das Befüllen der Ballons könn...,Bei der Anwendung eines Ballonkatheters kann d...,Einleitung der Geburt.md,"""{\""contextual\"": [{\""role\"": \""user\"", \""mess...",[Document(id=b92aa18d221df9e8cbef44cd1db9e8a52...,mistake,[],,,,,,,,,,,,
