# Experiment Setup


In [1]:
import json
import sys
from pathlib import Path
from typing import Any, Dict, List

import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

In [2]:
import logging
try:
    from src.utils import get_logger as _get_logger  
except ImportError:  
    _get_logger = None
from src.rag_pipeline import rag
from src.embedder import generate_query_embedding
from src.metrics import (
    precision_recall_k,
    semantic_precision_recall_k,
    grounding_score,
    estimate_tokens,
)
from src.agents.self_reflective_rag import self_reflect_rag
from src.agents.query_decomposition_rag import query_decomposition_rag

try:
    from src.utils import get_logger
    logger = get_logger(__name__)
except Exception:
    import logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

DATA_DIR = PROJECT_ROOT / 'data' / 'processed'
EMBEDDINGS_DIR = PROJECT_ROOT / 'embeddings'
RESULTS_DIR = PROJECT_ROOT / 'results'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

chunks_path = DATA_DIR / 'chunks.json'
embeddings_path = EMBEDDINGS_DIR / 'embeddings.npy'
index_path = EMBEDDINGS_DIR / 'embedding_index.json'

if chunks_path.exists():
    CHUNKS = json.loads(chunks_path.read_text(encoding='utf-8'))
else:
    logger.warning('chunks.json not found; using fallback chunks')
    CHUNKS = [
        {'chunk_id': 'c1', 'text': 'Sprint acceleration depends on shin angles and hip drive.'},
        {'chunk_id': 'c2', 'text': 'Defensive pressing intensifies when fatigue sets in.'},
        {'chunk_id': 'c3', 'text': 'Heel strike and midfoot strike alter loading rates differently.'},
    ]

if embeddings_path.exists():
    EMBEDDINGS = np.load(embeddings_path)
else:
    logger.warning('embeddings.npy not found; generating random fallback embeddings')
    EMBEDDINGS = np.random.rand(len(CHUNKS), 384)

if index_path.exists():
    INDEX_MAP = json.loads(index_path.read_text(encoding='utf-8'))
else:
    logger.warning('embedding_index.json not found; creating sequential index map')
    INDEX_MAP = {chunk['chunk_id']: idx for idx, chunk in enumerate(CHUNKS)}

PROVIDER = 'openai'
LLM_MODEL = None
EMBED_MODEL = None
TOP_K = 5
THRESHOLD = 0.5
TEST_QUESTIONS = [
    "Which sport has teams of 6 players: football or volleyball?",
    "At maximal velocity what is the approximate stride length of elite sprinters?",
    "What tactical innovation is credited to Hungary's Golden Team in the 1950s?",
]

CHUNK_LOOKUP = {chunk['chunk_id']: chunk for chunk in CHUNKS}


# Running Different Structures


In [3]:

from typing import Callable


def _make_result(answer: str, chunks: List[Dict[str, Any]], time_ms: float, tokens: float, metadata: Dict[str, Any] | None = None) -> Dict[str, Any]:
    chunk_list = []
    seen: set[str] = set()
    for chunk in chunks or []:
        chunk_id = str(chunk.get('chunk_id', '')).strip()
        if chunk_id and chunk_id not in seen:
            seen.add(chunk_id)
            chunk_list.append(chunk)
    return {
        'answer': answer,
        'chunks': chunk_list,
        'time_ms': float(time_ms),
        'tokens': float(tokens),
        'metadata': metadata or {},
    }


def _chunks_from_ids(chunk_ids: List[str]) -> List[Dict[str, Any]]:
    resolved = []
    for cid in chunk_ids:
        chunk = CHUNK_LOOKUP.get(cid)
        if chunk:
            resolved.append(chunk)
    return resolved


def run_vanilla_rag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running vanilla RAG for question: %s", question)
    logger.success("-" * 50)
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
    )
    answer = result.get('answer', '')
    return _make_result(answer, result.get('chunks', []), result.get('time_ms', 0.0), estimate_tokens(answer))


def run_self_reflective_rag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running self-reflective RAG for question: %s", question)
    logger.success("-" * 50)
    query_embedding = generate_query_embedding(question, provider=PROVIDER, model=EMBED_MODEL)
    result = self_reflect_rag(
        query=question,
        chunks=CHUNKS,
        query_embedding=query_embedding,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        embedding_provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        provider=PROVIDER,
        llm_model=LLM_MODEL,
        temperature=1.0,
        k=TOP_K,
        threshold=THRESHOLD,
    )
    timings = result.get('timings', {})
    time_ms = timings.get('total_ms', sum(timings.values()))
    refined_answer = result.get('refined_answer', '')
    return _make_result(
        refined_answer,
        result.get('retrieved_chunks', []),
        time_ms,
        estimate_tokens(refined_answer),
        metadata={'initial_answer': result.get('initial_answer', '')},
    )


def run_query_decomposition_rag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running query decomposition RAG for question: %s", question)
    logger.success("-" * 50)
    result = query_decomposition_rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        llm_model=LLM_MODEL,
        embedding_provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
    )
    answer = result.get('final_answer', '')
    sub_chunks: List[Dict[str, Any]] = []
    for sub in result.get('sub_answers', []):
        sub_chunks.extend(sub.get('retrieved_chunks', []))
    time_ms = result.get('timings', {}).get('total_ms', result.get('time_ms', 0.0))
    return _make_result(answer, sub_chunks, time_ms, estimate_tokens(answer), metadata={'plan': result.get('plan', {})})


def run_chain_verification_rag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running chain of verification RAG for question: %s", question)
    logger.success("-" * 50)
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
        use_chain_of_verification=True,
        verification_iterations=2,
        verification_statements=3,
    )
    answer = result.get('answer', '')
    return _make_result(
        answer,
        result.get('chunks', []),
        result.get('time_ms', 0.0),
        estimate_tokens(answer),
        metadata={'verification': result.get('verification')},
    )


def run_active_retrieval_rag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running active retrieval RAG for question: %s", question)
    logger.success("-" * 50)
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
        use_active_retrieval=True,
        active_iterations=3,
        active_sufficiency_threshold=0.8,
    )
    answer = result.get('answer', '')
    return _make_result(answer, result.get('chunks', []), result.get('time_ms', 0.0), estimate_tokens(answer), metadata={'active': result.get('active_retrieval')})


def run_marag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running marag RAG for question: %s", question)
    logger.success("-" * 50)
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
        use_marag=True,
        marag_iterations=2,
    )
    answer = result.get('answer', '')
    return _make_result(answer, result.get('chunks', []), result.get('time_ms', 0.0), estimate_tokens(answer), metadata={'plan': result.get('plan')})


def run_madam_rag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running madam RAG for question: %s", question)
    logger.success("-" * 50)
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
        use_madam_rag=True,
        madam_followup_rounds=1,
    )
    answer = result.get('answer', '')
    cited_chunks: List[Dict[str, Any]] = []
    for debater in result.get('debaters', []):
        for resp in debater.get('responses', []):
            cited_chunks.extend(_chunks_from_ids(resp.get('chunk_ids', [])))
    return _make_result(answer, cited_chunks, result.get('time_ms', 0.0), estimate_tokens(answer), metadata={'winner': result.get('winner'), 'reasoning': result.get('reasoning')})


def run_routing_rag(question: str) -> Dict[str, Any]:
    logger.success("-" * 50)
    logger.success("Running routing RAG for question: %s", question)
    logger.success("-" * 50)
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
        use_routing_agent=True,
    )
    routed_payload = result.get('result', {})
    chunks_used = routed_payload.get('chunks') or routed_payload.get('retrieved_chunks', [])
    time_ms = routed_payload.get('time_ms', result.get('time_ms', 0.0))
    answer = result.get('answer', '')
    metadata = {
        'pipeline': result.get('pipeline'),
        'profile': result.get('profile'),
        'plan': result.get('plan'),
    }
    return _make_result(answer, chunks_used, time_ms, estimate_tokens(answer), metadata=metadata)


ARCHITECTURE_RUNNERS: Dict[str, Callable[[str], Dict[str, Any]]] = {
    'vanilla': run_vanilla_rag,
    'self_reflective': run_self_reflective_rag,
    'query_decomposition': run_query_decomposition_rag,
    'chain_of_verification': run_chain_verification_rag,
    'active_retrieval': run_active_retrieval_rag,
    'marag': run_marag,
    'madam_rag': run_madam_rag,
    'routing': run_routing_rag,
}


In [4]:

records: List[Dict[str, Any]] = []
for question in TEST_QUESTIONS:
    print(f"=== {question} ===")
    for architecture, runner in ARCHITECTURE_RUNNERS.items():
        result = runner(question)
        chunks_out = result['chunks']
        answer_text = result['answer']
        kp, kr = precision_recall_k(query=question, retrieved_chunks=chunks_out, all_chunks=CHUNKS, k=TOP_K)
        answer_embedding = generate_query_embedding(answer_text, provider=PROVIDER, model=EMBED_MODEL)
        sp, sr = semantic_precision_recall_k(
            answer_embedding=answer_embedding,
            retrieved_chunks=chunks_out,
            all_chunks=CHUNKS,
            embeddings=EMBEDDINGS,
            index_map=INDEX_MAP,
            k=TOP_K,
        )
        record = {
            'question': question,
            'architecture': architecture,
            'time_ms': result['time_ms'],
            'tokens': result['tokens'],
            'keyword_precision_k': kp,
            'keyword_recall_k': kr,
            'semantic_precision_k': sp,
            'semantic_recall_k': sr,
            'grounding_score': grounding_score(answer_text, chunks_out),
        }
        if architecture == 'routing':
            record['routing_pipeline'] = result['metadata'].get('pipeline')
            record['routing_profile'] = result['metadata'].get('profile', {}).get('name')
        records.append(record)
        print(f"  -> {architecture} | time={record['time_ms']:.1f} ms tokens={record['tokens']:.1f}")
    print("")

df = pd.DataFrame(records)


[32m[2025-12-09 16:12:45][SUCCESS][__main__] --------------------------------------------------[0m
[32m[2025-12-09 16:12:45][SUCCESS][__main__] Running vanilla RAG for question: Which sport has teams of 6 players: football or volleyball?[0m
[32m[2025-12-09 16:12:45][SUCCESS][__main__] --------------------------------------------------[0m
[37m[2025-12-09 16:12:45][INFO][src.rag_pipeline] RAG | start | query_len=59 chunks=162[0m
[37m[2025-12-09 16:12:45][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m


=== Which sport has teams of 6 players: football or volleyball? ===


[37m[2025-12-09 16:12:46][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:12:46][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=1296.55 ms[0m
[37m[2025-12-09 16:12:46][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:12:46][INFO][src.rag_pipeline] RAG | retrieve | retrieved=5 time=30.74 ms threshold=0.50[0m
[37m[2025-12-09 16:12:46][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=13413[0m
[37m[2025-12-09 16:12:46][INFO][src.llm_orchestrator] LLM | prompt_len=13620 approx_tokens=1960[0m
[37m[2025-12-09 16:12:46][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: Which sport has teams of 6 players: football or volleyball?...[0m
[37m[2025-12-09 16:12:50][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:12:50

  -> vanilla | time=5083.1 ms tokens=1.0


[37m[2025-12-09 16:12:51][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:12:51][INFO][src.agents.self_reflective_rag] REFLECT | start | query_len=59 chunks=162[0m
[37m[2025-12-09 16:12:51][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:12:51][INFO][src.agents.self_reflective_rag] REFLECT | retrieval | retrieved=5 time=38.89 ms threshold=0.50[0m
[37m[2025-12-09 16:12:51][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=13413[0m
[37m[2025-12-09 16:12:51][INFO][src.llm_orchestrator] LLM | prompt_len=13620 approx_tokens=1960[0m
[37m[2025-12-09 16:12:51][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: Which sport has teams of 6 players: football or volleyball?...[0m
[37m[2025-12-09 16:12:55][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:12:55][IN

  -> self_reflective | time=45649.6 ms tokens=229.0


[37m[2025-12-09 16:13:37][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:13:38][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:13:38][INFO][src.reranker] RERANK | start | chunks=5 model=cross-encoder/ms-marco-MiniLM-L-6-v2 top_k=5[0m
  from .autonotebook import tqdm as notebook_tqdm
[37m[2025-12-09 16:13:53][INFO][src.reranker] RERANK | loading cross-encoder model=cross-encoder/ms-marco-MiniLM-L-6-v2 max_length=512[0m
[37m[2025-12-09 16:13:55][INFO][sentence_transformers.cross_encoder.CrossEncoder] Use pytorch device: cpu[0m
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.07it/s]
[37m[2025-12-09 16:13:56][INFO][src.reranker] RERANK | completed | returned=5 time=330.20 ms score_range=(-9.5807, -4.2723)[0m
[37m[2025-12-09 16:13:56][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=13587[0m
[37m[2025-12-09 16:13:56][INFO][src.llm_orchest

  -> query_decomposition | time=40837.3 ms tokens=32.0


[37m[2025-12-09 16:14:19][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:14:19][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=605.26 ms[0m
[37m[2025-12-09 16:14:19][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:14:19][INFO][src.rag_pipeline] RAG | retrieve | retrieved=5 time=36.23 ms threshold=0.50[0m
[37m[2025-12-09 16:14:19][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=13413[0m
[37m[2025-12-09 16:14:19][INFO][src.llm_orchestrator] LLM | prompt_len=13620 approx_tokens=1960[0m
[37m[2025-12-09 16:14:19][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: Which sport has teams of 6 players: football or volleyball?...[0m
[37m[2025-12-09 16:14:22][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:14:22]

  -> chain_of_verification | time=138493.3 ms tokens=76.0


[37m[2025-12-09 16:16:37][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:16:37][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=301.44 ms[0m
[37m[2025-12-09 16:16:37][INFO][src.agents.active_retrieval] ACTIVE_RETRIEVAL | start | query_len=59[0m
[37m[2025-12-09 16:16:38][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:16:38][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=13413[0m
[37m[2025-12-09 16:16:38][INFO][src.llm_orchestrator] LLM | prompt_len=13620 approx_tokens=1960[0m
[37m[2025-12-09 16:16:38][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: Which sport has teams of 6 players: football or volleyball?...[0m
[37m[2025-12-09 16:16:41][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:16:41][INFO][

  -> active_retrieval | time=16393.2 ms tokens=1.0


[37m[2025-12-09 16:16:54][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:16:54][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:16:54][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=13413[0m
[37m[2025-12-09 16:16:54][INFO][src.llm_orchestrator] LLM | prompt_len=13620 approx_tokens=1960[0m
[37m[2025-12-09 16:16:54][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: Which sport has teams of 6 players: football or volleyball?...[0m
[37m[2025-12-09 16:16:58][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:16:58][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 3161.22 ms[0m
[34m[2025-12-09 16:16:58][LLM][src.llm_orchestrator] LLM | response preview: Volleyball.[0m
[37m[2025-12-09 16:16:58][INFO][src.llm_orchestrator] LLM | answer size a

  -> marag | time=35978.7 ms tokens=19.0


[37m[2025-12-09 16:17:31][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:17:31][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:17:31][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=13413[0m
[37m[2025-12-09 16:17:31][INFO][src.llm_orchestrator] LLM | prompt_len=13761 approx_tokens=1979[0m
[37m[2025-12-09 16:17:31][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: You are Debater A. Answer the user question referencing chunk_id values in brackets, e.g., [chunk_id=c1]. Keep reasoning concise.

Question:
Which sport has teams of 6 players: football or volleyball?...[0m
[37m[2025-12-09 16:17:35][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:17:35][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 4296.02 ms[0m
[34m[2025-12-09 16:17:35][

  -> madam_rag | time=30887.6 ms tokens=5.0


[37m[2025-12-09 16:18:08][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:18:08][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 6335.48 ms[0m
[34m[2025-12-09 16:18:08][LLM][src.llm_orchestrator] LLM | response preview: {"embedding_profile":"No data in the documents.","pipeline":"vanilla","iterations":1,"followup_rounds":0,"reasoning":"No data in the documents."}[0m
[37m[2025-12-09 16:18:08][INFO][src.llm_orchestrator] LLM | answer size approx_tokens=9[0m
[37m[2025-12-09 16:18:08][INFO][src.agents.routing_rag] ROUTING_RAG | decision | pipeline=vanilla profile=balanced_openai iterations=1[0m
[37m[2025-12-09 16:18:08][INFO][src.rag_pipeline] RAG | start | query_len=59 chunks=162[0m
[37m[2025-12-09 16:18:08][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[37m[2025-12-09 16:18:08][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings 

  -> routing | time=4528.5 ms tokens=1.0

=== At maximal velocity what is the approximate stride length of elite sprinters? ===


[37m[2025-12-09 16:18:13][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:18:13][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=394.60 ms[0m
[37m[2025-12-09 16:18:13][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:18:13][INFO][src.rag_pipeline] RAG | retrieve | retrieved=5 time=56.70 ms threshold=0.50[0m
[37m[2025-12-09 16:18:13][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=12707[0m
[37m[2025-12-09 16:18:13][INFO][src.llm_orchestrator] LLM | prompt_len=12932 approx_tokens=1917[0m
[37m[2025-12-09 16:18:13][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: At maximal velocity what is the approximate stride length of elite sprinters?...[0m
[37m[2025-12-09 16:18:18][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[20

  -> vanilla | time=4672.5 ms tokens=16.0


[37m[2025-12-09 16:18:18][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:18:18][INFO][src.agents.self_reflective_rag] REFLECT | start | query_len=77 chunks=162[0m
[37m[2025-12-09 16:18:18][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:18:18][INFO][src.agents.self_reflective_rag] REFLECT | retrieval | retrieved=5 time=32.03 ms threshold=0.50[0m
[37m[2025-12-09 16:18:18][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=12707[0m
[37m[2025-12-09 16:18:18][INFO][src.llm_orchestrator] LLM | prompt_len=12932 approx_tokens=1917[0m
[37m[2025-12-09 16:18:18][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: At maximal velocity what is the approximate stride length of elite sprinters?...[0m
[37m[2025-12-09 16:18:25][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-

  -> self_reflective | time=25040.1 ms tokens=41.0


[37m[2025-12-09 16:18:44][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:18:44][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:18:44][INFO][src.reranker] RERANK | start | chunks=5 model=cross-encoder/ms-marco-MiniLM-L-6-v2 top_k=5[0m
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
[37m[2025-12-09 16:18:45][INFO][src.reranker] RERANK | completed | returned=5 time=269.82 ms score_range=(-10.7273, -1.7751)[0m
[37m[2025-12-09 16:18:45][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=11954[0m
[37m[2025-12-09 16:18:45][INFO][src.llm_orchestrator] LLM | prompt_len=12140 approx_tokens=1783[0m
[37m[2025-12-09 16:18:45][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: At maximal velocity what is the approx...[0m
[37m[2025-12-09 16:18:49][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "H

  -> query_decomposition | time=16670.8 ms tokens=29.0


[37m[2025-12-09 16:19:01][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:19:01][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=260.75 ms[0m
[37m[2025-12-09 16:19:01][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:19:01][INFO][src.rag_pipeline] RAG | retrieve | retrieved=5 time=34.35 ms threshold=0.50[0m
[37m[2025-12-09 16:19:01][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=12707[0m
[37m[2025-12-09 16:19:01][INFO][src.llm_orchestrator] LLM | prompt_len=12932 approx_tokens=1917[0m
[37m[2025-12-09 16:19:01][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: At maximal velocity what is the approximate stride length of elite sprinters?...[0m
[37m[2025-12-09 16:19:08][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[20

  -> chain_of_verification | time=41878.8 ms tokens=57.0


[37m[2025-12-09 16:19:44][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:19:44][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=429.17 ms[0m
[37m[2025-12-09 16:19:44][INFO][src.agents.active_retrieval] ACTIVE_RETRIEVAL | start | query_len=77[0m
[37m[2025-12-09 16:19:44][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:19:44][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=12707[0m
[37m[2025-12-09 16:19:44][INFO][src.llm_orchestrator] LLM | prompt_len=12932 approx_tokens=1917[0m
[37m[2025-12-09 16:19:44][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: At maximal velocity what is the approximate stride length of elite sprinters?...[0m
[37m[2025-12-09 16:19:48][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-0

  -> active_retrieval | time=18498.5 ms tokens=11.0


[37m[2025-12-09 16:20:03][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:20:03][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:20:03][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=12707[0m
[37m[2025-12-09 16:20:03][INFO][src.llm_orchestrator] LLM | prompt_len=12932 approx_tokens=1917[0m
[37m[2025-12-09 16:20:03][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: At maximal velocity what is the approximate stride length of elite sprinters?...[0m
[37m[2025-12-09 16:20:06][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:20:06][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 3674.42 ms[0m
[34m[2025-12-09 16:20:06][LLM][src.llm_orchestrator] LLM | response preview: About 2.3 meters per stride, with taller sprinters approaching roughly 2

  -> marag | time=28596.2 ms tokens=15.0


[37m[2025-12-09 16:20:32][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:20:32][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:20:32][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=12707[0m
[37m[2025-12-09 16:20:32][INFO][src.llm_orchestrator] LLM | prompt_len=13073 approx_tokens=1936[0m
[37m[2025-12-09 16:20:32][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: You are Debater A. Answer the user question referencing chunk_id values in brackets, e.g., [chunk_id=c1]. Keep reasoning concise.

Question:
At maximal velocity what is the approximate stride length of elite sprinters?...[0m
[37m[2025-12-09 16:20:37][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:20:37][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 5437.46 ms[0m
[34m[202

  -> madam_rag | time=30197.0 ms tokens=15.0


[37m[2025-12-09 16:21:13][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:21:13][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 10909.26 ms[0m
[34m[2025-12-09 16:21:13][LLM][src.llm_orchestrator] LLM | response preview: {"embedding_profile":"default","pipeline":"vanilla","iterations":1,"followup_rounds":0,"reasoning":"Simple factual query; use vanilla RAG with the default embedding profile. No multi-step reasoning or[0m
[37m[2025-12-09 16:21:13][INFO][src.llm_orchestrator] LLM | answer size approx_tokens=17[0m
[37m[2025-12-09 16:21:13][INFO][src.agents.routing_rag] ROUTING_RAG | decision | pipeline=vanilla profile=balanced_openai iterations=1[0m
[37m[2025-12-09 16:21:13][INFO][src.rag_pipeline] RAG | start | query_len=77 chunks=162[0m
[37m[2025-12-09 16:21:13][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[37m[2025-12-09 16:21:13][INFO][httpx]

  -> routing | time=4619.7 ms tokens=16.0

=== What tactical innovation is credited to Hungary's Golden Team in the 1950s? ===


[37m[2025-12-09 16:21:18][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:21:18][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=309.03 ms[0m
[37m[2025-12-09 16:21:18][INFO][src.retriever] RETRIEVE | completed | selected=3[0m
[37m[2025-12-09 16:21:18][INFO][src.rag_pipeline] RAG | retrieve | retrieved=3 time=46.20 ms threshold=0.50[0m
[37m[2025-12-09 16:21:18][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=3 context_chars=9070[0m
[37m[2025-12-09 16:21:18][INFO][src.llm_orchestrator] LLM | prompt_len=9293 approx_tokens=1240[0m
[37m[2025-12-09 16:21:18][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: What tactical innovation is credited to Hungary's Golden Team in the 1950s?...[0m
[37m[2025-12-09 16:21:22][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-1

  -> vanilla | time=3867.0 ms tokens=12.0


[37m[2025-12-09 16:21:22][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:21:22][INFO][src.agents.self_reflective_rag] REFLECT | start | query_len=75 chunks=162[0m
[37m[2025-12-09 16:21:22][INFO][src.retriever] RETRIEVE | completed | selected=3[0m
[37m[2025-12-09 16:21:22][INFO][src.agents.self_reflective_rag] REFLECT | retrieval | retrieved=3 time=30.73 ms threshold=0.50[0m
[37m[2025-12-09 16:21:22][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=3 context_chars=9070[0m
[37m[2025-12-09 16:21:22][INFO][src.llm_orchestrator] LLM | prompt_len=9293 approx_tokens=1240[0m
[37m[2025-12-09 16:21:22][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: What tactical innovation is credited to Hungary's Golden Team in the 1950s?...[0m
[37m[2025-12-09 16:21:29][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-0

  -> self_reflective | time=33424.6 ms tokens=75.0


[37m[2025-12-09 16:21:57][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:21:57][INFO][src.retriever] RETRIEVE | completed | selected=5[0m
[37m[2025-12-09 16:21:57][INFO][src.reranker] RERANK | start | chunks=5 model=cross-encoder/ms-marco-MiniLM-L-6-v2 top_k=5[0m
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
[37m[2025-12-09 16:21:58][INFO][src.reranker] RERANK | completed | returned=5 time=262.24 ms score_range=(-6.2221, -0.0797)[0m
[37m[2025-12-09 16:21:58][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=5 context_chars=12472[0m
[37m[2025-12-09 16:21:58][INFO][src.llm_orchestrator] LLM | prompt_len=12656 approx_tokens=1698[0m
[37m[2025-12-09 16:21:58][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: What tactical innovation is credited...[0m
[37m[2025-12-09 16:22:01][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP

  -> query_decomposition | time=16990.5 ms tokens=33.0


[37m[2025-12-09 16:22:14][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:22:14][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=584.27 ms[0m
[37m[2025-12-09 16:22:14][INFO][src.retriever] RETRIEVE | completed | selected=3[0m
[37m[2025-12-09 16:22:14][INFO][src.rag_pipeline] RAG | retrieve | retrieved=3 time=25.33 ms threshold=0.50[0m
[37m[2025-12-09 16:22:14][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=3 context_chars=9070[0m
[37m[2025-12-09 16:22:14][INFO][src.llm_orchestrator] LLM | prompt_len=9293 approx_tokens=1240[0m
[37m[2025-12-09 16:22:14][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: What tactical innovation is credited to Hungary's Golden Team in the 1950s?...[0m
[37m[2025-12-09 16:22:18][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-1

  -> chain_of_verification | time=38769.0 ms tokens=23.0


[37m[2025-12-09 16:22:53][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:22:53][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=339.87 ms[0m
[37m[2025-12-09 16:22:53][INFO][src.agents.active_retrieval] ACTIVE_RETRIEVAL | start | query_len=75[0m
[37m[2025-12-09 16:22:53][INFO][src.retriever] RETRIEVE | completed | selected=3[0m
[37m[2025-12-09 16:22:53][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=3 context_chars=9070[0m
[37m[2025-12-09 16:22:53][INFO][src.llm_orchestrator] LLM | prompt_len=9293 approx_tokens=1240[0m
[37m[2025-12-09 16:22:53][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: What tactical innovation is credited to Hungary's Golden Team in the 1950s?...[0m
[37m[2025-12-09 16:22:58][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16

  -> active_retrieval | time=18595.4 ms tokens=12.0


[37m[2025-12-09 16:23:12][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:23:12][INFO][src.retriever] RETRIEVE | completed | selected=3[0m
[37m[2025-12-09 16:23:12][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=3 context_chars=9070[0m
[37m[2025-12-09 16:23:12][INFO][src.llm_orchestrator] LLM | prompt_len=9293 approx_tokens=1240[0m
[37m[2025-12-09 16:23:12][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: What tactical innovation is credited to Hungary's Golden Team in the 1950s?...[0m
[37m[2025-12-09 16:23:17][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:23:17][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 4348.31 ms[0m
[34m[2025-12-09 16:23:17][LLM][src.llm_orchestrator] LLM | response preview: Interchanging positions with attacking fluidity, notably using a prototype “

  -> marag | time=25224.1 ms tokens=25.0


[37m[2025-12-09 16:23:38][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:23:38][INFO][src.retriever] RETRIEVE | completed | selected=3[0m
[37m[2025-12-09 16:23:38][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=3 context_chars=9070[0m
[37m[2025-12-09 16:23:38][INFO][src.llm_orchestrator] LLM | prompt_len=9434 approx_tokens=1259[0m
[37m[2025-12-09 16:23:38][INFO][src.llm_orchestrator] LLM | Sending request to OpenAI question: You are Debater A. Answer the user question referencing chunk_id values in brackets, e.g., [chunk_id=c1]. Keep reasoning concise.

Question:
What tactical innovation is credited to Hungary's Golden Team in the 1950s?...[0m
[37m[2025-12-09 16:23:44][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:23:44][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 6421.48 ms[0m
[34m[2025-12

  -> madam_rag | time=25177.9 ms tokens=11.0


[37m[2025-12-09 16:24:15][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"[0m
[37m[2025-12-09 16:24:15][INFO][src.llm_orchestrator] LLM | OpenAI request completed in 11503.19 ms[0m
[34m[2025-12-09 16:24:15][LLM][src.llm_orchestrator] LLM | response preview: {"embedding_profile":"No data in the documents.","pipeline":"vanilla","iterations":1,"followup_rounds":0,"reasoning":"No data in the documents."}[0m
[37m[2025-12-09 16:24:15][INFO][src.llm_orchestrator] LLM | answer size approx_tokens=9[0m
[37m[2025-12-09 16:24:15][INFO][src.agents.routing_rag] ROUTING_RAG | decision | pipeline=vanilla profile=balanced_openai iterations=1[0m
[37m[2025-12-09 16:24:15][INFO][src.rag_pipeline] RAG | start | query_len=75 chunks=162[0m
[37m[2025-12-09 16:24:15][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[37m[2025-12-09 16:24:15][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings

  -> routing | time=6183.3 ms tokens=12.0



In [5]:
df.head()

Unnamed: 0,question,architecture,time_ms,tokens,keyword_precision_k,keyword_recall_k,semantic_precision_k,semantic_recall_k,grounding_score,routing_pipeline,routing_profile
0,Which sport has teams of 6 players: football o...,vanilla,5083.0666,1.0,1.0,0.2,0.2,1.0,1.0,,
1,Which sport has teams of 6 players: football o...,self_reflective,45649.6262,229.0,1.0,0.2,0.6,0.6,0.922705,,
2,Which sport has teams of 6 players: football o...,query_decomposition,40837.289,32.0,1.0,0.2,0.0,0.0,0.777778,,
3,Which sport has teams of 6 players: football o...,chain_of_verification,138493.2516,76.0,1.0,0.2,0.2,1.0,0.521739,,
4,Which sport has teams of 6 players: football o...,active_retrieval,16393.2133,1.0,1.0,0.2,0.2,1.0,1.0,,


# Saving Results


In [6]:
csv_path = RESULTS_DIR / 'experiment_results.csv'
json_path = RESULTS_DIR / 'experiment_results.json'
df.to_csv(csv_path, index=False)
json_path.write_text(json.dumps(records, ensure_ascii=False, indent=2), encoding='utf-8')
print('Saved results to:', csv_path, 'and', json_path)


Saved results to: C:\Users\tomasz.makowski.2\Desktop\SemesterII\ComputationalIntelligence\Project\agentic-rag-architectures\results\experiment_results.csv and C:\Users\tomasz.makowski.2\Desktop\SemesterII\ComputationalIntelligence\Project\agentic-rag-architectures\results\experiment_results.json
