# Experiment Setup


In [1]:
import json
import sys
from pathlib import Path
from typing import Any, Dict, List

import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

In [2]:
from src.rag_pipeline import rag
from src.embedder import generate_query_embedding
from src.metrics import (
    extract_keywords,
    precision_recall_k,
    semantic_precision_recall_k,
    grounding_score,
    estimate_tokens,
)
from src.agents.self_reflective_rag import self_reflect_rag

try:
    from src.utils import get_logger
    logger = get_logger(__name__)
except Exception:
    import logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

DATA_DIR = PROJECT_ROOT / 'data' / 'processed'
EMBEDDINGS_DIR = PROJECT_ROOT / 'embeddings'
RESULTS_DIR = PROJECT_ROOT / 'results'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

chunks_path = DATA_DIR / 'chunks.json'
embeddings_path = EMBEDDINGS_DIR / 'embeddings.npy'
index_path = EMBEDDINGS_DIR / 'embedding_index.json'

if chunks_path.exists():
    CHUNKS = json.loads(chunks_path.read_text(encoding='utf-8'))
else:
    logger.warning('chunks.json not found; using fallback chunks')
    CHUNKS = [
        {'chunk_id': 'c1', 'text': 'Sprint acceleration depends on shin angles and hip drive.'},
        {'chunk_id': 'c2', 'text': 'Defensive pressing intensifies when fatigue sets in.'},
        {'chunk_id': 'c3', 'text': 'Heel strike and midfoot strike alter loading rates differently.'},
    ]

if embeddings_path.exists():
    EMBEDDINGS = np.load(embeddings_path)
else:
    logger.warning('embeddings.npy not found; generating random fallback embeddings')
    EMBEDDINGS = np.random.rand(len(CHUNKS), 384)

if index_path.exists():
    INDEX_MAP = json.loads(index_path.read_text(encoding='utf-8'))
else:
    logger.warning('embedding_index.json not found; creating sequential index map')
    INDEX_MAP = {chunk['chunk_id']: idx for idx, chunk in enumerate(CHUNKS)}

PROVIDER = 'openai'
LLM_MODEL = None
EMBED_MODEL = None
TOP_K = 5
THRESHOLD = 0.5
TEST_QUESTIONS = [
    "Which sport has teams of 6 players: football or volleyball?",
    "At maximal velocity what is the approximate stride length of elite sprinters?",
    "What tactical innovation is credited to Hungary's Golden Team in the 1950s?",
]


# Running Baseline (Vanilla RAG)


In [3]:
def run_vanilla_rag(question: str) -> Dict[str, Any]:
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
    )
    return {
        'answer': result.get('answer', ''),
        'chunks': result.get('chunks', []),
        'time_ms': result.get('time_ms', 0.0),
        'tokens': estimate_tokens(result.get('answer', '')),
    }


# Running Self-Reflective RAG


In [4]:
def run_self_reflective_rag(question: str) -> Dict[str, Any]:
    query_embedding = generate_query_embedding(question, provider=PROVIDER, model=EMBED_MODEL)
    result = self_reflect_rag(
        query=question,
        chunks=CHUNKS,
        query_embedding=query_embedding,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        llm_model=LLM_MODEL,
        temperature=1.0,
        k=TOP_K,
        threshold=THRESHOLD,
    )
    total_time = sum(result.get('timings', {}).values())
    refined_answer = result.get('refined_answer', '')
    return {
        'refined_answer': refined_answer,
        'initial_answer': result.get('initial_answer', ''),
        'chunks': result.get('retrieved_chunks', []),
        'timings': result.get('timings', {}),
        'time_ms': total_time,
        'tokens': estimate_tokens(refined_answer),
    }


# Metrics: Precision/Recall/Grounding


In [5]:
records: List[Dict[str, Any]] = []
for question in TEST_QUESTIONS:
    keywords = extract_keywords(question)
    vanilla_result = run_vanilla_rag(question)
    print("-" * 40)
    reflective_result = run_self_reflective_rag(question)
    print("-" * 40)

    keyword_precision_vanilla, keyword_recall_vanilla = precision_recall_k(
        query=question,
        retrieved_chunks=vanilla_result['chunks'],
        all_chunks=CHUNKS,
        k=TOP_K,
    )

    vanilla_answer_embedding = generate_query_embedding(vanilla_result['answer'], provider=PROVIDER, model=EMBED_MODEL)
    semantic_precision_vanilla, semantic_recall_vanilla = semantic_precision_recall_k(
        answer_embedding=vanilla_answer_embedding,
        retrieved_chunks=vanilla_result['chunks'],
        all_chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        k=TOP_K,
    )

    keyword_precision, keyword_recall = precision_recall_k(
        query=question,
        retrieved_chunks=reflective_result['chunks'],
        all_chunks=CHUNKS,
        k=TOP_K,
    )

    answer_embedding = generate_query_embedding(reflective_result['refined_answer'], provider=PROVIDER, model=EMBED_MODEL)
    semantic_precision, semantic_recall = semantic_precision_recall_k(
        answer_embedding=answer_embedding,
        retrieved_chunks=reflective_result['chunks'],
        all_chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        k=TOP_K,
    )

    grounding = grounding_score(reflective_result['refined_answer'], reflective_result['chunks'])

    records.append({
        'question': question,
        'time_vanilla': vanilla_result['time_ms'],
        'time_reflective': reflective_result['time_ms'],
        'tokens_vanilla': vanilla_result['tokens'],
        'tokens_reflective': reflective_result['tokens'],
        'grounding_reflective': grounding,
        'keyword_precision_vanilla_k': keyword_precision_vanilla,
        'keyword_recall_vanilla_k': keyword_recall_vanilla,
        'semantic_precision_vanilla_k': semantic_precision_vanilla,
        'semantic_recall_vanilla_k': semantic_recall_vanilla,
        'keyword_precision_k': keyword_precision,
        'keyword_recall_k': keyword_recall,
        'semantic_precision_k': semantic_precision,
        'semantic_recall_k': semantic_recall,
    })
    print("\n" + "-" * 40 + "\n")

df = pd.DataFrame(records)

[32m[2025-11-25 23:33:49][INFO][src.rag_pipeline] RAG | start | query_len=59 chunks=22[0m
[32m[2025-11-25 23:33:49][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[32m[2025-11-25 23:33:50][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:33:50][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=1281.63 ms[0m
[32m[2025-11-25 23:33:50][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:33:50][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:33:50][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': '3a24e71b-1c15-474b-8905-751d48cb90ec', 'score': 0.5346}][0m
[32m[2025-11-25 23:33:50][INFO][src.rag_pipeline] RAG | retrieve | retrieved=1 time=6.46 ms threshold=0.50[0m
[32m[2025-11-25 23:33:50][INFO][src.llm_orchestrator] LLM |

----------------------------------------


[32m[2025-11-25 23:33:55][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:33:55][INFO][src.self_reflective_rag] REFLECT | start | query_len=59 chunks=22[0m
[32m[2025-11-25 23:33:55][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:33:55][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:33:55][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': '3a24e71b-1c15-474b-8905-751d48cb90ec', 'score': 0.5348}][0m
[32m[2025-11-25 23:33:55][INFO][src.self_reflective_rag] REFLECT | retrieval | retrieved=1 time=1.70 ms threshold=0.50[0m
[32m[2025-11-25 23:33:55][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=1 context_chars=2819[0m
[32m[2025-11-25 23:33:55][INFO][src.llm_orchestrator] LLM | prompt_len=3026 approx_tokens=436[0m
[32m[2025-11-25 23:33:55][INFO][src.llm_orchestrator] LLM |

----------------------------------------


[32m[2025-11-25 23:34:31][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:34:31][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[32m[2025-11-25 23:34:31][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:34:31][INFO][src.rag_pipeline] RAG | start | query_len=77 chunks=22[0m
[32m[2025-11-25 23:34:31][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m



----------------------------------------



[32m[2025-11-25 23:34:31][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:34:31][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=369.55 ms[0m
[32m[2025-11-25 23:34:31][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:34:31][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=2[0m
[32m[2025-11-25 23:34:31][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': '5b3abccd-8489-4948-aa4e-94ce68176066', 'score': 0.6261}, {'chunk_id': '3a2ca109-39c5-43f3-8674-2e1ac3d2ee65', 'score': 0.6086}][0m
[32m[2025-11-25 23:34:31][INFO][src.rag_pipeline] RAG | retrieve | retrieved=2 time=3.68 ms threshold=0.50[0m
[32m[2025-11-25 23:34:31][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=2 context_chars=5103[0m
[32m[2025-11-25 23:34:31][INFO][src.llm_orchestrator] LLM | prompt_le

----------------------------------------


[32m[2025-11-25 23:34:35][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:34:35][INFO][src.self_reflective_rag] REFLECT | start | query_len=77 chunks=22[0m
[32m[2025-11-25 23:34:35][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:34:35][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=2[0m
[32m[2025-11-25 23:34:35][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': '5b3abccd-8489-4948-aa4e-94ce68176066', 'score': 0.6261}, {'chunk_id': '3a2ca109-39c5-43f3-8674-2e1ac3d2ee65', 'score': 0.6086}][0m
[32m[2025-11-25 23:34:35][INFO][src.self_reflective_rag] REFLECT | retrieval | retrieved=2 time=9.16 ms threshold=0.50[0m
[32m[2025-11-25 23:34:35][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=2 context_chars=5103[0m
[32m[2025-11-25 23:34:35][INFO][src.llm_orchestrator] LLM | prompt_len=5328 approx_toke

----------------------------------------


[32m[2025-11-25 23:35:17][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:35:17][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[32m[2025-11-25 23:35:17][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:35:17][INFO][src.rag_pipeline] RAG | start | query_len=75 chunks=22[0m
[32m[2025-11-25 23:35:17][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m



----------------------------------------



[32m[2025-11-25 23:35:18][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:35:18][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=417.04 ms[0m
[32m[2025-11-25 23:35:18][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:35:18][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:35:18][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': 'b6cdb298-405d-4322-93c2-bf4dc3ec3400', 'score': 0.5192}][0m
[32m[2025-11-25 23:35:18][INFO][src.rag_pipeline] RAG | retrieve | retrieved=1 time=6.88 ms threshold=0.50[0m
[32m[2025-11-25 23:35:18][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=1 context_chars=3039[0m
[32m[2025-11-25 23:35:18][INFO][src.llm_orchestrator] LLM | prompt_len=3262 approx_tokens=438[0m
[32m[2025-11-25 23:35:18][INFO][src.llm_o

----------------------------------------


[32m[2025-11-25 23:35:22][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:35:22][INFO][src.self_reflective_rag] REFLECT | start | query_len=75 chunks=22[0m
[32m[2025-11-25 23:35:22][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:35:22][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:35:22][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': 'b6cdb298-405d-4322-93c2-bf4dc3ec3400', 'score': 0.5192}][0m
[32m[2025-11-25 23:35:22][INFO][src.self_reflective_rag] REFLECT | retrieval | retrieved=1 time=2.43 ms threshold=0.50[0m
[32m[2025-11-25 23:35:22][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=1 context_chars=3039[0m
[32m[2025-11-25 23:35:22][INFO][src.llm_orchestrator] LLM | prompt_len=3262 approx_tokens=438[0m
[32m[2025-11-25 23:35:22][INFO][src.llm_orchestrator] LLM |

----------------------------------------


[32m[2025-11-25 23:35:49][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:35:49][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[32m[2025-11-25 23:35:49][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m



----------------------------------------



In [6]:
df

Unnamed: 0,question,time_vanilla,time_reflective,tokens_vanilla,tokens_reflective,grounding_reflective,keyword_precision_vanilla_k,keyword_recall_vanilla_k,semantic_precision_vanilla_k,semantic_recall_vanilla_k,keyword_precision_k,keyword_recall_k,semantic_precision_k,semantic_recall_k
0,Which sport has teams of 6 players: football o...,4962.035,70798.8027,1,228,0.770642,0.2,0.058824,0.2,1.0,0.2,0.058824,0.2,0.166667
1,At maximal velocity what is the approximate st...,4195.8649,82310.2056,12,145,0.822695,0.4,0.25,0.0,0.0,0.4,0.25,0.4,0.666667
2,What tactical innovation is credited to Hungar...,4407.7961,52933.5922,16,43,0.87234,0.2,0.058824,0.0,0.0,0.2,0.058824,0.2,0.5


# Saving Results


In [7]:
csv_path = RESULTS_DIR / 'experiment_results.csv'
json_path = RESULTS_DIR / 'experiment_results.json'
df.to_csv(csv_path, index=False)
json_path.write_text(json.dumps(records, ensure_ascii=False, indent=2), encoding='utf-8')
summary = {
    'avg_time_vanilla': float(df['time_vanilla'].mean()) if not df.empty else 0.0,
    'avg_time_reflective': float(df['time_reflective'].mean()) if not df.empty else 0.0,
    'avg_grounding_reflective': float(df['grounding_reflective'].mean()) if not df.empty else 0.0,
}
print('Saved results to:', csv_path, 'and', json_path)
summary


Saved results to: C:\Users\tomasz.makowski.2\Desktop\SemesterII\ComputationalIntelligence\Project\agentic-rag-architectures\results\experiment_results.csv and C:\Users\tomasz.makowski.2\Desktop\SemesterII\ComputationalIntelligence\Project\agentic-rag-architectures\results\experiment_results.json


{'avg_time_vanilla': 4521.898666629568,
 'avg_time_reflective': 68680.8668335046,
 'avg_grounding_reflective': 0.8218925542759234}