# Experiment Setup


In [None]:
import json
import os
import sys
from pathlib import Path
from typing import Any, Dict, List

import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.rag_pipeline import rag
from src.embedder import generate_query_embedding
from src.metrics import (
    extract_keywords,
    precision_recall_k,
    semantic_precision_recall_k,
    grounding_score,
    estimate_tokens,
)
from src.self_reflective_rag import self_reflect_rag

try:
    from src.utils import get_logger
    logger = get_logger(__name__)
except Exception:
    import logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

DATA_DIR = PROJECT_ROOT / 'data' / 'processed'
EMBEDDINGS_DIR = PROJECT_ROOT / 'embeddings'
RESULTS_DIR = PROJECT_ROOT / 'results'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

chunks_path = DATA_DIR / 'chunks.json'
embeddings_path = EMBEDDINGS_DIR / 'embeddings.npy'
index_path = EMBEDDINGS_DIR / 'embedding_index.json'

if chunks_path.exists():
    CHUNKS = json.loads(chunks_path.read_text(encoding='utf-8'))
else:
    logger.warning('chunks.json not found; using fallback chunks')
    CHUNKS = [
        {'chunk_id': 'c1', 'text': 'Sprint acceleration depends on shin angles and hip drive.'},
        {'chunk_id': 'c2', 'text': 'Defensive pressing intensifies when fatigue sets in.'},
        {'chunk_id': 'c3', 'text': 'Heel strike and midfoot strike alter loading rates differently.'},
    ]

if embeddings_path.exists():
    EMBEDDINGS = np.load(embeddings_path)
else:
    logger.warning('embeddings.npy not found; generating random fallback embeddings')
    EMBEDDINGS = np.random.rand(len(CHUNKS), 384)

if index_path.exists():
    INDEX_MAP = json.loads(index_path.read_text(encoding='utf-8'))
else:
    logger.warning('embedding_index.json not found; creating sequential index map')
    INDEX_MAP = {chunk['chunk_id']: idx for idx, chunk in enumerate(CHUNKS)}

PROVIDER = 'openai'
LLM_MODEL = None
EMBED_MODEL = None
TOP_K = 5
THRESHOLD = 0.5
TEST_QUESTIONS = [
    "Which sport has teams of 6 players: football or volleyball?",
    "At maximal velocity what is the approximate stride length of elite sprinters?",
    "What tactical innovation is credited to Hungary's Golden Team in the 1950s?",
]


# Running Baseline (Vanilla RAG)


In [2]:
def run_vanilla_rag(question: str) -> Dict[str, Any]:
    result = rag(
        query=question,
        chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        embedding_model=EMBED_MODEL,
        llm_model=LLM_MODEL,
        k=TOP_K,
        threshold=THRESHOLD,
    )
    return {
        'answer': result.get('answer', ''),
        'chunks': result.get('chunks', []),
        'time_ms': result.get('time_ms', 0.0),
        'tokens': estimate_tokens(result.get('answer', '')),
    }


# Running Self-Reflective RAG


In [3]:
def run_self_reflective_rag(question: str) -> Dict[str, Any]:
    query_embedding = generate_query_embedding(question, provider=PROVIDER, model=EMBED_MODEL)
    result = self_reflect_rag(
        query=question,
        chunks=CHUNKS,
        query_embedding=query_embedding,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        provider=PROVIDER,
        llm_model=LLM_MODEL,
        temperature=1.0,
        k=TOP_K,
        threshold=THRESHOLD,
    )
    total_time = sum(result.get('timings', {}).values())
    refined_answer = result.get('refined_answer', '')
    return {
        'refined_answer': refined_answer,
        'initial_answer': result.get('initial_answer', ''),
        'chunks': result.get('retrieved_chunks', []),
        'timings': result.get('timings', {}),
        'time_ms': total_time,
        'tokens': estimate_tokens(refined_answer),
    }


# Metrics: Precision/Recall/Grounding


In [4]:
records: List[Dict[str, Any]] = []
for question in TEST_QUESTIONS:
    keywords = extract_keywords(question)
    vanilla_result = run_vanilla_rag(question)
    print("-" * 40)
    reflective_result = run_self_reflective_rag(question)
    print("-" * 40)

    keyword_precision_vanilla, keyword_recall_vanilla = precision_recall_k(
        query=question,
        retrieved_chunks=vanilla_result['chunks'],
        all_chunks=CHUNKS,
        k=TOP_K,
    )

    vanilla_answer_embedding = generate_query_embedding(vanilla_result['answer'], provider=PROVIDER, model=EMBED_MODEL)
    semantic_precision_vanilla, semantic_recall_vanilla = semantic_precision_recall_k(
        answer_embedding=vanilla_answer_embedding,
        retrieved_chunks=vanilla_result['chunks'],
        all_chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        k=TOP_K,
    )

    keyword_precision, keyword_recall = precision_recall_k(
        query=question,
        retrieved_chunks=reflective_result['chunks'],
        all_chunks=CHUNKS,
        k=TOP_K,
    )

    answer_embedding = generate_query_embedding(reflective_result['refined_answer'], provider=PROVIDER, model=EMBED_MODEL)
    semantic_precision, semantic_recall = semantic_precision_recall_k(
        answer_embedding=answer_embedding,
        retrieved_chunks=reflective_result['chunks'],
        all_chunks=CHUNKS,
        embeddings=EMBEDDINGS,
        index_map=INDEX_MAP,
        k=TOP_K,
    )

    grounding = grounding_score(reflective_result['refined_answer'], reflective_result['chunks'])

    records.append({
        'question': question,
        'time_vanilla': vanilla_result['time_ms'],
        'time_reflective': reflective_result['time_ms'],
        'tokens_vanilla': vanilla_result['tokens'],
        'tokens_reflective': reflective_result['tokens'],
        'grounding_reflective': grounding,
        'keyword_precision_vanilla_k': keyword_precision_vanilla,
        'keyword_recall_vanilla_k': keyword_recall_vanilla,
        'semantic_precision_vanilla_k': semantic_precision_vanilla,
        'semantic_recall_vanilla_k': semantic_recall_vanilla,
        'keyword_precision_k': keyword_precision,
        'keyword_recall_k': keyword_recall,
        'semantic_precision_k': semantic_precision,
        'semantic_recall_k': semantic_recall,
    })
    print("\n" + "-" * 40 + "\n")

df = pd.DataFrame(records)

[32m[2025-11-25 23:09:09][INFO][src.rag_pipeline] RAG | start | query_len=59 chunks=22[0m
[32m[2025-11-25 23:09:09][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m


[32m[2025-11-25 23:09:11][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:09:11][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=1732.38 ms[0m
[32m[2025-11-25 23:09:11][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:09:11][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:09:11][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': '4863e5e6-b23f-4b40-a27f-c52d24bd8e84', 'score': 0.5343}][0m
[32m[2025-11-25 23:09:11][INFO][src.rag_pipeline] RAG | retrieve | retrieved=1 time=8.75 ms threshold=0.50[0m
[32m[2025-11-25 23:09:11][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=1 context_chars=2819[0m
[32m[2025-11-25 23:09:11][INFO][src.llm_orchestrator] LLM | prompt_len=3027 approx_tokens=435[0m
[32m[2025-11-25 23:09:11][INFO][src.llm_

----------------------------------------


[32m[2025-11-25 23:09:15][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:09:15][INFO][src.self_reflective_rag] REFLECT | start | query_len=59 chunks=22[0m
[32m[2025-11-25 23:09:15][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:09:15][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:09:15][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': '4863e5e6-b23f-4b40-a27f-c52d24bd8e84', 'score': 0.5343}][0m
[32m[2025-11-25 23:09:15][INFO][src.self_reflective_rag] REFLECT | retrieval | retrieved=1 time=2.23 ms threshold=0.50[0m
[32m[2025-11-25 23:09:15][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=1 context_chars=2819[0m
[32m[2025-11-25 23:09:15][INFO][src.llm_orchestrator] LLM | prompt_len=3027 approx_tokens=435[0m
[32m[2025-11-25 23:09:15][INFO][src.llm_orchestrator] LLM |

----------------------------------------


[32m[2025-11-25 23:09:56][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:09:56][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[32m[2025-11-25 23:09:56][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:09:56][INFO][src.rag_pipeline] RAG | start | query_len=77 chunks=22[0m
[32m[2025-11-25 23:09:56][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m



----------------------------------------



[32m[2025-11-25 23:09:57][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:09:57][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=438.42 ms[0m
[32m[2025-11-25 23:09:57][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:09:57][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=2[0m
[32m[2025-11-25 23:09:57][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': 'cba1976b-e193-4a5a-ac04-7454c53dff73', 'score': 0.6261}, {'chunk_id': 'f5dcd8bd-bd60-4ca7-972f-37e982cb32a3', 'score': 0.6087}][0m
[32m[2025-11-25 23:09:57][INFO][src.rag_pipeline] RAG | retrieve | retrieved=2 time=3.64 ms threshold=0.50[0m
[32m[2025-11-25 23:09:57][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=2 context_chars=5103[0m
[32m[2025-11-25 23:09:57][INFO][src.llm_orchestrator] LLM | prompt_le

----------------------------------------


[32m[2025-11-25 23:10:00][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:10:00][INFO][src.self_reflective_rag] REFLECT | start | query_len=77 chunks=22[0m
[32m[2025-11-25 23:10:00][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:10:00][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=2[0m
[32m[2025-11-25 23:10:00][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': 'cba1976b-e193-4a5a-ac04-7454c53dff73', 'score': 0.6261}, {'chunk_id': 'f5dcd8bd-bd60-4ca7-972f-37e982cb32a3', 'score': 0.6088}][0m
[32m[2025-11-25 23:10:00][INFO][src.self_reflective_rag] REFLECT | retrieval | retrieved=2 time=7.15 ms threshold=0.50[0m
[32m[2025-11-25 23:10:00][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=2 context_chars=5103[0m
[32m[2025-11-25 23:10:00][INFO][src.llm_orchestrator] LLM | prompt_len=5329 approx_toke

----------------------------------------


[32m[2025-11-25 23:10:24][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:10:24][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[32m[2025-11-25 23:10:24][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:10:24][INFO][src.rag_pipeline] RAG | start | query_len=75 chunks=22[0m
[32m[2025-11-25 23:10:24][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m



----------------------------------------



[32m[2025-11-25 23:10:25][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:10:25][INFO][src.rag_pipeline] RAG | embedding | provider=openai model=text-embedding-3-small time=403.50 ms[0m
[32m[2025-11-25 23:10:25][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:10:25][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:10:25][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': 'b8030f14-9641-4509-bc2b-70260bf08b8a', 'score': 0.5191}][0m
[32m[2025-11-25 23:10:25][INFO][src.rag_pipeline] RAG | retrieve | retrieved=1 time=1.72 ms threshold=0.50[0m
[32m[2025-11-25 23:10:25][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=1 context_chars=3039[0m
[32m[2025-11-25 23:10:25][INFO][src.llm_orchestrator] LLM | prompt_len=3263 approx_tokens=437[0m
[32m[2025-11-25 23:10:25][INFO][src.llm_o

----------------------------------------


[32m[2025-11-25 23:10:28][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:10:28][INFO][src.self_reflective_rag] REFLECT | start | query_len=75 chunks=22[0m
[32m[2025-11-25 23:10:28][INFO][src.retriever] RETRIEVE | start | vectors=22 k=5 threshold=0.50[0m
[32m[2025-11-25 23:10:28][INFO][src.retriever] RETRIEVE | threshold filtering | threshold=0.50 passed=1[0m
[32m[2025-11-25 23:10:28][INFO][src.retriever] RETRIEVE | top_k selected | [{'chunk_id': 'b8030f14-9641-4509-bc2b-70260bf08b8a', 'score': 0.5191}][0m
[32m[2025-11-25 23:10:28][INFO][src.self_reflective_rag] REFLECT | retrieval | retrieved=1 time=4.65 ms threshold=0.50[0m
[32m[2025-11-25 23:10:28][INFO][src.llm_orchestrator] LLM | provider=openai model=gpt-5-nano context_chunks=1 context_chars=3039[0m
[32m[2025-11-25 23:10:28][INFO][src.llm_orchestrator] LLM | prompt_len=3263 approx_tokens=437[0m
[32m[2025-11-25 23:10:28][INFO][src.llm_orchestrator] LLM |

----------------------------------------


[32m[2025-11-25 23:10:52][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[32m[2025-11-25 23:10:52][INFO][src.embedder] EMBED | query embedding | provider=openai model=text-embedding-3-small[0m
[32m[2025-11-25 23:10:53][INFO][httpx] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m



----------------------------------------



In [5]:
df

Unnamed: 0,question,time_vanilla,time_reflective,tokens_vanilla,tokens_reflective,grounding_reflective,keyword_precision_vanilla_k,keyword_recall_vanilla_k,semantic_precision_vanilla_k,semantic_recall_vanilla_k,keyword_precision_k,keyword_recall_k,semantic_precision_k,semantic_recall_k
0,Which sport has teams of 6 players: football o...,5890.3348,81372.0162,1,171,0.840491,0.2,0.058824,0.2,1.0,0.2,0.058824,0.2,0.2
1,At maximal velocity what is the approximate st...,3316.3843,46508.1296,12,109,0.844037,0.4,0.25,0.2,1.0,0.4,0.25,0.4,0.666667
2,What tactical innovation is credited to Hungar...,3324.3632,48084.9331,12,42,0.755556,0.2,0.058824,0.0,0.0,0.2,0.058824,0.0,0.0


# Saving Results


In [6]:
csv_path = RESULTS_DIR / 'experiment_results.csv'
json_path = RESULTS_DIR / 'experiment_results.json'
df.to_csv(csv_path, index=False)
json_path.write_text(json.dumps(records, ensure_ascii=False, indent=2), encoding='utf-8')
summary = {
    'avg_time_vanilla': float(df['time_vanilla'].mean()) if not df.empty else 0.0,
    'avg_time_reflective': float(df['time_reflective'].mean()) if not df.empty else 0.0,
    'avg_grounding_reflective': float(df['grounding_reflective'].mean()) if not df.empty else 0.0,
}
print('Saved results to:', csv_path, 'and', json_path)
summary


Saved results to: C:\Users\tomasz.makowski.2\Desktop\SemesterII\ComputationalIntelligence\Project\agentic-rag-architectures\results\experiment_results.csv and C:\Users\tomasz.makowski.2\Desktop\SemesterII\ComputationalIntelligence\Project\agentic-rag-architectures\results\experiment_results.json


{'avg_time_vanilla': 4177.027433295734,
 'avg_time_reflective': 58655.02629999537,
 'avg_grounding_reflective': 0.8133610167830915}