# 📊 RAGAS Evaluation
### Evaluate both baseline and semantic RAG apps using local RAGAS metrics

## 🛠️ Setup & Imports

In [1]:
# Reload all imported modules automatically before executing the code.
%load_ext autoreload
%autoreload 2

# 📚 Notebook Setup: sys.path + .env loading
import sys
import os

# Add project root and src/ to sys.path for imports
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Load environment variables from .env
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from src.evaluation import evaluate_baseline_rag, evaluate_semantic_rag
from configs.evaluation_config import RAGAS_METRICS
import pandas as pd

  embedding_model = OpenAIEmbeddings()


## Load sample documents

In [3]:
from pathlib import Path

sample_path = Path().resolve().parent / "data" / "raw" / "sample_docs.txt"
print(f"sample_path: {sample_path}")
with open(sample_path, "r") as f:
    sample_documents = [f.read()]

sample_path: /Users/mwalker/development/TAMARKDesigns/AI-Maker-Space/cohort-6/projects/session-08/AIE6-S08-adv-rag-evaluation/data/raw/sample_docs.txt


## 📋 Define Evaluation Questions
### You can expand this list or load from file later

In [4]:
eval_questions = [
    "What is the role of transformational leadership in building effective engineering teams?",
    "Why is quality considered a company-wide responsibility rather than just the QA team's responsibility?",
    "What are the pillars of an effective engineering philosophy according to this document?",
    "How do proactive defect prevention strategies differ from reactive bug-finding strategies in QA?",
    "Why is customer-centric thinking important when building internal tools and infrastructure?"
]

## 🔍 Evaluate Baseline RAG

In [None]:
baseline_results = evaluate_baseline_rag.run_evaluation(
    questions=eval_questions,
    documents=sample_documents,
    metrics=RAGAS_METRICS
)


  llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")


Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

The LLM did not return a valid classification.
The LLM did not return a valid classification.
The LLM did not return a valid classification.


## 🧠 Evaluate Semantic RAG


In [None]:
semantic_results = evaluate_semantic_rag.run_evaluation(
    questions=eval_questions,
    documents=sample_documents,
    metrics=RAGAS_METRICS
)




## 📦 Save Results to CSV

In [22]:
baseline_results.to_csv("../data/processed/baseline_ragas_results.csv", index=False)
semantic_results.to_csv("../data/processed/semantic_ragas_results.csv", index=False)

## 📊 View a Sample of Results

In [None]:
print("Baseline Evaluation:\n", baseline_results.head())
print("\nSemantic Evaluation:\n", semantic_results.head())

In [None]:
# 🔍 Diagnostic: Show semantic RAG retrievals for manual review
for i, row in semantic_results.iterrows():
    print(f"\n--- Question {i+1} ---")
    print("Q:", row["user_input"])
    print("Response:", row["response"])
    print("Retrieved Contexts:")
    for ctx in row["retrieved_contexts"]:
        print(" -", ctx[:200], "..." if len(ctx) > 200 else "")
    print("Faithfulness:", row["faithfulness"])
    print("Context Precision:", row["context_precision"])
