# RAG Evaluation Notebook

This notebook evaluates the RAG system using `ragas` metrics.

In [None]:
import os
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    answer_relevancy,
    faithfulness
)
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

# Setup output file for PDF extraction if not done
if not os.path.exists('assessment_text.txt'):
    print("Please run extraction first")

In [None]:
# Configuration
OLLAMA_BASE_URL = "http://localhost:11434"
MODEL_NAME = "llama3"

llm = ChatOllama(model=MODEL_NAME, base_url=OLLAMA_BASE_URL)
embeddings = OllamaEmbeddings(model=MODEL_NAME, base_url=OLLAMA_BASE_URL)

In [None]:
# Define Test Data
# In a real scenario, you would manually curate this or generate it.
questions = [
    "What metrics should be used for evaluation?",
    "Which vector store is preferred?",
    "What is the preferred framework due to agentic capabilities?"
]

ground_truths = [
    ["Retrieval accuracy, Retrieval precision, Contextual accuracy, Contextual precision"],
    ["Weaviate"],
    ["LangGraph or LangChain"]
]

# Run RAG to get answers and contexts
from agent import run_agent, get_retriever

answers = []
contexts = []

retriever = get_retriever()

for q in questions:
    try:
        # Simple retrieval for context (simulating what the agent sees)
        # In a full eval, we'd hook into the agent to get exact retrieved docs
        docs = retriever.invoke(q)
        ctx = [d.page_content for d in docs]
        contexts.append(ctx)
        
        # Get Answer
        ans = run_agent(q)
        answers.append(ans)
    except Exception as e:
        print(f"Error processing {q}: {e}")
        answers.append("Error")
        contexts.append([])


In [None]:
# Prepare Dataset for Ragas
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truths
}
dataset = Dataset.from_dict(data)

# Evaluate
results = evaluate(
    dataset = dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
    llm=llm,
    embeddings=embeddings
)

print(results)
df = results.to_pandas()
df.head()