In [22]:
import os
import sys
import json
from pathlib import Path
from dotenv import load_dotenv

# Add project root to path for imports
notebook_dir = Path.cwd()
project_root = notebook_dir.parent if notebook_dir.name == "notebooks" else notebook_dir
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

load_dotenv()

# Configuration
EVAL_DIR = Path("../data/eval")
PERSIST_DIRECTORY = "../chroma_db"
COLLECTION_NAME = "health_education_chunks"
MODEL_NAME = "openai:gpt-4o-mini"
TEMPERATURE = 0.0
K_RETRIEVAL = 3

print("✓ Configuration loaded")
print(f"✓ Project root added to path: {project_root}")


✓ Configuration loaded
✓ Project root added to path: /home/mateusdelai/Desktop/applied-ai-health-rag


In [23]:
# Load all evaluation files
eval_files = sorted(EVAL_DIR.glob("*.json"))
print(f"Found {len(eval_files)} evaluation files:")
for f in eval_files:
    print(f"  - {f.name}")

# Load all evaluation data
all_eval_data = []
for eval_file in eval_files:
    with open(eval_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        # Add file name for reference
        for item in data:
            item["eval_file"] = eval_file.name
        all_eval_data.extend(data)
    print(f"  Loaded {len(data)} questions from {eval_file.name}")

print(f"\n✓ Total of {len(all_eval_data)} questions loaded")


Found 4 evaluation files:
  - qa_ground_truth_diabetes_overview_v2.json
  - qa_ground_truth_diabetes_treatment.json
  - qa_ground_truth_hypertension_overview.json
  - qa_ground_truth_hypertension_treatment.json
  Loaded 12 questions from qa_ground_truth_diabetes_overview_v2.json
  Loaded 9 questions from qa_ground_truth_diabetes_treatment.json
  Loaded 6 questions from qa_ground_truth_hypertension_overview.json
  Loaded 9 questions from qa_ground_truth_hypertension_treatment.json

✓ Total of 36 questions loaded


In [24]:
# Initialize ChromaDB vector store
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma(
    persist_directory=PERSIST_DIRECTORY,
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
)

doc_count = vector_store._collection.count()
print(f"✓ Vector store loaded! Contains {doc_count} documents")


✓ Vector store loaded! Contains 131 documents


In [25]:
# Create retriever and chain (same pattern as notebook 5)
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Create retriever
retriever = vector_store.as_retriever(search_kwargs={"k": K_RETRIEVAL})

# Initialize LLM
llm = init_chat_model(MODEL_NAME, temperature=TEMPERATURE)

# Create template and chain
template = """Answer the question based only on the following context:
{context}

Question: {query}
"""
prompt = ChatPromptTemplate.from_template(template)
qa_chain = prompt | llm | StrOutputParser()

# Helper function to format documents
def format_docs(relevant_docs):
    return "\n".join(doc.page_content for doc in relevant_docs)

# Quick test
test_question = all_eval_data[0]["question"]
print(f"Test with question: {test_question}")
relevant_docs = retriever.invoke(test_question)
test_response = qa_chain.invoke({"context": format_docs(relevant_docs), "query": test_question})
print(f"\nResponse: {test_response[:200]}...")
print(f"\nRetrieved contexts: {len(relevant_docs)}")


Test with question: What is diabetes?



Response: Diabetes, also known as diabetes mellitus, is a disease in which blood glucose, or blood sugar, levels are too high. Glucose is the body's main source of energy and comes from the food you eat, as wel...

Retrieved contexts: 3


In [26]:
# Process all questions and create dataset for RAGAS
from ragas import EvaluationDataset

print(f"Processing {len(all_eval_data)} questions...")
dataset = []

for i, item in enumerate(all_eval_data, 1):
    question = item["question"]
    ground_truth = item["ground_truth"]
    
    if i % 5 == 0 or i == len(all_eval_data):
        print(f"  Processing question {i}/{len(all_eval_data)}...")
    
    try:
        relevant_docs = retriever.invoke(question)
        response = qa_chain.invoke({"context": format_docs(relevant_docs), "query": question})
        dataset.append({
            "user_input": question,
            "retrieved_contexts": [rdoc.page_content for rdoc in relevant_docs],
            "response": response,
            "reference": ground_truth,
        })
    except Exception as e:
        print(f"  Error processing question {i}: {e}")
        # Add entry with error
        dataset.append({
            "user_input": question,
            "retrieved_contexts": [],
            "response": f"ERROR: {str(e)}",
            "reference": ground_truth,
        })

print(f"\n✓ Dataset created with {len(dataset)} entries")
evaluation_dataset = EvaluationDataset.from_list(dataset)


Processing 36 questions...
  Processing question 5/36...
  Processing question 10/36...
  Processing question 15/36...
  Processing question 20/36...
  Processing question 25/36...
  Processing question 30/36...
  Processing question 35/36...
  Processing question 36/36...

✓ Dataset created with 36 entries


In [None]:
# Run RAGAS evaluation (same pattern as notebook 5)
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from openai import OpenAI

print("Initializing evaluator LLM...")
evaluator_llm = LangchainLLMWrapper(llm)

print("\nRunning RAGAS evaluation...")
result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
    llm=evaluator_llm,
)

print("\n✓ Evaluation completed!")
result

Initializing evaluator LLM...

Running RAGAS evaluation...


  evaluator_llm = LangchainLLMWrapper(llm)
Evaluating: 100%|██████████| 108/108 [05:12<00:00,  2.90s/it]



✓ Evaluation completed!


{'context_recall': 0.9051, 'faithfulness': 0.9954, 'factual_correctness(mode=f1)': 0.8386}

In [49]:
# Visualize detailed results
import pandas as pd

result_df = result.to_pandas()

print("Available columns:", result_df.columns.tolist())
print("\nMetrics by question:")
print("=" * 80)

# Show all metric columns with full precision
metric_cols = [col for col in result_df.columns if col not in 
               ['user_input', 'retrieved_contexts', 'response', 'reference']]
available_cols = ["user_input"] + metric_cols
available_cols = [col for col in available_cols if col in result_df.columns]

# Create a copy to format for display without modifying original precision
display_df = result_df[available_cols].copy()

# Format numeric columns to show more decimal places
pd.set_option('display.float_format', lambda x: f'{x:.10f}' if pd.notna(x) else 'NaN')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print(display_df.to_string(index=False))

# Reset pandas display options
pd.reset_option('display.float_format')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

print("\n\nAggregated metrics (average):")
print("=" * 80)

# Display all available metrics
metric_cols = [col for col in result_df.columns if col not in 
               ['user_input', 'retrieved_contexts', 'response', 'reference']]

for metric_col in metric_cols:
    if metric_col in result_df.columns:
        mean_val = result_df[metric_col].mean()
        # Format metric name for display
        display_name = metric_col.replace('_', ' ').title()
        if metric_col == "factual_correctness(mode=f1)":
            display_name = "Factual Correctness (F1)"
        print(f"{display_name:30s} {mean_val:.10f}")


Available columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'context_recall', 'faithfulness', 'factual_correctness(mode=f1)']

Metrics by question:
                                                                    user_input  context_recall  faithfulness  factual_correctness(mode=f1)
                                                             What is diabetes?    1.0000000000  1.0000000000                  0.7400000000
                                             Why is diabetes a health problem?    1.0000000000  1.0000000000                  0.6200000000
                                          What are the main types of diabetes?    1.0000000000  1.0000000000                  0.9100000000
                                                  What causes type 1 diabetes?    0.5000000000  1.0000000000                  0.8000000000
                                                  What causes type 2 diabetes?    1.0000000000  1.0000000000                  1.00000000

In [50]:
# Save results
OUTPUT_DIR = Path("../data/eval_results")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Save detailed results
output_file = OUTPUT_DIR / "ragas_evaluation_results.json"

# Convert DataFrame to dict preserving full precision
# Use float_format=None to preserve full precision in to_dict
detailed_results = []
for _, row in result_df.iterrows():
    record = {}
    for col in result_df.columns:
        value = row[col]
        # Preserve full precision for numeric values
        if pd.api.types.is_numeric_dtype(result_df[col]):
            record[col] = float(value) if pd.notna(value) else None
        else:
            record[col] = value
    detailed_results.append(record)

results_dict = {
    "total_questions": len(all_eval_data),
    "metrics_summary": {},
    "detailed_results": detailed_results,
}

# Add all available metrics to summary (preserve full precision)
metric_cols = [col for col in result_df.columns if col not in 
               ['user_input', 'retrieved_contexts', 'response', 'reference']]

for metric_col in metric_cols:
    if metric_col in result_df.columns:
        results_dict["metrics_summary"][metric_col] = float(result_df[metric_col].mean())

# Use ensure_ascii=False and allow_nan=False to preserve precision
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results_dict, f, indent=2, ensure_ascii=False, allow_nan=False)

print(f"✓ Results saved to: {output_file}")

# Also save as CSV for easy visualization
# Use float_format to preserve full precision (None means full precision)
csv_file = OUTPUT_DIR / "ragas_evaluation_results.csv"
result_df.to_csv(csv_file, index=False, float_format='%.15g')
print(f"✓ Results also saved as CSV: {csv_file}")


✓ Results saved to: ../data/eval_results/ragas_evaluation_results.json
✓ Results also saved as CSV: ../data/eval_results/ragas_evaluation_results.csv


In [51]:
# Analysis by evaluation file
print("Metrics by evaluation file:")
print("=" * 80)

file_metrics = {}
for eval_file in eval_files:
    file_data = [item for item in all_eval_data if item.get("eval_file") == eval_file.name]
    if file_data:
        # Find corresponding indices in dataset
        file_questions = {item["question"] for item in file_data}
        file_indices = [i for i, d in enumerate(dataset) if d["user_input"] in file_questions]
        
        if file_indices:
            file_df = result_df.iloc[file_indices]
            metrics = {
                "questions": len(file_indices),
                "file_name": eval_file.name
            }
            
            # Collect all available metrics
            metric_columns = [col for col in result_df.columns if col not in 
                            ['user_input', 'retrieved_contexts', 'response', 'reference']]
            
            for metric_col in metric_columns:
                if metric_col in file_df.columns:
                    metrics[metric_col] = float(file_df[metric_col].mean())
            
            file_metrics[eval_file.name] = metrics
            
            print(f"\n{eval_file.name}:")
            print(f"  Questions: {len(file_indices)}")
            
            # Display all metrics
            if "faithfulness" in file_df.columns:
                mean_faith = file_df['faithfulness'].mean()
                print(f"  Faithfulness:           {mean_faith:.10f}")
            if "context_recall" in file_df.columns:
                mean_recall = file_df['context_recall'].mean()
                print(f"  Context Recall:         {mean_recall:.10f}")
            if "llm_context_recall" in file_df.columns:
                mean_llm_recall = file_df['llm_context_recall'].mean()
                print(f"  LLM Context Recall:     {mean_llm_recall:.10f}")
            if "factual_correctness(mode=f1)" in file_df.columns:
                mean_factual = file_df['factual_correctness(mode=f1)'].mean()
                print(f"  Factual Correctness:    {mean_factual:.10f}")
            
            # Display any other metrics that might exist
            other_metrics = [col for col in metric_columns if col not in 
                           ['faithfulness', 'context_recall', 'llm_context_recall', 'factual_correctness(mode=f1)']]
            for metric_col in other_metrics:
                if metric_col in file_df.columns:
                    mean_val = file_df[metric_col].mean()
                    print(f"  {metric_col}:           {mean_val:.10f}")


Metrics by evaluation file:

qa_ground_truth_diabetes_overview_v2.json:
  Questions: 15
  Faithfulness:           1.0000000000
  Context Recall:         0.8444444444
  Factual Correctness:    0.8853333333

qa_ground_truth_diabetes_treatment.json:
  Questions: 12
  Faithfulness:           0.9861111111
  Context Recall:         0.9166666667
  Factual Correctness:    0.7908333333

qa_ground_truth_hypertension_overview.json:
  Questions: 6
  Faithfulness:           1.0000000000
  Context Recall:         0.9166666667
  Factual Correctness:    0.7716666667

qa_ground_truth_hypertension_treatment.json:
  Questions: 9
  Faithfulness:           1.0000000000
  Context Recall:         0.9629629630
  Factual Correctness:    0.8933333333
