In [None]:
"""
rag_eval.ipynb

This script automates the process of evaluating the RAG chatbot against a
pre-defined ground truth dataset. It initializes the full RAG pipeline,
iterates through each question in the dataset, and records the chatbot's
generated answer, the retrieved context, and the response time.

The final output is a CSV file containing the results, which can then be
used for scoring and analysis.
"""

In [1]:
# ==================================================================================================
# --- 1. Import Dependencies ---
# ==================================================================================================
import pandas as pd
import json
import time
import sys
import logging
from pathlib import Path
from types import SimpleNamespace

# --- Define Project Root and Add to Python Path ---
project_code_root = Path.cwd().parent.resolve()

if str(project_code_root) not in sys.path:
        sys.path.append(str(project_code_root))
    
from core.rag_setup import setup_chatbot
from core import config

In [2]:
# ==================================================================================================
# --- 2. Logging Configuration ---
# ==================================================================================================
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - [%(levelname)s] - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [3]:
# ==================================================================================================
# --- 3. Core Evaluation Functions ---
# ==================================================================================================

def run_evaluation_pipeline(chain, dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Processes each question in the dataset using the RAG chain and collects results.

    Args:
        chain: The initialized ConversationalRetrievalChain object.
        dataset (pd.DataFrame): The DataFrame containing the ground truth questions and answers.

    Returns:
        pd.DataFrame: A DataFrame with the evaluation results.
    """
    results = []
    total_questions = len(dataset)
    logging.info(f"Starting evaluation for {total_questions} questions...")

    for index, row in dataset.iterrows():
        try:
            # The source CSV contains JSON strings; they must be parsed.
            question_json = json.loads(row['question'])
            ground_truth_json = json.loads(row['expected_answer'])

            question_text = question_json['question']
            ground_truth_answer_text = ground_truth_json['expected_answer']

            # --- Query RAG System and Measure Latency ---
            start_time = time.time()
            response = chain.invoke(question_text)
            end_time = time.time()
            response_time = end_time - start_time

            # --- Extract and Structure Data ---
            generated_answer = response.get('answer', 'Error: No answer found.')
            retrieved_context = " ".join([doc.page_content for doc in response.get('source_documents', [])])

            results.append({
                "question": question_text,
                "ground_truth_answer": ground_truth_answer_text,
                "generated_answer": generated_answer,
                "retrieved_context": retrieved_context,
                "response_time_seconds": response_time
            })
            logging.info(f"Processed question {index + 1}/{total_questions} in {response_time:.2f} seconds")

        except Exception as e:
            logging.error(f"Failed to process question at index {index}: {e}")
            # Append a row indicating the error to maintain dataset alignment.
            results.append({
                "question": question_text,
                "ground_truth_answer": ground_truth_answer_text,
                "generated_answer": f"ERROR: {e}",
                "retrieved_context": "",
                "response_time_seconds": 0
            })

    return pd.DataFrame(results)

In [4]:
def main():
    """
    Main function to orchestrate the entire evaluation process.
    """
    project_root = Path.cwd().parent.parent.resolve()
    total_start_time = time.time()

    # --- Extract model version from config for dynamic file naming ---
    try:
        model_version = config.MODEL_NAME.split('_')[-1]
        output_filename = f"rag_eval_generated_answers_{model_version}.csv"
    except (IndexError, AttributeError):
        # Fallback in case MODEL_NAME format is unexpected
        logging.warning("Could not parse model version from config.MODEL_NAME. Using default filename.")
        output_filename = "rag_eval_generated_answers.csv"

    # Define input and output paths for this evaluation run.
    input_csv_path = project_root / "evaluation_data" / "ground_truth_dataset.csv"
    output_csv_path = project_root / "evaluation_data" / output_filename
    output_csv_path.parent.mkdir(parents=True, exist_ok=True) # Ensure output directory exists

    try:
        # --- Initialization ---
        logging.info("Initializing RAG system with evaluation configuration...")
        chain, _, _ = setup_chatbot(config)
        logging.info("RAG system initialization complete.")

        # --- Data Loading ---
        logging.info(f"Loading ground truth dataset from: {input_csv_path}")
        qa_dataset = pd.read_csv(input_csv_path)

        # --- Pipeline Execution ---
        evaluation_results_df = run_evaluation_pipeline(chain, qa_dataset)

        # --- Save Results ---
        logging.info(f"Saving evaluation results to: {output_csv_path}")
        evaluation_results_df.to_csv(output_csv_path, index=False, encoding="utf-8-sig")
        
        total_end_time = time.time()
        logging.info(f"Evaluation script finished successfully in {total_end_time - total_start_time:.2f} seconds.")

    except FileNotFoundError as e:
        logging.error(f"File not found: {e}. Please ensure all paths are correct.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during the evaluation: {e}")

In [None]:
if __name__ == "__main__":
    main()