# Cybersecurity RAG Evaluation Notebook

This notebook allows for an interactive evaluation of the RAG pipeline. It performs the same function as `scripts/run_evaluation.py` but provides a step-by-step execution with visible logs and outputs, including a confusion matrix chart.

### Setup Environment

In [None]:
!pip install -q -r requirements.txt

### Import Libraries and Load Configuration

In [None]:
import sys
import os
import pandas as pd
from dotenv import load_dotenv
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Add src to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.config import Config
from src.rag_pipeline.graph import build_rag_graph
from src.evaluation.evaluator import evaluate_performance

# Load environment variables from .env file
load_dotenv(dotenv_path='../.env')

logging.info("Libraries and configuration loaded.")

### Run Evaluation

In [None]:
# Check for API key before running
if not Config.GROQ_API_KEY:
    logging.error("GROQ_API_KEY is not set in the .env file. Evaluation cannot proceed.")
elif not Config.MONGO_URI:
    logging.error("MONGO_URI is not set in the .env file. Evaluation cannot proceed.")
else:
    # Load a sample of the eval dataset
    try:
        eval_df = pd.read_csv(Config.EVAL_DATA_PATH)
        # Use a small sample for quick testing, you can increase the fraction
        sample_df = eval_df.sample(frac=0.1, random_state=42)
        logging.info(f"Loaded {len(sample_df)} samples from {Config.EVAL_DATA_PATH}")
    except FileNotFoundError:
        logging.error(f"Evaluation file not found at {Config.EVAL_DATA_PATH}")
        sample_df = None

    if sample_df is not None:
        app = build_rag_graph()
        
        questions = []
        generated_answers = []
        ground_truths = []
        choices_list = []

        logging.info(f"Running evaluation on {len(sample_df)} samples...")
        # Use iterrows() for DataFrame iteration in a notebook for clarity
        for index, row in sample_df.iterrows():
            logging.info(f"Processing sample {index+1}/{len(sample_df)}: Question ID {row.name}")
            
            question = row["question"]
            ground_truth_choice = row["answer"]
            choices = row["choices"]
            
            # The input to the graph is a dictionary with keys matching the RAGState
            inputs = {"query": question, "conversation_history": ""}
            result = app.invoke(inputs)
            
            questions.append(question)
            generated_answers.append(result["answer"])
            ground_truths.append(ground_truth_choice)
            choices_list.append(choices)

        # Run the final evaluation and print the report
        evaluate_performance(questions, generated_answers, ground_truths, choices_list)