# Result analysis

In [None]:
from dotenv import load_dotenv
import os

# Get the path to the root directory of the repository
root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# Load environment variables from .env file in the root directory
load_dotenv(os.path.join(root_dir, '.env'))

In [None]:
# Parameters

SAVE_IMAGES = True
OVERRIDE_EVAL = False
DRIRUN_EVAL = False

## Import data

### Load experiment results

In [None]:
RESULTS_DIR = os.environ.get("RESULTS_PATH")
print(f"The results are stored in the directory {RESULTS_DIR}")

In [None]:
experiment_batches = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(os.path.join(RESULTS_DIR, d))]

# Print each folder with the number of JSON result files it contains
for i, d in enumerate(experiment_batches, 1):
    dir_path = os.path.join(RESULTS_DIR, d)
    json_count = sum(
        1 for f in os.listdir(dir_path)
        if f.endswith('.json') and os.path.isfile(os.path.join(dir_path, f))
    )
    print(f"{i}) {d} ({json_count} result{'s' if json_count != 1 else ''})")

In [None]:
experiment_ids_input = input("Choose experiment ID(s) (single number or comma-separated list, e.g., 1 or 1,2,3): ")

selected_ids = []
try:
    # Parse input - handle both single number and comma-separated list
    ids_str = experiment_ids_input.replace(" ", "")  # Remove spaces
    id_list = [int(x.strip()) for x in ids_str.split(",")]
    
    # Validate all IDs
    for id_val in id_list:
        if id_val < 1 or id_val > len(experiment_batches):
            print(f"Error: ID {id_val} is out of range (1-{len(experiment_batches)}). Skipping.")
        else:
            selected_ids.append(id_val - 1)  # Convert to 0-indexed
    
    if not selected_ids:
        print(f"Error: No valid IDs provided. Using first experiment.")
        selected_ids = [0]
    
except ValueError:
    print(f"Error: Invalid input format. Please enter numbers separated by commas.")
    selected_ids = [0]

# Get selected folder names and their base directories
selected_experiment_dirs = [os.path.join(RESULTS_DIR, experiment_batches[id]) for id in selected_ids]
exp_folder_names = [experiment_batches[id] for id in selected_ids]

print(f"\nSelected {len(selected_ids)} experiment batch(es):")
for i, folder_name in enumerate(exp_folder_names, 1):
    print(f"  {i}) {folder_name}")

# Set EXPERIMENT_DIR to the first selected directory
EXPERIMENT_DIR = selected_experiment_dirs[0]
print(f"\nEXPERIMENT_DIR set to: {EXPERIMENT_DIR}")


In [None]:
# Collect all experiment files from selected directories
all_exp_results_files = {}  # Dictionary: dir_path -> list of files

for exp_dir in selected_experiment_dirs:
    exp_results_files = [f for f in os.listdir(exp_dir) if f.endswith('.json')]
    all_exp_results_files[exp_dir] = exp_results_files
    print(f"{exp_dir}: Found {len(exp_results_files)} JSON experiments")

total_experiments = sum(len(files) for files in all_exp_results_files.values())
print(f"\nTotal: {total_experiments} experiments across all selected folders")


In [None]:
import pandas as pd

LOAD_FROM_CSV = False
experiments_df = pd.DataFrame()
folders_load_method = {}  # Track which load method (CSV or JSON) for each folder

# Process each selected folder
for exp_dir in selected_experiment_dirs:
    folder_name = os.path.basename(exp_dir)
    csv_files = [f for f in os.listdir(exp_dir) if f.endswith('.csv')]
    
    print(f"\n{'='*60}")
    print(f"Processing folder: {folder_name}")
    print(f"{'='*60}")
    
    if csv_files:
        print(f"Found {len(csv_files)} CSV file(s):")
        for i, csv_file in enumerate(csv_files, 1):
            print(f"  {i}) {csv_file}")
        print(f"  0) Load from JSON files (default)")
        
        choice = input(f"Choose data source for {folder_name} (0 for JSON, or CSV file number): ")
        
        try:
            choice_id = int(choice)
            if choice_id > 0 and choice_id <= len(csv_files):
                csv_file = csv_files[choice_id - 1]
                csv_path = os.path.join(exp_dir, csv_file)
                print(f"Loading from CSV: {csv_file}")
                df_temp = pd.read_csv(csv_path)
                experiments_df = pd.concat([experiments_df, df_temp], ignore_index=True)
                folders_load_method[folder_name] = f"CSV ({csv_file})"
                LOAD_FROM_CSV = True
            else:
                print(f"Loading from JSON files...")
                folders_load_method[folder_name] = "JSON"
        except ValueError:
            print("Invalid input. Loading from JSON files...")
            folders_load_method[folder_name] = "JSON"
    else:
        print(f"No CSV files found. Will load from JSON files...")
        folders_load_method[folder_name] = "JSON"

print(f"\n{'='*60}")
print("Summary of load methods:")
for folder, method in folders_load_method.items():
    print(f"  {folder}: {method}")
print(f"{'='*60}\n")

# If CSV was loaded, we're done. If not, JSON will be loaded in the next cell.
if LOAD_FROM_CSV:
    print(f"Loaded {len(experiments_df)} experiments from CSV files")
else:
    print("Will load experiments from JSON files...")


In [None]:
import json

# Check if any folders need to be loaded from JSON
folders_to_load_json = [folder for folder, method in folders_load_method.items() if method == "JSON"]

if folders_to_load_json:
    # Iterate over all selected experiment directories
    for exp_dir in selected_experiment_dirs:
        folder_name = os.path.basename(exp_dir)
        # Only load JSON for folders where user selected JSON option
        if folders_load_method.get(folder_name) != "JSON":
            continue
        
        exp_results_files = all_exp_results_files[exp_dir]
        print(f"\nLoading experiments from {folder_name}...")
        
        for experiment in exp_results_files:
            try:
                with open(os.path.join(exp_dir, experiment), 'r') as file:
                    data = json.load(file)

                    localization = data.get("final_report", {}).get("localization", [])
                    if isinstance(localization, list):
                        localization_str = ", ".join(localization)
                    else:
                        localization_str = None
            
                record = {
                    "experiment_file": experiment,
                    "agent_id" : data.get("agent_id", None),
                    "agent_conf_name" : data.get("agent_configuration_name", None),
                    "scenario": data.get("app_name", None),
                    "fault_name": data.get("testbed", {}).get("fault_name", None),
                    "target_namespace": data.get("target_namespace", None),
                    "trace_service_starting_point": data.get("trace_service_starting_point", None),
                    "rca_tasks_per_iteration": data.get("testbed", {}).get("rca_tasks_per_iteration", 0),
                    "max_tool_calls": data.get("testbed", {}).get("max_tool_calls", 0),
                    "execution_time_seconds": data.get("stats", {}).get("execution_time_seconds", 0),
                    "total_tokens": data.get("stats", {}).get("total_tokens", 0),
                    "tokens_triage": data.get("stats", {}).get("agent_stats", {}).get("triage_agent", {}).get("total_tokens", 0),
                    "tokens_planner": data.get("stats", {}).get("agent_stats", {}).get("planner_agent", {}).get("total_tokens", 0),
                    "tokens_rca_worker": data.get("stats", {}).get("agent_stats", {}).get("rca_agent", {}).get("total_tokens", 0),
                    "runs_count_rca": data.get("stats", {}).get("agent_stats", {}).get("rca_agent", {}).get("runs_count", 0),
                    "tokens_supervisor": data.get("stats", {}).get("agent_stats", {}).get("supervisor_agent", {}).get("total_tokens", 0),
                    "runs_count_supervisor": data.get("stats", {}).get("agent_stats", {}).get("supervisor_agent", {}).get("runs_count", 0),
                    "detection": data.get("final_report", {}).get("detection", None),
                    "localization": localization_str, 
                    "root_cause": data.get("final_report", {}).get("root_cause", None),
                    "eval_detection" : data.get("evaluation", {}).get("detection", None),
                    "eval_localization" : data.get("evaluation", {}).get("localization", None),
                    "eval_rca_score" : data.get("evaluation", {}).get("rca_score", None),
                    "eval_rca_motivation" : data.get("evaluation", {}).get("rca_motivation", None),
                }
                
                # Append record to dataframe
                experiments_df = pd.concat([experiments_df, pd.DataFrame([record])], ignore_index=True)
            
            except (json.JSONDecodeError, KeyError) as e:
                print(f"Warning: Error processing {experiment}: {str(e)}")
                continue
        
        print(f"Loaded {len(exp_results_files)} experiments from {os.path.basename(exp_dir)}")
    
    print(f"\nTotal experiments loaded: {len(experiments_df)}")


In [None]:
# Get the main app scenario
scenarions = experiments_df["scenario"].str.lower().unique()

SCENARIOS_NAME = ", ".join(scenarions)

SCENARIOS_NAME

### Load fault types

In [None]:
FAULTS_CONF_DIR = os.environ.get("FAULTS_CONF_DIR")

faults_conf_files = [f for f in os.listdir(FAULTS_CONF_DIR) if f.endswith('.json')]

print(f"Found {len(faults_conf_files)} fault configuration files:")

for f in faults_conf_files:
    print(f"- {f}")

In [None]:
fault_target_dict = {}

for f in faults_conf_files:
    with open(os.path.join(FAULTS_CONF_DIR,f), 'r') as file:
        data = json.load(file)
        formatted_key = f"{data["app_name"]} - {data["fault_type"]}"
        fault_target_dict[formatted_key] = data.get("target", None)

fault_target_dict

### Load agent configutations

In [None]:
agent_name_dict = {}
agent_desc_dict = {}

AGENTS_CONF_DIR = os.environ.get("AGENTS_CONF_DIR")

agents_conf_files = [f for f in os.listdir(AGENTS_CONF_DIR) if f.endswith('.json')]

for f in agents_conf_files:
    with open(os.path.join(AGENTS_CONF_DIR,f), 'r') as file:
        data = json.load(file)
        agent_name_dict[data.get("name")] = data.get("id")
        agent_desc_dict[data.get("id")] ={
            "name" : data.get("name"),
            "description" : data.get("description", "")
        }

In [None]:
agent_name_dict

In [None]:
# Add agent_id if not present in the results

if experiments_df['agent_id'].isna().any():
    experiments_df['agent_id'] = experiments_df['agent_conf_name'].map(
        lambda name: next((id for conf, id in agent_name_dict.items() if conf in name), None)
    )

## Evaluation

In [None]:
# Import evaluation helpers from repo package
import sys, os

# Ensure we can import from `sre-agent` package
sys.path.append(os.path.abspath(os.path.join(root_dir, 'sre-agent')))

from evaluation.evaluation import evaluate_experiment

In [None]:
# Build ground-truth mapping (target, RCA_gt) from faults configs
faults_gt_dict = {}
try:
    for f in faults_conf_files:
        with open(os.path.join(FAULTS_CONF_DIR, f), 'r') as file:
            d = json.load(file)
            key = f"{d['app_name']} - {d['fault_type']}"
            faults_gt_dict[key] = {
                "target": d.get("target", None),
                "RCA_gt": d.get("RCA_gt", "")
            }
    print(f"Loaded GT for {len(faults_gt_dict)} scenarios.")
except Exception as e:
    print(f"[warn] Failed building faults_gt_dict: {e}")

In [None]:
def build_fault_scenario(row):
    key = f"{row['scenario']} - {row['fault_name']}"
    gt = faults_gt_dict.get(key, {})
    return {
        "target": gt.get("target", None),
        "RCA_gt": gt.get("RCA_gt", "")
    }

def build_report_from_row(row):
    # Convert localization to list for evaluate_experiment compatibility
    loc_val = row['localization']
    if isinstance(loc_val, str):
        loc_list = [s.strip() for s in loc_val.split(',') if s.strip()]
    elif isinstance(loc_val, list):
        loc_list = loc_val
    else:
        loc_list = []
    return {
        "agent_configuration_name": row.get("agent_conf_name", "N/A"),
        "final_report": {
            "detection": bool(row.get("detection", False)),
            "localization": loc_list,
            "root_cause": row.get("root_cause", "") if isinstance(row.get("root_cause", ""), str) else ""
        }
    }

In [None]:
from typing import Tuple, Optional
from tqdm import tqdm
from datetime import datetime

def backfill_missing_evaluations(limit: Optional[int] = None, dry_run: bool = False, override: bool = False) -> Tuple[int, int]:
    """
    For rows where eval_* fields are missing, compute evaluation via evaluate_experiment,
    persist to the result JSON (adds/overwrites the "evaluation" key), and update the DataFrame.

    Args:
        limit: optional maximum number of rows to process.
        dry_run: if True, do not write files; only compute and update the DataFrame in-memory.
        override: if True, recompute evaluation for all rows; if False, only compute for missing values.

    Returns:
        (processed_count, file_updates)
    """
    required_cols = ['eval_detection', 'eval_localization', 'eval_rca_score', 'eval_rca_motivation']
    
    if override:
        idxs = experiments_df.index.tolist()
    else:
        missing_mask = experiments_df[required_cols].isna().any(axis=1)
        idxs = experiments_df[missing_mask].index.tolist()
    
    if limit is not None:
        idxs = idxs[:limit]
    
    processed = 0
    updated_files = 0
    
    for idx in tqdm(idxs, desc="Backfilling evaluations", unit="row"):
        row = experiments_df.loc[idx]
        # Build inputs for evaluation
        fault_scenario = build_fault_scenario(row)
        report = build_report_from_row(row)
        
        # Run evaluation (this may call the LLM for RCA scoring)
        evaluation = evaluate_experiment(fault_scenario, report)
        if not isinstance(evaluation, dict):
            print(f"[warn] Row {idx}: evaluate_experiment returned non-dict; skipping.")
            continue
        
        # Update dataframe in-memory
        experiments_df.loc[idx, 'eval_detection'] = evaluation.get('detection', None)
        experiments_df.loc[idx, 'eval_localization'] = evaluation.get('localization', None)
        experiments_df.loc[idx, 'eval_rca_score'] = evaluation.get('rca_score', None)
        experiments_df.loc[idx, 'eval_rca_motivation'] = evaluation.get('rca_motivation', None)
        
        # Update json file
        if not dry_run:
            file_name = row['experiment_file']
            file_path = os.path.join(EXPERIMENT_DIR, file_name)
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                data['evaluation'] = evaluation
                with open(file_path, 'w') as f:
                    json.dump(data, f, indent=2)
                updated_files += 1
            except Exception as e:
                print(f"[warn] Row {idx}: failed writing evaluation to {file_path}: {e}")
        
        processed += 1
    
    if override:
        # Format the result as date and then "experiment_results"
        date_str = datetime.now().strftime("%Y-%m-%d")
        out_file = f"experiment_results_{date_str}.csv"
        experiments_df.to_csv(os.path.join(EXPERIMENT_DIR, out_file), index=False)

    print(f"Backfill completed. Processed rows: {processed}. Files updated: {updated_files}.")
    return processed, updated_files




In [None]:
backfill_missing_evaluations(override=OVERRIDE_EVAL, dry_run=DRIRUN_EVAL)

# Exclude results of fault scenarios

Exclude results from fault scenarios that do not work on the VM due to virtualization limitations.

In [None]:
# Create a list of "scenario - fault type" from the faults_conf_files
scenario_fault_list = []

for f in faults_conf_files:
    with open(os.path.join(FAULTS_CONF_DIR, f), 'r') as file:
        data = json.load(file)
        scenario = data.get("app_name")
        fault_type = data.get("fault_type")
        scenario_fault_list.append(f"{scenario} - {fault_type}")

scenario_fault_list

In [None]:
scenarios_fault_to_exclude = [
    "hotel reservation - container kill",
    "hotel reservation - pod kill",
    "hotel reservation - redeploy without pv"
]

In [None]:
# Filter out excluded fault scenarios from experiments_df
experiments_df = experiments_df[~experiments_df.apply(
    lambda row: f"{row['scenario'].lower()} - {row['fault_name'].lower()}" in scenarios_fault_to_exclude,
    axis=1
)]

print(f"Filtered experiments_df. Remaining rows: {len(experiments_df)}")

In [None]:
experiments_df

## Data exploration

In [None]:
# Group by fault type
fault_group = experiments_df.groupby('fault_name').agg(
    experiments=('experiment_file', 'count'),
    detection_rate=('detection', 'mean'),
    detection_accuracy=('eval_detection', 'mean'),
    correct_detections=('eval_detection', 'sum'),
    localization_accuracy=('eval_localization', 'mean'),
    correct_localizations=('eval_localization', 'sum'),
    avg_exec_time=('execution_time_seconds', 'mean'),
    avg_tokens=('total_tokens', 'mean'),
    avg_rca_score=('eval_rca_score', 'mean')
)

fault_group = fault_group.sort_values('localization_accuracy', ascending=False)
fault_group

In [None]:
# Group by agent configuration
agent_group = experiments_df.groupby('agent_id').agg(
    experiments=('experiment_file', 'count'),
    # detection_rate=('detection', 'mean'),
    detection_accuracy=('eval_detection', 'mean'),
    # correct_detections=('eval_detection', 'sum'),
    localization_accuracy=('eval_localization', 'mean'),
   # correct_localizations=('eval_localization', 'sum'),
    avg_exec_time=('execution_time_seconds', 'mean'),
    avg_tokens=('total_tokens', 'mean'),
    avg_rca_score=('eval_rca_score', 'mean')
)

agent_group = agent_group.sort_values('localization_accuracy', ascending=False)
agent_group

In [None]:
# Group by scenario (application) - convert to lowercase for consistent grouping
scenario_group = experiments_df.groupby(experiments_df['scenario'].str.lower()).agg(
    experiments=('experiment_file', 'count'),
    # detection_rate=('detection', 'mean'),
    detection_accuracy=('eval_detection', 'mean'),
    localization_accuracy=('eval_localization', 'mean'),
    avg_exec_time=('execution_time_seconds', 'mean'),
    avg_tokens=('total_tokens', 'mean'),
    avg_rca_score=('eval_rca_score', 'mean')
)

scenario_group = scenario_group.sort_values('localization_accuracy', ascending=False)
scenario_group

## Data visualization

In [None]:
import re
import matplotlib.pyplot as plt

# Global counter for plot numbering
plot_counter = 0
plots_dir = os.path.join(EXPERIMENT_DIR, 'plots')

# Create plots directory if it doesn't exist
if SAVE_IMAGES:
    if not os.path.exists(plots_dir):
        os.makedirs(plots_dir)


In [None]:
# choose the palette
from pypalettes import load_cmap
cmap = load_cmap("Color_Blind")

In [None]:
cmap

### Scatter plots

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create figure and axis
fig, ax = plt.subplots(figsize=(14, 8))

# Get unique fault types for coloring
fault_types = experiments_df['fault_name'].unique()
colors = cmap(np.linspace(0, 1, len(fault_types)))
color_map = {fault: colors[i] for i, fault in enumerate(fault_types)}

# Plot each experiment as a dot
for idx, row in experiments_df.iterrows():
    x = row['execution_time_seconds']
    y = row['total_tokens']
    fault = row['fault_name']
    color = color_map[fault]
    detection = row['eval_detection']
    
    # Plot the dot - filled if detection is True, only border if False
    if detection:
        ax.scatter(x, y, s=100, color=color, alpha=0.7, edgecolors='black', linewidth=0.5)
    else:
        ax.scatter(x, y, s=100, facecolors='none', edgecolors=color, linewidth=2, alpha=0.7)
    
    # Add label with agent configuration on top of the dot
    label = f"{row['agent_id']}"
    ax.annotate(label, (x, y), fontsize=8, ha='center', va='bottom', fontweight='bold', 
                xytext=(0, 7), textcoords='offset points')

# Create legend for fault types
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[fault], 
                       markersize=8, label=fault) for fault in fault_types]

# Add detection legend
detection_handles = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='gray', 
               markersize=8, label='Correct detection', markeredgecolor='black', markeredgewidth=0.5),
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='none', 
               markersize=8, label='Wrong detection', markeredgecolor='gray', markeredgewidth=2)
]

# Combine all handles into a single legend
all_handles = handles + detection_handles
ax.legend(handles=all_handles, loc='upper left', fontsize=10)

# Labels and title
ax.set_xlabel('Execution Time (seconds)', fontsize=12, fontweight='bold')
ax.set_ylabel('Total Tokens', fontsize=12, fontweight='bold')
ax.set_title(f'Detection Accuracy: Execution Time vs Token Usage - {SCENARIOS_NAME.title()}', fontsize=14, fontweight='bold')

# Grid
ax.grid(True, alpha=0.3, linestyle='--', linewidth=1)

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_scatter_detection_accuracy.pdf')

    fig.savefig(filename, format='pdf', bbox_inches='tight')

    print(f"Saved plot to {filename}")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Create figure and axis
fig, ax = plt.subplots(figsize=(16, 9))

# Get unique fault types for coloring
fault_types = experiments_df['fault_name'].unique()
colors = cmap(np.linspace(0, 1, len(fault_types)))
color_map = {fault: colors[i] for i, fault in enumerate(fault_types)}

# Collect points and labels for later adjustment
texts = []

# Plot each experiment as a dot
for idx, row in experiments_df.iterrows():
    x = row['execution_time_seconds']
    y = row['total_tokens']
    fault = row['fault_name']
    color = color_map[fault]
    correct_localization = row['eval_localization']
    
    # Plot the dot - filled if correct_localization is True, X marker if False
    if correct_localization:
        ax.scatter(x, y, s=150, color=color, alpha=0.8, edgecolors='black', linewidth=1.2)
    else:
        ax.scatter(x, y, s=150, marker='x', color=color, linewidth=2.5, alpha=0.8)
    
    # Add label with agent configuration
    label = f"{row['agent_id']}"
    text_obj = ax.text(x, y, label, fontsize=9, ha='center', va='center', fontweight='bold',
                       bbox=dict(boxstyle='round,pad=0.4', facecolor='white', edgecolor='gray', alpha=0.85, linewidth=0.8),
                       zorder=10)
    texts.append(text_obj)

# Adjust text positions to avoid overlaps
try:
    adjust_text(texts, arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.2', 
                                       lw=0.8, color='gray', alpha=0.6),
                ax=ax, precision=0.1, expand_points=(1.5, 1.5))
except ImportError:
    print("Note: adjustText not installed. Labels may overlap. Install with: pip install adjustText")

# Create legend for fault types
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[fault], 
                       markersize=10, label=fault, markeredgecolor='black', markeredgewidth=1) 
           for fault in sorted(fault_types)]

# Add localization legend
localization_handles = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='gray', 
               markersize=10, label='Correct Localization', markeredgecolor='black', markeredgewidth=1),
    plt.Line2D([0], [0], marker='x', color='gray', linestyle='None',
               markersize=10, label='Wrong Localization', markeredgewidth=2.5)
]

# Combine all handles into a single legend
all_handles = handles + localization_handles
ax.legend(handles=all_handles, loc='upper left', fontsize=11, framealpha=0.95, edgecolor='black')

# Labels and title
ax.set_xlabel('Execution Time (seconds)', fontsize=14, fontweight='bold', color='#2c3e50')
ax.set_ylabel('Total Tokens', fontsize=14, fontweight='bold', color='#2c3e50')
ax.set_title(f'Localization Accuracy: Execution Time vs Token Usage - {SCENARIOS_NAME.title()}\n(by Fault Type and Agent Configuration)', 
             fontsize=16, fontweight='bold', pad=20, color='#2c3e50')

# Grid and background
ax.set_facecolor('#f8f9fa')
ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.8, color='#bdc3c7')
ax.set_axisbelow(True)

# Spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('#bdc3c7')
ax.spines['bottom'].set_color('#bdc3c7')
ax.spines['left'].set_linewidth(1.2)
ax.spines['bottom'].set_linewidth(1.2)

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_scatter_localization_accuracy.pdf')

    fig.savefig(filename, format='pdf', bbox_inches='tight')

    print(f"Saved plot to {filename}")
plt.show()

In [None]:
from adjustText import adjust_text

# Visualization: RCA score vs Execution Time and Token Usage

# Use existing variables: experiments_df, cmap, plt, np
fig, ax = plt.subplots(figsize=(16, 9))

# Color by fault type (reuse palette)
fault_types = experiments_df['fault_name'].unique()
colors = cmap(np.linspace(0, 1, len(fault_types)))
color_map = {fault: colors[i] for i, fault in enumerate(fault_types)}

# Point size encodes RCA score (0–5); shape encodes correctness of localization
def size_for_score(score: int) -> float:
    # Base size + scale; ensure visible even for low scores
    return 80 + 60 * max(0, int(score))

texts = []

for _, row in experiments_df.iterrows():
    x = row['execution_time_seconds']
    y = row['total_tokens']
    fault = row['fault_name']
    score = row['eval_rca_score']
    correct_loc = row['eval_localization']
    color = color_map[fault]
    s = size_for_score(score)

    # Circle for correct localization, 'x' for incorrect
    if correct_loc:
        ax.scatter(x, y, s=s, color=color, alpha=0.8, edgecolors='black', linewidth=1.0)
    else:
        ax.scatter(x, y, s=s, marker='x', color=color, linewidth=2.5, alpha=0.9)

    # Label with agent_id and score
    label = f"{row['agent_id']} (S{score})"
    text = ax.text(x, y, label, fontsize=9, ha='center', va='center', fontweight='bold',
                   bbox=dict(boxstyle='round,pad=0.35', facecolor='white', edgecolor='gray', alpha=0.85, linewidth=0.8),
                   zorder=10)
    texts.append(text)

# Optional: adjust_text if already imported above
try:
    adjust_text(texts, arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.2',
                                       lw=0.8, color='gray', alpha=0.6),
                ax=ax, precision=0.1, expand_points=(1.5, 1.5))
except Exception:
    pass

# Legend: fault type colors
fault_handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[f],
                            markeredgecolor='black', markeredgewidth=1, markersize=10, label=f)
                 for f in sorted(fault_types)]

# Legend: localization correctness
loc_handles = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='gray',
               markeredgecolor='black', markeredgewidth=1, markersize=10, label='Correct Localization'),
    plt.Line2D([0], [0], marker='x', color='gray', linestyle='None',
               markeredgewidth=2.5, markersize=10, label='Incorrect Localization'),
]

# Legend: RCA score sizes (show exemplars)
score_examples = [0, 1, 3, 5]
size_handles = [plt.Line2D([0], [0], marker='o', color='w',
                           markerfacecolor='lightgray', markeredgecolor='black',
                           markeredgewidth=1, markersize=np.sqrt(size_for_score(s))/1.5,
                           label=f'RCA Score: {s}') for s in score_examples]

all_handles = fault_handles + loc_handles + size_handles
ax.legend(handles=all_handles, loc='upper left', fontsize=11, framealpha=0.95, edgecolor='black')

# Axes styling
ax.set_xlabel('Execution Time (seconds)', fontsize=14, fontweight='bold', color='#2c3e50')
ax.set_ylabel('Total Tokens', fontsize=14, fontweight='bold', color='#2c3e50')
ax.set_title(f'RCA Score Visualization: Execution Time vs Token Usage - {SCENARIOS_NAME.title()}\n(size encodes RCA score; color by Fault Type)',
             fontsize=16, fontweight='bold', pad=20, color='#2c3e50')

ax.set_facecolor('#f8f9fa')
ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.8, color='#bdc3c7')
ax.set_axisbelow(True)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_color('#bdc3c7')
ax.spines['bottom'].set_color('#bdc3c7')
ax.spines['left'].set_linewidth(1.2)
ax.spines['bottom'].set_linewidth(1.2)

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_scatter_rca_score.pdf')

    fig.savefig(filename, format='pdf', bbox_inches='tight')

    print(f"Saved plot to {filename}")
plt.show()

### Box plots

In [None]:
# Create figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Prepare all data for global box plots
all_time_data = experiments_df['execution_time_seconds'].values
all_token_data = experiments_df['total_tokens'].values

# Get unique fault types for coloring individual points
fault_types = sorted(experiments_df['fault_name'].unique())
colors_array = cmap(np.linspace(0, 1, len(fault_types)))
color_map = {fault: colors_array[i] for i, fault in enumerate(fault_types)}

# Box plot for execution time (global)
bp1 = axes[0].boxplot([all_time_data], tick_labels=['All Experiments'], patch_artist=True, 
                      widths=0.4, showmeans=True,
                      boxprops=dict(linewidth=1.5),
                      whiskerprops=dict(linewidth=1.5),
                      capprops=dict(linewidth=1.5),
                      medianprops=dict(linewidth=2, color='darkred'),
                      meanprops=dict(marker='D', markerfacecolor='white', 
                                    markeredgecolor='black', markersize=8))

bp1['boxes'][0].set_facecolor('lightgray')
bp1['boxes'][0].set_alpha(0.5)

# Overlay individual experiments as dots
for idx, row in experiments_df.iterrows():
    fault = row['fault_name']
    color = color_map[fault]
    # Add small jitter to x position for visibility
    x_pos = 1 + np.random.uniform(-0.15, 0.15)
    axes[0].scatter(x_pos, row['execution_time_seconds'], s=60, color=color, 
                   alpha=0.7, edgecolors='black', linewidth=0.5, zorder=3)

axes[0].set_ylabel('Execution Time (seconds)', fontsize=12, fontweight='bold')
axes[0].set_title('Execution Time Distribution', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, linestyle='--', axis='y')
axes[0].set_facecolor('#f8f9fa')
axes[0].set_xlim(0.5, 1.5)

# Box plot for token usage (global)
bp2 = axes[1].boxplot([all_token_data], tick_labels=['All Experiments'], patch_artist=True,
                      widths=0.4, showmeans=True,
                      boxprops=dict(linewidth=1.5),
                      whiskerprops=dict(linewidth=1.5),
                      capprops=dict(linewidth=1.5),
                      medianprops=dict(linewidth=2, color='darkred'),
                      meanprops=dict(marker='D', markerfacecolor='white', 
                                    markeredgecolor='black', markersize=8))

bp2['boxes'][0].set_facecolor('lightgray')
bp2['boxes'][0].set_alpha(0.5)

# Overlay individual experiments as dots
for idx, row in experiments_df.iterrows():
    fault = row['fault_name']
    color = color_map[fault]
    x_pos = 1 + np.random.uniform(-0.15, 0.15)
    axes[1].scatter(x_pos, row['total_tokens'], s=60, color=color, 
                   alpha=0.7, edgecolors='black', linewidth=0.5, zorder=3)

axes[1].set_ylabel('Total Tokens', fontsize=12, fontweight='bold')
axes[1].set_title('Token Usage Distribution', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, linestyle='--', axis='y')
axes[1].set_facecolor('#f8f9fa')
axes[1].set_xlim(0.5, 1.5)

# Create legend for fault types (moved below the plots)
fault_handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[fault],
                            markeredgecolor='black', markeredgewidth=0.5, markersize=8, label=fault)
                 for fault in fault_types]
fig.legend(handles=fault_handles, loc='lower center', ncol=5, fontsize=10,
           framealpha=0.95, edgecolor='black', bbox_to_anchor=(0.5, -0.02))

# Add main title with scenarios
fig.suptitle(f'Global Performance Distribution - {SCENARIOS_NAME.title()}', fontsize=16, fontweight='bold', y=0.98)

# Leave space at the bottom for the legend
plt.tight_layout(rect=[0, 0.05, 1, 0.96])
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_boxplot_global_distribution.pdf')

    fig.savefig(filename, format='pdf', bbox_inches='tight')

    print(f"Saved plot to {filename}")
plt.show()

In [None]:
# Prepare data grouped by fault_name for per-fault box plots
fault_types = sorted(experiments_df['fault_name'].unique())
colors_array = cmap(np.linspace(0, 1, len(fault_types)))
color_map = {fault: colors_array[i] for i, fault in enumerate(fault_types)}

# Build per-fault arrays
time_data = [
    experiments_df[experiments_df['fault_name'] == fault]['execution_time_seconds'].values
    for fault in fault_types
]
token_data = [
    experiments_df[experiments_df['fault_name'] == fault]['total_tokens'].values
    for fault in fault_types
]

fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Create tick labels with numbers
tick_labels = [f"{i+1}" for i in range(len(fault_types))]

# Execution time per fault
bp_time = axes[0].boxplot(
    time_data,
    tick_labels=tick_labels,
    patch_artist=True,
    widths=0.6,
    showmeans=True,
    boxprops=dict(linewidth=1.5),
    whiskerprops=dict(linewidth=1.5),
    capprops=dict(linewidth=1.5),
    medianprops=dict(linewidth=2, color='darkred'),
    meanprops=dict(marker='D', markerfacecolor='white', markeredgecolor='black', markersize=7)
)
# Color boxes by fault type
for i, box in enumerate(bp_time['boxes']):
    box.set_facecolor(color_map[fault_types[i]])
    box.set_alpha(0.35)

# Overlay individual points with jitter per fault
for i, fault in enumerate(fault_types, start=1):
    data = experiments_df[experiments_df['fault_name'] == fault]
    jitter_x = i + np.random.uniform(-0.15, 0.15, size=len(data))
    axes[0].scatter(
        jitter_x,
        data['execution_time_seconds'],
        s=60,
        color=color_map[fault],
        alpha=0.85,
        edgecolors='black',
        linewidth=0.5,
        zorder=3
    )

axes[0].set_ylabel('Execution Time (seconds)', fontsize=12, fontweight='bold')
axes[0].set_title('Execution Time Distribution', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, linestyle='--', axis='y')
axes[0].set_facecolor('#f8f9fa')

# Tokens per fault
bp_tokens = axes[1].boxplot(
    token_data,
    tick_labels=tick_labels,
    patch_artist=True,
    widths=0.6,
    showmeans=True,
    boxprops=dict(linewidth=1.5),
    whiskerprops=dict(linewidth=1.5),
    capprops=dict(linewidth=1.5),
    medianprops=dict(linewidth=2, color='darkred'),
    meanprops=dict(marker='D', markerfacecolor='white', markeredgecolor='black', markersize=7)
)
for i, box in enumerate(bp_tokens['boxes']):
    box.set_facecolor(color_map[fault_types[i]])
    box.set_alpha(0.35)

for i, fault in enumerate(fault_types, start=1):
    data = experiments_df[experiments_df['fault_name'] == fault]
    jitter_x = i + np.random.uniform(-0.15, 0.15, size=len(data))
    axes[1].scatter(
        jitter_x,
        data['total_tokens'],
        s=60,
        color=color_map[fault],
        alpha=0.85,
        edgecolors='black',
        linewidth=0.5,
        zorder=3
    )

axes[1].set_ylabel('Total Tokens', fontsize=12, fontweight='bold')
axes[1].set_title('Token Usage Distribution', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, linestyle='--', axis='y')
axes[1].set_facecolor('#f8f9fa')

# Create legend with numbers and fault names
fault_handles = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[fault],
                markeredgecolor='black', markeredgewidth=0.5, markersize=8, 
                label=f"{i+1}. {fault}")
    for i, fault in enumerate(fault_types)
]
fig.legend(handles=fault_handles, loc='lower center', ncol=3, fontsize=10,
           framealpha=0.95, edgecolor='black', bbox_to_anchor=(0.5, -0.05))

# Add main title with scenarios
fig.suptitle(f'Performance Distribution by Fault Type - {SCENARIOS_NAME.title()}', fontsize=16, fontweight='bold', y=0.98)

plt.tight_layout(rect=[0, 0.08, 1, 0.96])
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_boxplot_by_fault_type.pdf')

    fig.savefig(filename, format='pdf', bbox_inches='tight')

    print(f"Saved plot to {filename}")
plt.show()

In [None]:
# Box plots by agent type (agent_id)

# Prepare data grouped by agent_id
agent_types = sorted(experiments_df['agent_id'].unique())
colors_array = cmap(np.linspace(0, 1, len(agent_types)))
agent_color_map = {agent: colors_array[i] for i, agent in enumerate(agent_types)}

time_data_agent = [
    experiments_df[experiments_df['agent_id'] == agent]['execution_time_seconds'].values
    for agent in agent_types
]
token_data_agent = [
    experiments_df[experiments_df['agent_id'] == agent]['total_tokens'].values
    for agent in agent_types
]

fig, axes = plt.subplots(1, 2, figsize=(18, 10))

# Execution time per agent
bp_time = axes[0].boxplot(
    time_data_agent,
    tick_labels=agent_types,
    patch_artist=True,
    widths=0.6,
    showmeans=True,
    boxprops=dict(linewidth=1.5),
    whiskerprops=dict(linewidth=1.5),
    capprops=dict(linewidth=1.5),
    medianprops=dict(linewidth=2, color='darkred'),
    meanprops=dict(marker='D', markerfacecolor='white', markeredgecolor='black', markersize=7)
)
for i, box in enumerate(bp_time['boxes']):
    box.set_facecolor(agent_color_map[agent_types[i]])
    box.set_alpha(0.35)

# Overlay individual points with jitter per agent
for i, agent in enumerate(agent_types, start=1):
    data = experiments_df[experiments_df['agent_id'] == agent]
    jitter_x = i + np.random.uniform(-0.15, 0.15, size=len(data))
    axes[0].scatter(
        jitter_x,
        data['execution_time_seconds'],
        s=60,
        color=agent_color_map[agent],
        alpha=0.85,
        edgecolors='black',
        linewidth=0.5,
        zorder=3
    )

axes[0].set_ylabel('Execution Time (seconds)', fontsize=12, fontweight='bold')
axes[0].set_title('Execution Time Distribution', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, linestyle='--', axis='y')
axes[0].set_facecolor('#f8f9fa')

# Tokens per agent
bp_tokens = axes[1].boxplot(
    token_data_agent,
    tick_labels=agent_types,
    patch_artist=True,
    widths=0.6,
    showmeans=True,
    boxprops=dict(linewidth=1.5),
    whiskerprops=dict(linewidth=1.5),
    capprops=dict(linewidth=1.5),
    medianprops=dict(linewidth=2, color='darkred'),
    meanprops=dict(marker='D', markerfacecolor='white', markeredgecolor='black', markersize=7)
)
for i, box in enumerate(bp_tokens['boxes']):
    box.set_facecolor(agent_color_map[agent_types[i]])
    box.set_alpha(0.35)

for i, agent in enumerate(agent_types, start=1):
    data = experiments_df[experiments_df['agent_id'] == agent]
    jitter_x = i + np.random.uniform(-0.15, 0.15, size=len(data))
    axes[1].scatter(
        jitter_x,
        data['total_tokens'],
        s=60,
        color=agent_color_map[agent],
        alpha=0.85,
        edgecolors='black',
        linewidth=0.5,
        zorder=3
    )

axes[1].set_ylabel('Total Tokens', fontsize=12, fontweight='bold')
axes[1].set_title('Token Usage Distribution', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, linestyle='--', axis='y')
axes[1].set_facecolor('#f8f9fa')

for ax in axes:
    ax.tick_params(axis='x', rotation=0)

# Legend for agent types
agent_handles = [
    plt.Line2D([0], [0], marker='o', color='w',
               markerfacecolor=agent_color_map[agent],
               markeredgecolor='black', markeredgewidth=0.5,
               markersize=8, label=f'{agent}: {agent_desc_dict[agent]["name"]}')
    for agent in agent_types
]
fig.legend(handles=agent_handles, loc='lower center', ncol=4, fontsize=10,
           framealpha=0.95, edgecolor='black', bbox_to_anchor=(0.5, -0.02))

# Add main title with scenarios
fig.suptitle(f'Performance Distribution by Agent Configuration - {SCENARIOS_NAME.title()}', fontsize=16, fontweight='bold', y=0.98)

plt.tight_layout(rect=[0, 0.05, 1, 0.96])
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_boxplot_by_agent_type.pdf')

    fig.savefig(filename, format='pdf', bbox_inches='tight')

    print(f"Saved plot to {filename}")
plt.show()

### Interactive Visualization

In [None]:
import numpy as np

import plotly.graph_objects as go

# Prepare data
plot_df = experiments_df.copy()
plot_df['hover_text'] = plot_df.apply(lambda row: 
    f"<b>Agent:</b> {row['agent_id']}<br>"
    f"<b>Scenario:</b> {row['scenario']}<br>"
    f"<b>Fault:</b> {row['fault_name']}<br>"
    f"<b>Time:</b> {row['execution_time_seconds']:.1f}s<br>"
    f"<b>Tokens:</b> {int(row['total_tokens']):,}<br>"
    f"<b>Detection:</b> {'✓' if row['eval_detection'] else '✗'}<br>"
    f"<b>Localization:</b> {'✓' if row['eval_localization'] else '✗'}<br>"
    f"<b>RCA Score:</b> {row['eval_rca_score']}/5",
    axis=1
)

# Color palette from pypalettes
fault_types = sorted(plot_df['fault_name'].unique())
colors_array = cmap(np.linspace(0, 1, len(fault_types)))
color_map = {fault: f'rgba({int(c[0]*255)},{int(c[1]*255)},{int(c[2]*255)},0.8)' 
             for fault, c in zip(fault_types, colors_array)}

# Create figure
fig = go.Figure()

for fault in fault_types:
    data = plot_df[plot_df['fault_name'] == fault]

    # Partition by localization correctness (treat NaN as incorrect/missing)
    correct = data[data['eval_localization'] == True]
    incorrect = data[data['eval_localization'] == False]

    legend_shown = False

    # Correct localization (circles)
    if len(correct) > 0:
        fig.add_trace(go.Scatter(
            x=correct['execution_time_seconds'],
            y=correct['total_tokens'],
            mode='markers+text',
            name=fault,  # ensure one legend entry per fault
            marker=dict(
                size=correct['eval_rca_score'] * 8 + 15,
                color=color_map[fault],
                line=dict(width=2, color='white'),
                symbol='circle'
            ),
            text=correct['agent_id'],
            textposition='middle center',
            textfont=dict(size=10, color='white', family='Arial Black'),
            customdata=correct['hover_text'],
            hovertemplate='%{customdata}<extra></extra>',
            legendgroup=fault,
            showlegend=True
        ))
        legend_shown = True

    # Incorrect localization (X markers)
    if len(incorrect) > 0:
        # If no correct points exist, show this trace in legend under the fault name
        name_for_incorrect = fault if not legend_shown else f'{fault} (incorrect)'
        fig.add_trace(go.Scatter(
            x=incorrect['execution_time_seconds'],
            y=incorrect['total_tokens'],
            mode='markers+text',
            name=name_for_incorrect,
            marker=dict(
                size=incorrect['eval_rca_score'] * 8 + 15,
                color=color_map[fault],
                line=dict(width=3, color=color_map[fault]),
                symbol='x',
                opacity=0.6
            ),
            text=incorrect['agent_id'],
            textposition='middle center',
            textfont=dict(size=10, color=color_map[fault], family='Arial Black'),
            customdata=incorrect['hover_text'],
            hovertemplate='%{customdata}<extra></extra>',
            legendgroup=fault,
            showlegend=not legend_shown
        ))

fig.update_layout(
    title=dict(
        text='<b>Experiment Performance Dashboard</b><br><sub>Execution Time vs Token Usage | Size: RCA Score | Shape: Localization Accuracy</sub>',
        x=0.5,
        xanchor='center',
        font=dict(size=20, color='#2c3e50')
    ),
    xaxis=dict(
        title='<b>Execution Time (seconds)</b>',
        showgrid=True,
        gridcolor='rgba(200,200,200,0.3)',
        zeroline=False,
        title_font=dict(size=14, color='#34495e')
    ),
    yaxis=dict(
        title='<b>Total Tokens</b>',
        showgrid=True,
        gridcolor='rgba(200,200,200,0.3)',
        zeroline=False,
        title_font=dict(size=14, color='#34495e')
    ),
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    height=750,
    hovermode='closest',
    font=dict(family='Arial, sans-serif', size=12),
    legend=dict(
        title=dict(text='<b>Fault Types</b>', font=dict(size=13)),
        orientation='v',
        yanchor='top',
        y=0.98,
        xanchor='left',
        x=0.01,
        bgcolor='rgba(255,255,255,0.9)',
        bordercolor='#bdc3c7',
        borderwidth=1
    ),
    margin=dict(l=80, r=30, t=100, b=80)
)

fig.show()


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Interactive Box Plot by Fault Type
fault_types = sorted(experiments_df['fault_name'].unique())
colors_array = cmap(np.linspace(0, 1, len(fault_types)))
fault_color_map = {fault: f'rgba({int(c[0]*255)},{int(c[1]*255)},{int(c[2]*255)},0.8)' 
                   for fault, c in zip(fault_types, colors_array)}

fig_fault = make_subplots(rows=1, cols=2, 
                          subplot_titles=['<b>Execution Time by Fault Type</b>', 
                                         '<b>Token Usage by Fault Type</b>'],
                          horizontal_spacing=0.08)

for i, fault in enumerate(fault_types):
    data = experiments_df[experiments_df['fault_name'] == fault]
    color = fault_color_map[fault]
    
    # Hover text for individual points
    hover_texts = data.apply(lambda row: 
        f"<b>Agent:</b> {row['agent_id']}<br>"
        f"<b>Fault:</b> {row['fault_name']}<br>"
        f"<b>Time:</b> {row['execution_time_seconds']:.1f}s<br>"
        f"<b>Tokens:</b> {int(row['total_tokens']):,}<br>"
        f"<b>Detection:</b> {'✓' if row['eval_detection'] else '✗'}<br>"
        f"<b>Localization:</b> {'✓' if row['eval_localization'] else '✗'}<br>"
        f"<b>RCA Score:</b> {row['eval_rca_score']}/5",
        axis=1
    ).tolist()
    
    # Execution time box plot
    fig_fault.add_trace(go.Box(
        y=data['execution_time_seconds'],
        name=fault,
        marker_color=color,
        boxpoints='all',
        jitter=0.4,
        pointpos=0,
        marker=dict(size=8, opacity=0.7, line=dict(width=1, color='white')),
        hovertext=hover_texts,
        hoverinfo='text',
        legendgroup=fault,
        showlegend=True
    ), row=1, col=1)
    
    # Token usage box plot
    fig_fault.add_trace(go.Box(
        y=data['total_tokens'],
        name=fault,
        marker_color=color,
        boxpoints='all',
        jitter=0.4,
        pointpos=0,
        marker=dict(size=8, opacity=0.7, line=dict(width=1, color='white')),
        hovertext=hover_texts,
        hoverinfo='text',
        legendgroup=fault,
        showlegend=False
    ), row=1, col=2)

fig_fault.update_layout(
    title=dict(
        text='<b>Performance Distribution by Fault Type</b>',
        x=0.5,
        xanchor='center',
        font=dict(size=20, color='#2c3e50')
    ),
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    height=600,
    font=dict(family='Arial, sans-serif', size=12),
    legend=dict(
        title=dict(text='<b>Fault Types</b>', font=dict(size=13)),
        orientation='h',
        yanchor='bottom',
        y=-0.25,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(255,255,255,0.9)',
        bordercolor='#bdc3c7',
        borderwidth=1
    ),
    margin=dict(l=80, r=30, t=100, b=120)
)

fig_fault.update_yaxes(title_text='<b>Execution Time (seconds)</b>', row=1, col=1,
                       showgrid=True, gridcolor='rgba(200,200,200,0.3)')
fig_fault.update_yaxes(title_text='<b>Total Tokens</b>', row=1, col=2,
                       showgrid=True, gridcolor='rgba(200,200,200,0.3)')

fig_fault.show()

In [None]:
# Interactive Box Plot by Agent Configuration
agent_types = sorted(experiments_df['agent_id'].unique())
colors_array_agents = cmap(np.linspace(0, 1, len(agent_types)))
agent_color_map = {agent: f'rgba({int(c[0]*255)},{int(c[1]*255)},{int(c[2]*255)},0.8)' 
                   for agent, c in zip(agent_types, colors_array_agents)}

fig_agent = make_subplots(rows=1, cols=2, 
                          subplot_titles=['<b>Execution Time by Agent Configuration</b>', 
                                         '<b>Token Usage by Agent Configuration</b>'],
                          horizontal_spacing=0.08)

for i, agent in enumerate(agent_types):
    data = experiments_df[experiments_df['agent_id'] == agent]
    color = agent_color_map[agent]
    
    # Hover text for individual points
    hover_texts = data.apply(lambda row: 
        f"<b>Agent:</b> {row['agent_id']}<br>"
        f"<b>Fault:</b> {row['fault_name']}<br>"
        f"<b>Time:</b> {row['execution_time_seconds']:.1f}s<br>"
        f"<b>Tokens:</b> {int(row['total_tokens']):,}<br>"
        f"<b>Detection:</b> {'✓' if row['eval_detection'] else '✗'}<br>"
        f"<b>Localization:</b> {'✓' if row['eval_localization'] else '✗'}<br>"
        f"<b>RCA Score:</b> {row['eval_rca_score']}/5",
        axis=1
    ).tolist()
    
    # Execution time box plot
    fig_agent.add_trace(go.Box(
        y=data['execution_time_seconds'],
        name=f'Agent {agent}',
        marker_color=color,
        boxpoints='all',
        jitter=0.4,
        pointpos=0,
        marker=dict(size=8, opacity=0.7, line=dict(width=1, color='white')),
        hovertext=hover_texts,
        hoverinfo='text',
        legendgroup=str(agent),
        showlegend=True
    ), row=1, col=1)
    
    # Token usage box plot
    fig_agent.add_trace(go.Box(
        y=data['total_tokens'],
        name=f'Agent {agent}',
        marker_color=color,
        boxpoints='all',
        jitter=0.4,
        pointpos=0,
        marker=dict(size=8, opacity=0.7, line=dict(width=1, color='white')),
        hovertext=hover_texts,
        hoverinfo='text',
        legendgroup=str(agent),
        showlegend=False
    ), row=1, col=2)

fig_agent.update_layout(
    title=dict(
        text='<b>Performance Distribution by Agent Configuration</b>',
        x=0.5,
        xanchor='center',
        font=dict(size=20, color='#2c3e50')
    ),
    plot_bgcolor='#f8f9fa',
    paper_bgcolor='white',
    height=600,
    font=dict(family='Arial, sans-serif', size=12),
    legend=dict(
        title=dict(text='<b>Agent Configurations</b>', font=dict(size=13)),
        orientation='h',
        yanchor='bottom',
        y=-0.25,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(255,255,255,0.9)',
        bordercolor='#bdc3c7',
        borderwidth=1
    ),
    margin=dict(l=80, r=30, t=100, b=120)
)

fig_agent.update_yaxes(title_text='<b>Execution Time (seconds)</b>', row=1, col=1,
                       showgrid=True, gridcolor='rgba(200,200,200,0.3)')
fig_agent.update_yaxes(title_text='<b>Total Tokens</b>', row=1, col=2,
                       showgrid=True, gridcolor='rgba(200,200,200,0.3)')

fig_agent.show()