# Multi-Experiment Hypothesis Test Analysis

Statistical analysis comparing Agent A vs Agent F across multiple fault scenarios.

In [None]:
from dotenv import load_dotenv
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats
from scipy.stats import ttest_ind, mannwhitneyu
import matplotlib.pyplot as plt
from pypalettes import load_cmap
import warnings
import re
import shutil

warnings.filterwarnings('ignore')

# Get the path to the root directory of the repository
root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# Load environment variables
load_dotenv(os.path.join(root_dir, '.env'))

print(f"Root directory: {root_dir}")

In [None]:
# Parameters
HYPOTHESIS_TEST_DIR = os.path.join(root_dir, 'Results', 'hypothesis-test')
SAVE_IMAGES = True
ALPHA = 0.05  # Significance level
CMAP_NAME = "Color_Blind"

print(f"Hypothesis test directory: {HYPOTHESIS_TEST_DIR}")

## Helper Functions

In [None]:
def load_experiment_data(agent_path):
    """Loads all JSON experiment files from a specific agent's folder."""
    json_files = [f for f in os.listdir(agent_path) if f.endswith('.json')]
    agent_records = []
    
    for experiment_file in json_files:
        try:
            file_path = os.path.join(agent_path, experiment_file)
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # Extract localization as string
            localization = data.get("final_report", {}).get("localization", [])
            localization_str = ", ".join(localization) if isinstance(localization, list) else None
            
            record = {
                "experiment_file": experiment_file,
                "agent_id": data.get("agent_id", None),
                "agent_conf_name": data.get("agent_configuration_name", None),
                "scenario": data.get("app_name", None),
                "fault_name": data.get("testbed", {}).get("fault_name", None),
                "execution_time_seconds": data.get("stats", {}).get("execution_time_seconds", 0),
                "total_tokens": data.get("stats", {}).get("total_tokens", 0),
                "eval_detection": data.get("evaluation", {}).get("detection", None),
                "eval_localization": data.get("evaluation", {}).get("localization", None),
                "eval_rca_score": data.get("evaluation", {}).get("rca_score", None),
            }
            agent_records.append(record)
        except (json.JSONDecodeError, KeyError) as e:
            print(f"Warning: Error processing {experiment_file}: {str(e)}")
            continue
            
    return pd.DataFrame(agent_records)

def perform_hypothesis_test(data_a, data_b, metric_name, alpha=0.05, direction='less'):
    """
    Perform one-sided hypothesis test (Agent A < Agent B).
    Returns a dictionary of test results.
    """
    a_clean = data_a.dropna()
    b_clean = data_b.dropna()
    
    if len(a_clean) == 0 or len(b_clean) == 0:
        return None
    
    # Normality test
    _, p_a_normal = stats.shapiro(a_clean)
    _, p_b_normal = stats.shapiro(b_clean)
    is_normal = (p_a_normal > alpha) and (p_b_normal > alpha)
    
    # Variance test
    _, p_levene = stats.levene(a_clean, b_clean)
    equal_var = p_levene > alpha
    
    # Hypothesis test
    if is_normal:
        t_stat, p_value = ttest_ind(a_clean, b_clean, equal_var=equal_var, alternative=direction)
        test_type = "T-Test" if equal_var else "Welch's T-Test"
    else:
        t_stat, p_value = mannwhitneyu(a_clean, b_clean, alternative=direction)
        test_type = "Mann-Whitney U"
    
    # Effect size (Cohen's d)
    mean_a, mean_b = a_clean.mean(), b_clean.mean()
    std_a, std_b = a_clean.std(), b_clean.std()
    pooled_std = np.sqrt(((len(a_clean)-1)*std_a**2 + (len(b_clean)-1)*std_b**2) / (len(a_clean) + len(b_clean) - 2))
    cohens_d = (mean_a - mean_b) / pooled_std if pooled_std > 0 else 0
    
    return {
        'metric': metric_name,
        'mean_a': mean_a,
        'mean_b': mean_b,
        'imp_pct': (mean_b - mean_a) / mean_b * 100 if mean_b != 0 else 0,
        'p_value': p_value,
        'significant': p_value < alpha,
        'cohens_d': cohens_d,
        'test_type': test_type
    }

## Discovery and Grouping

In [None]:
# Scan directory identifying pairs
all_dirs = [d for d in os.listdir(HYPOTHESIS_TEST_DIR) if os.path.isdir(os.path.join(HYPOTHESIS_TEST_DIR, d))]
experiments = {}

for d in all_dirs:
    if d == 'plots': continue
    
    # Expecting format "A - Experiment Name" or "F - Experiment Name"
    match = re.match(r"^([AF]) - (.+)$", d)
    if match:
        agent = match.group(1)
        exp_name = match.group(2)
        
        if exp_name not in experiments:
            experiments[exp_name] = {}
        experiments[exp_name][agent] = os.path.join(HYPOTHESIS_TEST_DIR, d)

print(f"Found {len(experiments)} experiment pairs:")
for exp, agents in experiments.items():
    print(f"  - {exp} (Agents: {list(agents.keys())})")

## Execution Loop

In [None]:
cmap = load_cmap(CMAP_NAME)
colors = [cmap(0.2), cmap(0.8)]
summary_results = []

for exp_name, agent_paths in experiments.items():
    if 'A' not in agent_paths or 'F' not in agent_paths:
        print(f"Skipping {exp_name}: Incomplete pair")
        continue
        
    print(f"\nProcessing: {exp_name}...")
    
    # Load Data
    df_a = load_experiment_data(agent_paths['A'])
    df_f = load_experiment_data(agent_paths['F'])
    
    # Directory for plots
    safe_name = re.sub(r'[^a-zA-Z0-9]', '_', exp_name)
    plot_dir = os.path.join(HYPOTHESIS_TEST_DIR, 'plots', safe_name)
    if SAVE_IMAGES:
        if os.path.exists(plot_dir):
             shutil.rmtree(plot_dir)
        os.makedirs(plot_dir)

    # Metrics to test
    metrics = [
        ('execution_time_seconds', 'Execution Time', 'less'),
        ('total_tokens', 'Total Tokens', 'less')
    ]
    
    # Run Tests
    for metric, label, direction in metrics:
        res = perform_hypothesis_test(df_a[metric], df_f[metric], label, ALPHA, direction)
        if res:
            res['experiment'] = exp_name
            summary_results.append(res)
            
            # Plot Boxplot
            if SAVE_IMAGES:
                plt.figure(figsize=(6, 5))
                box_data = [df_a[metric].dropna(), df_f[metric].dropna()]
                plt.boxplot(box_data, labels=['Agent A', 'Agent F'], patch_artist=True,
                            boxprops=dict(facecolor=colors[0], alpha=0.5),
                            medianprops=dict(color='black'))
                plt.title(f"{label}\n{exp_name}")
                plt.ylabel(label)
                plt.tight_layout()
                safe_metric = label.lower().replace(" ", "_")
                plt.savefig(os.path.join(plot_dir, f"{safe_metric}_boxplot.pdf"))
                plt.close()
    
    print(f"  -> Done. Results saved to {plot_dir}")

### Statistical Metrics Explanation

- **P-value**: The probability that the observed difference regarding the null hypothesis (Agent A >= Agent F) occurred by chance. A p-value < 0.05 indicates statistical significance (Agent A is significantly better).
- **Cohen's d**: A measure of effect size (standardized difference between means).
  - 0.2: Small effect
  - 0.5: Medium effect
  - 0.8: Large effect
- **Imp % (Improvement Percentage)**: Relative improvement of Agent A compared to Agent F. Positive values indicate Agent A performed better (lower time/tokens).

## Final Summary

In [None]:
summary_df = pd.DataFrame(summary_results)

if not summary_df.empty:
    # Prioritize columns
    cols = ['experiment', 'metric', 'mean_a', 'mean_b', 'imp_pct', 'p_value', 'significant', 'test_type']
    summary_df = summary_df[cols]
    
    print("\nOverall Hypothesis Test Results (Agent A vs Agent F):")
    # Display formatted dataframe
    display(summary_df.style.format({
        'mean_a': '{:.2f}',
        'mean_b': '{:.2f}',
        'imp_pct': '{:.2f}%',
        'p_value': '{:.5f}',
        'cohens_d': '{:.3f}'
    }).background_gradient(subset=['imp_pct'], cmap='Greens'))
    
    # Export to CSV
    summary_path = os.path.join(HYPOTHESIS_TEST_DIR, 'hypothesis_results_summary.csv')
    summary_df.to_csv(summary_path, index=False)
    print(f"\nSummary saved to: {summary_path}")
else:
    print("No results generated.")

In [None]:
print(summary_df.to_latex())