# Hypothesis Test Analysis

Statistical analysis comparing agent configurations on Hotel Reservation fault scenarios.

In [None]:
from dotenv import load_dotenv
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Get the path to the root directory of the repository
root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# Load environment variables from .env file in the root directory
load_dotenv(os.path.join(root_dir, '.env'))

print(f"Root directory: {root_dir}")

In [None]:
# Parameters
HYPOTHESIS_TEST_DIR = os.path.join(root_dir, 'Results', 'hypothesis-test-hotel-res')
SAVE_IMAGES = True
ALPHA = 0.05  # Significance level for hypothesis tests

print(f"Hypothesis test directory: {HYPOTHESIS_TEST_DIR}")
print(f"Alpha level: {ALPHA}")

## Load Experiment Data

In [None]:
# Get agent folders
agent_folders = [d for d in os.listdir(HYPOTHESIS_TEST_DIR) 
                  if os.path.isdir(os.path.join(HYPOTHESIS_TEST_DIR, d))]
agent_folders = sorted(agent_folders)

print(f"Found {len(agent_folders)} agent configurations:")
for i, folder in enumerate(agent_folders, 1):
    print(f"{i}) {folder}")

In [None]:
# Load experiments from all agent folders
experiments_df = pd.DataFrame()
agent_data = {}  # Track data by agent for later comparisons

for agent_folder in agent_folders:
    agent_path = os.path.join(HYPOTHESIS_TEST_DIR, agent_folder)
    json_files = [f for f in os.listdir(agent_path) if f.endswith('.json')]
    
    print(f"\nLoading experiments from: {agent_folder}")
    print(f"Found {len(json_files)} experiment files")
    
    agent_records = []
    
    for experiment_file in json_files:
        try:
            file_path = os.path.join(agent_path, experiment_file)
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # Extract localization as string
            localization = data.get("final_report", {}).get("localization", [])
            if isinstance(localization, list):
                localization_str = ", ".join(localization)
            else:
                localization_str = None
            
            record = {
                "experiment_file": experiment_file,
                "agent_id": data.get("agent_id", None),
                "agent_conf_name": data.get("agent_configuration_name", None),
                "scenario": data.get("app_name", None),
                "fault_name": data.get("testbed", {}).get("fault_name", None),
                "target_namespace": data.get("target_namespace", None),
                "trace_service_starting_point": data.get("trace_service_starting_point", None),
                "rca_tasks_per_iteration": data.get("testbed", {}).get("rca_tasks_per_iteration", 0),
                "max_tool_calls": data.get("testbed", {}).get("max_tool_calls", 0),
                "execution_time_seconds": data.get("stats", {}).get("execution_time_seconds", 0),
                "total_tokens": data.get("stats", {}).get("total_tokens", 0),
                "tokens_triage": data.get("stats", {}).get("agent_stats", {}).get("triage_agent", {}).get("total_tokens", 0),
                "tokens_planner": data.get("stats", {}).get("agent_stats", {}).get("planner_agent", {}).get("total_tokens", 0),
                "tokens_rca_worker": data.get("stats", {}).get("agent_stats", {}).get("rca_agent", {}).get("total_tokens", 0),
                "runs_count_rca": data.get("stats", {}).get("agent_stats", {}).get("rca_agent", {}).get("runs_count", 0),
                "tokens_supervisor": data.get("stats", {}).get("agent_stats", {}).get("supervisor_agent", {}).get("total_tokens", 0),
                "runs_count_supervisor": data.get("stats", {}).get("agent_stats", {}).get("supervisor_agent", {}).get("runs_count", 0),
                "detection": data.get("final_report", {}).get("detection", None),
                "localization": localization_str,
                "root_cause": data.get("final_report", {}).get("root_cause", None),
                "eval_detection": data.get("evaluation", {}).get("detection", None),
                "eval_localization": data.get("evaluation", {}).get("localization", None),
                "eval_rca_score": data.get("evaluation", {}).get("rca_score", None),
                "eval_rca_motivation": data.get("evaluation", {}).get("rca_motivation", None),
            }
            
            agent_records.append(record)
            experiments_df = pd.concat([experiments_df, pd.DataFrame([record])], ignore_index=True)
        
        except (json.JSONDecodeError, KeyError) as e:
            print(f"Warning: Error processing {experiment_file}: {str(e)}")
            continue
    
    agent_data[agent_folder] = pd.DataFrame(agent_records)
    print(f"Loaded {len(agent_records)} experiments")

print(f"\n{'='*60}")
print(f"Total experiments loaded: {len(experiments_df)}")
print(f"{'='*60}")

In [None]:
# Display loaded data
experiments_df.head(10)

## Data Summary

In [None]:
# Summary statistics by agent
summary_stats = experiments_df.groupby('agent_id').agg({
    'experiment_file': 'count',
    'execution_time_seconds': ['mean', 'std', 'min', 'max'],
    'total_tokens': ['mean', 'std', 'min', 'max'],
    'eval_detection': 'mean',
    'eval_localization': 'mean',
    'eval_rca_score': ['mean', 'std'],
})

summary_stats.columns = ['_'.join(col).strip() for col in summary_stats.columns.values]
summary_stats.rename(columns={'experiment_file_count': 'num_runs'}, inplace=True)
summary_stats

In [None]:
# Quick overview by scenario and fault
print("\nScenario and Fault Information:")
print(f"Scenario: {experiments_df['scenario'].unique()}")
print(f"Fault Type: {experiments_df['fault_name'].unique()}")
print(f"\nAgent IDs: {experiments_df['agent_id'].unique()}")
print(f"\nAgent Configuration Names: {experiments_df['agent_conf_name'].unique()}")

## Hypothesis Testing

Compare performance metrics between the two agent configurations.

In [None]:
from scipy import stats
from scipy.stats import ttest_ind, mannwhitneyu, normaltest

# Get the two agent configurations
agents = sorted(experiments_df['agent_id'].unique())
if len(agents) != 2:
    print(f"Warning: Expected 2 agents, found {len(agents)}")

agent_a = agents[0]
agent_b = agents[1]

print(f"Comparing Agent {agent_a} vs Agent {agent_b}")
print(f"Alpha significance level: {ALPHA}")

In [None]:
# Extract data for each agent
agent_a_data = experiments_df[experiments_df['agent_id'] == agent_a]
agent_b_data = experiments_df[experiments_df['agent_id'] == agent_b]

print(f"Agent {agent_a}: {len(agent_a_data)} runs")
print(f"Agent {agent_b}: {len(agent_b_data)} runs")

In [None]:
# Function to perform one-sided hypothesis test (Agent A better = lower values)
def perform_hypothesis_test_onesided(data_a, data_b, metric_name, alpha=0.05, direction='less'):
    """
    Perform one-sided hypothesis test.
    direction='less': Tests if Agent A < Agent B (for metrics where lower is better)
    """
    # Remove NaN values
    a_clean = data_a.dropna()
    b_clean = data_b.dropna()
    
    if len(a_clean) == 0 or len(b_clean) == 0:
        print(f"Skipping {metric_name}: Insufficient data")
        return None
    
    # Normality test (Shapiro-Wilk)
    _, p_a_normal = stats.shapiro(a_clean)
    _, p_b_normal = stats.shapiro(b_clean)
    
    is_normal_a = p_a_normal > alpha
    is_normal_b = p_b_normal > alpha
    
    # Levene's test for equal variances
    _, p_levene = stats.levene(a_clean, b_clean)
    equal_var = p_levene > alpha
    
    # Choose appropriate test (one-sided)
    if is_normal_a and is_normal_b:
        # Parametric: Independent t-test (one-sided)
        t_stat, p_value = ttest_ind(a_clean, b_clean, equal_var=equal_var, alternative=direction)
        test_used = "Welch's t-test (one-sided)" if not equal_var else "Student's t-test (one-sided)"
    else:
        # Non-parametric: Mann-Whitney U test (one-sided)
        u_stat, p_value = mannwhitneyu(a_clean, b_clean, alternative=direction)
        t_stat = u_stat
        test_used = "Mann-Whitney U test (one-sided)"
    
    mean_a = a_clean.mean()
    mean_b = b_clean.mean()
    std_a = a_clean.std()
    std_b = b_clean.std()
    
    # Calculate effect size (Cohen's d)
    pooled_std = np.sqrt(((len(a_clean)-1)*std_a**2 + (len(b_clean)-1)*std_b**2) / (len(a_clean) + len(b_clean) - 2))
    cohens_d = (mean_a - mean_b) / pooled_std if pooled_std > 0 else 0
    
    # For "better", we need Agent A mean to be lower for these metrics
    significant = "YES" if p_value < alpha else "NO"
    
    return {
        'metric': metric_name,
        'test_used': test_used,
        'agent_a_mean': mean_a,
        'agent_b_mean': mean_b,
        'agent_a_std': std_a,
        'agent_b_std': std_b,
        'test_statistic': t_stat,
        'p_value': p_value,
        'significant': significant,
        'cohens_d': cohens_d,
        'agent_a_better': mean_a < mean_b,
    }

In [None]:
# Perform ONE-SIDED hypothesis tests: Agent A better than Agent B (lower is better)
print("\n" + "="*100)
print("ONE-SIDED HYPOTHESIS TEST: Agent A performs BETTER than Agent B")
print("(Lower execution time and tokens = better performance)")
print("="*100)

metrics_to_test = [
    ('execution_time_seconds', 'Execution Time (seconds)', 'less'),  # less = Agent A < Agent B is better
    ('total_tokens', 'Total Tokens', 'less'),
]

hypothesis_results = []

for metric_col, metric_name, direction in metrics_to_test:
    result = perform_hypothesis_test_onesided(
        agent_a_data[metric_col],
        agent_b_data[metric_col],
        metric_name,
        alpha=ALPHA,
        direction=direction
    )
    if result:
        hypothesis_results.append(result)

# Create results dataframe
hypothesis_df = pd.DataFrame(hypothesis_results)
print("\nHypothesis Test Results (One-Sided: Agent A < Agent B):")
print("="*100)
print(hypothesis_df[['metric', 'agent_a_mean', 'agent_b_mean', 'p_value', 'significant', 'cohens_d']])

In [None]:
# Detailed results display
print("\n" + "="*100)
print("DETAILED ONE-SIDED HYPOTHESIS TEST RESULTS")
print("H0: Agent A >= Agent B (no improvement)")
print("H1: Agent A < Agent B (Agent A is better)")
print("="*100)

for idx, row in hypothesis_df.iterrows():
    print(f"\n{row['metric'].upper()}")
    print("-" * 80)
    print(f"  Agent {agent_a}: Mean = {row['agent_a_mean']:.4f}, Std = {row['agent_a_std']:.4f}")
    print(f"  Agent {agent_b}: Mean = {row['agent_b_mean']:.4f}, Std = {row['agent_b_std']:.4f}")
    print(f"  Difference: Agent A is {row['agent_a_mean'] - row['agent_b_mean']:.4f} {'LOWER' if row['agent_a_mean'] < row['agent_b_mean'] else 'HIGHER'}")
    print(f"  Percentage difference: {(row['agent_a_mean'] - row['agent_b_mean']) / row['agent_b_mean'] * 100:.2f}%")
    print(f"  Test Used: {row['test_used']}")
    print(f"  Test Statistic: {row['test_statistic']:.4f}")
    print(f"  P-value (one-sided): {row['p_value']:.6f}")
    print(f"  Significant (α={ALPHA}): {row['significant']}")
    
    if row['significant'] == 'YES' and row['agent_a_better']:
        print(f"  ✓ RESULT: Agent A performs SIGNIFICANTLY BETTER than Agent B")
    elif row['significant'] == 'YES' and not row['agent_a_better']:
        print(f"  ✗ RESULT: Agent A performs SIGNIFICANTLY WORSE than Agent B")
    else:
        print(f"  ○ RESULT: No significant difference detected")
    
    print(f"  Cohen's d (effect size): {row['cohens_d']:.4f}")
    
    # Interpret effect size
    d = abs(row['cohens_d'])
    if d < 0.2:
        effect = "negligible"
    elif d < 0.5:
        effect = "small"
    elif d < 0.8:
        effect = "medium"
    else:
        effect = "large"
    print(f"  Effect Size Interpretation: {effect}")

## Data Visualization

In [None]:
import matplotlib.pyplot as plt
from pypalettes import load_cmap

# Load color palette
cmap = load_cmap("Color_Blind")

# Create plots directory if it doesn't exist
plots_dir = os.path.join(HYPOTHESIS_TEST_DIR, 'plots')
if SAVE_IMAGES and not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

plot_counter = 0
print(f"Plots will be saved to: {plots_dir}")

In [None]:
# Box plots comparing agents on key metrics (only numerical metrics)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle(f'Resource Usage Comparison: Agent {agent_a} vs Agent {agent_b}', fontsize=16, fontweight='bold')

colors = [cmap(0.2), cmap(0.8)]
agent_labels = [f'Agent {agent_a}', f'Agent {agent_b}']
agents_list = [agent_a, agent_b]

metrics_to_plot = [
    ('execution_time_seconds', 'Execution Time (seconds)'),
    ('total_tokens', 'Total Tokens'),
]

for idx, (metric_col, metric_label) in enumerate(metrics_to_plot):
    ax = axes[idx]
    
    # Prepare data
    data_to_plot = [
        agent_a_data[metric_col].dropna().values,
        agent_b_data[metric_col].dropna().values
    ]
    
    # Create box plot with individual points
    bp = ax.boxplot(data_to_plot, tick_labels=agent_labels, patch_artist=True, widths=0.6,
                    showmeans=True,
                    boxprops=dict(linewidth=1.5),
                    medianprops=dict(linewidth=2, color='darkred'),
                    meanprops=dict(marker='D', markerfacecolor='white', 
                                  markeredgecolor='black', markersize=7))
    
    # Color boxes
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
    
    # Overlay individual points
    for i, agent in enumerate(agents_list, 1):
        data = experiments_df[experiments_df['agent_id'] == agent][metric_col].dropna().values
        x_pos = i + np.random.uniform(-0.1, 0.1, size=len(data))
        ax.scatter(x_pos, data, s=50, color=colors[i-1], alpha=0.6, 
                  edgecolors='black', linewidth=0.5, zorder=3)
    
    ax.set_ylabel(metric_label, fontweight='bold', fontsize=12)
    ax.grid(True, alpha=0.3, linestyle='--', axis='y')
    ax.set_facecolor('#f8f9fa')

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_boxplot_comparison.pdf')
    fig.savefig(filename, format='pdf', bbox_inches='tight')
    print(f"Saved plot to {filename}")
plt.show()

In [None]:
# Violin plots for better distribution visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle(f'Distribution Comparison: Agent {agent_a} vs Agent {agent_b}', fontsize=16, fontweight='bold')

for idx, (metric_col, metric_label) in enumerate(metrics_to_plot):
    ax = axes[idx]
    
    # Prepare data for violin plot
    plot_data = []
    labels = []
    for agent in agents_list:
        data = experiments_df[experiments_df['agent_id'] == agent][metric_col].dropna().values
        plot_data.append(data)
        labels.append(f'Agent {agent}')
    
    parts = ax.violinplot(plot_data, positions=[1, 2], widths=0.7, 
                          showmeans=True, showmedians=True)
    
    # Color the violin plots
    for i, pc in enumerate(parts['bodies']):
        pc.set_facecolor(colors[i])
        pc.set_alpha(0.6)
    
    ax.set_xticks([1, 2])
    ax.set_xticklabels(labels)
    ax.set_ylabel(metric_label, fontweight='bold', fontsize=12)
    ax.grid(True, alpha=0.3, linestyle='--', axis='y')
    ax.set_facecolor('#f8f9fa')

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_violin_comparison.pdf')
    fig.savefig(filename, format='pdf', bbox_inches='tight')
    print(f"Saved plot to {filename}")
plt.show()

In [None]:
# Scatter plot: Execution Time vs Total Tokens
fig, ax = plt.subplots(figsize=(12, 7))

for agent, agent_subset, color in [(agent_a, agent_a_data, colors[0]), (agent_b, agent_b_data, colors[1])]:
    ax.scatter(
        agent_subset['execution_time_seconds'],
        agent_subset['total_tokens'],
        s=120,
        alpha=0.6,
        edgecolors='black',
        linewidth=1,
        label=f'Agent {agent}',
        color=color
    )

ax.set_xlabel('Execution Time (seconds)', fontweight='bold', fontsize=12)
ax.set_ylabel('Total Tokens', fontweight='bold', fontsize=12)
ax.set_title('Execution Time vs Token Usage by Agent', fontweight='bold', fontsize=14)
ax.grid(True, alpha=0.3, linestyle='--')
ax.set_facecolor('#f8f9fa')
ax.legend(fontsize=11, edgecolor='black', loc='upper left')

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_scatter_performance.pdf')
    fig.savefig(filename, format='pdf', bbox_inches='tight')
    print(f"Saved plot to {filename}")
plt.show()

In [None]:
# Bar chart: Resource usage comparison
fig, ax = plt.subplots(figsize=(12, 6))

metrics = ['Execution Time (s)', 'Total Tokens']
agent_a_vals = [
    agent_a_data['execution_time_seconds'].mean(),
    agent_a_data['total_tokens'].mean() / 1000  # Scale down for visualization
]
agent_b_vals = [
    agent_b_data['execution_time_seconds'].mean(),
    agent_b_data['total_tokens'].mean() / 1000  # Scale down for visualization
]

x = np.arange(len(metrics))
width = 0.35

bars1 = ax.bar(x - width/2, agent_a_vals, width, label=f'Agent {agent_a}', 
               color=colors[0], alpha=0.7, edgecolor='black', linewidth=1.5)
bars2 = ax.bar(x + width/2, agent_b_vals, width, label=f'Agent {agent_b}', 
               color=colors[1], alpha=0.7, edgecolor='black', linewidth=1.5)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.1f}',
               ha='center', va='bottom', fontweight='bold', fontsize=11)

ax.set_ylabel('Value', fontweight='bold', fontsize=12)
ax.set_title(f'Resource Usage Comparison: Agent {agent_a} vs Agent {agent_b}\n(Tokens in thousands)', 
             fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(metrics, fontweight='bold', fontsize=11)
ax.legend(fontsize=11, edgecolor='black')
ax.grid(True, alpha=0.3, linestyle='--', axis='y')
ax.set_facecolor('#f8f9fa')

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_bar_resource_comparison.pdf')
    fig.savefig(filename, format='pdf', bbox_inches='tight')
    print(f"Saved plot to {filename}")
plt.show()

In [None]:
# Improvement summary visualization
fig, ax = plt.subplots(figsize=(12, 6))

improvements = [
    abs((agent_a_data['execution_time_seconds'].mean() - agent_b_data['execution_time_seconds'].mean()) 
        / agent_b_data['execution_time_seconds'].mean() * 100),
    abs((agent_a_data['total_tokens'].mean() - agent_b_data['total_tokens'].mean()) 
        / agent_b_data['total_tokens'].mean() * 100)
]

metrics_names = ['Execution Time', 'Total Tokens']

bars = ax.barh(metrics_names, improvements, color=[cmap(0.2), cmap(0.8)], 
               edgecolor='black', linewidth=2, height=0.6, alpha=0.8)

# Add value labels
for i, (bar, val) in enumerate(zip(bars, improvements)):
    ax.text(val + 1, i, f'{val:.1f}%', va='center', fontweight='bold', fontsize=12)

ax.set_xlabel('Improvement (%)', fontweight='bold', fontsize=12)
ax.set_title(f'Agent {agent_a} Performance Improvement vs Agent {agent_b}', 
             fontweight='bold', fontsize=14)
ax.set_xlim(0, max(improvements) + 10)
ax.grid(True, alpha=0.3, linestyle='--', axis='x')
ax.set_facecolor('#f8f9fa')

plt.tight_layout()
if SAVE_IMAGES:
    plot_counter += 1
    filename = os.path.join(plots_dir, f'{plot_counter:03d}_improvement_summary.pdf')
    fig.savefig(filename, format='pdf', bbox_inches='tight')
    print(f"Saved plot to {filename}")
plt.show()

## Summary and Conclusions

In [None]:
# Generate summary report
print("\n" + "="*100)
print("ONE-SIDED HYPOTHESIS TEST SUMMARY REPORT")
print("="*100)

print(f"\nTest Configuration:")
print(f"  - Agent A (Candidate): {agent_a}")
print(f"  - Agent B (Baseline): {agent_b}")
print(f"  - Number of runs per agent: 30")
print(f"  - Significance level (α): {ALPHA}")
print(f"  - Test Type: ONE-SIDED (Agent A < Agent B = better)")
print(f"  - Scenario: Hotel Reservation - Port Mismatch Geo")
print(f"  - Metrics: Execution Time, Total Tokens")

print(f"\nKey Findings:")
print("-" * 100)

significant_results = hypothesis_df[hypothesis_df['significant'] == 'YES']
if len(significant_results) > 0:
    better_results = significant_results[significant_results['agent_a_better'] == True]
    worse_results = significant_results[significant_results['agent_a_better'] == False]
    
    if len(better_results) > 0:
        print(f"\n✓ Metrics where Agent A is SIGNIFICANTLY BETTER (p < {ALPHA}):")
        for idx, row in better_results.iterrows():
            improvement = abs((row['agent_a_mean'] - row['agent_b_mean']) / row['agent_b_mean'] * 100)
            print(f"  • {row['metric']}: {improvement:.1f}% improvement (p = {row['p_value']:.6f}, d = {row['cohens_d']:.3f})")
    
    if len(worse_results) > 0:
        print(f"\n✗ Metrics where Agent A is SIGNIFICANTLY WORSE (p < {ALPHA}):")
        for idx, row in worse_results.iterrows():
            degradation = abs((row['agent_a_mean'] - row['agent_b_mean']) / row['agent_b_mean'] * 100)
            print(f"  • {row['metric']}: {degradation:.1f}% degradation (p = {row['p_value']:.6f}, d = {row['cohens_d']:.3f})")
else:
    print(f"\nNo statistically significant improvements found at α = {ALPHA}")
    print(f"Agent A does not perform significantly better than Agent B on the tested metrics.")

print(f"\nDetailed Performance Comparison:")
print("-" * 100)
print(f"\nAgent {agent_a} (Candidate):")
print(f"  - Execution Time: {agent_a_data['execution_time_seconds'].mean():.2f} ± {agent_a_data['execution_time_seconds'].std():.2f} seconds")
print(f"  - Total Tokens: {agent_a_data['total_tokens'].mean():.0f} ± {agent_a_data['total_tokens'].std():.0f}")
print(f"  - Detection Accuracy: {agent_a_data['eval_detection'].mean():.2%}")
print(f"  - Localization Accuracy: {agent_a_data['eval_localization'].mean():.2%}")
print(f"  - Avg RCA Score: {agent_a_data['eval_rca_score'].mean():.2f}/5.0")

print(f"\nAgent {agent_b} (Baseline):")
print(f"  - Execution Time: {agent_b_data['execution_time_seconds'].mean():.2f} ± {agent_b_data['execution_time_seconds'].std():.2f} seconds")
print(f"  - Total Tokens: {agent_b_data['total_tokens'].mean():.0f} ± {agent_b_data['total_tokens'].std():.0f}")
print(f"  - Detection Accuracy: {agent_b_data['eval_detection'].mean():.2%}")
print(f"  - Localization Accuracy: {agent_b_data['eval_localization'].mean():.2%}")
print(f"  - Avg RCA Score: {agent_b_data['eval_rca_score'].mean():.2f}/5.0")

print(f"\nRelative Performance (Agent A vs Agent B):")
time_improvement = (agent_b_data['execution_time_seconds'].mean() - agent_a_data['execution_time_seconds'].mean()) / agent_b_data['execution_time_seconds'].mean() * 100
token_improvement = (agent_b_data['total_tokens'].mean() - agent_a_data['total_tokens'].mean()) / agent_b_data['total_tokens'].mean() * 100
print(f"  - Execution Time: {time_improvement:+.2f}% (lower is better)")
print(f"  - Total Tokens: {token_improvement:+.2f}% (lower is better)")

print("\n" + "="*100)