In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt 

benchmarkVersion = "test-all-2025-09-18-28tests"

rootDir = "/workspaces/promptpex/"

if not os.path.isdir(rootDir):
    rootDir = ".."

evalsDir = f'{rootDir}/evals/{benchmarkVersion}/'


benchmarks = [
    "speech-tag"
]

# full list
benchmarks = [ "speech-tag", "classify-input-text", "text-to-p",  "sentence-rewrite", "extract-names", "elements", "art-prompt", "shakespearean-writing-assistant"]


benchmarks = [
    "bayesian_games_29_7",
    "bullet_journaling_145_1",
    "canopy_management_298_8",
    "fancy_title_generator",
    "hearing_impairments_124_7",
    "housing_market_dynamics_338_1",
    "initial_public_offerings_ipos_70_9",
    "news_broadcasting_693_9",
    # "prompt_generator",
    "real_time_analytics_609_2",
    "recruiter",
    "restaurant_owner",
    "sewing_951_7",
    "solr_search_engine",
    "speaker_identification_595_2",
    "startup_idea_generator",
    "tea_taster",
    "virtual_fitness_coach",
    "yes_or_no_answer",
    "speech-tag", "classify-input-text", "text-to-p",  "sentence-rewrite", "extract-names", "elements", "art-prompt", "shakespearean-writing-assistant",
]

prettyBenchmarkNames = { "speech-tag": "speech-tag", 
                "text-to-p": "text-to-p",  
                "shakespearean-writing-assistant": "shakespeare", 
                "sentence-rewrite": "sentence", 
                "extract-names": "extract-names", 
                "elements":"elements", 
                "art-prompt": "art-prompt", 
                "classify-input-text": "classify"}

prettyMetrics = { "tests compliant": "prompt ok/err", 
                "system_compliant": "prompt only",  
                "rules_system_with_input_compliant": "prompt/rule/input"
}

def parse_metric(val):
    """Convert metric to float, handle %, NaN, and '--'."""
    if isinstance(val, str):
        val = val.strip()
        if val.endswith('%'):
            try:
                return float(val.strip('%'))
            except:
                return 0.0
        if val in ('NaN', '--', ''):
            return 0.0
    try:
        v = float(val)
        if np.isnan(v):
            return 0.0
        return v
    except:
        return 0.0

def get_metrics_start_col(df):
    """Get the start column index for metrics (after 'tests valid compliant')."""
    df.columns = df.columns.str.strip()  # Strip whitespace from column names
    
    if 'tests valid compliant' in df.columns:
        return df.columns.get_loc('tests valid') + 2
    elif 'tests negative' in df.columns:
        return df.columns.get_loc('tests negative compliant') + 2
    elif 'tests positive' in df.columns:
        return df.columns.get_loc('tests positive') + 2
    else:
        # Fallback: assume metrics start after 'model' column
        return df.columns.get_loc('model') + 2 if 'model' in df.columns else 0

    
def analyze_benchmark_metrics(benchmark, evalsDir, prettyBenchmarkNames):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt

    # Build the path to the overview.csv for the given benchmark
    csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")

    # Read the CSV
    df = pd.read_csv(csv_path)

    # Get the start column for metrics
    start_col = get_metrics_start_col(df)

    # Include "tests compliant" as a metric (ensure it's included)
    metrics = list(df.columns[start_col:])

    # Extract model names and metrics columns
    models = df['model']
    metrics_table = df[['model'] + metrics].copy()

    for metric in metrics:
        metrics_table[metric] = metrics_table[metric].apply(parse_metric)

    print(f"Metrics by Model for benchmark '{benchmark}':")
    print(metrics_table.to_string(index=False))

    # Plot grouped bar chart
    x = np.arange(len(models))
    width = 0.8 / len(metrics)  # total width for all bars per group

    fig, ax = plt.subplots(figsize=(14, 6))
    for i, metric in enumerate(metrics):
        ax.bar(x + i*width, metrics_table[metric], width, label=metric)

    ax.set_xticks(x + width*(len(metrics)-1)/2)
    ax.set_xticklabels(models, rotation=20)
    ax.set_ylabel('Metric Value')
    ax.set_title(f"Model Metrics for {prettyBenchmarkNames.get(benchmark, benchmark)}")
    ax.legend(loc='best', fontsize='small', ncol=2)
    plt.tight_layout()
    plt.show()

# Example usage:
for benchmark in benchmarks:
    # Call the function to analyze and plot metrics for each benchmark
    print(f"Analyzing metrics for benchmark: {benchmark}")
    analyze_benchmark_metrics(benchmark, evalsDir, prettyBenchmarkNames)

In [None]:

def collect_metrics(benchmarks, evalsDir):
    # Data structure: {benchmark: {model: {metric: value}}}
    all_data = {}
    all_metrics = set()
    all_models = set()

    for benchmark in benchmarks:
        csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")
        if not os.path.isfile(csv_path):
            print(f"Warning: {csv_path} not found, skipping.")
            continue
        df = pd.read_csv(csv_path)
        
        # Get the start column for metrics
        start_col = get_metrics_start_col(df)

        # Include "tests compliant" as a metric (ensure it's included)
        metrics = list(df.columns[start_col:])

        all_metrics.update(metrics)
        all_data[benchmark] = {}
        for _, row in df.iterrows():
            model = row['model']
            all_models.add(model)
            all_data[benchmark].setdefault(model, {})
            for metric in metrics:
                all_data[benchmark][model][metric] = parse_metric(row[metric])
    return all_data, sorted(all_models), sorted(all_metrics)

def compute_model_metric_averages(all_data, all_models, all_metrics):
    # {model: {metric: [values...]}}
    model_metric_values = {model: {metric: [] for metric in all_metrics} for model in all_models}
    for benchmark in all_data:
        for model in all_models:
            model_metrics = all_data[benchmark].get(model, {})
            for metric in all_metrics:
                val = model_metrics.get(metric, 0.0)
                model_metric_values[model][metric].append(val)
    # Compute averages
    model_metric_avg = {model: {metric: np.mean(vals) for metric, vals in metrics.items()} for model, metrics in model_metric_values.items()}
    return model_metric_avg

def print_metric_table(model_metric_avg):
    models = list(model_metric_avg.keys())
    metrics = list(next(iter(model_metric_avg.values())).keys())
    print("Average Metrics by Model:")
    header = ["Model"] + [prettyMetrics.get(m, m) for m in metrics]
    
    print("\t".join(header))
    for model in models:
        row = [model] + [f"{model_metric_avg[model][metric]:.2f}" for metric in metrics]
        print("\t".join(row))

def plot_grouped_bar_chart(model_metric_avg):
    models = list(model_metric_avg.keys())
    metrics = list(next(iter(model_metric_avg.values())).keys())
    x = np.arange(len(models))
    width = 0.8 / len(metrics)
    fig, ax = plt.subplots(figsize=(14, 6))
    for i, metric in enumerate(metrics):
        values = [model_metric_avg[model][metric] for model in models]
        ax.bar(x + i*width, values, width, label=metric)
    ax.set_xticks(x + width*(len(metrics)-1)/2)
    ax.set_xticklabels(models, rotation=20)
    ax.set_ylabel('Average Metric Value')
    ax.set_title('Average Model Metrics Across Benchmarks')
    ax.legend(loc='best', fontsize='small', ncol=2)
    plt.tight_layout()
    plt.show()



all_data, all_models, all_metrics = collect_metrics(benchmarks, evalsDir)
model_metric_avg = compute_model_metric_averages(all_data, all_models, all_metrics)
print_metric_table(model_metric_avg)
plot_grouped_bar_chart(model_metric_avg)

In [None]:
def print_benchmark_model_metrics_table(benchmarks, evalsDir, columns_of_interest):
    import os
    import pandas as pd
    
    width = 18
    def fit(val):
        s = str(val)
        return s[:width].ljust(width)[:width] 
    
    header = ["Benchmark", "Model"] + [
        prettyMetrics.get(col, col) for col in columns_of_interest
    ]

    print("".join([fit(h) for h in header]))
    for benchmark in benchmarks:
        csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")
        if not os.path.isfile(csv_path):
            continue
        df = pd.read_csv(csv_path)

        for _, row in df.iterrows():
            model = row['model']
            values = []
            for col in columns_of_interest:
                val0 = row.get(col, 0)
                val = parse_metric(val0)
                values.append(f"{val:.2f}")
            print("".join([fit(benchmark), fit(model)] + [fit(v) for v in values]))

# read csv for first benchmark
csv_path = os.path.join(evalsDir, benchmarks[0], benchmarks[0], "overview.csv")
df = pd.read_csv(csv_path)

# Get the start column for metrics
start_col = get_metrics_start_col(df)

# columns_of_interest = ["tests", "tests compliant", "errors", "tests compliance unknown"] + list(df.columns[start_col:])
columns_of_interest = list(df.columns[start_col:])

print_benchmark_model_metrics_table(benchmarks, evalsDir, columns_of_interest)

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def collect_and_sum_benchmark_metrics(benchmarks, evalsDir, columns_of_interest):
    # data[benchmark][model][column]
    data = {}
    sums = {bench: {col: 0.0 for col in columns_of_interest} for bench in benchmarks}
    for benchmark in benchmarks:
        csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")
        if not os.path.isfile(csv_path):
            print(f"Warning: {csv_path} not found, skipping.")
            continue
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip()  # Strip whitespace from column names
        data[benchmark] = {}
        for _, row in df.iterrows():
            model = row['model']
            data[benchmark][model] = {}
            for col in columns_of_interest:
                val = parse_metric(row.get(col, 0))
                data[benchmark][model][col] = val
                sums[benchmark][col] += val
    return data, sums

def print_sums_table(sums, columns_of_interest):
    print("Benchmark\t" + "\t".join(columns_of_interest))
    for bench, colvals in sums.items():
        row = [bench] + [f"{colvals[col]:.2f}" for col in columns_of_interest]
        print("\t".join(row))

def plot_sums_bar(sums, columns_of_interest):
    benchmarks = list(sums.keys())
    for col in columns_of_interest:
        values = [sums[bench][col] for bench in benchmarks]
        plt.figure(figsize=(10, 5))
        plt.bar(benchmarks, values)
        plt.ylabel(col)
        plt.title(f"Sum of {col} by Benchmark")
        plt.xticks(rotation=20)
        plt.tight_layout()
        plt.show()

# columns_of_interest = ["errors", "tests compliance unknown"]
# Get the start column for metrics
start_col = get_metrics_start_col(df)

# columns_of_interest = ["tests", "tests compliant", "errors", "tests compliance unknown"] + list(df.columns[start_col:])
columns_of_interest = list(df.columns[start_col:])

data, sums = collect_and_sum_benchmark_metrics(benchmarks, evalsDir, columns_of_interest)
print_sums_table(sums, columns_of_interest)
plot_sums_bar(sums, columns_of_interest)

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



def average_tests_per_model(benchmarks, evalsDir):
    averages = {}
    for benchmark in benchmarks:
        csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")
        if not os.path.isfile(csv_path):
            print(f"Warning: {csv_path} not found, skipping.")
            continue
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip()
        # Parse the 'tests' column for all models
        tests = df['tests'].apply(parse_metric)
        if len(tests) > 0:
            avg = np.mean(tests)
        else:
            avg = 0.0
        averages[benchmark] = avg
    return averages

def print_avg_table(averages):
    print("Benchmark\tAverage Tests per Model")
    for bench, avg in averages.items():
        print(f"{bench}\t{avg:.2f}")

def plot_avg_bar(averages):
    benchmarks = list(averages.keys())
    values = list(averages.values())
    plt.figure(figsize=(10, 5))
    plt.bar(benchmarks, values)
    plt.ylabel("Average Tests per Model")
    plt.title("Average Tests per Model by Benchmark")
    plt.xticks(rotation=20)
    plt.tight_layout()
    plt.show()



averages = average_tests_per_model(benchmarks, evalsDir)
print_avg_table(averages)
plot_avg_bar(averages)

In [None]:
def plot_grouped_barplot_by_benchmark_and_model(benchmarks, evalsDir, column_of_interest):
    """
    Create a grouped barplot showing a specific column as a function of benchmark and model.
    Groups are benchmarks, bars within groups are models.
    Includes an additional "Average" group showing averages across benchmarks for each model.
    """
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    # Data structure: {benchmark: {model: value}}
    data = {}
    all_models = set()
    
    for benchmark in benchmarks:
        csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")
        if not os.path.isfile(csv_path):
            print(f"Warning: {csv_path} not found, skipping.")
            continue
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip()
        
        if "model" not in df.columns or column_of_interest not in df.columns:
            print(f"Warning: Required columns not found in {csv_path}, skipping.")
            continue
            
        data[benchmark] = {}
        for _, row in df.iterrows():
            model = row["model"]
            val = parse_metric(row[column_of_interest])
            data[benchmark][model] = val
            all_models.add(model)
    
    all_models = sorted(all_models)
    benchmarks_with_data = [b for b in benchmarks if b in data]
    
    if not benchmarks_with_data or not all_models:
        print("No data found for plotting.")
        return
    
    # Build data matrix: rows=benchmarks, columns=models
    values = []
    for benchmark in benchmarks_with_data:
        row = []
        for model in all_models:
            row.append(data.get(benchmark, {}).get(model, 0.0))
        values.append(row)
    values = np.array(values)  # shape: (num_benchmarks, num_models)
    
    # Calculate averages across benchmarks for each model
    model_averages = []
    for i, model in enumerate(all_models):
        # Get values for this model across all benchmarks
        model_values = values[:, i]
        # Only average non-zero values (or all values if you prefer)
        avg = np.mean(model_values)
        model_averages.append(avg)
    
    # Add the average row to the data
    all_values = np.vstack([values, model_averages])
    all_labels = benchmarks_with_data + ["Average"]
    
    # Create the grouped bar plot
    x = np.arange(len(all_labels))  # positions for all groups including average
    width = 0.8 / len(all_models)  # width of individual bars
    
    fig, ax = plt.subplots(figsize=(18, 8))
    
    # Create bars for each model
    colors = plt.cm.Set3(np.linspace(0, 1, len(all_models)))  # Use distinct colors
    for i, model in enumerate(all_models):
        offset = (i - (len(all_models) - 1) / 2) * width
        bars = ax.bar(x + offset, all_values[:, i], width, label=model, alpha=0.8, color=colors[i])
        
        # Highlight the average bars with different styling
        if len(bars) > len(benchmarks_with_data):
            bars[-1].set_alpha(1.0)  # Make average bar more opaque
            bars[-1].set_edgecolor('black')  # Add black border to average bar
            bars[-1].set_linewidth(2)
    
    # Add a vertical separator line before the average group
    if len(all_labels) > 1:
        separator_x = len(benchmarks_with_data) - 0.5
        ax.axvline(x=separator_x, color='gray', linestyle='--', alpha=0.7, linewidth=1)
    
    # Customize the plot
    ax.set_xlabel('Benchmark')
    ax.set_ylabel(column_of_interest)
    ax.set_title(f'{column_of_interest} by Benchmark and Model (with Cross-Benchmark Averages)')
    ax.set_xticks(x)
    ax.set_xticklabels(all_labels, rotation=45, ha='right')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add text annotation for the average section
    if len(all_labels) > 1:
        ax.text(len(benchmarks_with_data), ax.get_ylim()[1] * 0.95, 'Cross-Benchmark\nAverages', 
                ha='center', va='top', fontsize=10, style='italic', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

# Example usage with a specific column
# You can change this to any column that exists in your data
column_to_plot = "tests compliant"

# First check what columns are available in the first benchmark
csv_path = os.path.join(evalsDir, benchmarks[0], benchmarks[0], "overview.csv")
if os.path.isfile(csv_path):
    df_sample = pd.read_csv(csv_path)
    df_sample.columns = df_sample.columns.str.strip()
    print("Available columns:")
    print(df_sample.columns.tolist())
    print()
    
    # Try to find a good column to plot
    if "tests compliant" in df_sample.columns:
        column_to_plot = "tests compliant"
    elif "accuracy with eval" in df_sample.columns:
        column_to_plot = "accuracy with eval"
    else:
        # Use the first metric column after standard columns
        start_col = get_metrics_start_col(df_sample)
        
        if start_col < len(df_sample.columns):
            column_to_plot = df_sample.columns[start_col]

print(f"Plotting column: {column_to_plot}")
plot_grouped_barplot_by_benchmark_and_model(benchmarks, evalsDir, column_to_plot)

In [None]:
def compare_baseline_vs_main_metrics(benchmark, evalsDir, prettyBenchmarkNames=None):
    """
    Compare metrics between overview.csv and overview-baseline.csv files for a given benchmark.
    Creates side-by-side bar charts and difference analysis.
    """
    import os
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    # Build paths to both CSV files
    main_csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")
    baseline_csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview-baseline.csv")
    
    # Check if both files exist
    if not os.path.isfile(main_csv_path):
        print(f"Warning: Main file {main_csv_path} not found, skipping.")
        return
    if not os.path.isfile(baseline_csv_path):
        print(f"Warning: Baseline file {baseline_csv_path} not found, skipping.")
        return
    
    # Read both CSV files
    main_df = pd.read_csv(main_csv_path)
    baseline_df = pd.read_csv(baseline_csv_path)
    
    # Strip whitespace from column names
    main_df.columns = main_df.columns.str.strip()
    baseline_df.columns = baseline_df.columns.str.strip()
    
    # Find common models
    main_models = set(main_df['model'])
    baseline_models = set(baseline_df['model'])
    common_models = sorted(main_models.intersection(baseline_models))
    
    if not common_models:
        print(f"No common models found between main and baseline for benchmark '{benchmark}'")
        return
    
    # Find common metric columns (excluding 'model' and other non-metric columns)
    main_start_col = get_metrics_start_col(main_df)
    baseline_start_col = get_metrics_start_col(baseline_df)
    
    main_metrics = set(main_df.columns[main_start_col:])
    baseline_metrics = set(baseline_df.columns[baseline_start_col:])
    common_metrics = sorted(main_metrics.intersection(baseline_metrics))
    
    if not common_metrics:
        print(f"No common metrics found between main and baseline for benchmark '{benchmark}'")
        print(f"Main metrics: {list(main_metrics)}")
        print(f"Baseline metrics: {list(baseline_metrics)}")
        return
    
    print(f"Comparing benchmark '{benchmark}' with {len(common_models)} models and {len(common_metrics)} metrics")
    print(f"Common models: {common_models}")
    print(f"Common metrics: {common_metrics}")
    
    # Prepare data for comparison
    comparison_data = {}
    for model in common_models:
        main_row = main_df[main_df['model'] == model].iloc[0] if len(main_df[main_df['model'] == model]) > 0 else None
        baseline_row = baseline_df[baseline_df['model'] == model].iloc[0] if len(baseline_df[baseline_df['model'] == model]) > 0 else None
        
        if main_row is not None and baseline_row is not None:
            comparison_data[model] = {
                'main': {metric: parse_metric(main_row[metric]) for metric in common_metrics},
                'baseline': {metric: parse_metric(baseline_row[metric]) for metric in common_metrics}
            }
    
    # Create side-by-side comparison plots
    n_metrics = len(common_metrics)
    n_models = len(comparison_data)
    
    if n_metrics == 0 or n_models == 0:
        print("No data to plot")
        return
    
    # Create subplots for each metric
    fig, axes = plt.subplots(n_metrics, 1, figsize=(12, 4 * n_metrics))
    if n_metrics == 1:
        axes = [axes]
    
    for i, metric in enumerate(common_metrics):
        ax = axes[i]
        
        models = list(comparison_data.keys())
        main_values = [comparison_data[model]['main'][metric] for model in models]
        baseline_values = [comparison_data[model]['baseline'][metric] for model in models]
        
        x = np.arange(len(models))
        width = 0.35
        
        bars1 = ax.bar(x - width/2, baseline_values, width, label='Baseline', alpha=0.8, color='lightcoral')
        bars2 = ax.bar(x + width/2, main_values, width, label='Main', alpha=0.8, color='skyblue')
        
        ax.set_xlabel('Model')
        ax.set_ylabel(metric)
        ax.set_title(f'{metric} - Baseline vs Main ({prettyBenchmarkNames.get(benchmark, benchmark) if prettyBenchmarkNames else benchmark})')
        ax.set_xticks(x)
        ax.set_xticklabels(models, rotation=45, ha='right')
        ax.legend()
        ax.grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for bar in bars1:
            height = bar.get_height()
            ax.annotate(f'{height:.1f}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),  # 3 points vertical offset
                       textcoords="offset points",
                       ha='center', va='bottom', fontsize=8)
        
        for bar in bars2:
            height = bar.get_height()
            ax.annotate(f'{height:.1f}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),  # 3 points vertical offset
                       textcoords="offset points",
                       ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    # Create difference analysis plot
    fig, axes = plt.subplots(n_metrics, 1, figsize=(12, 4 * n_metrics))
    if n_metrics == 1:
        axes = [axes]
    
    for i, metric in enumerate(common_metrics):
        ax = axes[i]
        
        models = list(comparison_data.keys())
        differences = [comparison_data[model]['main'][metric] - comparison_data[model]['baseline'][metric] for model in models]
        
        # Color bars based on positive/negative change
        colors = ['green' if diff > 0 else 'red' if diff < 0 else 'gray' for diff in differences]
        
        bars = ax.bar(models, differences, color=colors, alpha=0.7)
        
        ax.set_xlabel('Model')
        ax.set_ylabel(f'{metric} Difference (Main - Baseline)')
        ax.set_title(f'{metric} - Improvement Analysis ({prettyBenchmarkNames.get(benchmark, benchmark) if prettyBenchmarkNames else benchmark})')
        ax.set_xticklabels(models, rotation=45, ha='right')
        ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        ax.grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for bar, diff in zip(bars, differences):
            height = bar.get_height()
            ax.annotate(f'{diff:+.1f}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3 if height >= 0 else -15),
                       textcoords="offset points",
                       ha='center', va='bottom' if height >= 0 else 'top', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary table
    print(f"\nDetailed Comparison for '{benchmark}':")
    print("=" * 80)
    
    header = ["Model"] + [f"{m} (B)" for m in common_metrics] + [f"{m} (M)" for m in common_metrics] + [f"{m} (Î”)" for m in common_metrics]
    print("\t".join([h[:10] for h in header]))
    
    for model in comparison_data:
        row = [model[:10]]
        # Baseline values
        row.extend([f"{comparison_data[model]['baseline'][metric]:.1f}" for metric in common_metrics])
        # Main values
        row.extend([f"{comparison_data[model]['main'][metric]:.1f}" for metric in common_metrics])
        # Differences
        row.extend([f"{comparison_data[model]['main'][metric] - comparison_data[model]['baseline'][metric]:+.1f}" for metric in common_metrics])
        print("\t".join(row))

def analyze_all_baseline_comparisons(benchmarks, evalsDir, prettyBenchmarkNames=None):
    """
    Run baseline vs main comparison for all benchmarks that have both files.
    """
    for benchmark in benchmarks:
        print(f"\n{'='*60}")
        print(f"Analyzing baseline comparison for: {benchmark}")
        print(f"{'='*60}")
        try:
            compare_baseline_vs_main_metrics(benchmark, evalsDir, prettyBenchmarkNames)
        except Exception as e:
            print(f"Error analyzing {benchmark}: {str(e)}")
            continue

In [None]:
# Test the baseline comparison with art-prompt benchmark
print("Testing baseline comparison analysis with art-prompt benchmark:")
compare_baseline_vs_main_metrics("art-prompt", evalsDir, prettyBenchmarkNames)