In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import sys
import importlib

# Add current directory to path to ensure we can import the analyze_metrics module
sys.path.insert(0, os.getcwd())

# Import the module and reload it to get the latest version
import analyze_metrics
importlib.reload(analyze_metrics)
print("Module imported and reloaded successfully!")

# Define aliases for convenience - Original functions
parse_metric = analyze_metrics.parse_metric
get_metrics_start_col = analyze_metrics.get_metrics_start_col
analyze_benchmark_metrics = analyze_metrics.analyze_benchmark_metrics
collect_metrics = analyze_metrics.collect_metrics
compute_model_metric_averages = analyze_metrics.compute_model_metric_averages
print_metric_table = analyze_metrics.print_metric_table
plot_grouped_bar_chart = analyze_metrics.plot_grouped_bar_chart
collect_and_sum_benchmark_metrics = analyze_metrics.collect_and_sum_benchmark_metrics
print_sums_table = analyze_metrics.print_sums_table
plot_sums_bar = analyze_metrics.plot_sums_bar
average_tests_per_model = analyze_metrics.average_tests_per_model
print_avg_table = analyze_metrics.print_avg_table
plot_avg_bar = analyze_metrics.plot_avg_bar
get_default_benchmarks = analyze_metrics.get_default_benchmarks
plot_grouped_barplot_by_benchmark_and_model = analyze_metrics.plot_grouped_barplot_by_benchmark_and_model
plot_baseline_vs_main_metrics_by_benchmark = analyze_metrics.plot_baseline_vs_main_metrics_by_benchmark

# New functions from analyze_metrics.py
plot_baseline_vs_main_metrics_by_benchmark_single_model = analyze_metrics.plot_baseline_vs_main_metrics_by_benchmark_single_model
parse_percentage = analyze_metrics.parse_percentage
generate_plot_results_style_analysis = analyze_metrics.generate_plot_results_style_analysis
_plot_non_compliance_by_benchmark = analyze_metrics._plot_non_compliance_by_benchmark
_plot_rule_vs_inverse_rule = analyze_metrics._plot_rule_vs_inverse_rule
_plot_promptpex_vs_baseline = analyze_metrics._plot_promptpex_vs_baseline
_plot_test_validity = analyze_metrics._plot_test_validity
_plot_rules_count = analyze_metrics._plot_rules_count

print("All function aliases created successfully!")

benchmarkVersion = "test-all-2025-09-29-paper/eval"

rootDir = "/home/zorn/promptpex"

if not os.path.isdir(rootDir):
    rootDir = ".."

evalsDir = f"{rootDir}/evals/{benchmarkVersion}/"

# Automatically detect available benchmarks instead of using hardcoded list
print("Detecting available benchmarks...")
benchmarks = get_default_benchmarks(evalsDir)
print(f"Found {len(benchmarks)} available benchmarks:")
for i, benchmark in enumerate(benchmarks):
    print(f"  {i+1:2d}. {benchmark}")

# Use subset for faster testing (uncomment to use all benchmarks)
# benchmarks = benchmarks[:5]  # Use first 5 benchmarks for testing
print(f"Using {len(benchmarks)} benchmarks for analysis")

# Control chart generation for individual benchmark analysis
verbose = False  # Set to False to skip individual benchmark charts
print(f"Verbose mode: {verbose} - {'Charts will be generated' if verbose else 'Charts will be skipped'} for individual benchmarks")

prettyBenchmarkNames = { 
    "speech-tag": "speech-tag", 
    "text-to-p": "text-to-p",  
    "shakespearean-writing-assistant": "shakespeare", 
    "sentence-rewrite": "sentence", 
    "extract-names": "extract-names", 
    "elements": "elements", 
    "art-prompt": "art-prompt", 
    "classify-input-text": "classify"
}

prettyMetrics = {
    "tests compliant": "prompt ok/err", 
    "system_compliant": "prompt only",  
    "rules_system_with_input_compliant": "prompt/rule/input"
}

# Display some random tests from the first available benchmark
if benchmarks:
    benchmark = benchmarks[0]
    csv_path = f"{evalsDir}/{benchmark}/{benchmark}/overview.csv"
    
    if os.path.isfile(csv_path):
        df = pd.read_csv(csv_path)
        print(f"Metrics columns for {benchmark}:", df.columns.tolist())
        
        df_sample = df.sample(n=min(10, len(df)), random_state=42)
    else:
        print(f"No overview.csv found for {benchmark}")
else:
    print("No benchmarks found!")

In [None]:
# Individual benchmark analysis - now using imported function with plot saving
if verbose:
    print("\n" + "="*60)

    print("INDIVIDUAL BENCHMARK ANALYSIS (WITH CHARTS)")
    print("="*60)
    for benchmark in benchmarks:
        print(f"Analyzing benchmark: {benchmark}")
        analyze_benchmark_metrics(benchmark, evalsDir, prettyBenchmarkNames, evalsDir)
else:
    print("" + "="*60)
    print("INDIVIDUAL BENCHMARK ANALYSIS (SKIPPED - VERBOSE=FALSE)")
    print("="*60)
    print(f"Skipping individual chart generation for {len(benchmarks)} benchmarks")
    print("To enable charts, set verbose = True in the first cell")

In [None]:
# Collect all metrics data using imported function
all_data, all_models, all_metrics = collect_metrics(benchmarks, evalsDir)

# Compute averages using imported function
model_metric_avg = compute_model_metric_averages(all_data, all_models, all_metrics)

# Display table using imported function  
print_metric_table(model_metric_avg, prettyMetrics)

# Plot averages with plot saving enabled
plot_grouped_bar_chart(model_metric_avg, evalsDir, evalsDir)

In [None]:
# Use the same columns from the first benchmark's data
if benchmarks:
    csv_path = os.path.join(evalsDir, benchmarks[0], benchmarks[0], "overview.csv")
    if os.path.isfile(csv_path):
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip()
        start_col = get_metrics_start_col(df)
        columns_of_interest = list(df.columns[start_col:])
        
        # Sum metrics analysis with plot saving
        print("Computing benchmark sums...")
        data, sums = collect_and_sum_benchmark_metrics(benchmarks, evalsDir, columns_of_interest)
        print_sums_table(sums, columns_of_interest)
        plot_sums_bar(sums, columns_of_interest, evalsDir, evalsDir)

In [None]:
# Average tests per model analysis with plot saving
print("Computing average tests per model...")
averages = average_tests_per_model(benchmarks, evalsDir)
print_avg_table(averages)
plot_avg_bar(averages, evalsDir, evalsDir)

In [None]:
# Grouped analysis with plot saving - using imported function
column_to_plot = None

# Check what columns are available
if benchmarks:
    csv_path = os.path.join(evalsDir, benchmarks[0], benchmarks[0], "overview.csv")
    if os.path.isfile(csv_path):
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip()
        available_columns = df.columns.tolist()
        print(f"Available columns: {available_columns}")
        
        # Try to find a suitable column to plot
        if "tests compliant" in available_columns:
            column_to_plot = "tests compliant"
        elif "accuracy with azure:o4-mini_2025-04-16" in available_columns:
            column_to_plot = "accuracy with azure:o4-mini_2025-04-16"
        elif len(available_columns) > 2:  # model + at least one metric
            column_to_plot = available_columns[2]  # Skip 'model' and possibly other non-metric columns
            
if not column_to_plot:
    column_to_plot = "tests compliant"
    print("Using default column: tests compliant")

print(f"Plotting column: {column_to_plot}")
plot_grouped_barplot_by_benchmark_and_model(benchmarks, evalsDir, column_to_plot, evalsDir, show_error_bars=False)

# Try accuracy if available
if "accuracy with azure:o4-mini_2025-04-16" in available_columns:
    column_to_plot = "accuracy with azure:o4-mini_2025-04-16"
    print(f"Plotting column: {column_to_plot}")
    plot_grouped_barplot_by_benchmark_and_model(benchmarks, evalsDir, column_to_plot, evalsDir, show_error_bars=False)

In [None]:
# Baseline comparison analysis with plot saving - using imported function
print("=" * 60)
print("BASELINE COMPARISON ANALYSIS")
print("=" * 60)

# Check if baseline data exists
has_baseline = False
for benchmark in benchmarks[:3]:  # Check first few benchmarks
    baseline_csv_path = os.path.join(evalsDir, benchmark, benchmark, "overview-baseline.csv")
    if os.path.isfile(baseline_csv_path):
        has_baseline = True
        print(f"Found baseline data for: {benchmark}")
        break

if has_baseline:
    print("Running baseline vs main analysis with plot saving...")
    plot_baseline_vs_main_metrics_by_benchmark(benchmarks, evalsDir, "tests compliant", evalsDir)
    
    # Try accuracy comparison if available
    csv_path = os.path.join(evalsDir, benchmarks[0], benchmarks[0], "overview.csv")
    if os.path.isfile(csv_path):
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip()
        accuracy_cols = [col for col in df.columns if "accuracy" in col.lower()]
        if accuracy_cols:
            print(f"Running accuracy baseline comparison for: {accuracy_cols[0]}")
            plot_baseline_vs_main_metrics_by_benchmark(benchmarks, evalsDir, accuracy_cols[0], evalsDir)
        else:
            print("No accuracy columns found for baseline comparison.")
    
    print("Baseline comparison plots saved to:", evalsDir)
else:
    print("No baseline data found. Checking available files:")
    for benchmark in benchmarks[:5]:  # Check first few benchmarks
        baseline_path = os.path.join(evalsDir, benchmark, benchmark, "overview-baseline.csv")
        main_path = os.path.join(evalsDir, benchmark, benchmark, "overview.csv")
        print(f"  {benchmark}:")
        print(f"    Main: {'✓' if os.path.isfile(main_path) else '✗'}")
        print(f"    Baseline: {'✓' if os.path.isfile(baseline_path) else '✗'}")

In [None]:
# Single Model Analysis - gpt-oss specific analysis
print("=" * 60)
print("SINGLE MODEL ANALYSIS - GPT-OSS")
print("=" * 60)

if has_baseline:
    print("Running single model analysis for gpt-oss...")
    plot_baseline_vs_main_metrics_by_benchmark_single_model(benchmarks, evalsDir, "gpt-oss", "tests compliant", evalsDir)
    
    # Try accuracy comparison for single model if available
    if 'accuracy_cols' in locals() and accuracy_cols:
        plot_baseline_vs_main_metrics_by_benchmark_single_model(benchmarks, evalsDir, "gpt-oss", accuracy_cols[0], evalsDir)
    
    print("Single model analysis plots saved to:", evalsDir)
else:
    print("No baseline data found. Skipping single model analysis.")

In [None]:
# Plot-Results Style Analysis - Complete compliance analysis suite
print("=" * 60)
print("PLOT-RESULTS STYLE ANALYSIS")
print("=" * 60)

# This generates all the plots and CSV files that were originally in plot-results.ipynb
# Including: non-compliance charts, rule vs inverse rule, promptpex vs baseline, 
# test validity, and rules count analysis

generate_plot_results_style_analysis(benchmarks, evalsDir, evalsDir)

print("Plot-results style analysis complete!")
print("Generated files:")
print("  - pp-cpct.csv (non-compliance percentages)")
print("  - pp-test-validity.csv (test validity statistics)")  
print("  - pos-neg-cpct.csv (rule vs inverse rule compliance)")
print("  - pp-compare.csv (promptpex vs baseline comparison)")
print("  - pp-grounded-rules.csv (rules count per benchmark)")
print("  - Various PDF charts for each analysis type")

In [None]:
# Individual Plot-Results Chart Generation (Optional)
# This cell demonstrates how to generate individual charts from the plot-results analysis
# These charts are also generated by the generate_plot_results_style_analysis function above

print("=" * 60)
print("INDIVIDUAL PLOT-RESULTS CHART GENERATION")
print("=" * 60)

# You can uncomment any of these to generate individual charts
# Note: The CSV files must exist first (generated by the previous cell)

try:
    print("Checking if CSV files exist for individual chart generation...")
    
    # Check for existence of CSV files
    csv_files = [
        f"{evalsDir}/pp-cpct.csv",
        f"{evalsDir}/pos-neg-cpct.csv", 
        f"{evalsDir}/pp-compare.csv",
        f"{evalsDir}/pp-test-validity.csv",
        f"{evalsDir}/pp-grounded-rules.csv"
    ]
    
    existing_files = [f for f in csv_files if os.path.isfile(f)]
    print(f"Found {len(existing_files)}/{len(csv_files)} CSV files")
    
    if len(existing_files) >= 3:  # Generate charts if we have most CSV files
        print("\nGenerating individual charts...")
        
        # Generate non-compliance by benchmark chart
        if os.path.isfile(f"{evalsDir}/pp-cpct.csv"):
            print("1. Non-compliance by benchmark chart...")
            _plot_non_compliance_by_benchmark(evalsDir)
        
        # Generate rule vs inverse rule chart  
        if os.path.isfile(f"{evalsDir}/pos-neg-cpct.csv"):
            print("2. Rule vs inverse rule chart...")
            _plot_rule_vs_inverse_rule(evalsDir)
        
        # Generate promptpex vs baseline chart
        if os.path.isfile(f"{evalsDir}/pp-compare.csv"):
            print("3. PromptPex vs baseline chart...")
            _plot_promptpex_vs_baseline(evalsDir)
        
        # Generate test validity chart
        if os.path.isfile(f"{evalsDir}/pp-test-validity.csv"):
            print("4. Test validity chart...")
            _plot_test_validity(evalsDir)
        
        # Generate rules count chart
        if os.path.isfile(f"{evalsDir}/pp-grounded-rules.csv"):
            print("5. Rules count chart...")
            _plot_rules_count(evalsDir)
        
        print("Individual chart generation complete!")
    else:
        print("Not enough CSV files found. Run the plot-results style analysis first.")
        
except Exception as e:
    print(f"Error in individual chart generation: {e}")
    print("Make sure to run the plot-results style analysis cell first.")

# Analysis Complete

## Summary of Generated Plots and Files

This notebook now generates all the same plots and analysis as the `analyze_metrics.py` script, including:

### 1. **Individual Benchmark Analysis** (if verbose=True)
- Individual metric charts per benchmark

### 2. **Aggregated Analysis**
- Model metric averages across benchmarks
- Benchmark sums analysis
- Average tests per model
- Grouped analysis by benchmark and model

### 3. **Baseline Comparison Analysis**
- Overall baseline vs promptpex comparison (with inverse rule analysis)
- Accuracy metrics comparison
- Cross-benchmark statistics with error bars

### 4. **Single Model Analysis**
- GPT-OSS specific baseline vs promptpex analysis
- Model-specific compliance and accuracy comparisons

### 5. **Plot-Results Style Analysis**
- **CSV Files Generated:**
  - `pp-cpct.csv` - Non-compliance percentages by benchmark and model
  - `pp-test-validity.csv` - Test validity statistics
  - `pos-neg-cpct.csv` - Rule vs inverse rule compliance comparison
  - `pp-compare.csv` - PromptPex vs baseline comparison
  - `pp-grounded-rules.csv` - Rules count per benchmark

- **Charts Generated:**
  - Non-compliance clustered bar chart
  - Rule vs inverse rule comparison
  - PromptPex vs baseline comparison
  - Test validity visualization
  - Rules count per benchmark

### 6. **Individual Chart Functions**
- Demonstrates how to use the private plotting functions
- Allows selective chart generation

All charts use consistent styling and are saved as PDF files in the evaluation directory.

**Files saved to:** `{evalsDir}`