# Benchmark analysis

In [1]:
import json
import os
import pandas as pd
import numpy as np
from collections import defaultdict

import plotly.graph_objects as go

In [2]:
model = "qwen-3-32b"
total_layers = 64
subtitle = f"{model.replace('-', ' ').title()}, Single-Shot & No Thinking"
base_dir = f"/workspace/{model}/capped"
out_dir = f"/root/git/plots/{model}/capped/results/benchmarks"

os.makedirs(out_dir, exist_ok=True)

config_names = ["baseline", "role_trait", "jailbreak"]

In [3]:
# Configure experiment naming based on config type

# Jailbreak config (harm/safe pattern)
JAILBREAK_CAP_NAMES = {
    'harm_0.25': 'Harm 75th',
    'harm_0.01': 'Harm 99th', 
    'safe_0.50': 'Safe 50th',
    'safe_0.01': 'Safe 99th',
}

JAILBREAK_LAYER_GROUPS = {
    f'{total_layers//2}:{total_layers}': f'Layers {total_layers//2}-{total_layers-1}',
    f'0:{total_layers}': f'All Layers (0-{total_layers-1})',
    f'2:{total_layers}:2': 'Every 2nd Layer',
    f'4:{total_layers}:4': 'Every 4th Layer',
}

# Role/trait config (percentile pattern)
ROLE_TRAIT_CAP_NAMES = {
    'p0.01': '1st',
    'p0.25': '25th',
    'p0.5': '50th',
    'p0.75': '75th',
}

ROLE_TRAIT_LAYER_GROUPS = {
    f'0:{total_layers}': f"All Layers (0-{total_layers-1})",
    f'0:{total_layers//4}': f"Layers 0-{total_layers//4-1}",
    f'{total_layers//8}:{total_layers//8 + total_layers//8}': f"Layers {total_layers//8}-{total_layers//4-1}",
    f'{total_layers//4}:{total_layers//4 + total_layers//8}': f"Layers {total_layers//4}-{total_layers//4 + total_layers//8-1}",
}

def parse_experiment_name(exp_id, config_name):
    """Parse experiment ID into human-readable name"""
    if exp_id == "baseline":
        return "Baseline"
    
    # Select appropriate mappings based on config
    if config_name == "jailbreak":
        cap_names = JAILBREAK_CAP_NAMES
        layer_groups = JAILBREAK_LAYER_GROUPS
    else:  # role_trait
        cap_names = ROLE_TRAIT_CAP_NAMES
        layer_groups = ROLE_TRAIT_LAYER_GROUPS
    
    # Parse format: layers_X:Y-cap or layers_X:Y:Z-cap
    parts = exp_id.split('-')
    if len(parts) != 2:
        return exp_id
    
    layer_part, cap_part = parts
    
    # Extract layer range
    layer_spec = layer_part.replace('layers_', '')
    layer_name = layer_groups.get(layer_spec, layer_spec)
    
    # Extract cap name
    cap_name = cap_names.get(cap_part, cap_part)
    
    return f"{layer_name}, {cap_name}"

print("Config setup complete")

Config setup complete


## IFEval

In [None]:
task = "ifeval"

# load results from subdirs into a DataFrame
bench_dir = f"{base_dir}/benchmarks"

data_rows = []

for cfg in config_names:
    if cfg == "baseline":
        # Baseline is in a different location
        results_path = f"{bench_dir}/baseline/{task}/latest/results.json"
        if os.path.exists(results_path):
            with open(results_path, "r") as f:
                data = json.load(f)

            if "results" in data and "ifeval" in data["results"]:
                row = {
                    "config_name": cfg,
                    "experiment_id": "baseline"
                }
                row.update(data["results"]["ifeval"])
                data_rows.append(row)
        else:
            print(f"Warning: Baseline not found at {results_path}")
    else:
        # Other configs have multiple experiments
        results_dir = f"{bench_dir}/{cfg}"

        if not os.path.exists(results_dir):
            print(f"Warning: Config directory not found: {results_dir}")
            continue

        for exp_id in os.listdir(results_dir):
            results_path = f"{results_dir}/{exp_id}/{task}/latest/results.json"
            if os.path.exists(results_path):
                with open(results_path, "r") as f:
                    data = json.load(f)

                # Extract the ifeval results
                if "results" in data and "ifeval" in data["results"]:
                    row = {
                        "config_name": cfg,
                        "experiment_id": exp_id
                    }
                    # Add all the metrics from results
                    row.update(data["results"]["ifeval"])
                    data_rows.append(row)

# Create DataFrame
df = pd.DataFrame(data_rows)
print(f"Loaded {len(df)} experiments")
print(f"Config breakdown: {df['config_name'].value_counts().to_dict()}")
df.head()

Loaded 33 experiments
Config breakdown: {'role_trait': 16, 'jailbreak': 16, 'baseline': 1}


Unnamed: 0,config_name,experiment_id,alias,"prompt_level_strict_acc,none","prompt_level_strict_acc_stderr,none","inst_level_strict_acc,none","inst_level_strict_acc_stderr,none","prompt_level_loose_acc,none","prompt_level_loose_acc_stderr,none","inst_level_loose_acc,none","inst_level_loose_acc_stderr,none"
0,baseline,baseline,ifeval,0.468,0.022337,0.601552,,0.486,0.022374,0.615783,
1,role_trait,layers_16:24-p0.75,ifeval,0.478,0.022361,0.608021,,0.49,0.022379,0.619664,
2,role_trait,layers_8:16-p0.75,ifeval,0.472,0.022348,0.598965,,0.484,0.022372,0.610608,
3,role_trait,layers_8:16-p0.01,ifeval,0.476,0.022357,0.606727,,0.488,0.022377,0.619664,
4,role_trait,layers_8:16-p0.25,ifeval,0.484,0.022372,0.610608,,0.5,0.022383,0.624838,


In [126]:
def plot_ifeval_results(df, title, subtitle):
    """
    Plot IFEval results with 2 subplots (prompt-level and instruction-level accuracy).
    Shows all experiments grouped by layer type within each config.
    Baseline appears on the far left.
    """
    import numpy as np
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    # Sort: baseline first, then by config_name and experiment_id
    df_sorted = df.copy()
    df_sorted['sort_key'] = df_sorted['config_name'].map({
        'baseline': 0,
        'role_trait': 1,
        'jailbreak': 2
    })

    # Extract layer and cap info for better grouping
    def extract_layer_cap(row):
        exp_id = row['experiment_id']
        if exp_id == 'baseline':
            return ('baseline', 'baseline', 'baseline')
        parts = exp_id.split('-')
        if len(parts) == 2:
            layer_part = parts[0].replace('layers_', '')
            cap_part = parts[1]
            return (layer_part, cap_part, exp_id)
        return ('unknown', 'unknown', exp_id)

    df_sorted[['layer_group', 'cap_type', 'full_id']] = df_sorted.apply(
        extract_layer_cap, axis=1, result_type='expand'
    )

    # Sort by config, then layer group, then cap type
    df_sorted = df_sorted.sort_values(['sort_key', 'layer_group', 'cap_type']).reset_index(drop=True)

    # Parse experiment names
    df_sorted['display_name'] = df_sorted.apply(
        lambda row: parse_experiment_name(row['experiment_id'], row['config_name']),
        axis=1
    )

    # Short cap label
    def get_cap_label(row):
        exp_id = row['experiment_id']
        config = row['config_name']
        if exp_id == 'baseline':
            return 'Baseline'
        cap_names = JAILBREAK_CAP_NAMES if config == 'jailbreak' else ROLE_TRAIT_CAP_NAMES
        cap_part = exp_id.split('-')[1] if '-' in exp_id else exp_id
        return cap_names.get(cap_part, cap_part)

    df_sorted['cap_label'] = df_sorted.apply(get_cap_label, axis=1)

    # Layer label
    def get_layer_label(row):
        exp_id = row['experiment_id']
        config = row['config_name']
        if exp_id == 'baseline':
            return 'Baseline'
        layer_groups = JAILBREAK_LAYER_GROUPS if config == 'jailbreak' else ROLE_TRAIT_LAYER_GROUPS
        layer_part = exp_id.split('-')[0].replace('layers_', '') if '-' in exp_id else ''
        return layer_groups.get(layer_part, layer_part)

    df_sorted['layer_label'] = df_sorted.apply(get_layer_label, axis=1)

    # Extract stderr values, convert 'N/A' to None
    def get_stderr(series):
        return [float(v) if v != 'N/A' else None for v in series]
    prompt_stderr = get_stderr(df_sorted['prompt_level_strict_acc_stderr,none'])

    # Calculate x positions - small gap within groups, larger gap between layer groups
    x_positions = []
    current_x = 0
    prev_layer = None

    bar_width = 0.35
    gap_within_group = 0.05
    gap_between_layers = 0.15

    for _, row in df_sorted.iterrows():
        if prev_layer is not None and row['layer_group'] != prev_layer:
            current_x += bar_width + gap_between_layers
        elif prev_layer is not None:
            current_x += bar_width + gap_within_group
        x_positions.append(current_x)
        prev_layer = row['layer_group']
    x_positions = np.array(x_positions)

    # Create subplots
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Prompt-level Accuracy', 'Instruction-level Accuracy'),
        vertical_spacing=0.12
    )

    # Colors
    config_colors = {
        'baseline': '#2ca02c',   # green
        'role_trait': '#ff7f0e',  # orange
        'jailbreak': '#d62728'    # red
    }
    colors = [config_colors.get(row['config_name'], '#636EFA') for _, row in df_sorted.iterrows()]

    # Top subplot: Prompt-level
    fig.add_trace(
        go.Bar(
            name='Prompt-level',
            x=x_positions,
            y=df_sorted['prompt_level_strict_acc,none'],
            error_y=dict(type='data', array=prompt_stderr, visible=True),
            marker_color=colors,
            opacity=0.8,
            width=bar_width,
            showlegend=False,
            hovertemplate="%{customdata[0]}<br>Config: %{customdata[1]}<br>Accuracy: %{y:.1%}<extra></extra>",
            customdata=np.column_stack([df_sorted['display_name'], df_sorted['config_name']]),
        ),
        row=1, col=1
    )

    # Bottom subplot: Instruction-level
    fig.add_trace(
        go.Bar(
            name='Instruction-level',
            x=x_positions,
            y=df_sorted['inst_level_strict_acc,none'],
            marker_color=colors,
            width=bar_width,
            opacity=0.8,
            showlegend=False,
            hovertemplate="%{customdata[0]}<br>Config: %{customdata[1]}<br>Accuracy: %{y:.1%}<extra></extra>",
            customdata=np.column_stack([df_sorted['display_name'], df_sorted['config_name']]),
        ),
        row=2, col=1
    )

    # Legend-only traces (so we get color legend without duplicating bars)
    legend_names = {
        'baseline': 'Baseline',
        'role_trait': 'Role/Trait Rollouts',
        'jailbreak': 'Jailbreak Rollouts'
    }
    present_configs = [c for c in ['baseline', 'role_trait', 'jailbreak']
                    if c in df_sorted['config_name'].values]

    for c in present_configs:
        fig.add_trace(
            go.Scatter(
                x=[None], y=[None], mode='markers',
                marker=dict(size=10, color=config_colors[c]),
                name=legend_names[c],
                showlegend=True
            ),
            row=1, col=1  # attach to top subplot; legend is global
        )

    # Place legend in top-right
    fig.update_layout(
        legend=dict(
            x=1.0, y=1.06, xanchor='right', yanchor='bottom',
            orientation='h',
        )
    )

    # Calculate config group boundaries for annotations
    config_ranges = {}
    for config in ['baseline', 'role_trait', 'jailbreak']:
        config_df = df_sorted[df_sorted['config_name'] == config]
        if len(config_df) > 0:
            indices = config_df.index
            config_ranges[config] = (x_positions[indices[0]], x_positions[indices[-1]])

    # Add a single "(L-R) …" legend line under each rollout heading
    def lr_caption_for(config_name):
        cfg = df_sorted[df_sorted['config_name'] == config_name].copy()
        if cfg.empty:
            return ""
        # Keep the left-to-right order shown in the chart
        cfg = cfg.iloc[np.argsort(x_positions[cfg.index])]
        # Skip baseline bars inside a config, if any
        labels = [lbl for lbl in cfg['cap_label'].tolist() if lbl.lower() != 'baseline']
        return "(L-R in each bar group)<br>" + ", ".join(labels[:4]) if labels else ""

    for config in ['role_trait', 'jailbreak']:
        if config in config_ranges:
            x_start, x_end = config_ranges[config]
            x_center = (x_start + x_end) / 2
            caption = lr_caption_for(config)
            if caption:
                fig.add_annotation(
                x=x_center, y=0.75,                 # adjust as you like
                text=f"<span style='font-size:10px'>{caption}</span>",
                showarrow=False, xref='x', yref='paper',
                yanchor='top', row=1, col=1         # <-- top subplot only
            )

    # Calculate layer group centers for x-axis labels
    layer_centers, layer_labels = [], []
    prev_layer = None
    group_start = 0
    for i, (_, row) in enumerate(df_sorted.iterrows()):
        if prev_layer is not None and row['layer_group'] != prev_layer:
            group_center = (x_positions[group_start] + x_positions[i-1]) / 2
            layer_centers.append(group_center)
            layer_labels.append(df_sorted.iloc[group_start]['layer_label'])
            group_start = i
        prev_layer = row['layer_group']
    if len(df_sorted) > 0:
        group_center = (x_positions[group_start] + x_positions[-1]) / 2
        layer_centers.append(group_center)
        layer_labels.append(df_sorted.iloc[group_start]['layer_label'])

    # Layout & axes
    fig.update_layout(
        title=dict(text=f"{title}", subtitle=dict(text=subtitle)),
        height=900, width=1000, margin=dict(b=100, t=150)
    )
    fig.update_yaxes(title_text="Accuracy", range=[0, 0.8], row=1, col=1, tickfont=dict(size=10))
    fig.update_yaxes(title_text="Accuracy", range=[0, 0.8], row=2, col=1, tickfont=dict(size=10))
    fig.update_xaxes(title_text="Intervention Layers", row=2, col=1)

    fig.update_xaxes(
        tickmode='array', tickvals=layer_centers, ticktext=layer_labels,
        range=[x_positions[0] - 0.5, x_positions[-1] + 0.5], row=1, col=1, tickfont=dict(size=10)
    )
    fig.update_xaxes(
        tickmode='array', tickvals=layer_centers, ticktext=layer_labels,
        range=[x_positions[0] - 0.5, x_positions[-1] + 0.5], row=2, col=1, tickfont=dict(size=10)
    )

    return fig


In [127]:
# Example usage
fig = plot_ifeval_results(df, title="IFEval with Projection Capping", subtitle=subtitle)
fig.show()
fig.write_html(f"{out_dir}/ifeval.html")

## MMLU Pro

In [4]:
task = "mmlu_pro"

# load results from subdirs into a DataFrame
bench_dir = f"{base_dir}/benchmarks"

mmlu_data_rows = []

for cfg in config_names:
    if cfg == "baseline":
        # Baseline is in a different location - use latest only
        results_path = f"{bench_dir}/baseline/{task}/latest/results.json"
        manifest_path = f"{bench_dir}/baseline/{task}/latest/manifest.json"
        
        if os.path.exists(results_path):
            with open(results_path, "r") as f:
                data = json.load(f)
            
            # Load manifest for thinking and apply_chat_template
            thinking = False
            apply_chat_template = False
            if os.path.exists(manifest_path):
                with open(manifest_path, "r") as f:
                    manifest = json.load(f)
                    thinking = manifest.get("thinking", None)
                    if thinking is None:
                        thinking = False
                    apply_chat_template = manifest.get("apply_chat_template", False)

            if "results" in data and "mmlu_pro" in data["results"]:
                row = {
                    "config_name": cfg,
                    "experiment_id": "baseline",
                    "thinking": thinking,
                    "apply_chat_template": apply_chat_template
                }
                # Add overall mmlu_pro metrics
                row.update(data["results"]["mmlu_pro"])
                
                # Add category metrics
                for key, val in data["results"].items():
                    if key.startswith("mmlu_pro_") and isinstance(val, dict):
                        category = key.replace("mmlu_pro_", "")
                        row[f"{category}_acc"] = val.get("exact_match,custom-extract", None)
                        row[f"{category}_stderr"] = val.get("exact_match_stderr,custom-extract", None)
                
                mmlu_data_rows.append(row)
        else:
            print(f"Warning: MMLU Pro baseline not found at {results_path}")
    else:
        # Other configs have multiple experiments
        results_dir = f"{bench_dir}/{cfg}"

        if not os.path.exists(results_dir):
            print(f"Warning: Config directory not found: {results_dir}")
            continue

        for exp_id in os.listdir(results_dir):
            results_path = f"{results_dir}/{exp_id}/{task}/latest/results.json"
            manifest_path = f"{results_dir}/{exp_id}/{task}/latest/manifest.json"
            
            if os.path.exists(results_path):
                with open(results_path, "r") as f:
                    data = json.load(f)
                
                # Load manifest for thinking and apply_chat_template
                thinking = False
                apply_chat_template = False
                if os.path.exists(manifest_path):
                    with open(manifest_path, "r") as f:
                        manifest = json.load(f)
                        thinking = manifest.get("thinking", None)
                        if thinking is None:
                            thinking = False
                        apply_chat_template = manifest.get("apply_chat_template", False)

                # Extract the mmlu_pro results
                if "results" in data and "mmlu_pro" in data["results"]:
                    row = {
                        "config_name": cfg,
                        "experiment_id": exp_id,
                        "thinking": thinking,
                        "apply_chat_template": apply_chat_template
                    }
                    # Add overall mmlu_pro metrics
                    row.update(data["results"]["mmlu_pro"])
                    
                    # Add category metrics
                    for key, val in data["results"].items():
                        if key.startswith("mmlu_pro_") and isinstance(val, dict):
                            category = key.replace("mmlu_pro_", "")
                            row[f"{category}_acc"] = val.get("exact_match,custom-extract", None)
                            row[f"{category}_stderr"] = val.get("exact_match_stderr,custom-extract", None)
                    
                    mmlu_data_rows.append(row)

# Create DataFrame
df_mmlu = pd.DataFrame(mmlu_data_rows)
print(f"Loaded {len(df_mmlu)} MMLU Pro experiments")
print(f"Config breakdown: {df_mmlu['config_name'].value_counts().to_dict()}")
df_mmlu[['experiment_id', 'thinking', 'apply_chat_template', 'exact_match,custom-extract']].head()

Loaded 21 MMLU Pro experiments
Config breakdown: {'role_trait': 16, 'jailbreak': 4, 'baseline': 1}


Unnamed: 0,experiment_id,thinking,apply_chat_template,"exact_match,custom-extract"
0,baseline,False,False,0.702143
1,layers_16:24-p0.75,False,True,0.0
2,layers_8:16-p0.75,False,True,0.0
3,layers_8:16-p0.01,False,True,0.0
4,layers_8:16-p0.25,False,True,0.0


In [5]:
# Load all 4 baseline MMLU Pro results
baseline_mmlu_path = f"{base_dir}/benchmarks/baseline/mmlu_pro"
baseline_dirs = [d for d in os.listdir(baseline_mmlu_path) if d.startswith("2025-")]

baseline_rows = []
for i, dirname in enumerate(sorted(baseline_dirs), 1):
    results_path = os.path.join(baseline_mmlu_path, dirname, "results.json")
    manifest_path = os.path.join(baseline_mmlu_path, dirname, "manifest.json")
    
    if os.path.exists(results_path):
        with open(results_path, "r") as f:
            data = json.load(f)
        
        # Load manifest for thinking and apply_chat_template
        thinking = False
        apply_chat_template = False
        if os.path.exists(manifest_path):
            with open(manifest_path, "r") as f:
                manifest = json.load(f)
                thinking = manifest.get("thinking", None)
                if thinking is None:
                    thinking = False
                apply_chat_template = manifest.get("apply_chat_template", False)
        
        if "results" in data and "mmlu_pro" in data["results"]:
            row = {
                "config_name": "baseline",
                "experiment_id": f"baseline_run{i}",
                "run_dir": dirname,
                "thinking": thinking,
                "apply_chat_template": apply_chat_template
            }
            # Add overall mmlu_pro metrics
            row.update(data["results"]["mmlu_pro"])
            
            # Add category metrics
            for key, val in data["results"].items():
                if key.startswith("mmlu_pro_") and isinstance(val, dict):
                    category = key.replace("mmlu_pro_", "")
                    row[f"{category}_acc"] = val.get("exact_match,custom-extract", None)
                    row[f"{category}_stderr"] = val.get("exact_match_stderr,custom-extract", None)
            
            baseline_rows.append(row)

df_baseline_mmlu = pd.DataFrame(baseline_rows)
print(f"Loaded {len(df_baseline_mmlu)} baseline MMLU Pro runs")
df_baseline_mmlu[['experiment_id', 'thinking', 'apply_chat_template', 'exact_match,custom-extract', 'exact_match_stderr,custom-extract']].head()

Loaded 4 baseline MMLU Pro runs


Unnamed: 0,experiment_id,thinking,apply_chat_template,"exact_match,custom-extract","exact_match_stderr,custom-extract"
0,baseline_run1,True,True,0.64,0.01257
1,baseline_run2,False,True,0.675,0.012285
2,baseline_run3,False,False,0.702143,0.011878
3,baseline_run4,True,False,0.714286,0.011793
