# Benchmark analysis

In [1]:
import json
import os
import pandas as pd
import numpy as np
from collections import defaultdict

import plotly.graph_objects as go

In [70]:
model = "qwen-3-32b"
total_layers = 64
subtitle = f"{model.replace('-', ' ').title()}, Single-Shot & No Thinking"
base_dir = f"/workspace/{model}/capped"
out_dir = f"/root/git/plots/{model}/capped/results/benchmarks"

os.makedirs(out_dir, exist_ok=True)

config_names = ["baseline", "role_trait", "jailbreak", "lmsys_10000"]

# ============================================================================
# Centralized Configuration & Parsing Infrastructure
# ============================================================================

def parse_experiment_id(experiment_id):
    """
    Parse experiment_id into layers and cap components.
    
    Args:
        experiment_id: String like "layers_0:64-p0.01" or "layers_32:64-harm_0.25"
    
    Returns:
        tuple: (layer_spec, cap_type, cap_value)
        - layer_spec: str like "0:64" or "2:64:2"
        - cap_type: str like "percentile", "harm", or "safe"
        - cap_value: str like "p0.01", "harm_0.25", or "safe_0.01"
    
    Examples:
        "layers_0:64-p0.01" → ("0:64", "percentile", "p0.01")
        "layers_32:64-harm_0.25" → ("32:64", "harm", "harm_0.25")
        "layers_2:64:2-safe_0.01" → ("2:64:2", "safe", "safe_0.01")
        "baseline" → (None, None, None)
    """
    if experiment_id == "baseline":
        return (None, None, None)
    
    parts = experiment_id.split('-')
    if len(parts) != 2:
        return (None, None, None)
    
    layer_part, cap_part = parts
    layer_spec = layer_part.replace('layers_', '')
    
    # Determine cap type
    if cap_part.startswith('p'):
        cap_type = "percentile"
    elif cap_part.startswith('harm'):
        cap_type = "harm"
    elif cap_part.startswith('safe'):
        cap_type = "safe"
    else:
        cap_type = "unknown"
    
    return (layer_spec, cap_type, cap_part)


def parse_layer_sort_key(layer_spec, total_layers=64):
    """
    Parse layer_spec to create sort key for consistent ordering:
    1. All layers (0:64) comes first
    2. Interval layers (2:64:2, 4:64:4) by increasing interval
    3. Range layers (8:16, 16:24, etc.) by start then end
    
    Returns: (category, interval_or_start, end)
    """
    if layer_spec is None:
        return (-1, 0, 0)  # Baseline comes first
    
    parts = layer_spec.split(':')
    
    if len(parts) == 3:
        # Interval layer (e.g., "2:64:2")
        start, end, interval = map(int, parts)
        return (1, int(interval), end)  # Category 1 = interval layers
    elif len(parts) == 2:
        start, end = map(int, parts)
        if start == 0 and end == total_layers:
            # All layers (0:64)
            return (0, 0, total_layers)  # Category 0 = all layers
        else:
            # Range layers (8:16, 16:24, etc.)
            return (2, start, end)  # Category 2 = range layers
    
    return (999, 0, 0)  # Unknown, sort to end


def format_layer_range(layer_spec, total_layers=64):
    """
    Convert layer_spec to human-readable string.
    
    Args:
        layer_spec: String like "0:64", "16:24", "2:64:2"
        total_layers: Total number of layers in the model
    
    Returns:
        str: Human-readable layer description
    
    Examples:
        "0:64" → "All Layers (0-63)"
        "16:24" → "Layers 16-23"
        "2:64:2" → "Every 2nd Layer"
        "4:64:4" → "Every 4th Layer"
    """
    if layer_spec is None:
        return "Baseline"
    
    parts = layer_spec.split(':')
    
    if len(parts) == 3:
        # Has interval (e.g., "2:64:2")
        start, end, interval = map(int, parts)
        if int(interval) == 2:
            return "Every 2nd Layer"
        else:
            return f"Every {interval}th Layer"
    elif len(parts) == 2:
        # Range without interval
        start, end = map(int, parts)
        if start == 0 and end == total_layers:
            return f"All Layers (0-{total_layers-1})"
        elif start == total_layers // 2:
            return f"Layers {start}-{end-1}"
        else:
            return f"Layers {start}-{end-1}"
    
    return layer_spec


def format_cap_label(cap_value):
    """
    Convert cap value to human-readable label.
    
    Args:
        cap_value: String like "p0.01", "harm_0.25", "safe_0.01"
    
    Returns:
        str: Human-readable label
    
    Examples:
        "p0.01" → "1st"
        "p0.25" → "25th"
        "p0.5" → "50th"
        "p0.75" → "75th"
        "harm_0.01" → "Harm 99th"
        "harm_0.25" → "Harm 75th"
        "safe_0.01" → "Safe 99th"
        "safe_0.50" → "Safe 50th"
    """
    if cap_value is None:
        return "Baseline"
    
    # Percentile-based (role_trait, lmsys_10000)
    if cap_value.startswith('p'):
        percentile = float(cap_value[1:])
        if percentile == 0.01:
            return "1st"
        elif percentile == 0.25:
            return "25th"
        elif percentile == 0.5:
            return "50th"
        elif percentile == 0.75:
            return "75th"
        else:
            return cap_value
    
    # Harm/Safe-based (jailbreak)
    elif cap_value.startswith('harm_') or cap_value.startswith('safe_'):
        prefix, value = cap_value.split('_')
        percentile = float(value)
        
        # Convert to percentile rank (lower value = higher rank)
        if percentile == 0.01:
            rank = "99th"
        elif percentile == 0.25:
            rank = "75th"
        elif percentile == 0.50:
            rank = "50th"
        else:
            rank = value
        
        return f"{prefix.capitalize()} {rank}"
    
    return cap_value


# Eval-specific configurations
EVAL_CONFIGS = {
    'ifeval': {
        'metrics': [
            {
                'name': 'prompt_level_strict_acc,none',
                'label': 'Prompt-level Accuracy',
                'stderr': 'prompt_level_strict_acc_stderr,none'
            },
            {
                'name': 'inst_level_strict_acc,none',
                'label': 'Instruction-level Accuracy',
                'stderr': None
            }
        ],
        'y_range': [0, 0.8]
    },
    'mmlu_pro': {
        'metrics': [
            {
                'name': 'exact_match,custom-extract',
                'label': 'Multiple Choice Exact Match Accuracy',
                'stderr': 'exact_match_stderr,custom-extract'
            }
        ],
        'y_range': [0, 0.8]
    },
    'eq_bench': {
        'metrics': [
            {
                'name': 'eqbench,none',
                'label': 'EQ-Bench Score (Higher is Better)',
                'stderr': 'eqbench_stderr,none'
            }
        ],
        'y_range': [0, 100]
    }
}

# Config visualization settings
CONFIG_COLORS = {
    'baseline': '#2ca02c',      # green
    'role_trait': '#ff7f0e',    # orange
    'jailbreak': '#d62728',     # red
    'lmsys_10000': '#9467bd'    # purple
}

CONFIG_DISPLAY_NAMES = {
    'baseline': 'Baseline',
    'role_trait': 'Role/Trait Rollouts',
    'jailbreak': 'Jailbreak Rollouts',
    'lmsys_10000': 'LMSYS-10K Rollouts'
}

print("Configuration and parsing infrastructure defined")

Configuration and parsing infrastructure defined


In [35]:
def load_experiment_data(tasks, config_names, base_dir):
    """
    Load experiment data for specified tasks and configs.

    Args:
        tasks: List of task names (e.g., ['ifeval', 'mmlu_pro', 'eq_bench'])
        config_names: List of config names (e.g., ['baseline', 'role_trait', 'jailbreak'])
        base_dir: Base directory containing benchmarks folder

    Returns:
        DataFrame with columns: task_name, config_name, experiment_id, run_dir,
                               thinking, apply_chat_template, and all metrics from results
    """
    all_rows = []
    bench_dir = f"{base_dir}/benchmarks"

    for task in tasks:
        task_dir = f"{bench_dir}/{task}"

        if not os.path.exists(task_dir):
            print(f"Warning: Task directory not found: {task_dir}")
            continue

        for config_name in config_names:
            config_dir = f"{task_dir}/{config_name}"

            if not os.path.exists(config_dir):
                print(f"Warning: Config directory not found: {config_dir}")
                continue

            if config_name == "baseline":
                # Baseline: iterate through all timestamped runs directly
                run_dirs = [d for d in os.listdir(config_dir) if d.startswith("2025-")]

                for run_dir in sorted(run_dirs):
                    results_path = os.path.join(config_dir, run_dir, "results.json")
                    manifest_path = os.path.join(config_dir, run_dir, "manifest.json")

                    if os.path.exists(results_path):
                        with open(results_path, "r") as f:
                            data = json.load(f)

                        # Load manifest for thinking and apply_chat_template
                        thinking = False
                        apply_chat_template = False
                        if os.path.exists(manifest_path):
                            with open(manifest_path, "r") as f:
                                manifest = json.load(f)
                                thinking = manifest.get("thinking", None)
                                if thinking is None:
                                    thinking = False
                                apply_chat_template = manifest.get("apply_chat_template", False)

                        # Get the task results
                        if "results" in data and task in data["results"]:
                            row = {
                                "task_name": task,
                                "config_name": config_name,
                                "experiment_id": "baseline",
                                "run_dir": run_dir,
                                "thinking": thinking,
                                "apply_chat_template": apply_chat_template
                            }
                            # Add all metrics from the task results
                            row.update(data["results"][task])
                            
                            # Drop alias field if present
                            row.pop("alias", None)

                            # For MMLU Pro, also add category-level metrics
                            if task == "mmlu_pro":
                                for key, val in data["results"].items():
                                    if key.startswith("mmlu_pro_") and isinstance(val, dict):
                                        category = key.replace("mmlu_pro_", "")
                                        row[f"{category}_acc"] = val.get("exact_match,custom-extract", None)
                                        row[f"{category}_stderr"] = val.get("exact_match_stderr,custom-extract", None)

                            all_rows.append(row)
            else:
                # Other configs: iterate through experiment_id directories
                for experiment_id in os.listdir(config_dir):
                    exp_dir = os.path.join(config_dir, experiment_id)

                    if not os.path.isdir(exp_dir):
                        continue

                    # Load all timestamped runs for this experiment
                    run_dirs = [d for d in os.listdir(exp_dir) if d.startswith("2025-")]

                    for run_dir in sorted(run_dirs):
                        results_path = os.path.join(exp_dir, run_dir, "results.json")
                        manifest_path = os.path.join(exp_dir, run_dir, "manifest.json")

                        if os.path.exists(results_path):
                            with open(results_path, "r") as f:
                                data = json.load(f)

                            # Load manifest for thinking and apply_chat_template
                            thinking = False
                            apply_chat_template = False
                            if os.path.exists(manifest_path):
                                with open(manifest_path, "r") as f:
                                    manifest = json.load(f)
                                    thinking = manifest.get("thinking", None)
                                    if thinking is None:
                                        thinking = False
                                    apply_chat_template = manifest.get("apply_chat_template", False)

                            # Get the task results
                            if "results" in data and task in data["results"]:
                                row = {
                                    "task_name": task,
                                    "config_name": config_name,
                                    "experiment_id": experiment_id,
                                    "run_dir": run_dir,
                                    "thinking": thinking,
                                    "apply_chat_template": apply_chat_template
                                }
                                # Add all metrics from the task results
                                row.update(data["results"][task])
                                
                                # Drop alias field if present
                                row.pop("alias", None)

                                # For MMLU Pro, also add category-level metrics
                                if task == "mmlu_pro":
                                    for key, val in data["results"].items():
                                        if key.startswith("mmlu_pro_") and isinstance(val, dict):
                                            category = key.replace("mmlu_pro_", "")
                                            row[f"{category}_acc"] = val.get("exact_match,custom-extract", None)
                                            row[f"{category}_stderr"] = val.get("exact_match_stderr,custom-extract", None)

                                all_rows.append(row)

    df = pd.DataFrame(all_rows)
    return df

print("Data loading function defined")

Data loading function defined


In [61]:
def plot_benchmark_results(df, eval_name, title, subtitle, total_layers=64):
    """
    Generic benchmark plotting function that works for any eval.
    
    Args:
        df: DataFrame with experiment results (should have only one baseline row)
        eval_name: Name of eval ('ifeval', 'mmlu_pro', 'eq_bench')
        title: Plot title
        subtitle: Plot subtitle  
        total_layers: Total number of layers in the model (default: 64)
    
    Returns:
        Plotly figure object
    """
    import numpy as np
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    # Get eval-specific configuration
    eval_config = EVAL_CONFIGS[eval_name]
    metrics = eval_config['metrics']
    y_range = eval_config['y_range']
    
    # Parse experiment IDs and add metadata columns
    df_sorted = df.copy()
    
    # Parse each experiment_id
    parsed = df_sorted['experiment_id'].apply(parse_experiment_id)
    df_sorted['layer_spec'] = parsed.apply(lambda x: x[0])
    df_sorted['cap_type'] = parsed.apply(lambda x: x[1])
    df_sorted['cap_value'] = parsed.apply(lambda x: x[2])
    
    # Add human-readable labels
    df_sorted['layer_label'] = df_sorted['layer_spec'].apply(
        lambda x: format_layer_range(x, total_layers)
    )
    df_sorted['cap_label'] = df_sorted['cap_value'].apply(format_cap_label)
    df_sorted['display_name'] = df_sorted.apply(
        lambda row: f"{row['layer_label']}, {row['cap_label']}" 
                    if row['experiment_id'] != 'baseline' 
                    else "Baseline",
        axis=1
    )
    
    # Create sort key for ordering (baseline first, then by config)
    config_order = {cfg: i for i, cfg in enumerate(['baseline', 'jailbreak', 'role_trait', 'lmsys_10000'])}
    df_sorted['sort_key'] = df_sorted['config_name'].map(config_order)
    
    # Sort by config, then layer_spec, then cap_value
    # Create layer sort keys for consistent ordering
    df_sorted['layer_sort_key'] = df_sorted['layer_spec'].apply(
        lambda x: parse_layer_sort_key(x, total_layers)
    )
    
    
    # Create cap sort keys (safe before harm, smaller percentiles first)
    def cap_sort_key(cap_value):
        if cap_value is None:
            return (0, 0.0)  # Baseline first
        if cap_value.startswith('safe_'):
            return (0, float(cap_value.split('_')[1]))  # Safe first, then by value
        elif cap_value.startswith('harm_'):
            return (1, float(cap_value.split('_')[1]))  # Harm second, then by value
        elif cap_value.startswith('p'):
            return (0, float(cap_value[1:]))  # Regular percentiles
        return (999, 0.0)  # Unknown
    
    df_sorted['cap_sort_key'] = df_sorted['cap_value'].apply(cap_sort_key)
    
    df_sorted = df_sorted.sort_values(
        ['sort_key', 'layer_sort_key', 'cap_sort_key'], 
        na_position='first'
    ).reset_index(drop=True)
    
    # Create a unique layer_group_id for position calculation
    # Baseline gets its own unique ID to prevent overlap with first non-baseline group
    df_sorted['layer_group_id'] = df_sorted.apply(
        lambda row: 'baseline' if row['experiment_id'] == 'baseline' 
                   else f"{row['config_name']}_{row['layer_spec']}", 
        axis=1
    )
    
    # Calculate x positions with gaps between layer groups
    x_positions = []
    current_x = 0
    prev_group_id = None
    
    bar_width = 0.35
    gap_within_group = 0.05
    gap_between_layers = 0.15
    
    for _, row in df_sorted.iterrows():
        if prev_group_id is not None and row['layer_group_id'] != prev_group_id:
            current_x += bar_width + gap_between_layers
        elif prev_group_id is not None:
            current_x += bar_width + gap_within_group
        x_positions.append(current_x)
        prev_group_id = row['layer_group_id']
    x_positions = np.array(x_positions)
    
    # Get colors for each bar
    colors = [CONFIG_COLORS.get(row['config_name'], '#636EFA') 
              for _, row in df_sorted.iterrows()]
    
    # Create subplots (one row per metric)
    num_metrics = len(metrics)
    subplot_titles = [m['label'] for m in metrics]
    
    fig = make_subplots(
        rows=num_metrics, cols=1,
        subplot_titles=subplot_titles,
        vertical_spacing=0.12 if num_metrics > 1 else 0
    )
    
    # Add traces for each metric
    for metric_idx, metric in enumerate(metrics, start=1):
        metric_name = metric['name']
        stderr_name = metric['stderr']
        
        # Get stderr values if available
        error_y = None
        if stderr_name and stderr_name in df_sorted.columns:
            stderr_vals = df_sorted[stderr_name].apply(
                lambda x: float(x) if x != 'N/A' and pd.notna(x) else None
            )
            error_y = dict(type='data', array=stderr_vals, visible=True, thickness=0.5)
        
        # Add bar trace
        fig.add_trace(
            go.Bar(
                name=metric['label'],
                x=x_positions,
                y=df_sorted[metric_name],
                error_y=error_y,
                marker_color=colors,
                opacity=0.8,
                width=bar_width,
                showlegend=False,
                hovertemplate="%{customdata[0]}<br>Config: %{customdata[1]}<br>Value: %{y:.1%}<extra></extra>",
                customdata=np.column_stack([df_sorted['display_name'], df_sorted['config_name']]),
            ),
            row=metric_idx, col=1
        )

    baseline_row = df_sorted[df_sorted['experiment_id'] == 'baseline']
    if len(baseline_row) > 0:
        for metric_idx, metric in enumerate(metrics, start=1):
            metric_name = metric['name']
            baseline_value = baseline_row[metric_name].iloc[0]

            fig.add_hline(
                y=baseline_value,
                line_dash="dot",
                line_color="green",
                line_width=1,
                opacity=0.8,
                row=metric_idx,
                col=1
            )
    
    # Add legend entries for configs (only once, attached to first subplot)
    present_configs = [c for c in config_order.keys() if c in df_sorted['config_name'].values]
    for config in present_configs:
        fig.add_trace(
            go.Scatter(
                x=[None], y=[None], mode='markers',
                marker=dict(size=10, color=CONFIG_COLORS[config]),
                name=CONFIG_DISPLAY_NAMES[config],
                showlegend=True
            ),
            row=1, col=1
        )
    
    # Place legend in top-right
    fig.update_layout(
        legend=dict(
            x=1.0, y=1.08, xanchor='right', yanchor='bottom',
            orientation='h',
        )
    )
    
    # Add single centered annotation about cap ordering
    fig.add_annotation(
        x=1, y=1,
        text="<span style='font-size:10px'>Projection caps in each bar group<br>(L-R) strict to relaxed</span>",
        showarrow=False, xref='paper', yref='paper',
        xanchor='right', yanchor='top'
    )
    
    # Calculate layer group centers for x-axis labels
    # Use layer_group_id to properly detect group boundaries
    layer_centers, layer_labels = [], []
    prev_group_id = None
    group_start = 0
    
    for i, (_, row) in enumerate(df_sorted.iterrows()):
        if prev_group_id is not None and row['layer_group_id'] != prev_group_id:
            group_center = (x_positions[group_start] + x_positions[i-1]) / 2
            layer_centers.append(group_center)
            layer_labels.append(df_sorted.iloc[group_start]['layer_label'])
            group_start = i
        prev_group_id = row['layer_group_id']
    
    # Add last group
    if len(df_sorted) > 0:
        group_center = (x_positions[group_start] + x_positions[-1]) / 2
        layer_centers.append(group_center)
        layer_labels.append(df_sorted.iloc[group_start]['layer_label'])
    
    # Update layout and axes
    fig.update_layout(
        title=dict(text=f"{title}", subtitle=dict(text=subtitle)),
        height=350 * num_metrics + 200, 
        width=1000, 
        margin=dict(b=100, t=150)
    )
    
    # Update axes for all subplots
    for metric_idx in range(1, num_metrics + 1):
        # Y-axis
        fig.update_yaxes(
            title_text="Score", 
            range=y_range, 
            row=metric_idx, col=1, 
            tickfont=dict(size=10)
        )
        
        # X-axis (only label bottom subplot)
        if metric_idx == num_metrics:
            fig.update_xaxes(title_text="Intervention Layers", row=metric_idx, col=1)
        
        # Set x-axis ticks
        fig.update_xaxes(
            tickmode='array', 
            tickvals=layer_centers, 
            ticktext=layer_labels,
            range=[x_positions[0] - 0.5, x_positions[-1] + 0.5], 
            row=metric_idx, col=1, 
            tickfont=dict(size=10)
        )
    
    return fig

## IFEval

In [39]:
# Load IFEval data
df = load_experiment_data(["ifeval"], config_names, base_dir)
print(f"Loaded {len(df)} IFEval experiment runs")
print(f"\nConfig breakdown:")
for config in df['config_name'].unique():
    config_df = df[df['config_name'] == config]
    unique_exps = config_df['experiment_id'].unique()
    print(f"  {config}: {len(unique_exps)} unique experiments")
    for exp in sorted(unique_exps):
        exp_count = len(config_df[config_df['experiment_id'] == exp])
        print(f"    - {exp} ({exp_count} run{'s' if exp_count > 1 else ''})")

Loaded 89 IFEval experiment runs

Config breakdown:
  baseline: 1 unique experiments
    - baseline (1 run)
  role_trait: 36 unique experiments
    - layers_0:16-p0.01 (1 run)
    - layers_0:16-p0.25 (1 run)
    - layers_0:16-p0.5 (1 run)
    - layers_0:16-p0.75 (1 run)
    - layers_0:64-p0.01 (1 run)
    - layers_0:64-p0.25 (1 run)
    - layers_0:64-p0.5 (1 run)
    - layers_0:64-p0.75 (1 run)
    - layers_16:24-p0.01 (1 run)
    - layers_16:24-p0.25 (1 run)
    - layers_16:24-p0.5 (1 run)
    - layers_16:24-p0.75 (1 run)
    - layers_24:32-p0.01 (1 run)
    - layers_24:32-p0.25 (1 run)
    - layers_24:32-p0.5 (1 run)
    - layers_24:32-p0.75 (1 run)
    - layers_32:40-p0.01 (1 run)
    - layers_32:40-p0.25 (1 run)
    - layers_32:40-p0.5 (1 run)
    - layers_32:40-p0.75 (1 run)
    - layers_40:48-p0.01 (1 run)
    - layers_40:48-p0.25 (1 run)
    - layers_40:48-p0.5 (1 run)
    - layers_40:48-p0.75 (1 run)
    - layers_48:56-p0.01 (1 run)
    - layers_48:56-p0.25 (1 run)
    - layers

In [5]:
df[['config_name', 'experiment_id', 'thinking', 'apply_chat_template', 'prompt_level_strict_acc,none', 'inst_level_strict_acc,none']].head(20)


Unnamed: 0,config_name,experiment_id,thinking,apply_chat_template,"prompt_level_strict_acc,none","inst_level_strict_acc,none"
0,baseline,baseline,False,False,0.468,0.601552
1,role_trait,layers_56:64-p0.75,False,True,0.51,0.635188
2,role_trait,layers_56:64-p0.5,False,True,0.506,0.6326
3,role_trait,layers_56:64-p0.25,False,True,0.494,0.624838
4,role_trait,layers_56:64-p0.01,False,True,0.47,0.609314
5,role_trait,layers_48:56-p0.75,False,True,0.49,0.615783
6,role_trait,layers_48:56-p0.5,False,True,0.474,0.600259
7,role_trait,layers_48:56-p0.25,False,True,0.472,0.602846
8,role_trait,layers_48:56-p0.01,False,True,0.486,0.614489
9,role_trait,layers_40:48-p0.75,False,True,0.472,0.598965


In [62]:
# Filter to single baseline (keeping the one without thinking)

# Plot using modular function
fig = plot_benchmark_results(df, 'ifeval', 
                             title="IFEval with Projection Capping", 
                             subtitle=subtitle,
                             total_layers=total_layers)
fig.show()
fig.write_html(f"{out_dir}/ifeval.html")

## MMLU Pro

In [10]:
# Load MMLU Pro data
df_mmlu = load_experiment_data(["mmlu_pro"], config_names, base_dir)
print(f"Loaded {len(df_mmlu)} MMLU Pro experiment runs")
print(f"\nConfig breakdown:")
for config in df_mmlu['config_name'].unique():
    config_df = df_mmlu[df_mmlu['config_name'] == config]
    unique_exps = config_df['experiment_id'].unique()
    print(f"  {config}: {len(unique_exps)} unique experiments")
    for exp in sorted(unique_exps):
        exp_count = len(config_df[config_df['experiment_id'] == exp])
        print(f"    - {exp} ({exp_count} run{'s' if exp_count > 1 else ''})")


Loaded 44 MMLU Pro experiment runs

Config breakdown:
  baseline: 1 unique experiments
    - baseline (4 runs)
  role_trait: 16 unique experiments
    - layers_0:16-p0.01 (1 run)
    - layers_0:16-p0.25 (1 run)
    - layers_0:16-p0.5 (1 run)
    - layers_0:16-p0.75 (1 run)
    - layers_0:64-p0.01 (1 run)
    - layers_0:64-p0.25 (1 run)
    - layers_0:64-p0.5 (1 run)
    - layers_0:64-p0.75 (1 run)
    - layers_16:24-p0.01 (1 run)
    - layers_16:24-p0.25 (1 run)
    - layers_16:24-p0.5 (1 run)
    - layers_16:24-p0.75 (1 run)
    - layers_8:16-p0.01 (1 run)
    - layers_8:16-p0.25 (1 run)
    - layers_8:16-p0.5 (1 run)
    - layers_8:16-p0.75 (1 run)
  jailbreak: 16 unique experiments
    - layers_0:64-harm_0.01 (1 run)
    - layers_0:64-harm_0.25 (1 run)
    - layers_0:64-safe_0.01 (1 run)
    - layers_0:64-safe_0.50 (1 run)
    - layers_2:64:2-harm_0.01 (1 run)
    - layers_2:64:2-harm_0.25 (1 run)
    - layers_2:64:2-safe_0.01 (1 run)
    - layers_2:64:2-safe_0.50 (1 run)
    - laye

In [28]:
df_mmlu[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'exact_match,custom-extract']].head(20)

Unnamed: 0,experiment_id,config_name,thinking,apply_chat_template,"exact_match,custom-extract"
0,baseline,baseline,True,True,0.64
1,baseline,baseline,False,True,0.675
2,baseline,baseline,False,False,0.702143
3,baseline,baseline,True,False,0.714286
4,layers_8:16-p0.75,role_trait,False,True,0.679286
5,layers_8:16-p0.5,role_trait,False,True,0.665714
6,layers_8:16-p0.25,role_trait,False,True,0.670714
7,layers_8:16-p0.01,role_trait,False,True,0.642857
8,layers_16:24-p0.75,role_trait,False,True,0.677143
9,layers_16:24-p0.5,role_trait,False,True,0.672143


In [63]:
df_mmlu_plot = df_mmlu[~((df_mmlu['config_name'] == 'baseline') & (df_mmlu['thinking'] == True)) & ~((df_mmlu['config_name'] == 'baseline') & (df_mmlu['apply_chat_template'] == False))]

fig_mmlu = plot_benchmark_results(df_mmlu_plot, 'mmlu_pro', 
                                  title="MMLU Pro with Projection Capping", 
                                  subtitle=subtitle,
                                  total_layers=total_layers)
fig_mmlu.show()

## EQ-Bench

In [65]:
# Load EQ-Bench data
df_eq = load_experiment_data(["eq_bench"], config_names, base_dir)
print(f"Loaded {len(df_eq)} EQ-Bench experiment runs")
print(f"\nConfig breakdown:")
for config in df_eq['config_name'].unique():
    config_df = df_eq[df_eq['config_name'] == config]
    unique_exps = config_df['experiment_id'].unique()
    print(f"  {config}: {len(unique_exps)} unique experiments")
    for exp in sorted(unique_exps):
        exp_count = len(config_df[config_df['experiment_id'] == exp])
        print(f"    - {exp} ({exp_count} run{'s' if exp_count > 1 else ''})")

Loaded 89 EQ-Bench experiment runs

Config breakdown:
  baseline: 1 unique experiments
    - baseline (1 run)
  role_trait: 36 unique experiments
    - layers_0:16-p0.01 (1 run)
    - layers_0:16-p0.25 (1 run)
    - layers_0:16-p0.5 (1 run)
    - layers_0:16-p0.75 (1 run)
    - layers_0:64-p0.01 (1 run)
    - layers_0:64-p0.25 (1 run)
    - layers_0:64-p0.5 (1 run)
    - layers_0:64-p0.75 (1 run)
    - layers_16:24-p0.01 (1 run)
    - layers_16:24-p0.25 (1 run)
    - layers_16:24-p0.5 (1 run)
    - layers_16:24-p0.75 (1 run)
    - layers_24:32-p0.01 (1 run)
    - layers_24:32-p0.25 (1 run)
    - layers_24:32-p0.5 (1 run)
    - layers_24:32-p0.75 (1 run)
    - layers_32:40-p0.01 (1 run)
    - layers_32:40-p0.25 (1 run)
    - layers_32:40-p0.5 (1 run)
    - layers_32:40-p0.75 (1 run)
    - layers_40:48-p0.01 (1 run)
    - layers_40:48-p0.25 (1 run)
    - layers_40:48-p0.5 (1 run)
    - layers_40:48-p0.75 (1 run)
    - layers_48:56-p0.01 (1 run)
    - layers_48:56-p0.25 (1 run)
    - laye

In [66]:
df_eq[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'eqbench,none']].head(5)

Unnamed: 0,experiment_id,config_name,thinking,apply_chat_template,"eqbench,none"
0,baseline,baseline,False,True,82.294152
1,layers_56:64-p0.75,role_trait,False,True,82.588051
2,layers_56:64-p0.5,role_trait,False,True,82.533942
3,layers_56:64-p0.25,role_trait,False,True,82.5496
4,layers_56:64-p0.01,role_trait,False,True,82.55543


In [71]:
fig_eq = plot_benchmark_results(df_eq, 'eq_bench', 
                                  title="EQ-Bench with Projection Capping", 
                                  subtitle=subtitle,
                                  total_layers=total_layers)
fig_eq.show()