# Benchmark analysis

In [2]:
import json
import os
import pandas as pd
import numpy as np
from collections import defaultdict

import plotly.graph_objects as go

# Import shared plotting utilities
from plots import (
    parse_experiment_id, parse_layer_sort_key, cap_sort_key,
    format_layer_range, format_cap_label,
    CONFIG_COLORS, CONFIG_DISPLAY_NAMES, CONFIG_ORDER
)

In [3]:
model = "llama-3.3-70b"
total_layers = 80
subtitle = f"{model.replace('-', ' ').title()}, Single-Shot & No Thinking"
base_dir = f"/workspace/{model}/capped"
out_dir = f"/root/git/plots/{model}/capped/results/benchmarks"

os.makedirs(out_dir, exist_ok=True)

config_names = ["baseline", "role_trait", "jailbreak", "lmsys_10000"]

# Eval-specific configurations
EVAL_CONFIGS = {
    'ifeval': {
        'metrics': [
            {
                'name': 'prompt_level_strict_acc,none',
                'label': 'Prompt-level Accuracy',
                'stderr': 'prompt_level_strict_acc_stderr,none'
            },
            {
                'name': 'inst_level_strict_acc,none',
                'label': 'Instruction-level Accuracy',
                'stderr': None
            }
        ],
        'y_range': [0, 0.8]
    },
    'mmlu_pro': {
        'metrics': [
            {
                'name': 'exact_match,custom-extract',
                'label': 'Multiple Choice Exact Match Accuracy',
                'stderr': 'exact_match_stderr,custom-extract'
            }
        ],
        'y_range': [0, 0.8]
    },
    'eq_bench': {
        'metrics': [
            {
                'name': 'eqbench,none',
                'label': 'EQ-Bench Score (Higher is Better)',
                'stderr': 'eqbench_stderr,none'
            }
        ],
        'y_range': [0, 100]
    },
    'gsm8k': {
        'metrics': [
            {
                'name': 'exact_match,flexible-extract',
                'label': 'GSM8K Exact Match Accuracy',
                'stderr': 'exact_match_stderr,flexible-extract'
            }
        ],
        'y_range': [0, 1]
    }
}

print("Configuration loaded")

Configuration loaded


In [4]:
def load_experiment_data(tasks, config_names, base_dir):
    """
    Load experiment data for specified tasks and configs.

    Args:
        tasks: List of task names (e.g., ['ifeval', 'mmlu_pro', 'eq_bench'])
        config_names: List of config names (e.g., ['baseline', 'role_trait', 'jailbreak'])
        base_dir: Base directory containing benchmarks folder

    Returns:
        DataFrame with columns: task_name, config_name, experiment_id, run_dir,
                               thinking, apply_chat_template, and all metrics from results
    """
    all_rows = []
    bench_dir = f"{base_dir}/benchmarks"

    for task in tasks:
        task_dir = f"{bench_dir}/{task}"

        if not os.path.exists(task_dir):
            print(f"Warning: Task directory not found: {task_dir}")
            continue

        for config_name in config_names:
            config_dir = f"{task_dir}/{config_name}"

            if not os.path.exists(config_dir):
                print(f"Warning: Config directory not found: {config_dir}")
                continue

            if config_name == "baseline":
                # Baseline: iterate through all timestamped runs directly
                run_dirs = [d for d in os.listdir(config_dir) if d.startswith("2025-")]

                for run_dir in sorted(run_dirs):
                    results_path = os.path.join(config_dir, run_dir, "results.json")
                    manifest_path = os.path.join(config_dir, run_dir, "manifest.json")

                    if os.path.exists(results_path):
                        with open(results_path, "r") as f:
                            data = json.load(f)

                        # Load manifest for thinking and apply_chat_template
                        thinking = False
                        apply_chat_template = False
                        vllm = False
                        max_gen_toks = None
                        dtype = None
                        if os.path.exists(manifest_path):
                            with open(manifest_path, "r") as f:
                                manifest = json.load(f)
                                thinking = manifest.get("thinking", None)
                                if thinking is None:
                                    thinking = False
                                apply_chat_template = manifest.get("apply_chat_template", False)
                                vllm = manifest.get("vllm", False)
                                max_gen_toks = manifest.get("max_gen_toks", None)
                                dtype = manifest.get("dtype", None)
                                if dtype is None:
                                    dtype = manifest.get("torch_dtype", None)
                        # Get the task results
                        if "results" in data and task in data["results"]:
                            row = {
                                "task_name": task,
                                "config_name": config_name,
                                "experiment_id": "baseline",
                                "run_dir": run_dir,
                                "thinking": thinking,
                                "apply_chat_template": apply_chat_template,
                                "vllm": vllm,
                                "max_gen_toks": max_gen_toks,
                                "dtype": dtype
                            }
                            # Add all metrics from the task results
                            row.update(data["results"][task])
                            
                            # Drop alias field if present
                            row.pop("alias", None)

                            # For MMLU Pro, also add category-level metrics
                            if task == "mmlu_pro":
                                for key, val in data["results"].items():
                                    if key.startswith("mmlu_pro_") and isinstance(val, dict):
                                        category = key.replace("mmlu_pro_", "")
                                        row[f"{category}_acc"] = val.get("exact_match,custom-extract", None)
                                        row[f"{category}_stderr"] = val.get("exact_match_stderr,custom-extract", None)

                            all_rows.append(row)
            else:
                # Other configs: iterate through experiment_id directories
                for experiment_id in os.listdir(config_dir):
                    exp_dir = os.path.join(config_dir, experiment_id)

                    if not os.path.isdir(exp_dir):
                        continue

                    # Load all timestamped runs for this experiment
                    run_dirs = [d for d in os.listdir(exp_dir) if d.startswith("2025-")]

                    for run_dir in sorted(run_dirs):
                        results_path = os.path.join(exp_dir, run_dir, "results.json")
                        manifest_path = os.path.join(exp_dir, run_dir, "manifest.json")

                        if os.path.exists(results_path):
                            with open(results_path, "r") as f:
                                data = json.load(f)

                            # Load manifest for thinking and apply_chat_template
                            thinking = False
                            apply_chat_template = False
                            vllm = False
                            max_gen_toks = None
                            dtype = None
                            if os.path.exists(manifest_path):
                                with open(manifest_path, "r") as f:
                                    manifest = json.load(f)
                                    thinking = manifest.get("thinking", None)
                                    if thinking is None:
                                        thinking = False
                                    apply_chat_template = manifest.get("apply_chat_template", False)
                                    vllm = manifest.get("vllm", False)
                                    max_gen_toks = manifest.get("max_gen_toks", None)

                                    # check for either dtype or torch_dtype
                                    dtype = manifest.get("dtype", None)
                                    if dtype is None:
                                        dtype = manifest.get("torch_dtype", None)

                            # Get the task results
                            if "results" in data and task in data["results"]:
                                row = {
                                    "task_name": task,
                                    "config_name": config_name,
                                    "experiment_id": experiment_id,
                                    "run_dir": run_dir,
                                    "thinking": thinking,
                                    "apply_chat_template": apply_chat_template,
                                    "vllm": vllm,
                                    "max_gen_toks": max_gen_toks,
                                    "dtype": dtype
                                }
                                # Add all metrics from the task results
                                row.update(data["results"][task])
                                
                                # Drop alias field if present
                                row.pop("alias", None)

                                # For MMLU Pro, also add category-level metrics
                                if task == "mmlu_pro":
                                    for key, val in data["results"].items():
                                        if key.startswith("mmlu_pro_") and isinstance(val, dict):
                                            category = key.replace("mmlu_pro_", "")
                                            row[f"{category}_acc"] = val.get("exact_match,custom-extract", None)
                                            row[f"{category}_stderr"] = val.get("exact_match_stderr,custom-extract", None)

                                all_rows.append(row)

    df = pd.DataFrame(all_rows)
    return df

print("Data loading function defined")

Data loading function defined


In [5]:
def plot_benchmark_results(df, eval_name, title, subtitle, total_layers=64):
    """
    Generic benchmark plotting function that works for any eval.
    
    Args:
        df: DataFrame with experiment results (should have only one baseline row)
        eval_name: Name of eval ('ifeval', 'mmlu_pro', 'eq_bench')
        title: Plot title
        subtitle: Plot subtitle  
        total_layers: Total number of layers in the model (default: 64)
    
    Returns:
        Plotly figure object
    """
    import numpy as np
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    # Get eval-specific configuration
    eval_config = EVAL_CONFIGS[eval_name]
    metrics = eval_config['metrics']
    y_range = eval_config['y_range']
    
    # Parse experiment IDs and add metadata columns
    df_sorted = df.copy()
    
    # Parse each experiment_id
    parsed = df_sorted['experiment_id'].apply(parse_experiment_id)
    df_sorted['layer_spec'] = parsed.apply(lambda x: x[0])
    df_sorted['cap_type'] = parsed.apply(lambda x: x[1])
    df_sorted['cap_value'] = parsed.apply(lambda x: x[2])
    
    # Add human-readable labels
    df_sorted['layer_label'] = df_sorted['layer_spec'].apply(
        lambda x: format_layer_range(x, total_layers)
    )
    df_sorted['cap_label'] = df_sorted['cap_value'].apply(format_cap_label)
    df_sorted['display_name'] = df_sorted.apply(
        lambda row: f"{row['layer_label']}, {row['cap_label']}" 
                    if row['experiment_id'] != 'baseline' 
                    else "Baseline",
        axis=1
    )
    
    # Create sort key for ordering (baseline first, then by config)
    config_order = {cfg: i for i, cfg in enumerate(['baseline', 'jailbreak', 'role_trait', 'lmsys_10000'])}
    df_sorted['sort_key'] = df_sorted['config_name'].map(config_order)
    
    # Sort by config, then layer_spec, then cap_value
    # Create layer sort keys for consistent ordering
    df_sorted['layer_sort_key'] = df_sorted['layer_spec'].apply(
        lambda x: parse_layer_sort_key(x, total_layers)
    )
    
    # Use imported cap_sort_key function
    df_sorted['cap_sort_key'] = df_sorted['cap_value'].apply(cap_sort_key)
    
    df_sorted = df_sorted.sort_values(
        ['sort_key', 'layer_sort_key', 'cap_sort_key'], 
        na_position='first'
    ).reset_index(drop=True)
    
    # Create a unique layer_group_id for position calculation
    # Baseline gets its own unique ID to prevent overlap with first non-baseline group
    df_sorted['layer_group_id'] = df_sorted.apply(
        lambda row: 'baseline' if row['experiment_id'] == 'baseline' 
                   else f"{row['config_name']}_{row['layer_spec']}", 
        axis=1
    )
    
    # Calculate x positions with gaps between layer groups
    x_positions = []
    current_x = 0
    prev_group_id = None
    
    bar_width = 0.35
    gap_within_group = 0.05
    gap_between_layers = 0.15
    
    for _, row in df_sorted.iterrows():
        if prev_group_id is not None and row['layer_group_id'] != prev_group_id:
            current_x += bar_width + gap_between_layers
        elif prev_group_id is not None:
            current_x += bar_width + gap_within_group
        x_positions.append(current_x)
        prev_group_id = row['layer_group_id']
    x_positions = np.array(x_positions)
    
    # Get colors for each bar
    colors = [CONFIG_COLORS.get(row['config_name'], '#636EFA') 
              for _, row in df_sorted.iterrows()]
    
    # Create subplots (one row per metric)
    num_metrics = len(metrics)
    subplot_titles = [m['label'] for m in metrics]
    
    fig = make_subplots(
        rows=num_metrics, cols=1,
        subplot_titles=subplot_titles,
        vertical_spacing=0.12 if num_metrics > 1 else 0
    )
    
    # Add traces for each metric
    for metric_idx, metric in enumerate(metrics, start=1):
        metric_name = metric['name']
        stderr_name = metric['stderr']
        
        # Get stderr values if available
        error_y = None
        if stderr_name and stderr_name in df_sorted.columns:
            stderr_vals = df_sorted[stderr_name].apply(
                lambda x: float(x) if x != 'N/A' and pd.notna(x) else None
            )
            error_y = dict(type='data', array=stderr_vals, visible=True, thickness=0.5)
        
        # Add bar trace
        fig.add_trace(
            go.Bar(
                name=metric['label'],
                x=x_positions,
                y=df_sorted[metric_name],
                error_y=error_y,
                marker_color=colors,
                opacity=0.8,
                width=bar_width,
                showlegend=False,
                hovertemplate="%{customdata[0]}<br>Config: %{customdata[1]}<br>Value: %{y:.1%}<extra></extra>",
                customdata=np.column_stack([df_sorted['display_name'], df_sorted['config_name']]),
            ),
            row=metric_idx, col=1
        )

    baseline_row = df_sorted[df_sorted['experiment_id'] == 'baseline']
    if len(baseline_row) > 0:
        for metric_idx, metric in enumerate(metrics, start=1):
            metric_name = metric['name']
            baseline_value = baseline_row[metric_name].iloc[0]

            fig.add_hline(
                y=baseline_value,
                line_dash="dot",
                line_color="green",
                line_width=1,
                opacity=0.8,
                row=metric_idx,
                col=1
            )
    
    # Add legend entries for configs (only once, attached to first subplot)
    present_configs = [c for c in config_order.keys() if c in df_sorted['config_name'].values]
    for config in present_configs:
        fig.add_trace(
            go.Scatter(
                x=[None], y=[None], mode='markers',
                marker=dict(size=10, color=CONFIG_COLORS[config]),
                name=CONFIG_DISPLAY_NAMES[config],
                showlegend=True
            ),
            row=1, col=1
        )
    
    # Place legend in top-right
    fig.update_layout(
        legend=dict(
            x=1.0, y=1.08, xanchor='right', yanchor='bottom',
            orientation='h',
        )
    )
    
    # Add single centered annotation about cap ordering
    fig.add_annotation(
        x=1, y=1,
        text="<span style='font-size:10px'>Projection caps in each bar group<br>(L-R) strict to relaxed</span>",
        showarrow=False, xref='paper', yref='paper',
        xanchor='right', yanchor='top'
    )
    
    # Calculate layer group centers for x-axis labels
    # Use layer_group_id to properly detect group boundaries
    layer_centers, layer_labels = [], []
    prev_group_id = None
    group_start = 0
    
    for i, (_, row) in enumerate(df_sorted.iterrows()):
        if prev_group_id is not None and row['layer_group_id'] != prev_group_id:
            group_center = (x_positions[group_start] + x_positions[i-1]) / 2
            layer_centers.append(group_center)
            layer_labels.append(df_sorted.iloc[group_start]['layer_label'])
            group_start = i
        prev_group_id = row['layer_group_id']
    
    # Add last group
    if len(df_sorted) > 0:
        group_center = (x_positions[group_start] + x_positions[-1]) / 2
        layer_centers.append(group_center)
        layer_labels.append(df_sorted.iloc[group_start]['layer_label'])
    
    # Update layout and axes
    fig.update_layout(
        title=dict(text=f"{title}", subtitle=dict(text=subtitle)),
        height=350 * num_metrics + 200, 
        width=1000, 
        margin=dict(b=100, t=150)
    )
    
    # Update axes for all subplots
    for metric_idx in range(1, num_metrics + 1):
        # Y-axis
        fig.update_yaxes(
            title_text="Score", 
            range=y_range, 
            row=metric_idx, col=1, 
            tickfont=dict(size=10)
        )
        
        # X-axis (only label bottom subplot)
        if metric_idx == num_metrics:
            fig.update_xaxes(title_text="Intervention Layers", row=metric_idx, col=1)
        
        # Set x-axis ticks
        fig.update_xaxes(
            tickmode='array', 
            tickvals=layer_centers, 
            ticktext=layer_labels,
            range=[x_positions[0] - 0.5, x_positions[-1] + 0.5], 
            row=metric_idx, col=1, 
            tickfont=dict(size=10)
        )
    
    return fig

## IFEval

In [4]:
# Load IFEval data
df = load_experiment_data(["ifeval"], config_names, base_dir)
print(f"Loaded {len(df)} IFEval experiment runs")
print(f"\nConfig breakdown:")
for config in df['config_name'].unique():
    config_df = df[df['config_name'] == config]
    unique_exps = config_df['experiment_id'].unique()
    print(f"  {config}: {len(unique_exps)} unique experiments")
    for exp in sorted(unique_exps):
        exp_count = len(config_df[config_df['experiment_id'] == exp])
        print(f"    - {exp} ({exp_count} run{'s' if exp_count > 1 else ''})")

Loaded 89 IFEval experiment runs

Config breakdown:
  baseline: 1 unique experiments
    - baseline (1 run)
  role_trait: 36 unique experiments
    - layers_0:16-p0.01 (1 run)
    - layers_0:16-p0.25 (1 run)
    - layers_0:16-p0.5 (1 run)
    - layers_0:16-p0.75 (1 run)
    - layers_0:64-p0.01 (1 run)
    - layers_0:64-p0.25 (1 run)
    - layers_0:64-p0.5 (1 run)
    - layers_0:64-p0.75 (1 run)
    - layers_16:24-p0.01 (1 run)
    - layers_16:24-p0.25 (1 run)
    - layers_16:24-p0.5 (1 run)
    - layers_16:24-p0.75 (1 run)
    - layers_24:32-p0.01 (1 run)
    - layers_24:32-p0.25 (1 run)
    - layers_24:32-p0.5 (1 run)
    - layers_24:32-p0.75 (1 run)
    - layers_32:40-p0.01 (1 run)
    - layers_32:40-p0.25 (1 run)
    - layers_32:40-p0.5 (1 run)
    - layers_32:40-p0.75 (1 run)
    - layers_40:48-p0.01 (1 run)
    - layers_40:48-p0.25 (1 run)
    - layers_40:48-p0.5 (1 run)
    - layers_40:48-p0.75 (1 run)
    - layers_48:56-p0.01 (1 run)
    - layers_48:56-p0.25 (1 run)
    - layers

In [86]:
df[['config_name', 'experiment_id', 'thinking', 'apply_chat_template', 'prompt_level_strict_acc,none', 'inst_level_strict_acc,none']].head(5)


Unnamed: 0,config_name,experiment_id,thinking,apply_chat_template,"prompt_level_strict_acc,none","inst_level_strict_acc,none"
0,baseline,baseline,False,False,0.468,0.601552
1,role_trait,layers_56:64-p0.75,False,True,0.51,0.635188
2,role_trait,layers_56:64-p0.5,False,True,0.506,0.6326
3,role_trait,layers_56:64-p0.25,False,True,0.494,0.624838
4,role_trait,layers_56:64-p0.01,False,True,0.47,0.609314


In [87]:
# Filter to single baseline (keeping the one without thinking)

# Plot using modular function
fig = plot_benchmark_results(df, 'ifeval', 
                             title="IFEval with Projection Capping", 
                             subtitle=subtitle,
                             total_layers=total_layers)
fig.show()
#fig.write_html(f"{out_dir}/ifeval.html")

## MMLU Pro

In [None]:
# Load MMLU Pro data
df_mmlu = load_experiment_data(["mmlu_pro"], config_names, base_dir)
print(f"Loaded {len(df_mmlu)} MMLU Pro experiment runs")
print(f"\nConfig breakdown:")
for config in df_mmlu['config_name'].unique():
    config_df = df_mmlu[df_mmlu['config_name'] == config]
    unique_exps = config_df['experiment_id'].unique()
    print(f"  {config}: {len(unique_exps)} unique experiments")
    for exp in sorted(unique_exps):
        exp_count = len(config_df[config_df['experiment_id'] == exp])
        print(f"    - {exp} ({exp_count} run{'s' if exp_count > 1 else ''})")


Loaded 145 MMLU Pro experiment runs

Config breakdown:
  baseline: 1 unique experiments
    - baseline (5 runs)
  role_trait: 36 unique experiments
    - layers_0:16-p0.01 (2 runs)
    - layers_0:16-p0.25 (2 runs)
    - layers_0:16-p0.5 (2 runs)
    - layers_0:16-p0.75 (2 runs)
    - layers_0:64-p0.01 (3 runs)
    - layers_0:64-p0.25 (3 runs)
    - layers_0:64-p0.5 (3 runs)
    - layers_0:64-p0.75 (3 runs)
    - layers_16:24-p0.01 (2 runs)
    - layers_16:24-p0.25 (2 runs)
    - layers_16:24-p0.5 (2 runs)
    - layers_16:24-p0.75 (2 runs)
    - layers_24:32-p0.01 (2 runs)
    - layers_24:32-p0.25 (2 runs)
    - layers_24:32-p0.5 (2 runs)
    - layers_24:32-p0.75 (2 runs)
    - layers_32:40-p0.01 (2 runs)
    - layers_32:40-p0.25 (2 runs)
    - layers_32:40-p0.5 (2 runs)
    - layers_32:40-p0.75 (2 runs)
    - layers_40:48-p0.01 (2 runs)
    - layers_40:48-p0.25 (2 runs)
    - layers_40:48-p0.5 (2 runs)
    - layers_40:48-p0.75 (2 runs)
    - layers_48:56-p0.01 (3 runs)
    - layers_48:

In [29]:
# Filter for specific experiment_id and print the results
filtered_df = df_mmlu[(df_mmlu['experiment_id'].str.startswith('layers_0:64'))]
filtered_df[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'vllm', 'max_gen_toks', 'dtype', 'exact_match,custom-extract']].head(20)

Unnamed: 0,experiment_id,config_name,thinking,apply_chat_template,vllm,max_gen_toks,dtype,"exact_match,custom-extract"
65,layers_0:64-p0.75,role_trait,False,True,False,2048.0,bfloat16,0.665
66,layers_0:64-p0.75,role_trait,False,True,True,512.0,bfloat16,0.486429
67,layers_0:64-p0.75,role_trait,False,True,True,,float16,0.648571
68,layers_0:64-p0.5,role_trait,False,True,False,2048.0,bfloat16,0.643571
69,layers_0:64-p0.5,role_trait,False,True,True,512.0,bfloat16,0.487143
70,layers_0:64-p0.5,role_trait,False,True,True,,float16,0.645
71,layers_0:64-p0.25,role_trait,False,True,False,2048.0,bfloat16,0.640714
72,layers_0:64-p0.25,role_trait,False,True,True,512.0,bfloat16,0.485714
73,layers_0:64-p0.25,role_trait,False,True,True,,float16,0.622143
74,layers_0:64-p0.01,role_trait,False,True,False,2048.0,bfloat16,0.58


In [None]:
df_mmlu_plot = df_mmlu[~((df_mmlu['config_name'] == 'baseline') & (df_mmlu['thinking'] == True)) & ~((df_mmlu['config_name'] == 'baseline') & (df_mmlu['apply_chat_template'] == False))]

fig_mmlu = plot_benchmark_results(df_mmlu_plot, 'mmlu_pro', 
                                  title="MMLU Pro with Projection Capping", 
                                  subtitle=subtitle,
                                  total_layers=total_layers)
fig_mmlu.show()
#fig_mmlu.write_html(f"{out_dir}/mmlu_pro.html")

## EQ-Bench

In [10]:
# Load EQ-Bench data
df_eq = load_experiment_data(["eq_bench"], config_names, base_dir)
print(f"Loaded {len(df_eq)} EQ-Bench experiment runs")
print(f"\nConfig breakdown:")
for config in df_eq['config_name'].unique():
    config_df = df_eq[df_eq['config_name'] == config]
    unique_exps = config_df['experiment_id'].unique()
    print(f"  {config}: {len(unique_exps)} unique experiments")
    for exp in sorted(unique_exps):
        exp_count = len(config_df[config_df['experiment_id'] == exp])
        print(f"    - {exp} ({exp_count} run{'s' if exp_count > 1 else ''})")

Loaded 261 EQ-Bench experiment runs

Config breakdown:
  baseline: 1 unique experiments
    - baseline (1 run)
  role_trait: 132 unique experiments
    - layers_0:20-p0.01 (1 run)
    - layers_0:20-p0.25 (1 run)
    - layers_0:20-p0.5 (1 run)
    - layers_0:20-p0.75 (1 run)
    - layers_0:40-p0.01 (2 runs)
    - layers_0:40-p0.25 (2 runs)
    - layers_0:40-p0.5 (2 runs)
    - layers_0:40-p0.75 (2 runs)
    - layers_0:80-p0.01 (3 runs)
    - layers_0:80-p0.25 (3 runs)
    - layers_0:80-p0.5 (3 runs)
    - layers_0:80-p0.75 (3 runs)
    - layers_10:20-p0.01 (1 run)
    - layers_10:20-p0.25 (1 run)
    - layers_10:20-p0.5 (1 run)
    - layers_10:20-p0.75 (1 run)
    - layers_20:30-p0.01 (1 run)
    - layers_20:30-p0.25 (1 run)
    - layers_20:30-p0.5 (1 run)
    - layers_20:30-p0.75 (1 run)
    - layers_32:40-p0.01 (2 runs)
    - layers_32:40-p0.25 (2 runs)
    - layers_32:40-p0.5 (2 runs)
    - layers_32:40-p0.75 (2 runs)
    - layers_32:48-p0.01 (2 runs)
    - layers_32:48-p0.25 (2 runs

In [11]:
df_eq[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'eqbench,none']].head(5)

Unnamed: 0,experiment_id,config_name,thinking,apply_chat_template,"eqbench,none"
0,baseline,baseline,False,True,83.823657
1,layers_72:80-p0.75,role_trait,False,True,82.342677
2,layers_72:80-p0.75,role_trait,False,True,83.064309
3,layers_72:80-p0.5,role_trait,False,True,82.335118
4,layers_72:80-p0.5,role_trait,False,True,82.764016


In [None]:
fig_eq = plot_benchmark_results(df_eq, 'eq_bench', 
                                  title="EQ-Bench with Projection Capping", 
                                  subtitle=subtitle,
                                  total_layers=total_layers)
fig_eq.show()
#fig_eq.write_html(f"{out_dir}/eq_bench.html")

## GSM8K

In [6]:
df_gsm8k = load_experiment_data(["gsm8k"], config_names, base_dir)
print(f"Loaded {len(df_gsm8k)} GSM8K experiment runs")
print(f"\nConfig breakdown:")
for config in df_gsm8k['config_name'].unique():
    config_df = df_gsm8k[df_gsm8k['config_name'] == config]
    unique_exps = config_df['experiment_id'].unique()
    print(f"  {config}: {len(unique_exps)} unique experiments")
    for exp in sorted(unique_exps):
        exp_count = len(config_df[config_df['experiment_id'] == exp])
        print(f"    - {exp} ({exp_count} run{'s' if exp_count > 1 else ''})")

Loaded 241 GSM8K experiment runs

Config breakdown:
  baseline: 1 unique experiments
    - baseline (1 run)
  role_trait: 120 unique experiments
    - layers_0:40-p0.01 (2 runs)
    - layers_0:40-p0.25 (2 runs)
    - layers_0:40-p0.5 (2 runs)
    - layers_0:40-p0.75 (2 runs)
    - layers_0:80-p0.01 (2 runs)
    - layers_0:80-p0.25 (2 runs)
    - layers_0:80-p0.5 (2 runs)
    - layers_0:80-p0.75 (2 runs)
    - layers_32:40-p0.01 (2 runs)
    - layers_32:40-p0.25 (2 runs)
    - layers_32:40-p0.5 (2 runs)
    - layers_32:40-p0.75 (2 runs)
    - layers_32:48-p0.01 (2 runs)
    - layers_32:48-p0.25 (2 runs)
    - layers_32:48-p0.5 (2 runs)
    - layers_32:48-p0.75 (2 runs)
    - layers_32:56-p0.01 (2 runs)
    - layers_32:56-p0.25 (2 runs)
    - layers_32:56-p0.5 (2 runs)
    - layers_32:56-p0.75 (2 runs)
    - layers_36:44-p0.01 (2 runs)
    - layers_36:44-p0.25 (2 runs)
    - layers_36:44-p0.5 (2 runs)
    - layers_36:44-p0.75 (2 runs)
    - layers_36:52-p0.01 (2 runs)
    - layers_36:52-

In [18]:
df_gsm8k[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'vllm', 'max_gen_toks', 'dtype', 'exact_match,flexible-extract']].head(20)

Unnamed: 0,experiment_id,config_name,thinking,apply_chat_template,vllm,max_gen_toks,dtype,"exact_match,flexible-extract"
0,baseline,baseline,False,True,True,,float16,0.891
1,layers_72:80-p0.75,role_trait,False,True,True,,float16,0.87
2,layers_72:80-p0.75,role_trait,False,True,False,2048.0,bfloat16,0.884
3,layers_72:80-p0.5,role_trait,False,True,True,,float16,0.865
4,layers_72:80-p0.5,role_trait,False,True,False,2048.0,bfloat16,0.881
5,layers_72:80-p0.25,role_trait,False,True,True,,float16,0.864
6,layers_72:80-p0.25,role_trait,False,True,False,2048.0,bfloat16,0.887
7,layers_72:80-p0.01,role_trait,False,True,True,,float16,0.866
8,layers_72:80-p0.01,role_trait,False,True,False,2048.0,bfloat16,0.884
9,layers_68:76-p0.75,role_trait,False,True,True,,float16,0.871


In [33]:
df_gsm8k_plot = df_gsm8k[~(df_gsm8k['vllm'])]
df_gsm8k_plot[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'vllm', 'max_gen_toks', 'dtype', 'exact_match,flexible-extract']].head(20)

# print number of rows in df_gsm8k_plot
print(f"Number of rows in df_gsm8k_plot: {len(df_gsm8k_plot)}")


Number of rows in df_gsm8k_plot: 89


In [45]:
fig_gsm8k = plot_benchmark_results(df_gsm8k_plot, 'gsm8k', 
                                  title="GSM8K with Projection Capping", 
                                  subtitle=subtitle,
                                  total_layers=total_layers)
fig_gsm8k.show()
fig_gsm8k.write_html(f"{out_dir}/gsm8k.html")
print(f"saved to {out_dir}/gsm8k.html")

saved to /root/git/plots/qwen-3-32b/capped/results/benchmarks/gsm8k.html


## Comparing VLLM with HF

In [15]:
# VLLM vs Non-VLLM Comparison Analysis
# Compare experiment pairs that match on specified criteria but differ in vllm setting
filtered_df = df_eq
target_metric = EVAL_CONFIGS['eq_bench']['metrics'][0]['name']

# First, let's examine the data structure to understand what we're working with
print("MMLU Data Overview:")
print(f"Total experiments: {len(filtered_df)}")
print(f"Unique vllm values: {filtered_df['vllm'].unique()}")
print(f"Unique max_gen_toks values: {filtered_df['max_gen_toks'].unique()}")
print(f"Unique thinking values: {filtered_df['thinking'].unique()}")
print(f"Unique apply_chat_template values: {filtered_df['apply_chat_template'].unique()}")

# Show sample of the data
print("\nSample data:")
print(filtered_df[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'vllm', 'max_gen_toks', 'dtype', target_metric]].head(10))


MMLU Data Overview:
Total experiments: 261
Unique vllm values: [ True False]
Unique max_gen_toks values: [  nan 2048.]
Unique thinking values: [False]
Unique apply_chat_template values: [ True]

Sample data:
        experiment_id config_name  thinking  apply_chat_template   vllm  \
0            baseline    baseline     False                 True   True   
1  layers_72:80-p0.75  role_trait     False                 True   True   
2  layers_72:80-p0.75  role_trait     False                 True  False   
3   layers_72:80-p0.5  role_trait     False                 True   True   
4   layers_72:80-p0.5  role_trait     False                 True  False   
5  layers_72:80-p0.25  role_trait     False                 True   True   
6  layers_72:80-p0.25  role_trait     False                 True  False   
7  layers_72:80-p0.01  role_trait     False                 True   True   
8  layers_72:80-p0.01  role_trait     False                 True  False   
9  layers_68:76-p0.75  role_trait     Fals

In [16]:
# Group experiments by matching criteria and find pairs
# Create a grouping key based on the matching criteria
filtered_df['group_key'] = filtered_df.apply(
    lambda row: f"{row['experiment_id']}|{row['config_name']}|{row['thinking']}|{row['apply_chat_template']}", 
    axis=1
)

# Group by the key and check for pairs
grouped = filtered_df.groupby('group_key')

# Find groups that have both vllm=True and vllm=False
pairs_found = []
for group_key, group in grouped:
    vllm_values = group['vllm'].unique()
    if len(vllm_values) == 2 and True in vllm_values and False in vllm_values:
        pairs_found.append(group_key)

print(f"Found {len(pairs_found)} experiment pairs with both vllm=True and vllm=False")
print(f"Total groups: {len(grouped)}")

# Show some examples of the pairs
if pairs_found:
    print("\nExample pairs:")
    for i, pair_key in enumerate(pairs_found[:3]):  # Show first 3 pairs
        pair_data = grouped.get_group(pair_key)
        print(f"\nPair {i+1}: {pair_key}")
        print(pair_data[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'vllm', 'max_gen_toks', target_metric]].to_string(index=False))


Found 120 experiment pairs with both vllm=True and vllm=False
Total groups: 133

Example pairs:

Pair 1: layers_0:40-p0.01|role_trait|False|True
    experiment_id config_name  thinking  apply_chat_template  vllm  max_gen_toks  eqbench,none
layers_0:40-p0.01  role_trait     False                 True  True           NaN     78.157594
layers_0:40-p0.01  role_trait     False                 True False           NaN     78.920256

Pair 2: layers_0:40-p0.25|role_trait|False|True
    experiment_id config_name  thinking  apply_chat_template  vllm  max_gen_toks  eqbench,none
layers_0:40-p0.25  role_trait     False                 True  True           NaN     80.888162
layers_0:40-p0.25  role_trait     False                 True False           NaN     81.495005

Pair 3: layers_0:40-p0.5|role_trait|False|True
   experiment_id config_name  thinking  apply_chat_template  vllm  max_gen_toks  eqbench,none
layers_0:40-p0.5  role_trait     False                 True  True           NaN     82.271560


In [17]:
# Calculate deltas for all pairs
comparison_results = []

for pair_key in pairs_found:
    pair_data = grouped.get_group(pair_key)
    
    # Get vllm=True and vllm=False rows
    vllm_true = pair_data[pair_data['vllm'] == True]
    vllm_false = pair_data[pair_data['vllm'] == False]
    
    # Calculate average scores for each group (in case there are multiple runs)
    vllm_true_score = vllm_true[target_metric].mean()
    vllm_false_score = vllm_false[target_metric].mean()
    
    # Calculate delta (vllm=True - vllm=False)
    delta = vllm_true_score - vllm_false_score
    
    # Get the common attributes
    common_attrs = {
        'experiment_id': pair_data['experiment_id'].iloc[0],
        'config_name': pair_data['config_name'].iloc[0],
        'thinking': pair_data['thinking'].iloc[0],
        'apply_chat_template': pair_data['apply_chat_template'].iloc[0],
        'max_gen_toks': pair_data['max_gen_toks'].iloc[0],
        'vllm_true_score': vllm_true_score,
        'vllm_false_score': vllm_false_score,
        'delta': delta,
        'vllm_true_runs': len(vllm_true),
        'vllm_false_runs': len(vllm_false)
    }
    
    comparison_results.append(common_attrs)

# Convert to DataFrame for easier analysis
comparison_df = pd.DataFrame(comparison_results)

print(f"Comparison Results Summary:")
print(f"Total pairs analyzed: {len(comparison_df)}")
print(f"Average delta (vllm=True - vllm=False): {comparison_df['delta'].mean():.4f}")
print(f"Median delta: {comparison_df['delta'].median():.4f}")
print(f"Standard deviation of deltas: {comparison_df['delta'].std():.4f}")

# Show the results
print("\nDetailed comparison results:")
print(comparison_df[['experiment_id', 'config_name', 'thinking', 'apply_chat_template', 'max_gen_toks', 
                     'vllm_true_score', 'vllm_false_score', 'delta', 'vllm_true_runs', 'vllm_false_runs']].to_string(index=False))


Comparison Results Summary:
Total pairs analyzed: 120
Average delta (vllm=True - vllm=False): -1.3070
Median delta: -1.2681
Standard deviation of deltas: 0.5085

Detailed comparison results:
     experiment_id config_name  thinking  apply_chat_template  max_gen_toks  vllm_true_score  vllm_false_score     delta  vllm_true_runs  vllm_false_runs
 layers_0:40-p0.01  role_trait     False                 True           NaN        78.157594         78.920256 -0.762662               1                1
 layers_0:40-p0.25  role_trait     False                 True           NaN        80.888162         81.495005 -0.606844               1                1
  layers_0:40-p0.5  role_trait     False                 True           NaN        82.271560         82.531722 -0.260161               1                1
 layers_0:40-p0.75  role_trait     False                 True           NaN        82.455596         83.574727 -1.119132               1                1
 layers_0:80-p0.01  role_trait     Fals