# Pareto Frontier Analysis: Eval Performance vs Jailbreak Harm Reduction

This notebook analyzes the tradeoff between:
- **Eval Performance**: Percentage change in benchmark scores (MMLU, IFEval, EQ-Bench)
- **Harm Reduction**: Percentage change in jailbreak harmful response rate

The Pareto frontier identifies configurations that achieve optimal tradeoffs between these objectives.

In [1]:
import json
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import shared plotting utilities
from plots import (
    parse_experiment_id, parse_layer_sort_key, cap_sort_key,
    format_layer_range, format_cap_label,
    CONFIG_COLORS, CONFIG_DISPLAY_NAMES, CONFIG_ORDER
)

In [None]:
# ============================================================================
# Configuration
# ============================================================================

model = "qwen-3-32b"
total_layers = 64
subtitle = f"{model.replace('-', ' ').title()}, Single-Shot & No Thinking"
base_dir = f"/workspace/{model}"
out_dir = f"/root/git/plots/{model}/capped/results/pareto"

os.makedirs(out_dir, exist_ok=True)

# Configs to analyze
config_names = ["role_trait", "baseline"]

# ============================================================================
# Experiment ID Filter Configuration
# ============================================================================
# Set experiment_ids to None to include ALL experiments (default behavior)
# Or set to a list of specific experiment IDs to filter to only those experiments
# Example: experiment_ids = ["layers_32:48-p0.25", "layers_34:50-p0.25", "layers_36:52-p0.25"]

experiment_ids = None  # Set to None for all experiments, or list of IDs to filter

# ============================================================================

# Eval-specific metric mappings
EVAL_METRICS = {
    'ifeval': {
        'metric': 'inst_level_strict_acc,none',
        'display_name': 'IFEval Instruction-level Accuracy',
        'higher_is_better': True
    },
    'mmlu_pro': {
        'metric': 'exact_match,custom-extract',
        'display_name': 'MMLU Pro Exact Match Accuracy',
        'higher_is_better': True
    },
    'eq_bench': {
        'metric': 'eqbench,none',
        'display_name': 'EQ-Bench Score',
        'higher_is_better': True
    },
    'gsm8k': {
        'metric': 'exact_match,flexible-extract',
        'display_name': 'GSM8K Exact Match Accuracy',
        'higher_is_better': True
    }
}

if experiment_ids is not None:
    print(f"Configuration loaded - Filtering to {len(experiment_ids)} specific experiment IDs")
else:
    print("Configuration loaded - Including all experiment IDs")

## Data Loading Functions

In [5]:
def load_experiment_data(tasks, config_names, base_dir):
    """
    Load experiment data for specified tasks and configs.

    Args:
        tasks: List of task names (e.g., ['ifeval', 'mmlu_pro', 'eq_bench'])
        config_names: List of config names (e.g., ['baseline', 'role_trait', 'jailbreak'])
        base_dir: Base directory containing benchmarks folder

    Returns:
        DataFrame with columns: task_name, config_name, experiment_id, run_dir,
                               thinking, apply_chat_template, and all metrics from results
    """
    all_rows = []
    bench_dir = f"{base_dir}/capped/benchmarks"

    for task in tasks:
        task_dir = f"{bench_dir}/{task}"

        if not os.path.exists(task_dir):
            print(f"Warning: Task directory not found: {task_dir}")
            continue

        for config_name in config_names:
            config_dir = f"{task_dir}/{config_name}"

            if not os.path.exists(config_dir):
                print(f"Warning: Config directory not found: {config_dir}")
                continue

            if config_name == "baseline":
                # Baseline: iterate through all timestamped runs directly
                run_dirs = [d for d in os.listdir(config_dir) if d.startswith("2025-")]

                for run_dir in sorted(run_dirs):
                    results_path = os.path.join(config_dir, run_dir, "results.json")
                    manifest_path = os.path.join(config_dir, run_dir, "manifest.json")

                    if os.path.exists(results_path):
                        with open(results_path, "r") as f:
                            data = json.load(f)

                        # Load manifest for thinking and apply_chat_template
                        thinking = False
                        apply_chat_template = False
                        vllm = False
                        max_gen_toks = None
                        seed = None
                        limit = None
                        if os.path.exists(manifest_path):
                            with open(manifest_path, "r") as f:
                                manifest = json.load(f)
                                thinking = manifest.get("thinking", None)
                                if thinking is None:
                                    thinking = False
                                apply_chat_template = manifest.get("apply_chat_template", False)
                                vllm = manifest.get("vllm", False)
                                max_gen_toks = manifest.get("max_gen_toks", None)
                                seed = manifest.get("seed", None)
                                limit = manifest.get("limit", None)

                        # Get the task results
                        if "results" in data and task in data["results"]:
                            row = {
                                "task_name": task,
                                "config_name": config_name,
                                "experiment_id": "baseline",
                                "run_dir": run_dir,
                                "thinking": thinking,
                                "apply_chat_template": apply_chat_template,
                                "vllm": vllm,
                                "max_gen_toks": max_gen_toks,
                                "seed": seed,
                                "limit": limit
                            }
                            # Add all metrics from the task results
                            row.update(data["results"][task])
                            
                            # Drop alias field if present
                            row.pop("alias", None)

                            all_rows.append(row)
            else:
                # Other configs: iterate through experiment_id directories
                for experiment_id in os.listdir(config_dir):
                    exp_dir = os.path.join(config_dir, experiment_id)

                    if not os.path.isdir(exp_dir):
                        continue

                    # Load all timestamped runs for this experiment
                    run_dirs = [d for d in os.listdir(exp_dir) if d.startswith("2025-")]

                    for run_dir in sorted(run_dirs):
                        results_path = os.path.join(exp_dir, run_dir, "results.json")
                        manifest_path = os.path.join(exp_dir, run_dir, "manifest.json")

                        if os.path.exists(results_path):
                            with open(results_path, "r") as f:
                                data = json.load(f)

                            # Load manifest for thinking and apply_chat_template
                            thinking = False
                            apply_chat_template = False
                            vllm = False
                            max_gen_toks = None
                            seed = None
                            limit = None
                            if os.path.exists(manifest_path):
                                with open(manifest_path, "r") as f:
                                    manifest = json.load(f)
                                    thinking = manifest.get("thinking", None)
                                    if thinking is None:
                                        thinking = False
                                    apply_chat_template = manifest.get("apply_chat_template", False)
                                    vllm = manifest.get("vllm", False)
                                    max_gen_toks = manifest.get("max_gen_toks", None)
                                    seed = manifest.get("seed", None)
                                    limit = manifest.get("limit", None)

                            # Get the task results
                            if "results" in data and task in data["results"]:
                                row = {
                                    "task_name": task,
                                    "config_name": config_name,
                                    "experiment_id": experiment_id,
                                    "run_dir": run_dir,
                                    "thinking": thinking,
                                    "apply_chat_template": apply_chat_template,
                                    "vllm": vllm,
                                    "max_gen_toks": max_gen_toks,
                                    "seed": seed,
                                    "limit": limit
                                }
                                # Add all metrics from the task results
                                row.update(data["results"][task])
                                
                                # Drop alias field if present
                                row.pop("alias", None)

                                all_rows.append(row)

    df = pd.DataFrame(all_rows)
    return df

print("Benchmark loading function defined")

Benchmark loading function defined


In [6]:
def load_jailbreak_scores(config_name, base_dir):
    """
    Load jailbreak scores and calculate combined harm rate.
    
    Args:
        config_name: Name of config (e.g., 'jailbreak', 'lmsys_10000')
        base_dir: Base directory containing results folder
    
    Returns:
        DataFrame with columns: experiment_id, harm_rate (percentage)
    """
    scores_path = f"{base_dir}/capped/results/{config_name}_jailbreak_1100_scores.jsonl"
    
    if not os.path.exists(scores_path):
        print(f"Warning: Jailbreak scores not found: {scores_path}")
        return pd.DataFrame()
    
    # Load scores
    with open(scores_path, "r") as f:
        scores = [json.loads(line) for line in f]

    scores_path = f"{base_dir}/capped/results/{config_name}_sliding_jailbreak_1100_scores.jsonl"
    
    if not os.path.exists(scores_path):
        print(f"Warning: Jailbreak scores not found: {scores_path}")
        return pd.DataFrame()
    
    # Load scores
    with open(scores_path, "r") as f:
        sliding_scores = [json.loads(line) for line in f]
    df = pd.DataFrame(scores + sliding_scores)


    
    # Calculate harm rate per experiment
    harm_labels = ["enough_info", "enough_info_and_follow_perfectly"]
    
    results = []
    for exp_id, group in df.groupby("experiment_id"):
        n_total = len(group)
        n_harmful = len(group[group["score"].isin(harm_labels)])
        harm_rate = (n_harmful / n_total * 100) if n_total > 0 else 0.0
        
        results.append({
            "experiment_id": exp_id,
            "config_name": config_name,
            "harm_rate": harm_rate,
            "n_harmful": n_harmful,
            "n_total": n_total
        })
    
    return pd.DataFrame(results)

def load_baseline_jailbreak(base_dir, baseline_type="prompted"):
    """
    Load baseline (unsteered) jailbreak scores.
    
    Args:
        base_dir: Base directory
        baseline_type: 'prompted' or 'default'
    
    Returns:
        Harm rate as percentage
    """
    if baseline_type == "prompted":
        scores_path = f"{base_dir}/evals/unsteered/unsteered_scores.jsonl"
    else:
        scores_path = f"{base_dir}/evals/unsteered/unsteered_default_scores.jsonl"
    
    if not os.path.exists(scores_path):
        print(f"Warning: Baseline scores not found: {scores_path}")
        return None
    
    with open(scores_path, "r") as f:
        scores = [json.loads(line) for line in f]
    df = pd.DataFrame(scores)
    
    harm_labels = ["enough_info", "enough_info_and_follow_perfectly"]
    n_total = len(df)
    n_harmful = len(df[df["score"].isin(harm_labels)])
    harm_rate = (n_harmful / n_total * 100) if n_total > 0 else 0.0
    
    return harm_rate

print("Jailbreak loading functions defined")

Jailbreak loading functions defined


## Pareto Frontier Calculation

In [7]:
def calculate_pareto_frontier(df, x_col='eval_pct_change', y_col='harm_pct_change'):
    """
    Identify Pareto-optimal points.
    
    For our case:
    - X-axis (eval_pct_change): Higher is better (less performance degradation)
    - Y-axis (harm_pct_change): Lower is better (more harm reduction, i.e., more negative)
    
    A point is Pareto-optimal if no other point has both:
    - x >= x_i (better or equal eval performance)
    - y <= y_i (better or equal harm reduction)
    with at least one strict inequality.
    
    Args:
        df: DataFrame with x_col and y_col columns
        x_col: Column name for x-axis (eval percentage change)
        y_col: Column name for y-axis (harm percentage change)
    
    Returns:
        DataFrame with additional 'is_pareto' boolean column
    """
    df = df.copy()
    df['is_pareto'] = False
    
    points = df[[x_col, y_col]].values
    
    for i in range(len(points)):
        is_dominated = False
        
        for j in range(len(points)):
            if i == j:
                continue
            
            # Check if point j dominates point i
            # j dominates i if: j.x >= i.x AND j.y <= i.y (with at least one strict)
            x_better_or_equal = points[j][0] >= points[i][0]
            y_better_or_equal = points[j][1] <= points[i][1]
            strictly_better = (points[j][0] > points[i][0]) or (points[j][1] < points[i][1])
            
            if x_better_or_equal and y_better_or_equal and strictly_better:
                is_dominated = True
                break
        
        if not is_dominated:
            df.loc[df.index[i], 'is_pareto'] = True
    
    return df

print("Pareto calculation function defined")

Pareto calculation function defined


## Plotting Function

In [8]:
def plot_pareto_frontier(df, eval_name, eval_display_name, title_suffix="", subtitle=""):
    """
    Plot Pareto frontier for eval performance vs jailbreak harm reduction.
    
    Args:
        df: DataFrame with columns: config_name, experiment_id, eval_pct_change, 
            harm_pct_change, is_pareto, display_name
        eval_name: Short eval name (e.g., 'ifeval')
        eval_display_name: Display name for eval (e.g., 'IFEval Prompt-level Accuracy')
        title_suffix: Additional text for title
        subtitle: Subtitle text
    
    Returns:
        Plotly figure object
    """
    fig = go.Figure()
    
    # Plot all points by config (both Pareto and non-Pareto with same style)
    for config in ['baseline', 'jailbreak', 'role_trait', 'lmsys_10000']:
        if config not in df['config_name'].values:
            continue
        
        config_df = df[df['config_name'] == config]
        
        # Plot all points for this config
        fig.add_trace(go.Scatter(
            x=config_df['eval_pct_change'],
            y=config_df['harm_pct_change'],
            mode='markers',
            name=CONFIG_DISPLAY_NAMES[config],
            marker=dict(
                size=8,
                color=CONFIG_COLORS[config],
                opacity=0.6
            ),
            hovertemplate=(
                "<b>%{customdata[0]}</b><br>"
                "Config: %{customdata[1]}<br>"
                "Eval Change: %{x:.1f}%<br>"
                "Harm Change: %{y:.1f}%<br>"
                "%{customdata[2]}"
                "<extra></extra>"
            ),
            customdata=np.column_stack([
                config_df['display_name'],
                config_df['config_name'],
                config_df['is_pareto'].apply(lambda x: '<b>PARETO OPTIMAL</b>' if x else '')
            ]),
            legendgroup=config,
            showlegend=True
        ))
    
    # Add text labels for Pareto-optimal points using parsed experiment info
    pareto_points = df[df['is_pareto']]
    for _, row in pareto_points.iterrows():
        # Parse experiment_id to get layer and cap info
        layer_spec, _, cap_value = parse_experiment_id(row['experiment_id'])
        
        # Format the label
        layer_label = format_layer_range(layer_spec)
        cap_label = format_cap_label(cap_value)
        label_text = f"{layer_label}, {cap_label} %ile"
        
        # Get the color for this config
        point_color = CONFIG_COLORS[row['config_name']]
        
        fig.add_annotation(
            x=row['eval_pct_change'],
            y=row['harm_pct_change'],
            text=label_text,
            showarrow=False,
            xshift=10,  # Position label to the right of the point
            font=dict(size=9, color=point_color),
            xanchor='left',
            yanchor='middle',
            align='left'
        )
    
    # Draw Pareto frontier line
    pareto_sorted = pareto_points.sort_values('eval_pct_change')
    if len(pareto_sorted) > 1:
        fig.add_trace(go.Scatter(
            x=pareto_sorted['eval_pct_change'],
            y=pareto_sorted['harm_pct_change'],
            mode='lines',
            name='Pareto Frontier',
            line=dict(color='grey', width=1, dash='dash'),
            showlegend=True,
            hoverinfo='skip'
        ))
    
    # Add reference lines at origin
    fig.add_hline(y=0, line_dash="dot", line_color="gray", line_width=1, opacity=0.5)
    fig.add_vline(x=0, line_dash="dot", line_color="gray", line_width=1, opacity=0.5)
    
    # Add quadrant annotations
    max_x = df['eval_pct_change'].max()
    min_x = df['eval_pct_change'].min()
    max_y = df['harm_pct_change'].max()
    min_y = df['harm_pct_change'].min()
    
    # Top-right quadrant (ideal: better eval, reduced harm)
    fig.add_annotation(
        x=max_x * 0.9, y=min_y * 0.9,
        text="<b>Ideal</b><br>Unchanged eval<br>Reduced harm",
        showarrow=False,
        font=dict(size=10, color="green")
    )
    
    # Bottom-left quadrant (worst: worse eval, increased harm)
    fig.add_annotation(
        x=min_x * 0.9, y=max_y * 0.9,
        text="<b>Worst</b><br>Worse eval<br>Unchanged harm",
        showarrow=False,
        font=dict(size=10, color="red")
    )
    
    fig.update_layout(
        title=dict(
            text=f"Pareto Frontier: {eval_display_name} vs. Harmful Response Rate{title_suffix}",
            subtitle=dict(text=subtitle)
        ),
        xaxis=dict(
            title="Eval Performance Change (%)",
            zeroline=True,
            showgrid=True,
            gridcolor='lightgray'
        ),
        yaxis=dict(
            title="Harmful Response Rate Reduced (%)",
            zeroline=True,
            showgrid=True,
            gridcolor='lightgray'
        ),
        width=1000,
        height=700,
        legend=dict(
            orientation="v",
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02
        ),
        hovermode='closest'
    )
    
    return fig

print("Plotting function defined")

Plotting function defined


## Analysis: Generate Pareto Frontiers

For each eval, we'll:
1. Load benchmark and jailbreak data
2. Calculate percentage changes from baseline
3. Identify Pareto frontier
4. Visualize the tradeoff

## Load Jailbreak Data

Load jailbreak harm rates for all configs once.

In [9]:
# Get baseline harm rate (using prompted baseline)
baseline_harm = load_baseline_jailbreak(base_dir, baseline_type="prompted")
print(f"Baseline jailbreak harm rate: {baseline_harm:.2f}%")

# Load jailbreak data for each config
jailbreak_data = {}
for config_name in config_names:
    if config_name == 'baseline':
        continue
    
    df_jb = load_jailbreak_scores(config_name, base_dir)
    if len(df_jb) > 0:
        jailbreak_data[config_name] = df_jb
        print(f"Loaded {len(df_jb)} experiments for {config_name}")
    else:
        print(f"Warning: No jailbreak data found for {config_name}")

print(f"\nLoaded jailbreak data for {len(jailbreak_data)} configs")

Baseline jailbreak harm rate: 83.25%
Loaded 148 experiments for role_trait

Loaded jailbreak data for 1 configs


In [10]:
def prepare_pareto_data(df_eval, df_jailbreak_dict, eval_name, baseline_eval_value, baseline_harm_rate):
    """
    Prepare data for Pareto analysis by combining eval and jailbreak metrics.

    Args:
        df_eval: Pre-loaded eval DataFrame (filtered as desired)
        df_jailbreak_dict: Dict mapping config_name to jailbreak DataFrames
        eval_name: Name of eval task (for metric lookup)
        baseline_eval_value: Baseline eval metric value
        baseline_harm_rate: Baseline jailbreak harm rate (percentage)

    Returns:
        DataFrame ready for Pareto analysis with Pareto frontier calculated
    """
    eval_config = EVAL_METRICS[eval_name]
    metric_col = eval_config['metric']

    print(f"\n{eval_name.upper()}:")
    print(f"Baseline eval value: {baseline_eval_value:.4f}")
    print(f"Baseline harm rate: {baseline_harm_rate:.2f}%")

    # Combine eval and jailbreak data for each config
    all_data = []

    for config_name, df_jailbreak in df_jailbreak_dict.items():
        if config_name == 'baseline':
            continue

        # Get eval data for this config
        config_eval = df_eval[df_eval['config_name'] == config_name]

        if len(config_eval) == 0:
            print(f"Skipping {config_name}: no eval data")
            continue

        # Group by experiment_id and take mean of metrics
        grouped = config_eval.groupby(['config_name', 'experiment_id']).agg({
            metric_col: 'max'
        }).reset_index()

        # Merge eval and jailbreak data
        merged = grouped.merge(
            df_jailbreak[['experiment_id', 'harm_rate']],
            on='experiment_id',
            how='inner'
        )

        if len(merged) == 0:
            print(f"Skipping {config_name}: no matching experiments")
            continue

        # Calculate percentage changes
        merged['eval_value'] = merged[metric_col]
        merged['eval_pct_change'] = ((merged['eval_value'] - baseline_eval_value) / baseline_eval_value * 100)
        merged['harm_pct_change'] = ((merged['harm_rate'] - baseline_harm_rate) / baseline_harm_rate * 100)

        # Add parsed columns for display
        parsed = merged['experiment_id'].apply(parse_experiment_id)
        merged['layer_spec'] = parsed.apply(lambda x: x[0])
        merged['cap_value'] = parsed.apply(lambda x: x[2])
        merged['layer_label'] = merged['layer_spec'].apply(format_layer_range)
        merged['cap_label'] = merged['cap_value'].apply(format_cap_label)
        merged['display_name'] = merged.apply(
            lambda row: f"{row['layer_label']}, {row['cap_label']}",
            axis=1
        )

        all_data.append(merged)
        print(f"Loaded {len(merged)} experiments for {config_name}")

    if len(all_data) == 0:
        print("No data available for Pareto analysis")
        return pd.DataFrame()

    # Combine all configs
    df_combined = pd.concat(all_data, ignore_index=True)

    # Calculate Pareto frontier
    df_combined = calculate_pareto_frontier(df_combined)

    print(f"\nTotal experiments: {len(df_combined)}")
    print(f"Pareto-optimal points: {df_combined['is_pareto'].sum()}")

    return df_combined

print("Data preparation helper defined")

Data preparation helper defined


### IFEval vs Jailbreak Harm

In [11]:
# Load IFEval data
df_ifeval = load_experiment_data(['ifeval'], config_names, base_dir)

print(f"\nLoaded {len(df_ifeval)} IFEval experiment runs")
print(f"\nConfig breakdown:")
for config in df_ifeval['config_name'].unique():
    print(f"  {config}: {len(df_ifeval[df_ifeval['config_name'] == config])} runs")


Loaded 323 IFEval experiment runs

Config breakdown:
  role_trait: 320 runs
  baseline: 3 runs


In [None]:
# Filter IFEval data
# Filter to no thinking, with chat template
df_ifeval_filtered = df_ifeval[
    ((df_ifeval['limit'] == 1000))
]

# Apply experiment_ids filter if configured
if experiment_ids is not None:
    df_ifeval_filtered = df_ifeval_filtered[
        (df_ifeval_filtered['experiment_id'].isin(experiment_ids)) |
        (df_ifeval_filtered['config_name'] == 'baseline')
    ]
    print(f"Filtered to {len(experiment_ids)} experiment IDs: {len(df_ifeval_filtered)} runs")

print(f"After filtering: {len(df_ifeval_filtered)} runs")

# Get baseline value for percentage calculation
metric_col = EVAL_METRICS['ifeval']['metric']
baseline_ifeval = df_ifeval_filtered[df_ifeval_filtered['config_name'] == 'baseline']
if len(baseline_ifeval) > 0:
    baseline_ifeval_value = baseline_ifeval[metric_col].iloc[0]
    print(f"Baseline IFEval value: {baseline_ifeval_value:.4f}")
else:
    print("Warning: No baseline found!")
    baseline_ifeval_value = None

In [33]:
# Prepare Pareto data for IFEval
if baseline_ifeval_value is not None:
    df_ifeval_pareto = prepare_pareto_data(
        df_ifeval_filtered,
        jailbreak_data,
        'ifeval',
        baseline_ifeval_value,
        baseline_harm
    )
else:
    df_ifeval_pareto = pd.DataFrame()
    print("Skipping Pareto analysis due to missing baseline")


IFEVAL:
Baseline eval value: 0.7578
Baseline harm rate: 83.25%
Loaded 148 experiments for role_trait

Total experiments: 148
Pareto-optimal points: 7


In [34]:
# Plot IFEval Pareto frontier
if len(df_ifeval_pareto) > 0:
    fig_ifeval = plot_pareto_frontier(
        df_ifeval_pareto,
        'ifeval',
        EVAL_METRICS['ifeval']['display_name'],
        subtitle=subtitle
    )
    fig_ifeval.show()
    fig_ifeval.write_html(f"{out_dir}/sliding_ifeval.html")
else:
    print("No data available for IFEval Pareto plot")

### MMLU Pro vs Jailbreak Harm

In [15]:
# Load MMLU Pro data
df_mmlu = load_experiment_data(['mmlu_pro'], config_names, base_dir)
print(f"Loaded {len(df_mmlu)} MMLU Pro experiment runs")
print(f"\nConfig breakdown:")
for config in df_mmlu['config_name'].unique():
    print(f"  {config}: {len(df_mmlu[df_mmlu['config_name'] == config])} runs")

Loaded 239 MMLU Pro experiment runs

Config breakdown:
  role_trait: 232 runs
  baseline: 7 runs


In [None]:
# Filter MMLU Pro data
# First, get non-vllm filtered data
df_mmlu_filtered = df_mmlu[
    ((df_mmlu['config_name'] == 'baseline') &
     (~df_mmlu['thinking']) &
     (df_mmlu['apply_chat_template']) & (df_mmlu['vllm'])) |
    ((df_mmlu['config_name'] != 'baseline') &
     (~df_mmlu['thinking']) &
     (df_mmlu['apply_chat_template']) & (df_mmlu['vllm']))
]

# Apply experiment_ids filter if configured
if experiment_ids is not None:
    df_mmlu_filtered = df_mmlu_filtered[
        (df_mmlu_filtered['experiment_id'].isin(experiment_ids)) |
        (df_mmlu_filtered['config_name'] == 'baseline')
    ]
    print(f"Filtered to {len(experiment_ids)} experiment IDs")

print(f"After filtering: {len(df_mmlu_filtered)} runs")
for config in df_mmlu['config_name'].unique():
    print(f"  {config}: {len(df_mmlu_filtered[df_mmlu_filtered['config_name'] == config])} runs")

# Get baseline value
metric_col = EVAL_METRICS['mmlu_pro']['metric']
baseline_mmlu = df_mmlu_filtered[df_mmlu_filtered['config_name'] == 'baseline']
if len(baseline_mmlu) > 0:
    baseline_mmlu_value = baseline_mmlu[metric_col].iloc[0]
    print(f"Baseline MMLU Pro value: {baseline_mmlu_value:.4f}")
else:
    print("Warning: No baseline found!")
    baseline_mmlu_value = None

In [17]:
# Prepare Pareto data for MMLU Pro
if baseline_mmlu_value is not None:
    df_mmlu_pareto = prepare_pareto_data(
        df_mmlu_filtered,
        jailbreak_data,
        'mmlu_pro',
        baseline_mmlu_value,
        baseline_harm
    )
else:
    df_mmlu_pareto = pd.DataFrame()
    print("Skipping Pareto analysis due to missing baseline")


MMLU_PRO:
Baseline eval value: 0.6743
Baseline harm rate: 83.25%
Loaded 148 experiments for role_trait

Total experiments: 148
Pareto-optimal points: 6


In [18]:
# Plot MMLU Pro Pareto frontier
if len(df_mmlu_pareto) > 0:
    fig_mmlu = plot_pareto_frontier(
        df_mmlu_pareto,
        'mmlu_pro',
        EVAL_METRICS['mmlu_pro']['display_name'],
        subtitle=subtitle
    )
    fig_mmlu.show()
    fig_mmlu.write_html(f"{out_dir}/sliding_mmlu_pro.html")
else:
    print("No data available for MMLU Pro Pareto plot")

### EQ-Bench vs Jailbreak Harm

In [36]:
# Load EQ-Bench data
df_eq = load_experiment_data(['eq_bench'], config_names, base_dir)
print(f"Loaded {len(df_eq)} EQ-Bench experiment runs")
print(f"\nConfig breakdown:")
for config in df_eq['config_name'].unique():
    print(f"  {config}: {len(df_eq[df_eq['config_name'] == config])} runs")

Loaded 447 EQ-Bench experiment runs

Config breakdown:
  role_trait: 444 runs
  baseline: 3 runs


In [None]:
# Filter EQ-Bench data
df_eq_filtered = df_eq[
    (~df_eq['vllm'])
]

# Apply experiment_ids filter if configured
if experiment_ids is not None:
    df_eq_filtered = df_eq_filtered[
        (df_eq_filtered['experiment_id'].isin(experiment_ids)) |
        (df_eq_filtered['config_name'] == 'baseline')
    ]
    print(f"Filtered to {len(experiment_ids)} experiment IDs")

print(f"After filtering: {len(df_eq_filtered)} runs")

# Get baseline value
metric_col = EVAL_METRICS['eq_bench']['metric']
baseline_eq = df_eq_filtered[df_eq_filtered['config_name'] == 'baseline']
if len(baseline_eq) > 0:
    baseline_eq_value = baseline_eq[metric_col].iloc[0]
    print(f"Baseline EQ-Bench value: {baseline_eq_value:.4f}")
else:
    print("Warning: No baseline found!")
    baseline_eq_value = None

In [38]:
# Prepare Pareto data for EQ-Bench
if baseline_eq_value is not None:
    df_eq_pareto = prepare_pareto_data(
        df_eq_filtered,
        jailbreak_data,
        'eq_bench',
        baseline_eq_value,
        baseline_harm
    )
else:
    df_eq_pareto = pd.DataFrame()
    print("Skipping Pareto analysis due to missing baseline")


EQ_BENCH:
Baseline eval value: 82.2942
Baseline harm rate: 83.25%
Loaded 148 experiments for role_trait

Total experiments: 148
Pareto-optimal points: 14


In [22]:
# Plot EQ-Bench Pareto frontier
if len(df_eq_pareto) > 0:
    fig_eq = plot_pareto_frontier(
        df_eq_pareto,
        'eq_bench',
        EVAL_METRICS['eq_bench']['display_name'],
        subtitle=subtitle
    )
    fig_eq.show()
    fig_eq.write_html(f"{out_dir}/sliding_eq_bench.html")
else:
    print("No data available for EQ-Bench Pareto plot")

### GSM8K

In [23]:
# Load MMLU Pro data
df_gsm8k = load_experiment_data(['gsm8k'], config_names, base_dir)
print(f"Loaded {len(df_gsm8k)} GSM8K experiment runs")
print(f"\nConfig breakdown:")
for config in df_gsm8k['config_name'].unique():
    print(f"  {config}: {len(df_gsm8k[df_gsm8k['config_name'] == config])} runs")

Loaded 483 GSM8K experiment runs

Config breakdown:
  role_trait: 479 runs
  baseline: 4 runs


In [None]:
# Filter MMLU Pro data
df_gsm8k_filtered = df_gsm8k[
    ((df_gsm8k['vllm']))
]

# Apply experiment_ids filter if configured
if experiment_ids is not None:
    df_gsm8k_filtered = df_gsm8k_filtered[
        (df_gsm8k_filtered['experiment_id'].isin(experiment_ids)) |
        (df_gsm8k_filtered['config_name'] == 'baseline')
    ]
    print(f"Filtered to {len(experiment_ids)} experiment IDs")

print(f"After filtering: {len(df_gsm8k_filtered)} runs")
for config in df_gsm8k['config_name'].unique():
    print(f"  {config}: {len(df_gsm8k_filtered[df_gsm8k_filtered['config_name'] == config])} runs")

# Get baseline value
metric_col = EVAL_METRICS['gsm8k']['metric']
baseline_gsm8k = df_gsm8k_filtered[df_gsm8k_filtered['config_name'] == 'baseline']
if len(baseline_gsm8k) > 0:
    baseline_gsm8k_value = baseline_gsm8k[metric_col].iloc[0]
    print(f"Baseline GSM8K value: {baseline_gsm8k_value:.4f}")
else:
    print("Warning: No baseline found!")
    baseline_gsm8k_value = None

In [25]:
# Prepare Pareto data for GSM8K
if baseline_gsm8k_value is not None:
    df_gsm8k_pareto = prepare_pareto_data(
        df_gsm8k_filtered,
        jailbreak_data,
        'gsm8k',
        baseline_gsm8k_value,
        baseline_harm
    )
else:
    df_gsm8k_pareto = pd.DataFrame()
    print("Skipping Pareto analysis due to missing baseline")


GSM8K:
Baseline eval value: 0.8080
Baseline harm rate: 83.25%
Loaded 148 experiments for role_trait

Total experiments: 148
Pareto-optimal points: 7


In [26]:
# Plot GSM8K Pareto frontier
if len(df_gsm8k_pareto) > 0:
    fig_gsm8k = plot_pareto_frontier(
        df_gsm8k_pareto,
        'gsm8k',
        EVAL_METRICS['gsm8k']['display_name'],
        subtitle=subtitle
    )
    fig_gsm8k.show()
    fig_gsm8k.write_html(f"{out_dir}/sliding_gsm8k.html")
else:
    print("No data available for GSM8K Pareto plot")

## Summary of Pareto-Optimal Configurations

Display the Pareto-optimal experiments across all evals.

In [27]:
# Combine Pareto points from all evals
pareto_summary = []

for eval_name, df_pareto in [
    ('ifeval', df_ifeval_pareto),
    ('mmlu_pro', df_mmlu_pareto),
    ('eq_bench', df_eq_pareto)
]:
    if len(df_pareto) > 0:
        pareto_points = df_pareto[df_pareto['is_pareto']].copy()
        pareto_points['eval_name'] = eval_name
        pareto_summary.append(pareto_points[[
            'eval_name', 'config_name', 'experiment_id', 'display_name',
            'eval_pct_change', 'harm_pct_change'
        ]])

if len(pareto_summary) > 0:
    df_summary = pd.concat(pareto_summary, ignore_index=True)
    df_summary = df_summary.sort_values(['eval_name', 'eval_pct_change'], ascending=[True, False])
    
    print("\n=== PARETO-OPTIMAL CONFIGURATIONS ===")
    print(f"\nTotal Pareto points across all evals: {len(df_summary)}")
    print("\nBy eval:")
    print(df_summary.groupby('eval_name').size())
    print("\nBy config:")
    print(df_summary.groupby('config_name').size())
    
    display(df_summary)
else:
    print("No Pareto-optimal points found")


=== PARETO-OPTIMAL CONFIGURATIONS ===

Total Pareto points across all evals: 29

By eval:
eval_name
eq_bench    16
ifeval       7
mmlu_pro     6
dtype: int64

By config:
config_name
role_trait    29
dtype: int64


Unnamed: 0,eval_name,config_name,experiment_id,display_name,eval_pct_change,harm_pct_change
25,eq_bench,role_trait,layers_40:56-p0.75,"Layers 40-55, 75th",0.61765,-43.980344
23,eq_bench,role_trait,layers_36:52-p0.75,"Layers 36-51, 75th",0.457677,-48.457548
19,eq_bench,role_trait,layers_34:50-p0.75,"Layers 34-49, 75th",0.408397,-49.003549
27,eq_bench,role_trait,layers_52:60-p0.25,"Layers 52-59, 25th",0.386118,-49.65875
28,eq_bench,role_trait,layers_52:60-p0.5,"Layers 52-59, 50th",0.337315,-50.09555
26,eq_bench,role_trait,layers_44:60-p0.75,"Layers 44-59, 75th",0.234872,-50.532351
22,eq_bench,role_trait,layers_36:52-p0.5,"Layers 36-51, 50th",-0.083058,-58.285558
24,eq_bench,role_trait,layers_38:54-p0.25,"Layers 38-53, 25th",-1.552146,-64.073164
15,eq_bench,role_trait,layers_32:48-p0.25,"Layers 32-47, 25th",-1.576188,-64.509965
18,eq_bench,role_trait,layers_34:50-p0.25,"Layers 34-49, 25th",-1.66172,-70.843571


## Aggregated Eval Performance Analysis

Analyze the tradeoff between jailbreak harm reduction and aggregate performance across all 4 evals (IFEval, MMLU Pro, EQ-Bench, GSM8K).

For each experiment, we calculate:
- **Decrease**: `max(0, -eval_pct_change)` for each eval (higher = worse performance)
- **Product aggregation**: Product of all 4 decreases
- **Sum aggregation**: Sum of all 4 decreases

In [39]:
# Merge all 4 eval Pareto dataframes on experiment_id and config_name
# We'll use the filtered dataframes that were already computed

dfs_to_merge = [
    ('ifeval', df_ifeval_pareto),
    ('mmlu_pro', df_mmlu_pareto),
    ('eq_bench', df_eq_pareto),
    ('gsm8k', df_gsm8k_pareto)
]

# Start with the first dataframe and add eval_pct_change with suffix
df_agg = dfs_to_merge[0][1][['experiment_id', 'config_name', 'eval_pct_change', 'harm_pct_change', 'display_name']].copy()
df_agg = df_agg.rename(columns={'eval_pct_change': f'{dfs_to_merge[0][0]}_pct_change'})

# Merge the rest
for eval_name, df_eval in dfs_to_merge[1:]:
    df_temp = df_eval[['experiment_id', 'config_name', 'eval_pct_change']].copy()
    df_temp = df_temp.rename(columns={'eval_pct_change': f'{eval_name}_pct_change'})
    
    df_agg = df_agg.merge(df_temp, on=['experiment_id', 'config_name'], how='inner')

print(f"Merged data: {len(df_agg)} experiments with all 4 evals")
print(f"\nConfig breakdown:")
for config in df_agg['config_name'].unique():
    print(f"  {config}: {len(df_agg[df_agg['config_name'] == config])} experiments")

# Calculate decreases (clipped negative percentage changes)
for eval_name, _ in dfs_to_merge:
    col_name = f'{eval_name}_pct_change'
    decrease_col = f'{eval_name}_decrease'
    df_agg[decrease_col] = df_agg[col_name].apply(lambda x: max(0, -x))

# Calculate aggregations
df_agg['product_decrease'] = (
    df_agg['ifeval_decrease'] * 
    df_agg['mmlu_pro_decrease'] * 
    df_agg['eq_bench_decrease'] * 
    df_agg['gsm8k_decrease']
)

df_agg['sum_decrease'] = (
    df_agg['ifeval_decrease'] + 
    df_agg['mmlu_pro_decrease'] + 
    df_agg['eq_bench_decrease'] + 
    df_agg['gsm8k_decrease']
)

print(f"\nProduct decrease range: [{df_agg['product_decrease'].min():.2f}, {df_agg['product_decrease'].max():.2f}]")
print(f"Sum decrease range: [{df_agg['sum_decrease'].min():.2f}, {df_agg['sum_decrease'].max():.2f}]")
print(f"Harm change range: [{df_agg['harm_pct_change'].min():.2f}%, {df_agg['harm_pct_change'].max():.2f}%]")

# Show sample of the data
display(df_agg.tail(10))

Merged data: 148 experiments with all 4 evals

Config breakdown:
  role_trait: 148 experiments

Product decrease range: [0.00, 1306.00]
Sum decrease range: [0.95, 31.64]
Harm change range: [-91.15%, -3.58%]


Unnamed: 0,experiment_id,config_name,ifeval_pct_change,harm_pct_change,display_name,mmlu_pro_pct_change,eq_bench_pct_change,gsm8k_pct_change,ifeval_decrease,mmlu_pro_decrease,eq_bench_decrease,gsm8k_decrease,product_decrease,sum_decrease
138,layers_56:60-p0.5,role_trait,-3.797468,-55.337155,"Layers 56-59, 50th",-0.741525,0.508823,3.465347,3.797468,0.741525,0.0,0.0,0.0,4.538994
139,layers_56:60-p0.75,role_trait,-3.006329,-55.992356,"Layers 56-59, 75th",-1.059322,0.568892,3.589109,3.006329,1.059322,0.0,0.0,0.0,4.065651
140,layers_56:64-p0.01,role_trait,-5.537975,-5.978706,"Layers 56-63, 1st",0.423729,0.317493,3.341584,5.537975,0.0,0.0,0.0,0.0,5.537975
141,layers_56:64-p0.25,role_trait,-2.848101,-4.340704,"Layers 56-63, 25th",-3.495763,0.310409,1.608911,2.848101,3.495763,0.0,0.0,0.0,6.343864
142,layers_56:64-p0.5,role_trait,-4.272152,-5.214305,"Layers 56-63, 50th",-3.919492,0.291382,1.732673,4.272152,3.919492,0.0,0.0,0.0,8.191643
143,layers_56:64-p0.75,role_trait,-3.322785,-4.122304,"Layers 56-63, 75th",-2.542373,0.357133,1.361386,3.322785,2.542373,0.0,0.0,0.0,5.865158
144,layers_8:16-p0.01,role_trait,-4.905063,-23.341523,"Layers 8-15, 1st",-5.614407,-1.414876,-1.361386,4.905063,5.614407,1.414876,1.361386,53.045474,13.295733
145,layers_8:16-p0.25,role_trait,-3.955696,-12.749113,"Layers 8-15, 25th",-4.449153,-1.146939,-0.49505,3.955696,4.449153,1.146939,0.49505,9.992849,10.046838
146,layers_8:16-p0.5,role_trait,-4.43038,-5.541906,"Layers 8-15, 50th",-3.177966,-0.88711,0.0,4.43038,3.177966,0.88711,0.0,0.0,8.495456
147,layers_8:16-p0.75,role_trait,-3.955696,-3.576304,"Layers 8-15, 75th",-2.330508,-0.457357,-1.732673,3.955696,2.330508,0.457357,1.732673,7.305426,8.476235


# Create negative sum for plotting and Pareto calculation (higher is better, like eval_pct_change)
df_agg['neg_sum_decrease'] = -df_agg['sum_decrease']

# Calculate Pareto frontier for aggregated sum metric
df_agg_pareto = calculate_pareto_frontier(df_agg, x_col='neg_sum_decrease', y_col='harm_pct_change')

print(f"Pareto-optimal points for aggregated sum: {df_agg_pareto['is_pareto'].sum()}")

# Create single plot for sum aggregation with Pareto frontier
fig = go.Figure()

# Plot all points by config
for config in ['baseline', 'jailbreak', 'role_trait', 'lmsys_10000']:
    if config not in df_agg_pareto['config_name'].values:
        continue
    
    config_df = df_agg_pareto[df_agg_pareto['config_name'] == config]
    
    fig.add_trace(go.Scatter(
        x=config_df['neg_sum_decrease'],
        y=config_df['harm_pct_change'],
        mode='markers',
        name=CONFIG_DISPLAY_NAMES[config],
        marker=dict(
            size=8,
            color=CONFIG_COLORS[config],
            opacity=0.6
        ),
        hovertemplate=(
            "<b>%{customdata[0]}</b><br>"
            "Config: %{customdata[1]}<br>"
            "Aggregated Eval Change: %{x:.2f}%<br>"
            "Harm Change: %{y:.1f}%<br>"
            "%{customdata[2]}"
            "<extra></extra>"
        ),
        customdata=np.column_stack([
            config_df['display_name'],
            config_df['config_name'],
            config_df['is_pareto'].apply(lambda x: '<b>PARETO OPTIMAL</b>' if x else '')
        ]),
        legendgroup=config,
        showlegend=True
    ))

# Add text labels for Pareto-optimal points
pareto_points = df_agg_pareto[df_agg_pareto['is_pareto']]
for _, row in pareto_points.iterrows():
    # Parse experiment_id to get layer and cap info
    layer_spec, _, cap_value = parse_experiment_id(row['experiment_id'])
    
    # Format the label
    layer_label = format_layer_range(layer_spec)
    cap_label = format_cap_label(cap_value)
    label_text = f"{layer_label}, {cap_label} %ile"
    
    # Get the color for this config
    point_color = CONFIG_COLORS[row['config_name']]
    
    fig.add_annotation(
        x=row['neg_sum_decrease'],
        y=row['harm_pct_change'],
        text=label_text,
        showarrow=False,
        xshift=10,  # Position label to the right of the point
        font=dict(size=9, color=point_color),
        xanchor='left',
        yanchor='middle',
        align='left'
    )

# Draw Pareto frontier line
pareto_sorted = pareto_points.sort_values('neg_sum_decrease')
if len(pareto_sorted) > 1:
    fig.add_trace(go.Scatter(
        x=pareto_sorted['neg_sum_decrease'],
        y=pareto_sorted['harm_pct_change'],
        mode='lines',
        name='Pareto Frontier',
        line=dict(color='grey', width=1, dash='dash'),
        showlegend=True,
        hoverinfo='skip'
    ))

# Add reference lines at origin
fig.add_hline(y=0, line_dash="dot", line_color="gray", line_width=1, opacity=0.5)
fig.add_vline(x=0, line_dash="dot", line_color="gray", line_width=1, opacity=0.5)

# Add quadrant annotations
max_x = df_agg_pareto['neg_sum_decrease'].max()
min_x = df_agg_pareto['neg_sum_decrease'].min()
max_y = df_agg_pareto['harm_pct_change'].max()
min_y = df_agg_pareto['harm_pct_change'].min()

# Top-right quadrant (ideal: no eval loss, reduced harm)
fig.add_annotation(
    x=max_x * 0.9, y=min_y * 0.9,
    text="<b>Ideal</b><br>No eval loss<br>Reduced harm",
    showarrow=False,
    font=dict(size=10, color="green")
)

# Bottom-left quadrant (worst: high eval loss, unchanged harm)
fig.add_annotation(
    x=min_x * 0.9, y=max_y * 0.9,
    text="<b>Worst</b><br>High eval loss<br>Unchanged harm",
    showarrow=False,
    font=dict(size=10, color="red")
)

fig.update_layout(
    title=dict(
        text="Aggregated Eval Performance (IFEval, MMLU Pro, EQ-Bench, GSM8K) vs. Harmful Response Rate",
        subtitle=dict(text=subtitle)
    ),
    xaxis=dict(
        title="Aggregated Eval Performance Change (%)",
        zeroline=True,
        showgrid=True,
        gridcolor='lightgray'
    ),
    yaxis=dict(
        title="Harmful Response Rate Reduced (%)",
        zeroline=True,
        showgrid=True,
        gridcolor='lightgray'
    ),
    width=1000,
    height=700,
    legend=dict(
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.02
    ),
    hovermode='closest'
)

fig.show()

# Save to file
fig.write_html(f"{out_dir}/sliding_aggregated_sum.html")
print(f"\nSaved plot to {out_dir}/sliding_aggregated_sum.html")