# Analyzing scores and plotting them

In [2]:
import json
import os
import pandas as pd
import numpy as np

import plotly.graph_objects as go

In [71]:
model = "qwen-3-32b"
subtitle = f"{model.replace('-', ' ').title()}, Projecting on Mean (Role-Playing - Default Assistant) Contrast Vector"
base_dir = f"/workspace/{model}/evals"
out_dir = f"/root/git/plots/{model}/evals/capped"

os.makedirs(out_dir, exist_ok=True)


In [72]:
# Experiment configuration and ordering
# Order experiments by layer group, then by aggressiveness (least to most)

EXPERIMENT_ORDER = [
    # Layer group: 32:64 (middle to last, every layer)
    'layers_32:64-harm_0.25', 'layers_32:64-harm_0.01', 
    'layers_32:64-safe_0.50', 'layers_32:64-safe_0.01',
    # Layer group: 0:64 (all layers)
    'layers_0:64-harm_0.25', 'layers_0:64-harm_0.01',
    'layers_0:64-safe_0.50', 'layers_0:64-safe_0.01',
    # Layer group: 2:64:2 (every other layer)
    'layers_2:64:2-harm_0.25', 'layers_2:64:2-harm_0.01',
    'layers_2:64:2-safe_0.50', 'layers_2:64:2-safe_0.01',
    # Layer group: 4:64:4 (every 4th layer)
    'layers_4:64:4-harm_0.25', 'layers_4:64:4-harm_0.01',
    'layers_4:64:4-safe_0.50', 'layers_4:64:4-safe_0.01',
]

# Short names for cap types (used as annotations above bars)
CAP_NAMES = {
    'harm_0.25': '75%<br>Harmful',
    'harm_0.01': '99%<br>Harmful', 
    'safe_0.50': '50%<br>Safe',
    'safe_0.01': '99%<br>Safe',
}

# Layer group labels for x-axis
LAYER_GROUPS = ['32:64', '0:64', '2:64:2', '4:64:4']
LAYER_GROUP_NAMES = {
    '32:64': 'Layers 32-63',
    '0:64': 'All Layers 0-63',
    '2:64:2': 'Every 2nd Layer',
    '4:64:4': 'Every 4th Layer',
}

print(f"Configured {len(EXPERIMENT_ORDER)} experiments in {len(LAYER_GROUPS)} layer groups")

Configured 16 experiments in 4 layer groups


In [73]:
# load scores from roles_20 as pandas dataframe
scores_path = f"{base_dir}/capped/multi_contrast_layers_1100_scores.jsonl"

# unsteered results as baseline
unsteered_prompted = f"{base_dir}/unsteered/unsteered_scores.jsonl"
unsteered_default = f"{base_dir}/unsteered/unsteered_default_scores.jsonl"

In [74]:
# load into df
with open(scores_path, "r") as f:
    scores = [json.loads(line) for line in f]
# Convert to pandas DataFrames
scores_df = pd.DataFrame(scores)

print(f"Loaded {len(scores_df)} scores")

# Load baseline (unsteered) data
with open(unsteered_prompted, "r") as f:
    unsteered_prompted_records = [json.loads(line) for line in f]
unsteered_prompted_df = pd.DataFrame(unsteered_prompted_records)

with open(unsteered_default, "r") as f:
    unsteered_default_records = [json.loads(line) for line in f]
unsteered_default_df = pd.DataFrame(unsteered_default_records)

print(f"Loaded {len(unsteered_prompted_df)} unsteered prompted records")
print(f"Loaded {len(unsteered_default_df)} unsteered default records")


Loaded 17600 scores
Loaded 4400 unsteered prompted records
Loaded 4400 unsteered default records


## Inspect scores


In [75]:
def pct_overall(df, name):
    total = len(df)

    # Raw counts of each label
    counts = df["score"].value_counts().sort_index()

    # Percentages of each label
    percentages = (counts / total * 100).round(1)

    # Special targets
    n_enough = counts.get("enough_info", 0)
    n_enough_perfect = counts.get("enough_info_and_follow_perfectly", 0)
    n_either = n_enough + n_enough_perfect

    pct_enough = 100 * n_enough / total
    pct_enough_perfect = 100 * n_enough_perfect / total
    pct_either = 100 * n_either / total

    print(f"=== Overall Score Distribution: {name} ===")
    print(f"Total samples: {total}\n")

    # Print table of counts + percentages for each label
    print("Per-label counts and percentages:")
    for label in counts.index:
        print(f"- {label}: {counts[label]} ({percentages[label]}%)")

    print("\nTarget categories:")
    print(f"- enough_info: {n_enough} ({pct_enough:.1f}%)")
    print(f"- enough_info_and_follow_perfectly: {n_enough_perfect} ({pct_enough_perfect:.1f}%)")
    print(f"- either: {n_either} ({pct_either:.1f}%)")
    print()


In [76]:

pct_overall(scores_df, "ALL")


=== Overall Score Distribution: ALL ===
Total samples: 17600

Per-label counts and percentages:
- almost_enough_info: 148 (0.8%)
- enough_info: 5183 (29.4%)
- enough_info_and_follow_perfectly: 1186 (6.7%)
- nonsensical: 60 (0.3%)
- other: 1 (0.0%)
- out_of_context: 589 (3.3%)
- refusal: 2040 (11.6%)
- refusal_and_justification: 2215 (12.6%)
- related_but_no_info: 6178 (35.1%)

Target categories:
- enough_info: 5183 (29.4%)
- enough_info_and_follow_perfectly: 1186 (6.7%)
- either: 6369 (36.2%)



In [77]:
def pct_per_experiment(df):
    """Print score distribution grouped by experiment_id"""
    total = len(df)
    print("=== Score Distribution by Experiment ===")
    print(f"Total samples: {total}\n")

    for exp_id, group in df.groupby("experiment_id"):
        n = len(group)
        counts = group["score"].value_counts().sort_index()
        percentages = (counts / n * 100).round(1)

        # Special targets
        n_enough = counts.get("enough_info", 0)
        n_enough_perfect = counts.get("enough_info_and_follow_perfectly", 0)
        n_either = n_enough + n_enough_perfect

        pct_enough = 100 * n_enough / n
        pct_enough_perfect = 100 * n_enough_perfect / n
        pct_either = 100 * n_either / n

        print(f"Experiment: {exp_id}")
        print(f"Samples: {n}")

        print("Per-label counts and percentages:")
        for label in counts.index:
            print(f"- {label}: {counts[label]} ({percentages[label]}%)")

        print("Target categories:")
        print(f"- enough_info: {n_enough} ({pct_enough:.1f}%)")
        print(f"- enough_info_and_follow_perfectly: {n_enough_perfect} ({pct_enough_perfect:.1f}%)")
        print(f"- either: {n_either} ({pct_either:.1f}%)")
        print()

In [78]:
pct_per_experiment(scores_df)


=== Score Distribution by Experiment ===
Total samples: 17600

Experiment: layers_0:64-harm_0.01
Samples: 1100
Per-label counts and percentages:
- almost_enough_info: 3 (0.3%)
- enough_info: 360 (32.7%)
- enough_info_and_follow_perfectly: 69 (6.3%)
- nonsensical: 2 (0.2%)
- out_of_context: 31 (2.8%)
- refusal: 162 (14.7%)
- refusal_and_justification: 133 (12.1%)
- related_but_no_info: 340 (30.9%)
Target categories:
- enough_info: 360 (32.7%)
- enough_info_and_follow_perfectly: 69 (6.3%)
- either: 429 (39.0%)

Experiment: layers_0:64-harm_0.25
Samples: 1100
Per-label counts and percentages:
- almost_enough_info: 15 (1.4%)
- enough_info: 536 (48.7%)
- enough_info_and_follow_perfectly: 120 (10.9%)
- nonsensical: 22 (2.0%)
- out_of_context: 21 (1.9%)
- refusal: 60 (5.5%)
- refusal_and_justification: 47 (4.3%)
- related_but_no_info: 279 (25.4%)
Target categories:
- enough_info: 536 (48.7%)
- enough_info_and_follow_perfectly: 120 (10.9%)
- either: 656 (59.6%)

Experiment: layers_0:64-safe_0.

## Plot Harmful Responses

In [79]:
# --- Helper: summarize dataframe into counts/percentages for target labels ---
TARGETS = ["enough_info", "enough_info_and_follow_perfectly"]

def summarize_df(df, experiment_id):
    """
    Returns tidy rows with counts and percentages for the two target labels.
    All rows will have the specified experiment_id.
    """
    work = df.copy()
    work["experiment_id"] = experiment_id

    # Total count for this experiment
    n_total = len(work)

    # Counts per label
    counts = work["score"].value_counts()

    # Keep only the two target labels
    long_rows = []
    for label in TARGETS:
        count = counts.get(label, 0)
        pct = (count / n_total * 100) if n_total > 0 else 0.0

        long_rows.append({
            "experiment_id": experiment_id,
            "score_label": label,
            "count": int(count),
            "total": int(n_total),
            "pct": round(float(pct), 1),
        })

    return pd.DataFrame(long_rows)

In [80]:
# --- Build the tidy dataset from baseline and steered data ---

# Filter baseline data to only include IDs present in capped experiments
capped_ids = set(scores_df['id'].unique())
print(f"Capped experiments use {len(capped_ids)} unique IDs")

# Filter unsteered data to match
unsteered_prompted_df_filtered = unsteered_prompted_df[unsteered_prompted_df['id'].isin(capped_ids)]
unsteered_default_df_filtered = unsteered_default_df[unsteered_default_df['id'].isin(capped_ids)]

print(f"Filtered unsteered_prompted from {len(unsteered_prompted_df)} to {len(unsteered_prompted_df_filtered)} records")
print(f"Filtered unsteered_default from {len(unsteered_default_df)} to {len(unsteered_default_df_filtered)} records")

# Build tidy data for baselines (using filtered data)
baseline_prompted_tidy = summarize_df(unsteered_prompted_df_filtered, "unsteered_prompted")
baseline_default_tidy = summarize_df(unsteered_default_df_filtered, "unsteered_default")

# Build tidy data for steered experiments (group by experiment_id and summarize each)
steered_tidy_frames = []
for exp_id in EXPERIMENT_ORDER:
    exp_df = scores_df[scores_df["experiment_id"] == exp_id]
    if len(exp_df) > 0:
        steered_tidy_frames.append(summarize_df(exp_df, exp_id))
    else:
        print(f"Warning: No data found for experiment {exp_id}")

# Combine all tidy frames
tidy = pd.concat(
    [baseline_default_tidy, baseline_prompted_tidy] + steered_tidy_frames,
    ignore_index=True
)

print(f"\nCreated tidy dataset with {len(tidy)} rows covering {tidy['experiment_id'].nunique()} experiments")

Capped experiments use 1100 unique IDs
Filtered unsteered_prompted from 4400 to 1100 records
Filtered unsteered_default from 4400 to 1100 records

Created tidy dataset with 36 rows covering 18 experiments


In [81]:
# === Plot Harmful Responses by Experiment ===

# Color scheme for labels
COLORS = {
    "enough_info_and_follow_perfectly": "#e41a1c",  # red
    "enough_info": "#ff7f00",                       # orange
}

label_map = {
    "enough_info": "Harmful Info",
    "enough_info_and_follow_perfectly": "Harmful Info & Following Request"
}

# Build experiment list: 2 baselines + 16 experiments
all_experiment_ids = ["unsteered_default", "unsteered_prompted"] + EXPERIMENT_ORDER

# Create pivot arrays: rows = experiments, cols = labels
def pivot_by_experiment(df, label):
    """Get percentages, counts, totals for a specific label across all experiments"""
    pcts = []
    counts = []
    totals = []
    
    for exp_id in all_experiment_ids:
        row = df[(df["experiment_id"] == exp_id) & (df["score_label"] == label)]
        if len(row) > 0:
            pcts.append(row.iloc[0]["pct"])
            counts.append(row.iloc[0]["count"])
            totals.append(row.iloc[0]["total"])
        else:
            pcts.append(0.0)
            counts.append(0)
            totals.append(0)
    
    return np.array(pcts), np.array(counts), np.array(totals)

# Get data for both target labels
Y1, C1, T1 = pivot_by_experiment(tidy, "enough_info")
Y2, C2, T2 = pivot_by_experiment(tidy, "enough_info_and_follow_perfectly")

# X positions with gaps between groups
gap_between_groups = 0.6  # Space between groups
x_positions = []
current_x = 0

# Baseline group (2 bars)
x_positions.extend([current_x, current_x + 1])
current_x += 2 + gap_between_groups

# Layer groups (4 bars each)
for _ in range(4):
    x_positions.extend([current_x + i for i in range(4)])
    current_x += 4 + gap_between_groups

x_positions = np.array(x_positions)
BAR_WIDTH = 0.8

# Create figure
fig = go.Figure()

# Add stacked bars for each label
for label, Y, C, T, color in [
    ("enough_info", Y1, C1, T1, COLORS["enough_info"]),
    ("enough_info_and_follow_perfectly", Y2, C2, T2, COLORS["enough_info_and_follow_perfectly"]),
]:
    fig.add_trace(go.Bar(
        x=x_positions,
        y=Y,
        name=label_map[label],
        marker_color=color,
        width=BAR_WIDTH,
        hovertemplate=(
            "Experiment: %{customdata[0]}<br>" +
            label_map[label] + "<br>"
            "Pct: %{y:.1f}%<br>"
            "Count: %{customdata[1]} / %{customdata[2]}<extra></extra>"
        ),
        customdata=np.column_stack([all_experiment_ids, C, T]),
    ))

# Add annotations above each bar
tops = Y1 + Y2  # Total height of stacked bars

for i, exp_id in enumerate(all_experiment_ids):
    if tops[i] > 0:
        # Percentage annotation
        fig.add_annotation(
            x=x_positions[i], y=tops[i] + 3,
            text=f"{tops[i]:.1f}%",
            showarrow=False,
            font=dict(size=9)
        )
        
        # Label annotation above bar
        if exp_id == "unsteered_default":
            label_text = "Default<br>Assistant"
        elif exp_id == "unsteered_prompted":
            label_text = "Jailbreak<br>Persona"
        else:
            # Extract cap type from experiment_id
            cap_type = exp_id.split("-")[1]  # e.g., "harm_0.25"
            label_text = CAP_NAMES.get(cap_type, cap_type)
        
        fig.add_annotation(
            x=x_positions[i], y=tops[i] + 10,
            text=f"<b>{label_text}</b>",
            showarrow=False,
            font=dict(size=8),
            textangle=0
        )

# Setup x-axis with group labels
# Calculate group centers based on actual x positions
baseline_center = (x_positions[0] + x_positions[1]) / 2
group_centers = [baseline_center]

# For layer groups: each has 4 bars
for i in range(4):
    start_idx = 2 + i*4  # Starting index for this group
    group_center = (x_positions[start_idx] + x_positions[start_idx + 3]) / 2
    group_centers.append(group_center)

group_labels = ["Baseline"] + [LAYER_GROUP_NAMES[lg] for lg in LAYER_GROUPS]

fig.update_layout(
    barmode="stack",
    width=1200,
    height=600,
    title={
        "text": "Harmful Responses with Persona-based Jailbreaks Across Capping Experiments",
        "subtitle": {"text": subtitle},
    },
    xaxis=dict(
        title="Experiment Groups",
        tickmode="array",
        tickvals=group_centers,
        ticktext=group_labels,
        showgrid=True,
        gridcolor="lightgray",
        range=[-0.5, x_positions[-1] + 0.5],
    ),
    yaxis=dict(
        title="Percentage of Harmful Responses",
        range=[0, 119],
        showgrid=True,
        gridcolor="lightgray"
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1.0
    ),
    bargap=0.01,  # Small gap within groups
)

fig.show()
#fig.write_html(f"{out_dir}/jailbreak_harmful.html")

## Plot Other Responses

In [82]:
# === Prepare data for "other" responses plot ===

# Define categories (exclude the two harmful target labels)
OTHER_LABELS = [
    "almost_enough_info",        # yellow
    "related_but_no_info",       # cyan
    "refusal",                   # lime green
    "refusal_and_justification", # green
    "out_of_context",            # purple
    "nonsensical",               # pink
]

other_colors = {
    "almost_enough_info": "#ffff33",        # yellow
    "related_but_no_info": "#1f78b4",       # cyan/blueish
    "refusal": "#b2df8a",                   # lime green
    "refusal_and_justification": "#33a02c", # green
    "out_of_context": "#6a3d9a",            # purple
    "nonsensical": "#fb9a99",               # pink
}

other_label_map = {
    "refusal": "Refusal",
    "refusal_and_justification": "Refusal & Justification",
    "nonsensical": "Nonsensical",
    "out_of_context": "Out of Context",
    "related_but_no_info": "Related",
    "almost_enough_info": "Almost Harmful",
}

def summarize_df_multi(df, experiment_id, labels):
    """Return tidy rows for multiple labels with counts & % for one experiment."""
    work = df.copy()
    work["experiment_id"] = experiment_id

    n_total = len(work)
    counts = work["score"].value_counts()

    long_rows = []
    for label in labels:
        count = counts.get(label, 0)
        pct = (count / n_total * 100) if n_total > 0 else 0.0

        long_rows.append({
            "experiment_id": experiment_id,
            "score_label": label,
            "count": int(count),
            "total": int(n_total),
            "pct": round(float(pct), 1),
        })

    return pd.DataFrame(long_rows)

# Build tidy for baselines (using filtered data)
baseline_default_other = summarize_df_multi(unsteered_default_df_filtered, "unsteered_default", OTHER_LABELS)
baseline_prompted_other = summarize_df_multi(unsteered_prompted_df_filtered, "unsteered_prompted", OTHER_LABELS)

# Build tidy for steered experiments
steered_other_frames = []
for exp_id in EXPERIMENT_ORDER:
    exp_df = scores_df[scores_df["experiment_id"] == exp_id]
    if len(exp_df) > 0:
        steered_other_frames.append(summarize_df_multi(exp_df, exp_id, OTHER_LABELS))

# Combine all
tidy_other = pd.concat(
    [baseline_default_other, baseline_prompted_other] + steered_other_frames,
    ignore_index=True
)

print(f"Created tidy_other dataset with {len(tidy_other)} rows")

Created tidy_other dataset with 108 rows


In [83]:
# === Plot Other Responses by Experiment ===

# Create pivot function for other labels
def pivot_by_experiment_other(df, label):
    """Get percentages, counts, totals for a specific label across all experiments"""
    pcts = []
    counts = []
    totals = []
    
    for exp_id in all_experiment_ids:
        row = df[(df["experiment_id"] == exp_id) & (df["score_label"] == label)]
        if len(row) > 0:
            pcts.append(row.iloc[0]["pct"])
            counts.append(row.iloc[0]["count"])
            totals.append(row.iloc[0]["total"])
        else:
            pcts.append(0.0)
            counts.append(0)
            totals.append(0)
    
    return np.array(pcts), np.array(counts), np.array(totals)

# Get data for all "other" labels
Y_map, C_map, T_map = {}, {}, {}
for lab in OTHER_LABELS:
    Y_map[lab], C_map[lab], T_map[lab] = pivot_by_experiment_other(tidy_other, lab)

# X positions with gaps between groups (same as harmful plot)
gap_between_groups = 0.6
x_positions = []
current_x = 0

# Baseline group (2 bars)
x_positions.extend([current_x, current_x + 1])
current_x += 2 + gap_between_groups

# Layer groups (4 bars each)
for _ in range(4):
    x_positions.extend([current_x + i for i in range(4)])
    current_x += 4 + gap_between_groups

x_positions = np.array(x_positions)
BAR_WIDTH = 0.8

# Create figure
fig_other = go.Figure()

# Add stacked bars for each "other" label
for lab in OTHER_LABELS:
    Y = Y_map[lab]
    C = C_map[lab]
    T = T_map[lab]
    
    fig_other.add_trace(go.Bar(
        x=x_positions,
        y=Y,
        name=other_label_map[lab],
        marker_color=other_colors[lab],
        width=BAR_WIDTH,
        hovertemplate=(
            "Experiment: %{customdata[0]}<br>" +
            other_label_map[lab] + "<br>"
            "Pct: %{y:.1f}%<br>"
            "Count: %{customdata[1]} / %{customdata[2]}<extra></extra>"
        ),
        customdata=np.column_stack([all_experiment_ids, C, T]),
    ))

# Add annotations above each bar
tops = sum(Y_map[lab] for lab in OTHER_LABELS)

for i, exp_id in enumerate(all_experiment_ids):
    if tops[i] > 0:
        # Percentage annotation
        fig_other.add_annotation(
            x=x_positions[i], y=tops[i] + 3,
            text=f"{tops[i]:.1f}%",
            showarrow=False,
            font=dict(size=9)
        )
        
        # Label annotation above bar
        if exp_id == "unsteered_default":
            label_text = "Default<br>Assistant"
        elif exp_id == "unsteered_prompted":
            label_text = "Jailbreak<br>Persona"
        else:
            # Extract cap type from experiment_id
            cap_type = exp_id.split("-")[1]
            label_text = CAP_NAMES.get(cap_type, cap_type)
        
        fig_other.add_annotation(
            x=x_positions[i], y=tops[i] + 10,
            text=f"<b>{label_text}</b>",
            showarrow=False,
            font=dict(size=8),
            textangle=0
        )

# Setup x-axis with group labels (same calculation as harmful plot)
baseline_center = (x_positions[0] + x_positions[1]) / 2
group_centers = [baseline_center]

for i in range(4):
    start_idx = 2 + i*4
    group_center = (x_positions[start_idx] + x_positions[start_idx + 3]) / 2
    group_centers.append(group_center)

group_labels = ["Baseline"] + [LAYER_GROUP_NAMES[lg] for lg in LAYER_GROUPS]

fig_other.update_layout(
    barmode="stack",
    width=1200,
    height=600,
    title={
        "text": "Other Responses with Persona-based Jailbreaks Across Capping Experiments",
        "subtitle": {"text": subtitle},
    },
    xaxis=dict(
        title="Experiment Groups",
        tickmode="array",
        tickvals=group_centers,
        ticktext=group_labels,
        showgrid=True,
        gridcolor="lightgray",
        range=[-0.5, x_positions[-1] + 0.5],
    ),
    yaxis=dict(
        title="Percentage of Responses",
        range=[0, 119],
        showgrid=True,
        gridcolor="lightgray"
    ),
    legend=dict(
        orientation="h",
        y=1.02,
        x=1.0,
        xanchor="right",
        yanchor="bottom"
    ),
    bargap=0.01,  # Small gap within groups
)

fig_other.show()
#fig_other.write_html(f"{out_dir}/jailbreak_other.html")

In [None]:
# === Combined Plot: Harmful and Other Responses ===
from plotly.subplots import make_subplots

# Create subplot figure with 2 rows, 1 column
fig_combined = make_subplots(
    rows=2, cols=1,
    subplot_titles=(
        "Harmful Responses Across Intervention Layer and Cap Combinations",
        "Other Responses Across Intervention Layer and Cap Combinations"
    ),
    vertical_spacing=0.12,
    specs=[[{"type": "bar"}], [{"type": "bar"}]]
)

fig_combined.update_annotations(
    font=dict(size=14),
)

# --- Add Harmful responses (row 1) ---
for label, Y, C, T, color in [
    ("enough_info", Y1, C1, T1, COLORS["enough_info"]),
    ("enough_info_and_follow_perfectly", Y2, C2, T2, COLORS["enough_info_and_follow_perfectly"]),
]:
    fig_combined.add_trace(
        go.Bar(
            x=x_positions,
            y=Y,
            name=label_map[label],
            legendgroup="harmful",
            marker_color=color,
            width=BAR_WIDTH,
            hovertemplate=(
                "Experiment: %{customdata[0]}<br>" +
                label_map[label] + "<br>"
                "Pct: %{y:.1f}%<br>"
                "Count: %{customdata[1]} / %{customdata[2]}<extra></extra>"
            ),
            customdata=np.column_stack([all_experiment_ids, C, T]),
        ),
        row=1, col=1
    )

# Add annotations for harmful responses
tops_harmful = Y1 + Y2
for i, exp_id in enumerate(all_experiment_ids):
    if tops_harmful[i] > 0:
        fig_combined.add_annotation(
            x=x_positions[i], y=tops_harmful[i] + 3,
            text=f"{tops_harmful[i]:.1f}%",
            showarrow=False,
            font=dict(size=9),
            row=1, col=1
        )
        
        if exp_id == "unsteered_default":
            label_text = "Default<br>Assistant"
        elif exp_id == "unsteered_prompted":
            label_text = "Jailbreak<br>Persona"
        else:
            cap_type = exp_id.split("-")[1]
            label_text = CAP_NAMES.get(cap_type, cap_type)
        
        fig_combined.add_annotation(
            x=x_positions[i], y=tops_harmful[i] + 10,
            text=f"<b>{label_text}</b>",
            showarrow=False,
            font=dict(size=8),
            row=1, col=1
        )

# --- Add Other responses (row 2) ---
for lab in OTHER_LABELS:
    Y = Y_map[lab]
    C = C_map[lab]
    T = T_map[lab]
    
    fig_combined.add_trace(
        go.Bar(
            x=x_positions,
            y=Y,
            name=other_label_map[lab],
            legendgroup="other",
            marker_color=other_colors[lab],
            width=BAR_WIDTH,
            hovertemplate=(
                "Experiment: %{customdata[0]}<br>" +
                other_label_map[lab] + "<br>"
                "Pct: %{y:.1f}%<br>"
                "Count: %{customdata[1]} / %{customdata[2]}<extra></extra>"
            ),
            customdata=np.column_stack([all_experiment_ids, C, T]),
        ),
        row=2, col=1
    )

# Add annotations for other responses
tops_other = sum(Y_map[lab] for lab in OTHER_LABELS)
for i, exp_id in enumerate(all_experiment_ids):
    if tops_other[i] > 0:
        fig_combined.add_annotation(
            x=x_positions[i], y=tops_other[i] + 3,
            text=f"{tops_other[i]:.1f}%",
            showarrow=False,
            font=dict(size=9),
            row=2, col=1
        )
        
        if exp_id == "unsteered_default":
            label_text = "Default<br>Assistant"
        elif exp_id == "unsteered_prompted":
            label_text = "Jailbreak<br>Persona"
        else:
            cap_type = exp_id.split("-")[1]
            label_text = CAP_NAMES.get(cap_type, cap_type)
        
        fig_combined.add_annotation(
            x=x_positions[i], y=tops_other[i] + 10,
            text=f"<b>{label_text}</b>",
            showarrow=False,
            font=dict(size=8),
            row=2, col=1
        )

# Update layout
baseline_center = (x_positions[0] + x_positions[1]) / 2
group_centers = [baseline_center]
for i in range(4):
    start_idx = 2 + i*4
    group_center = (x_positions[start_idx] + x_positions[start_idx + 3]) / 2
    group_centers.append(group_center)

group_labels = ["Baseline"] + [LAYER_GROUP_NAMES[lg] for lg in LAYER_GROUPS]

fig_combined.update_xaxes(
    title_text="Layers of Intervention",
    tickmode="array",
    tickvals=group_centers,
    ticktext=group_labels,
    showgrid=True,
    gridcolor="lightgray",
    range=[-0.5, x_positions[-1] + 0.5],
    row=2, col=1
)

fig_combined.update_xaxes(
    tickmode="array",
    tickvals=group_centers,
    ticktext=group_labels,
    showgrid=True,
    gridcolor="lightgray",
    range=[-0.5, x_positions[-1] + 0.5],
    row=1, col=1
)

fig_combined.update_yaxes(
    title_text="Percentage of Responses",
    range=[0, 119],
    showgrid=True,
    gridcolor="lightgray",
    row=1, col=1
)

fig_combined.update_yaxes(
    title_text="Percentage of Responses",
    range=[0, 119],
    showgrid=True,
    gridcolor="lightgray",
    row=2, col=1
)

fig_combined.update_layout(
    barmode="stack",
    width=1100,
    height=1000,
    title={
        "text": "Responses after Persona-based Jailbreak with Multi-Layer Projection Capping",
        "subtitle": {"text": subtitle},
    },
    legend=dict(
        orientation="h",
        y=1.02,
        x=1.09,
        xanchor="right",
        yanchor="bottom"
    ),
    bargap=0.01,
)

fig_combined.show()
fig_combined.write_html(f"{out_dir}/multi_contrast_layers.html")