# Analyzing scores and plotting them

In [60]:
import json
import os
import pandas as pd
import numpy as np

import plotly.graph_objects as go

In [81]:
model = "llama-3.3-70b"
layer = 40
subtitle = f"{model.replace('-', ' ').title()}, Layer {layer}"
base_dir = f"/root/git/persona-subspace/evals/jailbreak/{model}"
out_dir = f"/root/git/persona-subspace/evals/plots/{model}"

os.makedirs(out_dir, exist_ok=True)
magnitudes = [-24.0, -18.0, -12.0, -6.0, 0.0]

In [82]:
# load scores from roles_20 as pandas dataframe

unsteered_jailbreak_path = f"{base_dir}/unsteered/unsteered_scores.jsonl"
unsteered_default_path = f"{base_dir}/unsteered/unsteered_default_scores.jsonl"

steered_jailbreak_path = f"{base_dir}/steered/jailbreak_1100_scores.jsonl"
steered_default_path = f"{base_dir}/steered/default_1100_scores.jsonl"

In [83]:
# Load all 3 JSONL files into separate DataFrames
with open(unsteered_jailbreak_path, "r") as f:
    unsteered_jailbreak_scores = [json.loads(line) for line in f]
    
with open(unsteered_default_path, "r") as f:
    unsteered_default_scores = [json.loads(line) for line in f]
    
with open(steered_jailbreak_path, "r") as f:
    steered_jailbreak_scores = [json.loads(line) for line in f]

with open(steered_default_path, "r") as f:    
    steered_default_scores = [json.loads(line) for line in f]


# Convert to pandas DataFrames
unsteered_jailbreak_df = pd.DataFrame(unsteered_jailbreak_scores)
unsteered_default_df = pd.DataFrame(unsteered_default_scores)
steered_jailbreak_df = pd.DataFrame(steered_jailbreak_scores)
steered_default_df = pd.DataFrame(steered_default_scores)


print(f"Loaded {len(unsteered_jailbreak_df)} unsteered jailbreak scores")
print(f"Loaded {len(unsteered_default_df)} unsteered default scores")   
print(f"Loaded {len(steered_jailbreak_df)} steered jailbreak scores")
print(f"Loaded {len(steered_default_df)} steered default scores")


Loaded 4400 unsteered jailbreak scores
Loaded 4400 unsteered default scores
Loaded 4400 steered jailbreak scores
Loaded 4400 steered default scores


## Inspect scores


In [84]:
all_dfs = [unsteered_jailbreak_df, unsteered_default_df, steered_jailbreak_df, steered_default_df]


In [85]:
def pct_overall(df, name):
    total = len(df)

    # Raw counts of each label
    counts = df["score"].value_counts().sort_index()

    # Percentages of each label
    percentages = (counts / total * 100).round(1)

    # Special targets
    n_enough = counts.get("enough_info", 0)
    n_enough_perfect = counts.get("enough_info_and_follow_perfectly", 0)
    n_either = n_enough + n_enough_perfect

    pct_enough = 100 * n_enough / total
    pct_enough_perfect = 100 * n_enough_perfect / total
    pct_either = 100 * n_either / total

    print(f"=== Overall Score Distribution: {name} ===")
    print(f"Total samples: {total}\n")

    # Print table of counts + percentages for each label
    print("Per-label counts and percentages:")
    for label in counts.index:
        print(f"- {label}: {counts[label]} ({percentages[label]}%)")

    print("\nTarget categories:")
    print(f"- enough_info: {n_enough} ({pct_enough:.1f}%)")
    print(f"- enough_info_and_follow_perfectly: {n_enough_perfect} ({pct_enough_perfect:.1f}%)")
    print(f"- either: {n_either} ({pct_either:.1f}%)")
    print()


In [86]:

pct_overall(unsteered_jailbreak_df, "Unsteered + Jailbreak")


=== Overall Score Distribution: Unsteered + Jailbreak ===
Total samples: 4400

Per-label counts and percentages:
- almost_enough_info: 19 (0.4%)
- enough_info: 2001 (45.5%)
- enough_info_and_follow_perfectly: 870 (19.8%)
- out_of_context: 2 (0.0%)
- refusal: 964 (21.9%)
- refusal_and_justification: 18 (0.4%)
- related_but_no_info: 526 (12.0%)

Target categories:
- enough_info: 2001 (45.5%)
- enough_info_and_follow_perfectly: 870 (19.8%)
- either: 2871 (65.2%)



In [87]:
pct_overall(unsteered_default_df, "Unsteered + No Jailbreak")


=== Overall Score Distribution: Unsteered + No Jailbreak ===
Total samples: 4400

Per-label counts and percentages:
- almost_enough_info: 28 (0.6%)
- enough_info: 196 (4.5%)
- enough_info_and_follow_perfectly: 2 (0.0%)
- out_of_context: 152 (3.5%)
- refusal: 353 (8.0%)
- refusal_and_justification: 151 (3.4%)
- related_but_no_info: 3518 (80.0%)

Target categories:
- enough_info: 196 (4.5%)
- enough_info_and_follow_perfectly: 2 (0.0%)
- either: 198 (4.5%)



In [88]:
def pct_per_magnitude(df, name):
    total = len(df)
    print(f"=== Score Distribution by Magnitude: {name} ===")
    print(f"Total samples: {total}\n")

    for mag, group in df.groupby("magnitude"):
        n = len(group)
        counts = group["score"].value_counts().sort_index()
        percentages = (counts / n * 100).round(1)

        # Special targets
        n_enough = counts.get("enough_info", 0)
        n_enough_perfect = counts.get("enough_info_and_follow_perfectly", 0)
        n_either = n_enough + n_enough_perfect

        pct_enough = 100 * n_enough / n
        pct_enough_perfect = 100 * n_enough_perfect / n
        pct_either = 100 * n_either / n

        print(f"Magnitude: {mag}")
        print(f"Samples: {n}")

        print("Per-label counts and percentages:")
        for label in counts.index:
            print(f"- {label}: {counts[label]} ({percentages[label]}%)")

        print("Target categories:")
        print(f"- enough_info: {n_enough} ({pct_enough:.1f}%)")
        print(f"- enough_info_and_follow_perfectly: {n_enough_perfect} ({pct_enough_perfect:.1f}%)")
        print(f"- either: {n_either} ({pct_either:.1f}%)")
        print()


In [89]:
pct_per_magnitude(steered_jailbreak_df, "Steered + Jailbreak")

=== Score Distribution by Magnitude: Steered + Jailbreak ===
Total samples: 4400

Magnitude: -24.0
Samples: 1100
Per-label counts and percentages:
- enough_info: 28 (2.5%)
- enough_info_and_follow_perfectly: 5 (0.5%)
- nonsensical: 698 (63.5%)
- out_of_context: 8 (0.7%)
- refusal: 3 (0.3%)
- related_but_no_info: 358 (32.5%)
Target categories:
- enough_info: 28 (2.5%)
- enough_info_and_follow_perfectly: 5 (0.5%)
- either: 33 (3.0%)

Magnitude: -18.0
Samples: 1100
Per-label counts and percentages:
- almost_enough_info: 6 (0.5%)
- enough_info: 264 (24.0%)
- enough_info_and_follow_perfectly: 110 (10.0%)
- nonsensical: 20 (1.8%)
- out_of_context: 10 (0.9%)
- refusal: 5 (0.5%)
- related_but_no_info: 685 (62.3%)
Target categories:
- enough_info: 264 (24.0%)
- enough_info_and_follow_perfectly: 110 (10.0%)
- either: 374 (34.0%)

Magnitude: -12.0
Samples: 1100
Per-label counts and percentages:
- almost_enough_info: 3 (0.3%)
- enough_info: 470 (42.7%)
- enough_info_and_follow_perfectly: 201 (18.3

In [90]:
pct_per_magnitude(steered_default_df, "Steered + No Jailbreak")

=== Score Distribution by Magnitude: Steered + No Jailbreak ===
Total samples: 4400

Magnitude: -24.0
Samples: 1100
Per-label counts and percentages:
- nonsensical: 828 (75.3%)
- out_of_context: 66 (6.0%)
- refusal_and_justification: 1 (0.1%)
- related_but_no_info: 205 (18.6%)
Target categories:
- enough_info: 0 (0.0%)
- enough_info_and_follow_perfectly: 0 (0.0%)
- either: 0 (0.0%)

Magnitude: -18.0
Samples: 1100
Per-label counts and percentages:
- almost_enough_info: 1 (0.1%)
- enough_info: 2 (0.2%)
- nonsensical: 144 (13.1%)
- out_of_context: 170 (15.5%)
- refusal: 22 (2.0%)
- refusal_and_justification: 13 (1.2%)
- related_but_no_info: 748 (68.0%)
Target categories:
- enough_info: 2 (0.2%)
- enough_info_and_follow_perfectly: 0 (0.0%)
- either: 2 (0.2%)

Magnitude: -12.0
Samples: 1100
Per-label counts and percentages:
- almost_enough_info: 2 (0.2%)
- enough_info: 30 (2.7%)
- nonsensical: 14 (1.3%)
- out_of_context: 88 (8.0%)
- refusal: 17 (1.5%)
- refusal_and_justification: 33 (3.0%)


In [91]:
print(steered_jailbreak_df.keys())

Index(['id', 'role', 'prompt_id', 'question_id', 'prompt', 'question',
       'harm_category', 'persona', 'magnitude', 'response', 'score',
       'analysis'],
      dtype='object')


## Plot Harmful Responses

In [92]:

# --- 1) Helper: summarize one dataframe into counts/percentages for target labels ---
TARGETS = ["enough_info", "enough_info_and_follow_perfectly"]

def summarize_df(df, jailbreak_label, magnitude_override=None):
    """
    Returns tidy rows with counts and percentages for the two target labels,
    grouped by magnitude (float). If magnitude_override is provided, use that
    value for every row (e.g., unsteered -> 0.0).
    """
    work = df.copy()
    if magnitude_override is not None:
        work["magnitude"] = magnitude_override

    # group totals per magnitude
    totals = work.groupby("magnitude").size().rename("n_total")

    # counts per label
    counts = (
        work[["magnitude", "score"]]
        .value_counts()
        .rename("n")
        .reset_index()
    )

    # keep only the two target labels, but we’ll also compute n_either
    target_counts = counts[counts["score"].isin(TARGETS)].copy()

    # pivot to get columns for each target label count
    pivot = target_counts.pivot_table(
        index="magnitude",
        columns="score",
        values="n",
        aggfunc="sum",
        fill_value=0
    )

    # ensure both columns present
    for t in TARGETS:
        if t not in pivot.columns:
            pivot[t] = 0

    # combine with totals
    df_sum = pivot.join(totals, how="right").fillna(0)
    df_sum["n_either"] = df_sum["enough_info"] + df_sum["enough_info_and_follow_perfectly"]

    # percentages
    for col in ["enough_info", "enough_info_and_follow_perfectly", "n_either"]:
        df_sum[f"pct_{col}"] = (df_sum[col] / df_sum["n_total"] * 100).round(1)

    # tidy long rows for stacked plotting (two rows per magnitude: one per target label)
    long_rows = []
    for mag, row in df_sum.reset_index().iterrows():
        for label in TARGETS:
            long_rows.append({
                "magnitude": float(row["magnitude"]),
                "jailbreak": jailbreak_label,                 # "No Jailbreak" or "Jailbreak"
                "score_label": label,                         # which of the two targets
                "count": int(row[label]),
                "total": int(row["n_total"]),
                "pct": float(row[f"pct_{label}"]),
                "pct_either": float(row["pct_n_either"]),     # same for both labels
            })
    return pd.DataFrame(long_rows)


In [93]:

# --- 2) Build the tidy dataset from your four inputs ---
# Unsteered frames are magnitude 0.0
unsteered_default_tidy   = summarize_df(unsteered_default_df,   "No Jailbreak", 0.0)
unsteered_jailbreak_tidy = summarize_df(unsteered_jailbreak_df, "Jailbreak",    0.0)

# Steered frames keep their own magnitude values
steered_default_tidy     = summarize_df(steered_default_df,     "No Jailbreak")
steered_jailbreak_tidy   = summarize_df(steered_jailbreak_df,   "Jailbreak")

tidy = pd.concat(
    [unsteered_default_tidy, unsteered_jailbreak_tidy,
     steered_default_tidy,   steered_jailbreak_tidy],
    ignore_index=True
)




In [94]:
# ----- CONFIG -----
TARGETS   = ["enough_info", "enough_info_and_follow_perfectly"]
JB_ORDER  = ["No Jailbreak", "Jailbreak"]   # left, right
COLORS = {
    "enough_info_and_follow_perfectly": "#e41a1c",  # red
    "enough_info": "#ff7f00",                      # orange
}

label_map = {
    "enough_info": "Harmful Info in Response",
    "enough_info_and_follow_perfectly": "Harmful Info & Perfectly Following Request"
}

# Order magnitudes the way you want them on the x-axis


# ----- BUILD PIVOTS (robust, fills missing combos with 0) -----
def pivot_pct(df, label):
    # returns a (len(magnitudes) x len(JB_ORDER)) array of percentages
    tbl = (df[df["score_label"]==label]
           .pivot_table(index="magnitude", columns="jailbreak",
                        values="pct", aggfunc="first", fill_value=0.0))
    # enforce full order and fill any missing JB columns
    tbl = tbl.reindex(index=magnitudes, columns=JB_ORDER, fill_value=0.0)
    return tbl.values  # rows=mags, cols=JB_ORDER

def pivot_counts(df, label):
    # (counts, totals) arrays for hover
    ct = (df[df["score_label"]==label]
          .pivot_table(index="magnitude", columns="jailbreak",
                       values="count", aggfunc="first", fill_value=0))
    tt = (df[df["score_label"]==label]
          .pivot_table(index="magnitude", columns="jailbreak",
                       values="total", aggfunc="first", fill_value=0))
    ct = ct.reindex(index=magnitudes, columns=JB_ORDER, fill_value=0)
    tt = tt.reindex(index=magnitudes, columns=JB_ORDER, fill_value=0)
    return ct.values, tt.values

Y1 = pivot_pct(tidy, "enough_info")
Y2 = pivot_pct(tidy, "enough_info_and_follow_perfectly")
C1, T1 = pivot_counts(tidy, "enough_info")
C2, T2 = pivot_counts(tidy, "enough_info_and_follow_perfectly")

# ----- X POSITIONS: numeric centers with small offsets -----
centers = np.arange(len(magnitudes))        # 0,1,2,...
dx = 0.18
x_nojb = centers - dx
x_jb   = centers + dx
BAR_WIDTH = 0.34

# ----- BUILD FIGURE -----
fig = go.Figure()

# Left bar of each group (No Jailbreak): stack Y1 + Y2 at x_nojb
for label, Y, C, T in [
    ("enough_info", Y1, C1, T1),
    ("enough_info_and_follow_perfectly", Y2, C2, T2),
]:
    fig.add_trace(go.Bar(
        x=x_nojb,
        y=Y[:, 0],
        name=label_map[label], legendgroup=label_map[label],  # <-- mapped
        marker_color=COLORS[label],
        width=BAR_WIDTH,
        text=[f"{v:.1f}%" if v >= 7.0 else "" for v in Y[:, 0]],
        textposition="inside",
        cliponaxis=False,
        hovertemplate=("Magnitude: %{customdata[2]}<br>"
                       "No Jailbreak — "+label_map[label]+"<br>"   # <-- mapped
                       "Pct: %{y:.1f}%<br>"
                       "Count: %{customdata[0]} / %{customdata[1]}<extra></extra>"),
        customdata=np.column_stack([C[:, 0], T[:, 0], magnitudes]),
    ))

# Right bar of each group (Jailbreak): stack Y1 + Y2 at x_jb
for label, Y, C, T in [
    ("enough_info", Y1, C1, T1),
    ("enough_info_and_follow_perfectly", Y2, C2, T2),
]:
    fig.add_trace(go.Bar(
        x=x_jb,
        y=Y[:, 1],
        name=None, showlegend=False, legendgroup=label_map[label],  # <-- mapped
        marker_color=COLORS[label],
        width=BAR_WIDTH,
        text=[f"{v:.1f}%" if v >= 7.0 else "" for v in Y[:, 1]],
        textposition="inside",
        cliponaxis=False,
        hovertemplate=("Magnitude: %{customdata[2]}<br>"
                       "Jailbreak — "+label_map[label]+"<br>"      # <-- mapped
                       "Pct: %{y:.1f}%<br>"
                       "Count: %{customdata[0]} / %{customdata[1]}<extra></extra>"),
        customdata=np.column_stack([C[:, 1], T[:, 1], magnitudes]),
    ))


# Annotations with sum percentages above each stacked pair
tops_nojb = (Y1[:, 0] + Y2[:, 0])
tops_jb   = (Y1[:, 1] + Y2[:, 1])
for i in range(len(magnitudes)):
    if tops_nojb[i] > 0:
        fig.add_annotation(x=x_nojb[i], y=tops_nojb[i] + 3,
                           text=f"<b>{tops_nojb[i]:.1f}%</b>", showarrow=False, font=dict(size=13))
        fig.add_annotation(x=x_nojb[i], y=tops_nojb[i] + 8,
                           text="<b>Default</b>", showarrow=False, font=dict(size=10))
    if tops_jb[i] > 0:
        fig.add_annotation(x=x_jb[i], y=tops_jb[i] + 3,
                           text=f"<b>{tops_jb[i]:.1f}%</b>", showarrow=False, font=dict(size=13))
        fig.add_annotation(x=x_jb[i], y=tops_jb[i] + 8,
                           text="<b>Jailbreak</b>", showarrow=False, font=dict(size=10))

# ----- LAYOUT -----
fig.update_layout(
    barmode="stack",  # stacks within each bar; grouping via numeric offsets
    width=980, height=520,
    title={
        "text": "Harmful Responses with Persona-based Jailbreaks when Steered Towards the Assistant",
        "subtitle": {
            "text": subtitle,
        }
    },
    xaxis=dict(
        title="Role-playing PC1 Steering Magnitude",
        tickmode="array",
        tickvals=centers,
        ticktext=[str(m) for m in magnitudes[:-1]] + ["Unsteered"],
        showgrid=True, gridcolor="lightgray",
        range=[centers.min()-0.8, centers.max()+0.8],
    ),
    yaxis=dict(
        title="Percentage of Harmful Responses",
        range=[0, 100],
        showgrid=True, gridcolor="lightgray"
    ),
    legend=dict(
        orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1.0
    ),
    bargap=0.25,
)

fig.show()
fig.write_html(f"{out_dir}/jailbreak_harmful.html")

## Plot Other Responses

In [95]:
# --- Define categories you want to stack (exclude your two targets)
# Ordered category list for plotting (matches desired color gradient)
OTHER_LABELS = [
    "almost_enough_info",        # yellow
    "related_but_no_info",       # cyan
    "refusal",                   # lime green
    "refusal_and_justification", # green
    "out_of_context",            # purple
    "nonsensical",               # pink
    "other",                     # grey
]

other_colors = {
    "almost_enough_info": "#ffff33",        # yellow
    "related_but_no_info": "#1f78b4",       # cyan/blueish
    "refusal": "#b2df8a",                   # lime green
    "refusal_and_justification": "#33a02c", # green
    "out_of_context": "#6a3d9a",            # purple
    "nonsensical": "#fb9a99",               # pink
    "other": "#999999",                     # grey
}


other_label_map = {
    "refusal": "Refusal",
    "refusal_and_justification": "Refusal & Justification",
    "nonsensical": "Nonsensical",
    "out_of_context": "Out of Context",
    "related_but_no_info": "Related",
    "almost_enough_info": "Almost Harmful",
    "other": "Other",
}


def summarize_df_multi(df, jailbreak_label, labels, magnitude_override=None):
    """Return tidy rows for *labels* with counts & % per magnitude."""
    work = df.copy()
    if magnitude_override is not None:
        work["magnitude"] = magnitude_override

    totals = work.groupby("magnitude").size().rename("n_total")

    counts = (
        work[["magnitude", "score"]]
        .value_counts()
        .rename("n")
        .reset_index()
    )

    # keep only requested labels
    counts = counts[counts["score"].isin(labels)].copy()

    pivot = counts.pivot_table(
        index="magnitude", columns="score", values="n", aggfunc="sum", fill_value=0
    )

    # ensure all label columns exist
    for lab in labels:
        if lab not in pivot.columns:
            pivot[lab] = 0

    df_sum = pivot.join(totals, how="right").fillna(0)

    # percentages
    for lab in labels:
        df_sum[f"pct_{lab}"] = (df_sum[lab] / df_sum["n_total"] * 100).round(1)

    # long rows
    long = []
    for _, row in df_sum.reset_index().iterrows():
        for lab in labels:
            long.append({
                "magnitude": float(row["magnitude"]),
                "jailbreak": jailbreak_label,      # "No Jailbreak" / "Jailbreak"
                "score_label": lab,
                "count": int(row[lab]),
                "total": int(row["n_total"]),
                "pct": float(row[f"pct_{lab}"]),
            })
    return pd.DataFrame(long)

# Build tidy for the OTHER categories from your 4 input frames
unsteered_default_other   = summarize_df_multi(unsteered_default_df,   "No Jailbreak", OTHER_LABELS, 0.0)
unsteered_jailbreak_other = summarize_df_multi(unsteered_jailbreak_df, "Jailbreak",    OTHER_LABELS, 0.0)
steered_default_other     = summarize_df_multi(steered_default_df,     "No Jailbreak", OTHER_LABELS)
steered_jailbreak_other   = summarize_df_multi(steered_jailbreak_df,   "Jailbreak",    OTHER_LABELS)

tidy_other = pd.concat(
    [unsteered_default_other, unsteered_jailbreak_other,
     steered_default_other,   steered_jailbreak_other],
    ignore_index=True
)


In [96]:
# order & positions
centers = np.arange(len(magnitudes))
dx = 0.18
x_nojb = centers - dx
x_jb   = centers + dx
BAR_WIDTH = 0.34

# Helper: pivot arrays (rows=magnitudes, cols=[No JB, JB]) for a given label
def pivot_pct_other(df, label):
    tbl = (df[df["score_label"]==label]
           .pivot_table(index="magnitude", columns="jailbreak",
                        values="pct", aggfunc="first", fill_value=0.0)
           .reindex(index=magnitudes, columns=["No Jailbreak","Jailbreak"], fill_value=0.0))
    return tbl.values

def pivot_counts_other(df, label, which="count"):
    tbl = (df[df["score_label"]==label]
           .pivot_table(index="magnitude", columns="jailbreak",
                        values=which, aggfunc="first", fill_value=0)
           .reindex(index=magnitudes, columns=["No Jailbreak","Jailbreak"], fill_value=0))
    return tbl.values

# Build arrays for every "other" label
Y_map, C_map, T_map = {}, {}, {}
for lab in OTHER_LABELS:
    Y_map[lab] = pivot_pct_other(tidy_other, lab)
    C_map[lab] = pivot_counts_other(tidy_other, lab, "count")
    T_map[lab] = pivot_counts_other(tidy_other, lab, "total")

fig_other = go.Figure()

# Left bar (No JB): stack all OTHER_LABELS
for lab in OTHER_LABELS:
    Y = Y_map[lab]
    C = C_map[lab]
    T = T_map[lab]
    fig_other.add_trace(go.Bar(
        x=x_nojb, y=Y[:, 0],
        name=other_label_map[lab],
        legendgroup=other_label_map[lab],
        marker_color=other_colors[lab],
        width=BAR_WIDTH,
        text=[f"{v:.1f}%" if v >= 7.0 else "" for v in Y[:, 0]],
        textposition="inside",
        cliponaxis=False,
        hovertemplate=("Magnitude: %{customdata[2]}<br>"
                       "No Jailbreak — "+other_label_map[lab]+"<br>"
                       "Pct: %{y:.1f}%<br>"
                       "Count: %{customdata[0]} / %{customdata[1]}<extra></extra>"),
        customdata=np.column_stack([C[:, 0], T[:, 0], magnitudes]),
    ))

# Right bar (JB): stack all OTHER_LABELS (hide duplicate legend items)
for lab in OTHER_LABELS:
    Y = Y_map[lab]
    C = C_map[lab]
    T = T_map[lab]
    fig_other.add_trace(go.Bar(
        x=x_jb, y=Y[:, 1],
        name=None, showlegend=False, legendgroup=other_label_map[lab],
        marker_color=other_colors[lab],
        width=BAR_WIDTH,
        text=[f"{v:.1f}%" if v >= 7.0 else "" for v in Y[:, 1]],
        textposition="inside",
        cliponaxis=False,
        hovertemplate=("Magnitude: %{customdata[2]}<br>"
                       "Jailbreak — "+other_label_map[lab]+"<br>"
                       "Pct: %{y:.1f}%<br>"
                       "Count: %{customdata[0]} / %{customdata[1]}<extra></extra>"),
        customdata=np.column_stack([C[:, 1], T[:, 1], magnitudes]),
    ))

# Annotations with sum percentages above each stacked pair
tops_nojb = sum(Y_map[lab][:, 0] for lab in OTHER_LABELS)
tops_jb   = sum(Y_map[lab][:, 1] for lab in OTHER_LABELS)
for i in range(len(magnitudes)):
    if tops_nojb[i] > 0:
        fig_other.add_annotation(x=x_nojb[i], y=tops_nojb[i] + 3,
                                 text=f"<b>{tops_nojb[i]:.1f}%</b>", showarrow=False, font=dict(size=13))
        fig_other.add_annotation(x=x_nojb[i], y=tops_nojb[i] + 8,
                                 text="<b>Default</b>", showarrow=False, font=dict(size=10))
    if tops_jb[i] > 0:
        fig_other.add_annotation(x=x_jb[i], y=tops_jb[i] + 3,
                                 text=f"<b>{tops_jb[i]:.1f}%</b>", showarrow=False, font=dict(size=13))
        fig_other.add_annotation(x=x_jb[i], y=tops_jb[i] + 8,
                                 text="<b>Jailbreak</b>", showarrow=False, font=dict(size=10))

fig_other.update_layout(
    barmode="stack",
    width=1000, height=560,
    title={
        "text": "Other Responses with Persona-based Jailbreaks when Steered Towards the Assistant",
        "subtitle": {
            "text": subtitle,
        }
    },
    margin=dict(t=140),
    xaxis=dict(
        title="Role-playing PC1 Steering Magnitude",
        tickmode="array",
        tickvals=centers,
        ticktext=[str(m) for m in magnitudes[:-1]] + ["Unsteered"],
        showgrid=True, gridcolor="lightgray",
        range=[centers.min()-0.8, centers.max()+0.8],
    ),
    yaxis=dict(
        title="Percentage of Responses",
        range=[0, 115],
        showgrid=True, gridcolor="lightgray"
    ),
    legend=dict(
        orientation="h", y=1.02, x=1.0, xanchor="right", yanchor="bottom"
    ),
    bargap=0.25,
)

fig_other.show()
fig_other.write_html(f"{out_dir}/jailbreak_other.html")