# Analyzing scores and plotting them

In [2]:
import json
import os
import pandas as pd
import numpy as np

import plotly.graph_objects as go

In [3]:
model = "qwen-3-32b"
subtitle = f"{model.replace('-', ' ').title()}"
base_dir = f"/workspace/evals/jailbreak/{model}"
out_dir = f"/root/git/plots/{model}/evals"

os.makedirs(out_dir, exist_ok=True)


In [4]:
# load scores from roles_20 as pandas dataframe

prefill_path = f"{base_dir}/jailbreak_spiral_turns1_scores.jsonl"
unsteered_default_path = f"{base_dir}/unsteered/unsteered_default_scores.jsonl"



In [5]:

    
with open(unsteered_default_path, "r") as f:
    unsteered_default_scores = [json.loads(line) for line in f]
    
with open(prefill_path, "r") as f:
    prefill_scores = [json.loads(line) for line in f]




# Convert to pandas DataFrames
prefill_df = pd.DataFrame(prefill_scores)
unsteered_default_df = pd.DataFrame(unsteered_default_scores)

print(f"Loaded {len(unsteered_default_df)} unsteered default scores")   
print(f"Loaded {len(prefill_df)} prefill scores")   

Loaded 4400 unsteered default scores
Loaded 4400 prefill scores


## Inspect scores


In [6]:
def pct_overall(df, name):
    total = len(df)

    # Raw counts of each label
    counts = df["score"].value_counts().sort_index()

    # Percentages of each label
    percentages = (counts / total * 100).round(1)

    # Special targets
    n_enough = counts.get("enough_info", 0)
    n_enough_perfect = counts.get("enough_info_and_follow_perfectly", 0)
    n_either = n_enough + n_enough_perfect

    pct_enough = 100 * n_enough / total
    pct_enough_perfect = 100 * n_enough_perfect / total
    pct_either = 100 * n_either / total

    print(f"=== Overall Score Distribution: {name} ===")
    print(f"Total samples: {total}\n")

    # Print table of counts + percentages for each label
    print("Per-label counts and percentages:")
    for label in counts.index:
        print(f"- {label}: {counts[label]} ({percentages[label]}%)")

    print("\nTarget categories:")
    print(f"- enough_info: {n_enough} ({pct_enough:.1f}%)")
    print(f"- enough_info_and_follow_perfectly: {n_enough_perfect} ({pct_enough_perfect:.1f}%)")
    print(f"- either: {n_either} ({pct_either:.1f}%)")
    print()


In [7]:

pct_overall(prefill_df, "Prefill")


=== Overall Score Distribution: Prefill ===
Total samples: 4400

Per-label counts and percentages:
- almost_enough_info: 24 (0.5%)
- enough_info: 210 (4.8%)
- out_of_context: 240 (5.5%)
- refusal: 99 (2.2%)
- refusal_and_justification: 277 (6.3%)
- related_but_no_info: 3550 (80.7%)

Target categories:
- enough_info: 210 (4.8%)
- enough_info_and_follow_perfectly: 0 (0.0%)
- either: 210 (4.8%)



In [8]:
pct_overall(unsteered_default_df, "Unsteered + No Jailbreak")


=== Overall Score Distribution: Unsteered + No Jailbreak ===
Total samples: 4400

Per-label counts and percentages:
- almost_enough_info: 6 (0.1%)
- enough_info: 51 (1.2%)
- other: 1 (0.0%)
- out_of_context: 137 (3.1%)
- refusal: 416 (9.5%)
- refusal_and_justification: 325 (7.4%)
- related_but_no_info: 3463 (78.7%)
- related_but_no_info.: 1 (0.0%)

Target categories:
- enough_info: 51 (1.2%)
- enough_info_and_follow_perfectly: 0 (0.0%)
- either: 51 (1.2%)



In [None]:
print(unsteered_default_df.keys())

Index(['id', 'role', 'prompt_id', 'question_id', 'prompt', 'question',
       'harm_category', 'persona', 'response', 'magnitude', 'score',
       'analysis'],
      dtype='object')


## Plot Response Distribution

In [10]:
# Define all response categories and their colors
ALL_LABELS = [
    "enough_info",
    "enough_info_and_follow_perfectly", 
    "almost_enough_info",
    "related_but_no_info",
    "refusal",
    "refusal_and_justification",
    "out_of_context",
    "other"
]

COLORS = {
    "enough_info_and_follow_perfectly": "#e41a1c",  # red
    "enough_info": "#ff7f00",                      # orange
    "almost_enough_info": "#ffff33",               # yellow
    "related_but_no_info": "#1f78b4",              # cyan/blueish
    "refusal": "#b2df8a",                          # lime green
    "refusal_and_justification": "#33a02c",        # green
    "out_of_context": "#6a3d9a",                   # purple
    "other": "#999999",                            # grey
}

LABEL_MAP = {
    "enough_info": "Harmful Info in Response",
    "enough_info_and_follow_perfectly": "Harmful Info & Perfectly Following Request",
    "refusal": "Refusal",
    "refusal_and_justification": "Refusal & Justification",
    "out_of_context": "Out of Context",
    "related_but_no_info": "Related",
    "almost_enough_info": "Almost Harmful",
    "other": "Other",
}

def calculate_percentages(df, name):
    """Calculate percentages for all response categories"""
    total = len(df)
    counts = df["score"].value_counts()
    
    # Ensure all labels are present
    percentages = {}
    actual_counts = {}
    for label in ALL_LABELS:
        count = counts.get(label, 0)
        actual_counts[label] = count
        percentages[label] = (count / total * 100) if total > 0 else 0
    
    return percentages, actual_counts, total

# Calculate data for both conditions
default_pct, default_counts, default_total = calculate_percentages(unsteered_default_df, "Default")
prefill_pct, prefill_counts, prefill_total = calculate_percentages(prefill_df, "Prefill")

print("Default Response percentages:")
for label in ALL_LABELS:
    if default_counts[label] > 0:
        print(f"  {LABEL_MAP[label]}: {default_pct[label]:.1f}% ({default_counts[label]}/{default_total})")

print("\nPrefilled Response percentages:")
for label in ALL_LABELS:
    if prefill_counts[label] > 0:
        print(f"  {LABEL_MAP[label]}: {prefill_pct[label]:.1f}% ({prefill_counts[label]}/{prefill_total})")

Default Response percentages:
  Harmful Info in Response: 1.2% (51/4400)
  Almost Harmful: 0.1% (6/4400)
  Related: 78.7% (3463/4400)
  Refusal: 9.5% (416/4400)
  Refusal & Justification: 7.4% (325/4400)
  Out of Context: 3.1% (137/4400)
  Other: 0.0% (1/4400)

Prefilled Response percentages:
  Harmful Info in Response: 4.8% (210/4400)
  Almost Harmful: 0.5% (24/4400)
  Related: 80.7% (3550/4400)
  Refusal: 2.2% (99/4400)
  Refusal & Justification: 6.3% (277/4400)
  Out of Context: 5.5% (240/4400)


In [22]:
# Create the unified stacked bar plot
fig = go.Figure()

# X positions for the two bars
x_positions = [0, 1]  # Default, Prefill
x_labels = ["Baseline", "1 Role-Playing Turn"]
bar_width = 0.4

# Add stacked bars for each response type
for label in ALL_LABELS:
    # Only add traces for labels that have data in at least one condition
    if default_counts[label] > 0 or prefill_counts[label] > 0:
        fig.add_trace(go.Bar(
            x=x_positions,
            y=[default_pct[label], prefill_pct[label]],
            name=LABEL_MAP[label],
            marker_color=COLORS[label],
            width=bar_width,
            text=[f"{default_pct[label]:.1f}%" if default_pct[label] >= 4.0 else "", 
                  f"{prefill_pct[label]:.1f}%" if prefill_pct[label] >= 4.0 else ""],
            textposition="inside",
            hovertemplate=(
                "%{fullData.name}<br>"
                "Condition: %{customdata[2]}<br>"
                "Percentage: %{y:.1f}%<br>"
                "Count: %{customdata[0]} / %{customdata[1]}"
                "<extra></extra>"
            ),
            customdata=[
                [default_counts[label], default_total, "Default Response"],
                [prefill_counts[label], prefill_total, "Prefilled Responses"]
            ]
        ))

# Update layout
fig.update_layout(
    barmode="stack",
    width=800,
    height=700,
    title={
        "text": "Responses to Harmful Questions with High Projection on Role PC1",
        "subtitle": {
            "text": subtitle + ", Prefilled with 1 Turn from Spiral Transcript",
        }
    },
    xaxis=dict(
        title="Condition",
        tickmode="array",
        tickvals=x_positions,
        ticktext=x_labels,
        showgrid=True,
        gridcolor="lightgray",
    ),
    yaxis=dict(
        title="Percentage of Responses",
        range=[0, 105],
        showgrid=True,
        gridcolor="lightgray"
    ),
    legend=dict(
        orientation="v",
        yanchor="top",
        y=1.0,
        xanchor="left",
        x=1.02
    ),
    bargap=0.4,
)

fig.show()
fig.write_html(f"{out_dir}/jailbreak_spiral_turn1.html")