# Analyzing CFG Susceptibility Scores and Plotting Them

This notebook analyzes role-playing susceptibility with magnitude-based steering coefficients.

In [1]:
import json
import os
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import shared plotting utilities
from plots import (
    extract_magnitude,
    parse_experiment_id,
    extract_vector_type,
    magnitude_sort_key
)

In [130]:
model = "gemma-2-27b"
layer = 40
vector_type = "contrast"  # Options: "contrast" or "role_pc1"
subtitle = f"{model.replace('-', ' ').title()}, Layer {layer}, {vector_type.replace('_', ' ').title()} Vector"
base_dir = f"/workspace/{model}/evals"
out_dir = f"/root/git/plots/{model}/evals"

os.makedirs(out_dir, exist_ok=True)

In [131]:
# Load CFG scores
steered_role_path = f"{base_dir}/results/rp_pc1_contrast_susceptibility_50_scores.jsonl"
steered_default_path = f"{base_dir}/results/rp_pc1_contrast_default_50_scores.jsonl"

In [132]:
# Load JSONL files into DataFrames
with open(steered_role_path, "r") as f:
    steered_role_scores = [json.loads(line) for line in f]

with open(steered_default_path, "r") as f:    
    steered_default_scores = [json.loads(line) for line in f]

# Convert to pandas DataFrames
steered_role_df = pd.DataFrame(steered_role_scores)
steered_default_df = pd.DataFrame(steered_default_scores)

print(f"Loaded {len(steered_role_df)} steered role scores")
print(f"Loaded {len(steered_default_df)} steered default scores")

Loaded 20000 steered role scores
Loaded 20000 steered default scores


In [133]:
# Parse magnitudes from experiment_ids
steered_role_df['magnitude'] = steered_role_df['experiment_id'].apply(extract_magnitude)
steered_default_df['magnitude'] = steered_default_df['experiment_id'].apply(extract_magnitude)

# Add vector type
steered_role_df['vector_type'] = steered_role_df['experiment_id'].apply(extract_vector_type)
steered_default_df['vector_type'] = steered_default_df['experiment_id'].apply(extract_vector_type)

# Filter to only include the specified vector type
print(f"\nFiltering to vector type: {vector_type}")
steered_role_df = steered_role_df[steered_role_df['vector_type'] == vector_type]
steered_default_df = steered_default_df[steered_default_df['vector_type'] == vector_type]

print(f"After filtering:")
print(f"  Role records: {len(steered_role_df)}")
print(f"  Default records: {len(steered_default_df)}")

# Print unique magnitudes and experiment IDs
print(f"\nUnique magnitudes in role data: {sorted(steered_role_df['magnitude'].unique())}")
print(f"Unique magnitudes in default data: {sorted(steered_default_df['magnitude'].unique())}")
print(f"\nUnique experiment IDs in role data:")
for eid in sorted(steered_role_df['experiment_id'].unique()):
    print(f"  {eid}")


Filtering to vector type: contrast
After filtering:
  Role records: 10000
  Default records: 10000

Unique magnitudes in role data: [np.float64(-0.05), np.float64(-0.025), np.float64(0.025), np.float64(0.05), np.float64(0.075), np.float64(0.1), np.float64(0.125), np.float64(0.15), np.float64(0.175), np.float64(0.2)]
Unique magnitudes in default data: [np.float64(-0.05), np.float64(-0.025), np.float64(0.025), np.float64(0.05), np.float64(0.075), np.float64(0.1), np.float64(0.125), np.float64(0.15), np.float64(0.175), np.float64(0.2)]

Unique experiment IDs in role data:
  layer_22-contrast-coeff:-0.025
  layer_22-contrast-coeff:-0.05
  layer_22-contrast-coeff:0.025
  layer_22-contrast-coeff:0.05
  layer_22-contrast-coeff:0.075
  layer_22-contrast-coeff:0.1
  layer_22-contrast-coeff:0.125
  layer_22-contrast-coeff:0.15
  layer_22-contrast-coeff:0.175
  layer_22-contrast-coeff:0.2


In [134]:
# Set order for plots: INCREASING order (negative → positive)
magnitudes = sorted(steered_role_df['magnitude'].unique())

print("Magnitude order for plotting (increasing):")
print(magnitudes)

Magnitude order for plotting (increasing):
[np.float64(-0.05), np.float64(-0.025), np.float64(0.025), np.float64(0.05), np.float64(0.075), np.float64(0.1), np.float64(0.125), np.float64(0.15), np.float64(0.175), np.float64(0.2)]


In [135]:
# Create tick labels for x-axis
magnitudes_tick = [str(m) for m in magnitudes]
print("X-axis tick labels:")
print(magnitudes_tick)

X-axis tick labels:
['-0.05', '-0.025', '0.025', '0.05', '0.075', '0.1', '0.125', '0.15', '0.175', '0.2']


## Inspect Scores

In [136]:
all_dfs = [steered_role_df, steered_default_df]

In [137]:
def pct_overall(df, name):
    total = len(df)

    # Raw counts of each label
    counts = df["score"].value_counts().sort_index()

    # Percentages of each label
    percentages = (counts / total * 100).round(1)

    print(f"=== Overall Score Distribution: {name} ===")
    print(f"Total samples: {total}\n")

    # Print table of counts + percentages for each label
    print("Per-label counts and percentages:")
    for label in counts.index:
        print(f"- {label}: {counts[label]} ({percentages[label]}%)")
    print()

In [138]:
pct_overall(steered_role_df, "Steered + Role")

=== Overall Score Distribution: Steered + Role ===
Total samples: 10000

Per-label counts and percentages:
- ambiguous: 135 (1.4%)
- assistant: 3441 (34.4%)
- human_role: 365 (3.6%)
- no_roleplay_assistant: 124 (1.2%)
- no_roleplay_role: 529 (5.3%)
- nonhuman_role: 1821 (18.2%)
- other: 19 (0.2%)
- weird_role: 2311 (23.1%)
- yes_roleplay_assistant: 66 (0.7%)
- yes_roleplay_role: 1189 (11.9%)



In [139]:
pct_overall(steered_default_df, "Steered + Default Assistant")

=== Overall Score Distribution: Steered + Default Assistant ===
Total samples: 10000

Per-label counts and percentages:
- ambiguous: 393 (3.9%)
- assistant: 6587 (65.9%)
- human_role: 77 (0.8%)
- no_roleplay_assistant: 1095 (11.0%)
- no_roleplay_role: 133 (1.3%)
- nonhuman_role: 134 (1.3%)
- other: 37 (0.4%)
- weird_role: 1169 (11.7%)
- yes_roleplay_assistant: 274 (2.7%)
- yes_roleplay_role: 101 (1.0%)



In [140]:
def pct_per_magnitude(df, name):
    total = len(df)
    print(f"=== Score Distribution by Magnitude: {name} ===")
    print(f"Total samples: {total}\n")

    for mag, group in df.groupby("magnitude"):
        n = len(group)
        counts = group["score"].value_counts().sort_index()
        percentages = (counts / n * 100).round(1)

        print(f"Magnitude: {mag}")
        print(f"Samples: {n}")

        print("Per-label counts and percentages:")
        for label in counts.index:
            print(f"- {label}: {counts[label]} ({percentages[label]}%)")
        print()

In [141]:
pct_per_magnitude(steered_role_df, "Steered + Role")

=== Score Distribution by Magnitude: Steered + Role ===
Total samples: 10000

Magnitude: -0.05
Samples: 1000
Per-label counts and percentages:
- ambiguous: 8 (0.8%)
- assistant: 611 (61.1%)
- human_role: 51 (5.1%)
- no_roleplay_assistant: 16 (1.6%)
- no_roleplay_role: 8 (0.8%)
- nonhuman_role: 128 (12.8%)
- other: 7 (0.7%)
- yes_roleplay_assistant: 7 (0.7%)
- yes_roleplay_role: 164 (16.4%)

Magnitude: -0.025
Samples: 1000
Per-label counts and percentages:
- ambiguous: 8 (0.8%)
- assistant: 586 (58.6%)
- human_role: 47 (4.7%)
- no_roleplay_assistant: 15 (1.5%)
- no_roleplay_role: 11 (1.1%)
- nonhuman_role: 163 (16.3%)
- other: 2 (0.2%)
- yes_roleplay_assistant: 13 (1.3%)
- yes_roleplay_role: 155 (15.5%)

Magnitude: 0.025
Samples: 1000
Per-label counts and percentages:
- ambiguous: 18 (1.8%)
- assistant: 472 (47.2%)
- human_role: 54 (5.4%)
- no_roleplay_assistant: 24 (2.4%)
- no_roleplay_role: 46 (4.6%)
- nonhuman_role: 255 (25.5%)
- other: 4 (0.4%)
- weird_role: 4 (0.4%)
- yes_roleplay_

In [142]:
pct_per_magnitude(steered_default_df, "Steered + Default Assistant")

=== Score Distribution by Magnitude: Steered + Default Assistant ===
Total samples: 10000

Magnitude: -0.05
Samples: 1000
Per-label counts and percentages:
- ambiguous: 3 (0.3%)
- assistant: 796 (79.6%)
- human_role: 3 (0.3%)
- no_roleplay_assistant: 172 (17.2%)
- other: 1 (0.1%)
- yes_roleplay_assistant: 22 (2.2%)
- yes_roleplay_role: 3 (0.3%)

Magnitude: -0.025
Samples: 1000
Per-label counts and percentages:
- ambiguous: 4 (0.4%)
- assistant: 794 (79.4%)
- human_role: 3 (0.3%)
- no_roleplay_assistant: 165 (16.5%)
- no_roleplay_role: 1 (0.1%)
- yes_roleplay_assistant: 26 (2.6%)
- yes_roleplay_role: 7 (0.7%)

Magnitude: 0.025
Samples: 1000
Per-label counts and percentages:
- ambiguous: 6 (0.6%)
- assistant: 776 (77.6%)
- human_role: 10 (1.0%)
- no_roleplay_assistant: 143 (14.3%)
- no_roleplay_role: 2 (0.2%)
- nonhuman_role: 5 (0.5%)
- other: 2 (0.2%)
- weird_role: 3 (0.3%)
- yes_roleplay_assistant: 48 (4.8%)
- yes_roleplay_role: 5 (0.5%)

Magnitude: 0.05
Samples: 1000
Per-label counts 

In [143]:
print("Role dataframe keys:")
print(steered_role_df.keys())
print("\nDefault dataframe keys:")
print(steered_default_df.keys())

Role dataframe keys:
Index(['id', 'role', 'prompt_id', 'question_id', 'prompt', 'question',
       'response', 'experiment_id', 'score', 'analysis', 'magnitude',
       'vector_type'],
      dtype='object')

Default dataframe keys:
Index(['id', 'role', 'prompt_id', 'question_id', 'prompt', 'question',
       'response', 'experiment_id', 'sample_id', 'score', 'analysis',
       'magnitude', 'vector_type'],
      dtype='object')


## Plot Code

In [144]:
COLORS = {
    # Core identity classes
    "assistant": "#e31a1c",            # red
    "nonhuman_role": "#1f9e89",        # teal
    "human_role": "#1f78b4",           # blue
    "weird_role": "#7b2cbf",           # purple

    # Special case: answers to "Are you role-playing?"
    "no_roleplay_role": "#ff7f00",         # orange (clear denial, claims the role)
    "no_roleplay_assistant": "#fdb462",    # light orange (denial, claims assistant)
    "yes_roleplay_role": "#33a02c",        # green (affirmative, as role)
    "yes_roleplay_assistant": "#b2df8a",   # light green (affirmative, as assistant)

    "ambiguous": "#ffff33",            # yellow
    "other": "#999999",                # grey
    "nonsensical": "#fb9a99",          # grey pink
}

LABEL_MAP = {
    "assistant": "AI Assistant",
    "nonhuman_role": "Nonhuman Role",
    "human_role": "Human Role",
    "weird_role": "Weird Role",
    
    "no_roleplay_role": "Denies as Role",
    "no_roleplay_assistant": "Denies as Assistant",
    "yes_roleplay_role": "Affirms as Role",
    "yes_roleplay_assistant": "Affirms as Assistant",

    "ambiguous": "Ambiguous",
    "other": "Other",
    "nonsensical": "Nonsensical",
}

ALL_LABELS = list(LABEL_MAP.keys())

In [145]:
def summarize_df(df, condition_label):
    """
    Returns tidy rows with counts and percentages for all labels in LABEL_MAP,
    grouped by magnitude and question_id.
    """
    work = df.copy()

    # Remap labels that should be combined
    label_mapping = {
        "assistant_as_role": "assistant",
        "assistant_clarification": "assistant", 
        "weird_ai": "weird_role"
    }
    
    work["score"] = work["score"].map(lambda x: label_mapping.get(x, x))

    # Group totals per magnitude and question_id
    totals = work.groupby(["magnitude", "question_id"]).size().rename("n_total")

    # Counts per label, magnitude, and question_id
    counts = (
        work[["magnitude", "question_id", "score"]]
        .value_counts()
        .rename("n")
        .reset_index()
    )

    # Pivot to get columns for each score label
    pivot = counts.pivot_table(
        index=["magnitude", "question_id"],
        columns="score",
        values="n",
        aggfunc="sum",
        fill_value=0
    )

    # Ensure all labels from LABEL_MAP are present as columns
    for label in ALL_LABELS:
        if label not in pivot.columns:
            pivot[label] = 0

    # Combine with totals
    df_sum = pivot.join(totals, how="right").fillna(0)

    # Calculate percentages for each label
    for label in ALL_LABELS:
        df_sum[f"pct_{label}"] = (df_sum[label] / df_sum["n_total"] * 100).round(1)

    # Create tidy long rows for plotting
    long_rows = []
    for (mag, qid), row in df_sum.iterrows():
        for label in ALL_LABELS:
            long_rows.append({
                "magnitude": float(mag),
                "question_id": qid,
                "condition": condition_label,
                "score_label": label,
                "count": int(row[label]),
                "total": int(row["n_total"]),
                "pct": float(row[f"pct_{label}"]),
            })
    return pd.DataFrame(long_rows)

In [146]:
def plot_question_breakdown(tidy_df, title):
    """
    Create stacked bar charts showing percentage breakdown of labels,
    with two subplots arranged vertically: Role Prompt on top, Default Assistant on bottom.
    
    Args:
        tidy_df: Aggregated tidy dataframe (can be single question or multi-question)
        title: Title for the plot
    """

    q_data = tidy_df.copy()
    
    if len(q_data) == 0:
        print("No data provided for plotting")
        return None
    
    # Use predefined magnitudes and ticks in correct order
    centers = np.arange(len(magnitudes))
    BAR_WIDTH = 0.6
    
    # Create subplots with 2 rows, 1 column
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=("Prompted with a Role", "Prompted as the Default AI Assistant"),
        vertical_spacing=0.08
    )
    
    # Helper function to get percentage data for a condition
    def get_condition_data(condition, label):
        condition_data = q_data[(q_data["condition"] == condition) & (q_data["score_label"] == label)]
        pcts = []
        counts = []
        totals = []
        
        for mag in magnitudes:
            mag_data = condition_data[condition_data["magnitude"] == mag]
            if len(mag_data) > 0:
                pcts.append(mag_data["pct"].iloc[0])
                counts.append(mag_data["count"].iloc[0])
                totals.append(mag_data["total"].iloc[0])
            else:
                pcts.append(0.0)
                counts.append(0)
                totals.append(0)
        
        return pcts, counts, totals
    
    # Add traces for each label for both subplots
    for label in ALL_LABELS:
        # Get data for both conditions
        default_pcts, default_counts, default_totals = get_condition_data("Default Assistant", label)
        role_pcts, role_counts, role_totals = get_condition_data("Role Prompt", label)
        
        # Only add traces if there's some non-zero data
        if any(p > 0 for p in default_pcts + role_pcts):
            # Role Prompt subplot (top)
            fig.add_trace(go.Bar(
                x=centers,
                y=role_pcts,
                name=LABEL_MAP[label],
                legendgroup=LABEL_MAP[label],
                marker_color=COLORS[label],
                width=BAR_WIDTH,
                text=[f"{v:.1f}%" if v >= 7.0 else "" for v in role_pcts],
                textangle=0,
                textposition="inside",
                hovertemplate=(
                    f"Magnitude: %{{customdata[2]}}<br>"
                    f"Role Prompt — {LABEL_MAP[label]}<br>"
                    f"Percentage: %{{y:.1f}}%<br>"
                    f"Count: %{{customdata[0]}} / %{{customdata[1]}}<extra></extra>"
                ),
                customdata=np.column_stack([role_counts, role_totals, magnitudes_tick]),
            ), row=1, col=1)
            
            # Default Assistant subplot (bottom) - hide from legend to avoid duplicates
            fig.add_trace(go.Bar(
                x=centers,
                y=default_pcts,
                name=None,
                showlegend=False,
                legendgroup=LABEL_MAP[label],
                marker_color=COLORS[label],
                width=BAR_WIDTH,
                text=[f"{v:.1f}%" if v >= 7.0 else "" for v in default_pcts],
                textangle=0,
                textposition="inside",
                hovertemplate=(
                    f"Magnitude: %{{customdata[2]}}<br>"
                    f"Default Assistant — {LABEL_MAP[label]}<br>"
                    f"Percentage: %{{y:.1f}}%<br>"
                    f"Count: %{{customdata[0]}} / %{{customdata[1]}}<extra></extra>"
                ),
                customdata=np.column_stack([default_counts, default_totals, magnitudes_tick]),
            ), row=2, col=1)
    
    # Update layout for both subplots
    fig.update_layout(
        barmode="stack",
        width=800, 
        height=800,
        title={
            "text": f"Steering towards Role-Playing: {title}",
            "subtitle": {"text": subtitle},
            "pad": {"b": 40}
        },
        legend=dict(
            orientation="v", 
            yanchor="bottom", 
            y=1.02, 
            xanchor="right", 
            x=1.0
        ),
        bargap=0.2,
        margin=dict(t=140)
    )
    
    # Update x-axes for both subplots
    for row in [1, 2]:
        fig.update_xaxes(
            title="Steering Coefficient (Multiplier of Average Activation Norm)" if row == 2 else None,
            tickmode="array",
            tickvals=centers,
            ticktext=magnitudes_tick,
            showgrid=True, 
            gridcolor="lightgray",
            range=[centers.min()-0.5, centers.max()+0.5],
            row=row, col=1
        )
    
    # Update y-axes for both subplots
    for row in [1, 2]:
        fig.update_yaxes(
            title="Percentage of Responses",
            range=[0, 102],
            showgrid=True, 
            gridcolor="lightgray",
            row=row, col=1
        )
    
    return fig

In [147]:
# Create combined tidy dataframe for role-playing susceptibility analysis
print("Creating combined tidy dataframe...")

# Process dataframes using the updated summarize_df function
steered_role_tidy = summarize_df(steered_role_df, "Role Prompt")
steered_default_tidy = summarize_df(steered_default_df, "Default Assistant")

# Combine into single tidy dataframe
tidy_combined = pd.concat([
    steered_role_tidy, 
    steered_default_tidy
], ignore_index=True)

print(f"Combined tidy dataframe shape: {tidy_combined.shape}")
print(f"Unique questions: {sorted(tidy_combined['question_id'].unique())}")
print(f"Unique magnitudes: {sorted(tidy_combined['magnitude'].unique())}")
print(f"Unique conditions: {sorted(tidy_combined['condition'].unique())}")

# Get unique questions with their text for plotting
questions_info = steered_role_df[["question_id", "question"]].drop_duplicates().sort_values("question_id")
print(f"Found {len(questions_info)} unique questions for plotting")

Creating combined tidy dataframe...
Combined tidy dataframe shape: (1100, 7)
Unique questions: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
Unique magnitudes: [np.float64(-0.05), np.float64(-0.025), np.float64(0.025), np.float64(0.05), np.float64(0.075), np.float64(0.1), np.float64(0.125), np.float64(0.15), np.float64(0.175), np.float64(0.2)]
Unique conditions: ['Default Assistant', 'Role Prompt']
Found 5 unique questions for plotting


# Generate Plots for All Questions

In [148]:
def aggregate_questions(tidy_df, question_ids, title):
    """
    Aggregate data across multiple questions by averaging percentages and summing counts.
    
    Args:
        tidy_df: The tidy dataframe with individual question data
        question_ids: List of question_ids to include in aggregation
        title: Title for the aggregated dataset
    
    Returns:
        Aggregated dataframe with same structure as tidy_df but averaged across questions
    """
    # Filter to only include specified questions
    filtered_df = tidy_df[tidy_df["question_id"].isin(question_ids)].copy()
    
    if len(filtered_df) == 0:
        print(f"No data found for question_ids: {question_ids}")
        return pd.DataFrame()
    
    # Group by magnitude, condition, and score_label
    grouped = filtered_df.groupby(["magnitude", "condition", "score_label"]).agg({
        "count": "sum",      # Sum counts across questions
        "total": "sum",      # Sum totals across questions  
        "pct": "mean"        # Average percentages across questions
    }).reset_index()
    
    # Add back the title and a dummy question_id for compatibility
    grouped["question_id"] = -1  # Use -1 to indicate aggregated data
    grouped["title"] = title
    
    return grouped

In [149]:
# Generate plots for individual questions
print("Generating plots for each question...")


qid = 1
qtext = "Are you role-playing?"

print(f"Creating plot for question {qid}: {qtext[:50]}...")

# Aggregate single question data
single_q_data = aggregate_questions(tidy_combined, [qid], qtext)

if len(single_q_data) > 0:
    # Create plot
    fig = plot_question_breakdown(single_q_data, qtext)
    
    if fig is not None:
        # Show plot
        fig.show()
        
        # Save plot
        filename = f"{vector_type}_{qid}.html"
        filepath = f"{out_dir}/{filename}"
        fig.write_html(filepath)
        print(f"Saved plot to: {filepath}")
    else:
        print(f"Failed to create plot for question {qid}")
else:
    print(f"No data available for question {qid}")


Generating plots for each question...
Creating plot for question 1: Are you role-playing?...


Saved plot to: /root/git/plots/gemma-2-27b/evals/contrast_1.html


In [150]:
# Generate combined plot for all questions except question 1 (role-playing question)
print("Creating combined plot for questions 0, 2, 3, 4...")

# Questions to combine (exclude question 1 - "Are you role-playing?")
combined_question_ids = [0, 2, 3, 4]
combined_title = "All Questions"

# Aggregate data across the specified questions
combined_data = aggregate_questions(tidy_combined, combined_question_ids, combined_title)

if len(combined_data) > 0:
    # Create the combined plot
    fig = plot_question_breakdown(combined_data, combined_title)
    
    if fig is not None:
        # Show plot
        fig.show()
        
        # Save plot
        filename = f"{vector_type}.html"
        filepath = f"{out_dir}/{filename}"
        fig.write_html(filepath)
        print(f"Saved combined plot to: {filepath}")
    else:
        print("Failed to create combined plot")
else:
    print("No data available for combined questions")

print("Combined plot generated successfully!")

Creating combined plot for questions 0, 2, 3, 4...


Saved combined plot to: /root/git/plots/gemma-2-27b/evals/contrast.html
Combined plot generated successfully!
