# Make a file with the vectors we want to cap on

In [48]:
import torch
import os
import sys

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, MeanScaler

In [49]:
# Configuration 
model_name = "qwen-3-32b"
layer = 32
base_dir = f"/workspace/{model_name}"

output_file = f"{base_dir}/evals/capping_vectors.pt"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

## Make vectors

In [4]:
# standardscaler role_trait PC1
# standardscaler role PC1
# l2meanscaler role_trait PC1
# l2meanscaler role PC1
# meanscaler role_trait PC1
# meanscaler role PC1
# mean(roles[pos_3] - default)

vectors = []

In [6]:
scalers = ["", "_normalized", "_mean"]

for scaler in scalers:
    # first the one from both roles/traits pca
    combined_vec = {}
    combined_path = f"{base_dir}/roles_traits/pca/layer{layer}{scaler}_roles_pos23_traits_pos40-100.pt"
    combined_results = torch.load(combined_path, weights_only=False)

    combined_vec['scaler'] = combined_results['scaler']
    combined_vec['name'] = f"roles_traits/pca/layer{layer}{scaler}_roles_pos23_traits_pos40-100/pc1"
    combined_vec['vector'] = combined_results['pca'].components_[0]
    combined_vec['layer'] = layer

    vectors.append(combined_vec)

    # then roles pca only
    roles_vec = {}
    roles_path = f"{base_dir}/roles_240/pca/layer{layer}{scaler}_pos23.pt"
    roles_results = torch.load(roles_path, weights_only=False)

    roles_vec['scaler'] = roles_results['scaler']
    roles_vec['name'] = f"roles/pca/layer{layer}{scaler}_pos23/pc1"
    roles_vec['vector'] = roles_results['pca'].components_[0]
    roles_vec['layer'] = layer

    vectors.append(roles_vec)
    

In [8]:
print(vectors[0]['vector'].shape)

(5120,)


In [19]:
all_role_vecs = torch.stack(roles_results['vectors']['pos_3'])[:, layer, :].mean(dim=0)
default_vectors = torch.load(f"{base_dir}/roles_240/default_vectors.pt")['activations']['default_1'][layer]

contrast = all_role_vecs - default_vectors

normalized_contrast = contrast / torch.norm(contrast, dim=0)


In [20]:
vectors.append({
    'scaler': None,
    'name': "contrast_role_pos3_default1",
    'vector': normalized_contrast,
    'layer': layer
})


In [22]:
print(len(vectors))

7


In [None]:
# flip some vectors
vectors = torch.load(output_file, weights_only=False)
print([v['name'] for v in vectors])


['roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1', 'roles/pca/layer32_pos23/pc1', 'roles_traits/pca/layer32_normalized_roles_pos23_traits_pos40-100/pc1', 'roles/pca/layer32_normalized_pos23/pc1', 'roles_traits/pca/layer32_mean_roles_pos23_traits_pos40-100/pc1', 'roles/pca/layer32_mean_pos23/pc1', 'contrast_role_pos3_default1']


In [55]:
to_flip = [
    f"roles_traits/pca/layer{layer}_normalized_roles_pos23_traits_pos40-100/pc1",
    f"roles_traits/pca/layer{layer}_mean_roles_pos23_traits_pos40-100/pc1",
    f"roles/pca/layer{layer}_mean_pos23/pc1",
]

for v in vectors:
    if v['name'] in to_flip:
        v['vector'] = -v['vector']


In [56]:
torch.save(vectors, output_file)
print(f"Saved {len(vectors)} vectors to {output_file}")

Saved 7 vectors to /workspace/qwen-3-32b/evals/capping_vectors.pt


## Analysis: Load projections and analyze by score

In [64]:
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load both projections files
prompted_file = f"{base_dir}/evals/unsteered/unsteered_projections.jsonl"
default_file = f"{base_dir}/evals/unsteered/unsteered_default_projections.jsonl"

# Load prompted projections
prompted_records = []
with open(prompted_file, 'r') as f:
    for line in f:
        prompted_records.append(json.loads(line))

df_prompted = pd.DataFrame(prompted_records)
print(f"Loaded {len(df_prompted)} prompted records")

# Load default projections
default_records = []
with open(default_file, 'r') as f:
    for line in f:
        default_records.append(json.loads(line))

df_default = pd.DataFrame(default_records)
print(f"Loaded {len(df_default)} default records")
print(f"Columns: {df_prompted.columns.tolist()}")

Loaded 4400 prompted records
Loaded 4400 default records
Columns: ['id', 'role', 'prompt_id', 'question_id', 'prompt', 'question', 'harm_category', 'persona', 'response', 'magnitude', 'score', 'analysis', 'projections', 'projections_scaled']


In [65]:
# Function to expand projections and add binary label
def process_dataframe(df):
    # Expand projections
    projections_df = pd.json_normalize(df['projections'])
    projections_df.columns = [f"proj_{col}" for col in projections_df.columns]
    
    if 'projections_scaled' in df.columns:
        projections_scaled_df = pd.json_normalize(df['projections_scaled'])
        projections_scaled_df.columns = [f"proj_scaled_{col}" for col in projections_scaled_df.columns]
        df_expanded = pd.concat([df.drop(['projections', 'projections_scaled'], axis=1), 
                                 projections_df, projections_scaled_df], axis=1)
    else:
        df_expanded = pd.concat([df.drop(['projections'], axis=1), projections_df], axis=1)
    
    # Create binary label: harmful vs not harmful
    df_expanded['label'] = df_expanded['score'].apply(
        lambda x: 'harmful' if x in ['enough_info', 'enough_info_and_follow_perfectly'] else 'not harmful'
    )
    
    return df_expanded

# Process both dataframes
df_prompted_expanded = process_dataframe(df_prompted)
df_default_expanded = process_dataframe(df_default)

print(f"Projection columns: {[col for col in df_prompted_expanded.columns if col.startswith('proj_')]}")
print(f"\nLabel distribution for prompted:")
print(df_prompted_expanded['label'].value_counts())
print(f"\nLabel distribution for default:")
print(df_default_expanded['label'].value_counts())

Projection columns: ['proj_roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1', 'proj_roles/pca/layer32_pos23/pc1', 'proj_roles_traits/pca/layer32_normalized_roles_pos23_traits_pos40-100/pc1', 'proj_roles/pca/layer32_normalized_pos23/pc1', 'proj_roles_traits/pca/layer32_mean_roles_pos23_traits_pos40-100/pc1', 'proj_roles/pca/layer32_mean_pos23/pc1', 'proj_contrast_role_pos3_default1', 'proj_scaled_roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1', 'proj_scaled_roles/pca/layer32_pos23/pc1', 'proj_scaled_roles_traits/pca/layer32_normalized_roles_pos23_traits_pos40-100/pc1', 'proj_scaled_roles/pca/layer32_normalized_pos23/pc1', 'proj_scaled_roles_traits/pca/layer32_mean_roles_pos23_traits_pos40-100/pc1', 'proj_scaled_roles/pca/layer32_mean_pos23/pc1']

Label distribution for prompted:
label
harmful        3663
not harmful     737
Name: count, dtype: int64

Label distribution for default:
label
not harmful    4349
harmful          51
Name: count, dtype: int64


In [66]:
# Compute average projections by label (harmful vs not harmful)
projection_cols = [col for col in df_prompted_expanded.columns if col.startswith('proj_')]

print("PROMPTED PROJECTIONS")
print("=" * 80)
label_averages_prompted = df_prompted_expanded.groupby('label')[projection_cols].mean()
label_counts_prompted = df_prompted_expanded['label'].value_counts()

for label in label_averages_prompted.index:
    print(f"\n{label} (n={label_counts_prompted[label]}):")
    for col in projection_cols[:5]:  # Show first 5 for brevity
        print(f"  {col}: {label_averages_prompted.loc[label, col]:.4f}")

print("\n\nDEFAULT PROJECTIONS")
print("=" * 80)
label_averages_default = df_default_expanded.groupby('label')[projection_cols].mean()
label_counts_default = df_default_expanded['label'].value_counts()

for label in label_averages_default.index:
    print(f"\n{label} (n={label_counts_default[label]}):")
    for col in projection_cols[:5]:  # Show first 5 for brevity
        print(f"  {col}: {label_averages_default.loc[label, col]:.4f}")

PROMPTED PROJECTIONS

harmful (n=3663):
  proj_roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1: 13.4638
  proj_roles/pca/layer32_pos23/pc1: 14.2274
  proj_roles_traits/pca/layer32_normalized_roles_pos23_traits_pos40-100/pc1: -43.8407
  proj_roles/pca/layer32_normalized_pos23/pc1: -32.6411
  proj_roles_traits/pca/layer32_mean_roles_pos23_traits_pos40-100/pc1: -56.2857

not harmful (n=737):
  proj_roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1: 5.0720
  proj_roles/pca/layer32_pos23/pc1: 6.3335
  proj_roles_traits/pca/layer32_normalized_roles_pos23_traits_pos40-100/pc1: -49.6177
  proj_roles/pca/layer32_normalized_pos23/pc1: -39.2733
  proj_roles_traits/pca/layer32_mean_roles_pos23_traits_pos40-100/pc1: -61.5433


DEFAULT PROJECTIONS

harmful (n=51):
  proj_roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1: -12.9451
  proj_roles/pca/layer32_pos23/pc1: -11.0172
  proj_roles_traits/pca/layer32_normalized_roles_pos23_traits_pos40-100/pc1: -80.3787
  proj_roles/pc

## Histograms for each projection


In [80]:
from collections import defaultdict

results = defaultdict(dict)

projection_cols = [col for col in df_prompted_expanded.columns if col.startswith('proj_')]

# Cap at percentiles that block OUT 95%, 90%, and 75% of harmful responses
# This means caps at 5th, 10th, and 25th percentiles of harmful distribution
percentiles = [0.01, 0.10, 0.25]
block_percentages = [99, 90, 75]  # For display purposes

for col in projection_cols:
    # Only use harmful responses from jailbreak (prompted) dataset
    harmful = df_prompted_expanded[df_prompted_expanded['label'] == 'harmful'][col].dropna().values

    if len(harmful) == 0:
        continue

    # Compute percentiles of harmful distribution
    caps = np.quantile(harmful, percentiles).tolist()

    # Also get stats from both datasets for reference
    both = pd.concat([df_prompted_expanded[[col, 'label']], df_default_expanded[[col, 'label']]])
    harmful_all = both[both['label'] == 'harmful'][col].dropna().values
    not_harmful_all = both[both['label'] == 'not harmful'][col].dropna().values

    results[col] = {
        "caps": caps,
        "percentiles": percentiles,
        "block_percentages": block_percentages,
        "harmful_mean": float(np.mean(harmful_all)),
        "not_harmful_mean": float(np.mean(not_harmful_all)),
        "harmful_std": float(np.std(harmful_all)),
        "not_harmful_std": float(np.std(not_harmful_all))
    }

# Show sample results
print("Sample cap computation results:")
print("=" * 80)
sample_col = projection_cols[0]
print(f"\n{sample_col}:")
print(f"  Harmful mean: {results[sample_col]['harmful_mean']:.4f} ± {results[sample_col]['harmful_std']:.4f}")
print(f"  Not harmful mean: {results[sample_col]['not_harmful_mean']:.4f} ± {results[sample_col]['not_harmful_std']:.4f}")
print(f"  Caps that block out X% of harmful responses:")
for block_pct, cap in zip(block_percentages, results[sample_col]['caps']):
    print(f"    Block {block_pct}%: {cap:.4f}")

print(f"\nComputed caps for {len(results)} projection columns")

Sample cap computation results:

proj_roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1:
  Harmful mean: 13.1011 ± 10.1679
  Not harmful mean: -11.1284 ± 9.7467
  Caps that block out X% of harmful responses:
    Block 99%: -8.1085
    Block 90%: 0.2219
    Block 75%: 6.6732

Computed caps for 13 projection columns


In [81]:
# Create mapping functions for more informative titles
def format_projection_title(proj_col):
    """Convert projection column name to informative title"""
    # Remove 'proj_' or 'proj_scaled_' prefix
    if proj_col.startswith('proj_scaled_'):
        name = proj_col.replace('proj_scaled_', '')
        is_scaled = True
    else:
        name = proj_col.replace('proj_', '')
        is_scaled = False
    
    # Handle special case: contrast vector
    if 'contrast' in name:
        vector_desc = 'Role - Assistant Contrast Vector (L2 Normalized)'
    else:
        # Extract scaler from the path
        if '_normalized_' in name:
            scaler = 'Mean-Centered and L2 Normalized'
        elif '_mean_' in name:
            scaler = 'Mean-Centered'
        else:
            scaler = 'Z-Score Standardized'
        
        # Get vector type
        if 'roles_traits' in name:
            vector_type = 'Combined Role & Trait Vectors'
        elif 'roles' in name:
            vector_type = 'Role Vectors'
        else:
            vector_type = name
        
        # Format: "PC1 from [Vector Type] ([Scaler])"
        vector_desc = f'PC1 from {vector_type} ({scaler})'
    
    # Add "Scaled" suffix if applicable
    if is_scaled:
        vector_desc += ' [Scaled]'
    
    # Final title format
    title = f'Projection Difference on {vector_desc}'
    
    return title

In [82]:
# Create histograms with 4 subplots: unscaled and scaled, prompted and default
projection_cols_raw = [col for col in projection_cols if not col.startswith('proj_scaled_')]

for proj_col in projection_cols_raw:
    # Get corresponding scaled column
    scaled_col = proj_col.replace('proj_', 'proj_scaled_')
    
    # Get informative title (without [Scaled] suffix)
    title = format_projection_title(proj_col)
    
    # Create subplot figure with 2 rows, 2 columns
    # Row 1: Unscaled (Prompted | Default)
    # Row 2: Scaled (Prompted | Default)
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Raw Activation for Responses With Jailbreak Persona',
            'Raw Activation for Responses As Default Assistant',
            'Scaled Activation for Responses With Jailbreak Persona',
            'Scaled Activation for Responses As Default Assistant'
        ),
        horizontal_spacing=0.08,
        vertical_spacing=0.1
    )
    
    # Plot unscaled projections (row 1)
    for df_exp, col_idx, name in [(df_prompted_expanded, 1, 'Prompted'), 
                                    (df_default_expanded, 2, 'Default')]:
        # Separate data by label
        harmful_data = df_exp[df_exp['label'] == 'harmful'][proj_col]
        not_harmful_data = df_exp[df_exp['label'] == 'not harmful'][proj_col]
        
        # Add histograms
        fig.add_trace(
            go.Histogram(
                x=harmful_data,
                name='harmful',
                opacity=0.7,
                nbinsx=50,
                legendgroup='harmful',
                showlegend=(col_idx == 1),  # Only show legend once
                marker_color='red'
            ),
            row=1, col=col_idx
        )
        
        fig.add_trace(
            go.Histogram(
                x=not_harmful_data,
                name='not harmful',
                opacity=0.7,
                nbinsx=50,
                legendgroup='not harmful',
                showlegend=(col_idx == 1),  # Only show legend once
                marker_color='blue'
            ),
            row=1, col=col_idx
        )
    
    # Add cap lines for unscaled projections (row 1)
    if proj_col in results:
        for cap in results[proj_col]['caps']:
            for col_idx in [1, 2]:
                fig.add_vline(x=cap, line_color="black", line_dash="dash", line_width=1,
                             row=1, col=col_idx)
    
    # Plot scaled projections (row 2) if they exist
    if scaled_col in df_prompted_expanded.columns:
        for df_exp, col_idx, name in [(df_prompted_expanded, 1, 'Prompted'), 
                                        (df_default_expanded, 2, 'Default')]:
            # Separate data by label
            harmful_data = df_exp[df_exp['label'] == 'harmful'][scaled_col]
            not_harmful_data = df_exp[df_exp['label'] == 'not harmful'][scaled_col]
            
            # Add histograms
            fig.add_trace(
                go.Histogram(
                    x=harmful_data,
                    name='harmful',
                    opacity=0.7,
                    nbinsx=50,
                    legendgroup='harmful',
                    showlegend=False,  # Legend already shown in row 1
                    marker_color='red'
                ),
                row=2, col=col_idx
            )
            
            fig.add_trace(
                go.Histogram(
                    x=not_harmful_data,
                    name='not harmful',
                    opacity=0.7,
                    nbinsx=50,
                    legendgroup='not harmful',
                    showlegend=False,  # Legend already shown in row 1
                    marker_color='blue'
                ),
                row=2, col=col_idx
            )
        
        # Add cap lines for scaled projections (row 2)
        if scaled_col in results:
            for cap in results[scaled_col]['caps']:
                for col_idx in [1, 2]:
                    fig.add_vline(x=cap, line_color="black", line_dash="dash", line_width=1,
                                 row=2, col=col_idx)
    
    # Update layout
    fig.update_layout(
        title={
            'text': title,
            'subtitle': {
                'text': f"{model_name.replace('_', ' ').title()}, Layer {layer}",
            },
        },
        width=1000,
        height=800,
        barmode='overlay',
        showlegend=True
    )
    
    # Update axes labels and match axes within rows
    fig.update_xaxes(row=1, col=1, matches='x')
    fig.update_xaxes(row=1, col=2, matches='x')
    fig.update_xaxes(title_text='Projection Value', row=2, col=1, matches='x3')
    fig.update_xaxes(title_text='Projection Value', row=2, col=2, matches='x3')
    fig.update_yaxes(title_text='Count', row=1, col=1, matches='y')
    fig.update_yaxes(row=1, col=2, matches='y')
    fig.update_yaxes(title_text='Count', row=2, col=1, matches='y3')
    fig.update_yaxes(row=2, col=2, matches='y3')
    fig.update_annotations(font_size=12)
    
    fig.show()

## Save Caps to Vectors File

In [83]:
# Load the vectors file
vectors = torch.load(output_file, weights_only=False)

# Create mapping from vector name to projection column names
def get_projection_col_names(vector_name):
    """Get unscaled and scaled projection column names for a vector"""
    unscaled_col = f"proj_{vector_name}"
    scaled_col = f"proj_scaled_{vector_name}"
    return unscaled_col, scaled_col

# Add caps to each vector
for vector in vectors:
    vector_name = vector['name']
    unscaled_col, scaled_col = get_projection_col_names(vector_name)
    
    caps_dict = {}
    
    # Add unscaled caps if available
    if unscaled_col in results:
        caps_dict['unscaled'] = results[unscaled_col]['caps']
    else:
        caps_dict['unscaled'] = None
        print(f"Warning: No unscaled caps found for {vector_name}")
    
    # Add scaled caps if available
    if scaled_col in results:
        caps_dict['scaled'] = results[scaled_col]['caps']
    else:
        caps_dict['scaled'] = None
        # Don't warn for contrast vector which doesn't have scaled version
        if 'contrast' not in vector_name:
            print(f"Warning: No scaled caps found for {vector_name}")
    
    vector['caps'] = caps_dict

# Save updated vectors
torch.save(vectors, output_file)
print(f"✓ Saved caps to {len(vectors)} vectors in {output_file}")

# Show sample
print("\nSample vector with caps:")
sample_vec = vectors[0]
print(f"Vector: {sample_vec['name']}")
print(f"Unscaled caps: {sample_vec['caps']['unscaled']}")
print(f"Scaled caps: {sample_vec['caps']['scaled']}")

✓ Saved caps to 7 vectors in /workspace/qwen-3-32b/evals/capping_vectors.pt

Sample vector with caps:
Vector: roles_traits/pca/layer32_roles_pos23_traits_pos40-100/pc1
Unscaled caps: [-8.108493156433106, 0.22188998460769668, 6.673163414001465]
Scaled caps: [-13.081570873260498, 7.266024589538575, 23.00894546508789]
