# General Config Generation

This notebook generates experiment configs based on projection percentiles from:
1. Role/Trait data → `role_trait_config.pt`
2. LMSYS Chat-1M data → `lmsys_10000_config.pt`

In [1]:
import torch
import os
import json
import sys
from collections import defaultdict
import pandas as pd
import numpy as np

sys.path.append('.')
sys.path.append('..')

torch.set_float32_matmul_precision('high')

In [2]:
# Configuration - Change these for different models
model_name = "qwen-3-32b"
layer = 32
total_layers = 64
base_dir = f"/workspace/{model_name}"

vectors_file = f"{base_dir}/capped/configs/multi_contrast_vectors.pt"
output_dir = f"{base_dir}/capped/configs"

os.makedirs(output_dir, exist_ok=True)

print(f"Model: {model_name}")
print(f"Total layers: {total_layers}")
print(f"Vectors file: {vectors_file}")
print(f"Output directory: {output_dir}")

Model: qwen-3-32b
Total layers: 64
Vectors file: /workspace/qwen-3-32b/capped/configs/multi_contrast_vectors.pt
Output directory: /workspace/qwen-3-32b/capped/configs


In [3]:
# Load vectors
vectors = torch.load(vectors_file, weights_only=False)
print(f"Loaded {len(vectors)} vectors")
print(f"Sample vector: {vectors[0]['name']} at layer {vectors[0]['layer']}")

# Create vectors dictionary for config
vectors_dict = {}
for vec in vectors:
    vectors_dict[vec['name']] = {
        'vector': vec['vector'],
        'layer': vec['layer']
    }

print(f"Created vectors dictionary with {len(vectors_dict)} entries")

Loaded 64 vectors
Sample vector: layer_0/contrast_role_pos3_default1 at layer 0
Created vectors dictionary with 64 entries


## Helper Functions

In [4]:
def bin_scores(df):
    """
    Add a score_bin column to the dataframe that bins scores consistently:
    - Bin 0: Role score 0, Trait 0-25, Default (all)
    - Bin 1: Role score 1, Trait 25-50
    - Bin 2: Role score 2, Trait 50-75
    - Bin 3: Role score 3, Trait 75-100
    """
    df = df.copy()
    
    # Handle REFUSAL and null scores
    df['score'] = df['score'].replace('REFUSAL', 0)
    df['score'] = df['score'].fillna(-1)
    df['score'] = df['score'].astype(int)
    
    # Create binned score column
    def bin_score(row):
        if row['type'] == 'role':
            return row['score']
        elif row['type'] == 'trait':
            if row['score'] < 25:
                return 0
            elif row['score'] < 50:
                return 1
            elif row['score'] < 75:
                return 2
            else:
                return 3
        else:  # default
            return 0
    
    df['score_bin'] = df.apply(bin_score, axis=1)
    return df


def build_experiments(percentile_df, percentiles, layer_selections, vectors_dict):
    """
    Build experiments from percentile DataFrame.
    
    Args:
        percentile_df: DataFrame with columns ['layer', 'vector_name', 'p1', 'p25', 'p50', 'p75']
        percentiles: List of percentile values [0.01, 0.25, 0.50, 0.75]
        layer_selections: Dict mapping layer range string to list of layer indices
        vectors_dict: Dictionary of all vectors
    
    Returns:
        List of experiment dictionaries
    """
    # Map percentile values to column names
    percentile_to_col = {
        0.01: 'p1',
        0.25: 'p25',
        0.50: 'p50',
        0.75: 'p75'
    }
    
    experiments = []
    
    for layer_range_str, layer_list in layer_selections.items():
        for percentile in percentiles:
            exp_id = f"layers_{layer_range_str}-p{percentile}"
            col_name = percentile_to_col[percentile]
            
            # Filter vectors in this layer range
            vectors_in_range = percentile_df[percentile_df['layer'].isin(layer_list)]
            
            # Create interventions
            interventions = []
            for _, row in vectors_in_range.iterrows():
                interventions.append({
                    'vector': row['vector_name'],
                    'cap': float(row[col_name])
                })
            
            experiments.append({
                'id': exp_id,
                'interventions': interventions
            })
    
    return experiments

## Section 1: Role/Trait Config

In [5]:
# Load role/trait projection data
role_file = f"{base_dir}/capped/projections/roles_projections.jsonl"
trait_file = f"{base_dir}/capped/projections/traits_projections.jsonl"

# Load each file into pandas
df_role = pd.read_json(role_file, lines=True)
df_trait = pd.read_json(trait_file, lines=True)

# Add source_file column
df_role['source_file'] = 'role'
df_trait['source_file'] = 'trait'

# Concatenate
df_rt = pd.concat([df_role, df_trait], ignore_index=True)

# Add type column
def determine_type(row):
    if row.get('prompt_label') == 'default':
        return 'default'
    if isinstance(row['role'], str) and '_default' in row['role']:
        parts = row['role'].split('_')
        if len(parts) == 2 and parts[0].isdigit() and parts[1] == 'default':
            return 'default'
    return row['source_file']

df_rt['type'] = df_rt.apply(determine_type, axis=1)
df_rt = df_rt.drop('source_file', axis=1)

print(f"Loaded {len(df_rt)} role/trait records")
print(f"Type distribution:\n{df_rt['type'].value_counts()}")

Loaded 909599 role/trait records
Type distribution:
type
trait      576000
role       329999
default      3600
Name: count, dtype: int64


In [6]:
# Expand projections into columns
projections_df = pd.json_normalize(df_rt['projections'])
df_rt = pd.concat([df_rt.drop('projections', axis=1), projections_df], axis=1)

# Bin scores
df_rt_binned = bin_scores(df_rt)

print(f"Expanded shape: {df_rt_binned.shape}")
print(f"Score bin distribution:\n{df_rt_binned['score_bin'].value_counts().sort_index()}")

  df['score'] = df['score'].replace('REFUSAL', 0)


Expanded shape: (909599, 70)
Score bin distribution:
score_bin
0    256688
1     57355
2     57002
3    538554
Name: count, dtype: int64


In [7]:
# Compute percentiles for each vector
projection_cols = [col for col in df_rt_binned.columns if col.startswith('layer_')]
print(f"Found {len(projection_cols)} projection columns")

percentiles_to_compute = [1, 25, 50, 75]
rt_percentile_data = []

for col in projection_cols:
    values = df_rt_binned[col].dropna().values
    
    if len(values) == 0:
        continue
    
    # Extract layer number
    layer_num = int(col.split('/')[0].replace('layer_', ''))
    
    # Compute percentiles
    pcts = np.percentile(values, percentiles_to_compute)
    
    rt_percentile_data.append({
        'layer': layer_num,
        'vector_name': col,
        'p1': pcts[0],
        'p25': pcts[1],
        'p50': pcts[2],
        'p75': pcts[3]
    })

df_rt_percentiles = pd.DataFrame(rt_percentile_data).sort_values('layer')
print(f"\nComputed percentiles for {len(df_rt_percentiles)} vectors")
print(f"\nSample percentiles for layer {layer}:")
print(df_rt_percentiles[df_rt_percentiles['layer'] == layer])

Found 64 projection columns

Computed percentiles for 64 vectors

Sample percentiles for layer 32:
    layer                           vector_name     p1      p25      p50  \
32     32  layer_32/contrast_role_pos3_default1 -11.25  5.28125  15.3125   

       p75  
32  31.125  


In [11]:
# Define layer selections
layer_selections = {
    # f'0:{total_layers}': list(range(0, total_layers)),
    # f'0:{total_layers//4}': list(range(0, total_layers//4)),
    # f'{total_layers//4}:{total_layers//4 + total_layers//8}': 
    #     list(range(total_layers//4, total_layers//4 + total_layers//8)),
    # f'{total_layers//8}:{total_layers//8 + total_layers//8}': 
    #     list(range(total_layers//8, total_layers//8 + total_layers//8)),
}

# for i in range(3, 8):
#     layer_selections[f'{total_layers//8*i}:{total_layers//8*i + total_layers//8}'] = list(range(total_layers//8*i, total_layers//8*i + total_layers//8))

# Sliding windows with different sizes and increments
# Window size 4, increment by 2
for i in range(32, 60, 2):
    if i + 4 <= 60:
        layer_selections[f'{i}:{i + 4}'] = list(range(i, i + 4))

# Window size 4, increment by 4
for i in range(32, 60, 4):
    if i + 4 <= 60:
        layer_selections[f'{i}:{i + 4}'] = list(range(i, i + 4))

# Window size 8, increment by 2
for i in range(32, 60, 2):
    if i + 8 <= 60:
        layer_selections[f'{i}:{i + 8}'] = list(range(i, i + 8))

# Window size 8, increment by 4
for i in range(32, 60, 4):
    if i + 8 <= 60:
        layer_selections[f'{i}:{i + 8}'] = list(range(i, i + 8))

# Window size 16, increment by 2
for i in range(32, 60, 2):
    if i + 16 <= 60:
        layer_selections[f'{i}:{i + 16}'] = list(range(i, i + 16))

# Window size 16, increment by 4
for i in range(32, 60, 4):
    if i + 16 <= 60:
        layer_selections[f'{i}:{i + 16}'] = list(range(i, i + 16))

# Sort by window size (4, 8, 16), then by starting layer
layer_selections = dict(sorted(layer_selections.items(), 
                               key=lambda x: (len(x[1]), min(x[1]))))

print("Layer selections:")
for name, layers in layer_selections.items():
    print(f"  {name}: {len(layers)} layers ({min(layers)}-{max(layers)})")

Layer selections:
  32:36: 4 layers (32-35)
  34:38: 4 layers (34-37)
  36:40: 4 layers (36-39)
  38:42: 4 layers (38-41)
  40:44: 4 layers (40-43)
  42:46: 4 layers (42-45)
  44:48: 4 layers (44-47)
  46:50: 4 layers (46-49)
  48:52: 4 layers (48-51)
  50:54: 4 layers (50-53)
  52:56: 4 layers (52-55)
  54:58: 4 layers (54-57)
  56:60: 4 layers (56-59)
  32:40: 8 layers (32-39)
  34:42: 8 layers (34-41)
  36:44: 8 layers (36-43)
  38:46: 8 layers (38-45)
  40:48: 8 layers (40-47)
  42:50: 8 layers (42-49)
  44:52: 8 layers (44-51)
  46:54: 8 layers (46-53)
  48:56: 8 layers (48-55)
  50:58: 8 layers (50-57)
  52:60: 8 layers (52-59)
  32:48: 16 layers (32-47)
  34:50: 16 layers (34-49)
  36:52: 16 layers (36-51)
  38:54: 16 layers (38-53)
  40:56: 16 layers (40-55)
  42:58: 16 layers (42-57)
  44:60: 16 layers (44-59)


In [12]:
# Build role/trait experiments
percentiles = [0.01, 0.25, 0.50, 0.75]

rt_experiments = build_experiments(
    percentile_df=df_rt_percentiles,
    percentiles=percentiles,
    layer_selections=layer_selections,
    vectors_dict=vectors_dict
)

print(f"\nCreated {len(rt_experiments)} role/trait experiments:")
for exp in rt_experiments:
    print(f"  - {exp['id']}: {len(exp['interventions'])} interventions")


Created 124 role/trait experiments:
  - layers_32:36-p0.01: 4 interventions
  - layers_32:36-p0.25: 4 interventions
  - layers_32:36-p0.5: 4 interventions
  - layers_32:36-p0.75: 4 interventions
  - layers_34:38-p0.01: 4 interventions
  - layers_34:38-p0.25: 4 interventions
  - layers_34:38-p0.5: 4 interventions
  - layers_34:38-p0.75: 4 interventions
  - layers_36:40-p0.01: 4 interventions
  - layers_36:40-p0.25: 4 interventions
  - layers_36:40-p0.5: 4 interventions
  - layers_36:40-p0.75: 4 interventions
  - layers_38:42-p0.01: 4 interventions
  - layers_38:42-p0.25: 4 interventions
  - layers_38:42-p0.5: 4 interventions
  - layers_38:42-p0.75: 4 interventions
  - layers_40:44-p0.01: 4 interventions
  - layers_40:44-p0.25: 4 interventions
  - layers_40:44-p0.5: 4 interventions
  - layers_40:44-p0.75: 4 interventions
  - layers_42:46-p0.01: 4 interventions
  - layers_42:46-p0.25: 4 interventions
  - layers_42:46-p0.5: 4 interventions
  - layers_42:46-p0.75: 4 interventions
  - layer

In [13]:
# Show sample experiments
print("\nSample experiment details:")
for i in [0, 4, 8, 12]:
    exp = rt_experiments[i]
    caps = [interv['cap'] for interv in exp['interventions']]
    print(f"\n{exp['id']}:")
    print(f"  Vectors: {len(exp['interventions'])}")
    print(f"  Cap range: {min(caps):.2f} to {max(caps):.2f}")
    print(f"  First 3 interventions:")
    for interv in exp['interventions'][:3]:
        print(f"    {interv['vector']}: {interv['cap']:.4f}")


Sample experiment details:

layers_32:36-p0.01:
  Vectors: 4
  Cap range: -26.25 to -8.88
  First 3 interventions:
    layer_32/contrast_role_pos3_default1: -11.2500
    layer_33/contrast_role_pos3_default1: -8.8750
    layer_34/contrast_role_pos3_default1: -16.0000

layers_34:38-p0.01:
  Vectors: 4
  Cap range: -30.00 to -16.00
  First 3 interventions:
    layer_34/contrast_role_pos3_default1: -16.0000
    layer_35/contrast_role_pos3_default1: -26.2500
    layer_36/contrast_role_pos3_default1: -30.0000

layers_36:40-p0.01:
  Vectors: 4
  Cap range: -30.00 to -19.00
  First 3 interventions:
    layer_36/contrast_role_pos3_default1: -30.0000
    layer_37/contrast_role_pos3_default1: -24.5000
    layer_38/contrast_role_pos3_default1: -19.0000

layers_38:42-p0.01:
  Vectors: 4
  Cap range: -29.75 to -19.00
  First 3 interventions:
    layer_38/contrast_role_pos3_default1: -19.0000
    layer_39/contrast_role_pos3_default1: -25.3750
    layer_40/contrast_role_pos3_default1: -27.5000


In [None]:
# Save role/trait config
rt_config_file = f"{output_dir}/role_trait_sliding_config.pt"

rt_config = {
    'vectors': vectors_dict,
    'experiments': rt_experiments
}

torch.save(rt_config, rt_config_file)

print(f"✓ Saved role/trait config to {rt_config_file}")
print(f"\nConfig summary:")
print(f"  - {len(rt_config['vectors'])} vectors")
print(f"  - {len(rt_config['experiments'])} experiments")
print(f"  - {sum(len(exp['interventions']) for exp in rt_config['experiments'])} total interventions")

✓ Saved role/trait config to /workspace/qwen-3-32b/capped/configs/role_trait_sliding_config.pt

Config summary:
  - 64 vectors
  - 124 experiments
  - 1008 total interventions


: 

## Section 2: LMSYS Config

In [15]:
# Load LMSYS projection statistics
lmsys_file = f"{base_dir}/capped/projections/lmsys_10000.json"

with open(lmsys_file, 'r') as f:
    lmsys_data = json.load(f)

print(f"Loaded LMSYS data")
print(f"Metadata: {lmsys_data.get('metadata', {})}")
print(f"Vectors: {len(lmsys_data['per_vector_stats'])}")

Loaded LMSYS data
Metadata: {'dataset': 'lmsys/lmsys-chat-1m', 'model': 'Qwen/Qwen3-32B', 'n_conversations_requested': 10000, 'seed': 42, 'language_filter': 'English', 'n_conversations_sampled': 10000, 'n_assistant_turns': 1206464}
Vectors: 64


In [16]:
# Extract percentiles from pre-computed statistics
lmsys_percentile_data = []

for vector_name, stats in lmsys_data['per_vector_stats'].items():
    # Extract layer number
    layer_num = int(vector_name.split('/')[0].replace('layer_', ''))
    
    # Get percentiles from pre-computed stats
    percentiles_dict = stats['percentiles']
    
    lmsys_percentile_data.append({
        'layer': layer_num,
        'vector_name': vector_name,
        'p1': float(percentiles_dict['1']),
        'p25': float(percentiles_dict['25']),
        'p50': float(percentiles_dict['50']),
        'p75': float(percentiles_dict['75'])
    })

df_lmsys_percentiles = pd.DataFrame(lmsys_percentile_data).sort_values('layer')
print(f"\nExtracted percentiles for {len(df_lmsys_percentiles)} vectors")
print(f"\nSample percentiles for layer {layer}:")
print(df_lmsys_percentiles[df_lmsys_percentiles['layer'] == layer])


Extracted percentiles for 64 vectors

Sample percentiles for layer 32:
    layer                           vector_name       p1  p25      p50    p75
32     32  layer_32/contrast_role_pos3_default1 -7.90625  6.0  13.0625  19.25


In [17]:
# Build LMSYS experiments
lmsys_experiments = build_experiments(
    percentile_df=df_lmsys_percentiles,
    percentiles=percentiles,
    layer_selections=layer_selections,
    vectors_dict=vectors_dict
)

print(f"\nCreated {len(lmsys_experiments)} LMSYS experiments:")
for exp in lmsys_experiments:
    print(f"  - {exp['id']}: {len(exp['interventions'])} interventions")


Created 20 LMSYS experiments:
  - layers_24:32-p0.01: 8 interventions
  - layers_24:32-p0.25: 8 interventions
  - layers_24:32-p0.5: 8 interventions
  - layers_24:32-p0.75: 8 interventions
  - layers_32:40-p0.01: 8 interventions
  - layers_32:40-p0.25: 8 interventions
  - layers_32:40-p0.5: 8 interventions
  - layers_32:40-p0.75: 8 interventions
  - layers_40:48-p0.01: 8 interventions
  - layers_40:48-p0.25: 8 interventions
  - layers_40:48-p0.5: 8 interventions
  - layers_40:48-p0.75: 8 interventions
  - layers_48:56-p0.01: 8 interventions
  - layers_48:56-p0.25: 8 interventions
  - layers_48:56-p0.5: 8 interventions
  - layers_48:56-p0.75: 8 interventions
  - layers_56:64-p0.01: 8 interventions
  - layers_56:64-p0.25: 8 interventions
  - layers_56:64-p0.5: 8 interventions
  - layers_56:64-p0.75: 8 interventions


In [18]:
# Show sample experiments
print("\nSample experiment details:")
for i in [0, 4, 8, 12]:
    exp = lmsys_experiments[i]
    caps = [interv['cap'] for interv in exp['interventions']]
    print(f"\n{exp['id']}:")
    print(f"  Vectors: {len(exp['interventions'])}")
    print(f"  Cap range: {min(caps):.2f} to {max(caps):.2f}")
    print(f"  First 3 interventions:")
    for interv in exp['interventions'][:3]:
        print(f"    {interv['vector']}: {interv['cap']:.4f}")


Sample experiment details:

layers_24:32-p0.01:
  Vectors: 8
  Cap range: -59.50 to -11.38
  First 3 interventions:
    layer_24/contrast_role_pos3_default1: -59.5000
    layer_25/contrast_role_pos3_default1: -49.7500
    layer_26/contrast_role_pos3_default1: -34.5000

layers_32:40-p0.01:
  Vectors: 8
  Cap range: -22.12 to -6.78
  First 3 interventions:
    layer_32/contrast_role_pos3_default1: -7.9062
    layer_33/contrast_role_pos3_default1: -6.7812
    layer_34/contrast_role_pos3_default1: -10.9375

layers_40:48-p0.01:
  Vectors: 8
  Cap range: -81.00 to -20.75
  First 3 interventions:
    layer_40/contrast_role_pos3_default1: -20.7500
    layer_41/contrast_role_pos3_default1: -21.6250
    layer_42/contrast_role_pos3_default1: -24.8750

layers_48:56-p0.01:
  Vectors: 8
  Cap range: -67.50 to -47.00
  First 3 interventions:
    layer_48/contrast_role_pos3_default1: -51.7500
    layer_49/contrast_role_pos3_default1: -54.5000
    layer_50/contrast_role_pos3_default1: -52.7500


In [19]:
# Save LMSYS config
lmsys_config_file = f"{output_dir}/lmsys_10000_eighths_config.pt"

lmsys_config = {
    'vectors': vectors_dict,
    'experiments': lmsys_experiments
}

torch.save(lmsys_config, lmsys_config_file)

print(f"✓ Saved LMSYS config to {lmsys_config_file}")
print(f"\nConfig summary:")
print(f"  - {len(lmsys_config['vectors'])} vectors")
print(f"  - {len(lmsys_config['experiments'])} experiments")
print(f"  - {sum(len(exp['interventions']) for exp in lmsys_config['experiments'])} total interventions")

✓ Saved LMSYS config to /workspace/qwen-3-32b/capped/configs/lmsys_10000_eighths_config.pt

Config summary:
  - 64 vectors
  - 20 experiments
  - 160 total interventions


## Summary

In [20]:
print("="*70)
print("Config Generation Complete")
print("="*70)
print(f"\nModel: {model_name}")
print(f"Total layers: {total_layers}")
print(f"\nGenerated configs:")
print(f"  1. {rt_config_file}")
print(f"     - {len(rt_config['experiments'])} experiments")
print(f"     - Based on role/trait projection data")
print(f"\n  2. {lmsys_config_file}")
print(f"     - {len(lmsys_config['experiments'])} experiments")
print(f"     - Based on LMSYS-Chat-1M projection data")
print(f"\nPercentiles used: {percentiles}")
print(f"Layer selections: {list(layer_selections.keys())}")
print("="*70)

Config Generation Complete

Model: qwen-3-32b
Total layers: 64

Generated configs:
  1. /workspace/qwen-3-32b/capped/configs/role_trait_eighths_config.pt
     - 20 experiments
     - Based on role/trait projection data

  2. /workspace/qwen-3-32b/capped/configs/lmsys_10000_eighths_config.pt
     - 20 experiments
     - Based on LMSYS-Chat-1M projection data

Percentiles used: [0.01, 0.25, 0.5, 0.75]
Layer selections: ['24:32', '32:40', '40:48', '48:56', '56:64']
