# General Config Generation

This notebook generates experiment configs based on projection percentiles from:
1. Role/Trait data → `role_trait_config.pt`
2. LMSYS Chat-1M data → `lmsys_10000_config.pt`

In [30]:
import torch
import os
import json
import sys
from collections import defaultdict
import pandas as pd
import numpy as np

sys.path.append('.')
sys.path.append('..')

torch.set_float32_matmul_precision('high')

In [None]:
# Configuration - Change these for different models
model_name = "llama-3.3-70b"
layer = 40
total_layers = 80
base_dir = f"/workspace/{model_name}"

vectors_file = f"{base_dir}/capped/configs/multi_contrast_vectors.pt"
output_dir = f"{base_dir}/capped/configs"

os.makedirs(output_dir, exist_ok=True)

print(f"Model: {model_name}")
print(f"Total layers: {total_layers}")
print(f"Vectors file: {vectors_file}")
print(f"Output directory: {output_dir}")

Model: llama-3.3-70b
Total layers: 80
Vectors file: /workspace/llama-3.3-70b/capped/configs/multi_contrast_vectors.pt
Output directory: /workspace/llama-3.3-70b/capped/configs


In [32]:
# Load vectors
vectors = torch.load(vectors_file, weights_only=False)
print(f"Loaded {len(vectors)} vectors")
print(f"Sample vector: {vectors[0]['name']} at layer {vectors[0]['layer']}")

# Create vectors dictionary for config
vectors_dict = {}
for vec in vectors:
    vectors_dict[vec['name']] = {
        'vector': vec['vector'],
        'layer': vec['layer']
    }

print(f"Created vectors dictionary with {len(vectors_dict)} entries")

Loaded 80 vectors
Sample vector: layer_0/contrast_role_pos3_default1 at layer 0
Created vectors dictionary with 80 entries


## Helper Functions

In [33]:
def bin_scores(df):
    """
    Add a score_bin column to the dataframe that bins scores consistently:
    - Bin 0: Role score 0, Trait 0-25, Default (all)
    - Bin 1: Role score 1, Trait 25-50
    - Bin 2: Role score 2, Trait 50-75
    - Bin 3: Role score 3, Trait 75-100
    """
    df = df.copy()
    
    # Handle REFUSAL and null scores
    df['score'] = df['score'].replace('REFUSAL', 0)
    df['score'] = df['score'].fillna(-1)
    df['score'] = df['score'].astype(int)
    
    # Create binned score column
    def bin_score(row):
        if row['type'] == 'role':
            return row['score']
        elif row['type'] == 'trait':
            if row['score'] < 25:
                return 0
            elif row['score'] < 50:
                return 1
            elif row['score'] < 75:
                return 2
            else:
                return 3
        else:  # default
            return 0
    
    df['score_bin'] = df.apply(bin_score, axis=1)
    return df


def build_experiments(percentile_df, percentiles, layer_selections, vectors_dict):
    """
    Build experiments from percentile DataFrame.
    
    Args:
        percentile_df: DataFrame with columns ['layer', 'vector_name', 'p1', 'p25', 'p50', 'p75']
        percentiles: List of percentile values [0.01, 0.25, 0.50, 0.75]
        layer_selections: Dict mapping layer range string to list of layer indices
        vectors_dict: Dictionary of all vectors
    
    Returns:
        List of experiment dictionaries
    """
    # Map percentile values to column names
    percentile_to_col = {
        0.01: 'p1',
        0.25: 'p25',
        0.50: 'p50',
        0.75: 'p75'
    }
    
    experiments = []
    
    for layer_range_str, layer_list in layer_selections.items():
        for percentile in percentiles:
            exp_id = f"layers_{layer_range_str}-p{percentile}"
            col_name = percentile_to_col[percentile]
            
            # Filter vectors in this layer range
            vectors_in_range = percentile_df[percentile_df['layer'].isin(layer_list)]
            
            # Create interventions
            interventions = []
            for _, row in vectors_in_range.iterrows():
                interventions.append({
                    'vector': row['vector_name'],
                    'cap': float(row[col_name])
                })
            
            experiments.append({
                'id': exp_id,
                'interventions': interventions
            })
    
    return experiments

## Section 1: Role/Trait Config

In [34]:
# Load role/trait projection data
role_file = f"{base_dir}/capped/projections/roles_projections.jsonl"
trait_file = f"{base_dir}/capped/projections/traits_projections.jsonl"

# Load each file into pandas
df_role = pd.read_json(role_file, lines=True)
df_trait = pd.read_json(trait_file, lines=True)

# Add source_file column
df_role['source_file'] = 'role'
df_trait['source_file'] = 'trait'

# Concatenate
df_rt = pd.concat([df_role, df_trait], ignore_index=True)

# Add type column
def determine_type(row):
    if row.get('prompt_label') == 'default':
        return 'default'
    if isinstance(row['role'], str) and '_default' in row['role']:
        parts = row['role'].split('_')
        if len(parts) == 2 and parts[0].isdigit() and parts[1] == 'default':
            return 'default'
    return row['source_file']

df_rt['type'] = df_rt.apply(determine_type, axis=1)
df_rt = df_rt.drop('source_file', axis=1)

print(f"Loaded {len(df_rt)} role/trait records")
print(f"Type distribution:\n{df_rt['type'].value_counts()}")

Loaded 908400 role/trait records
Type distribution:
type
trait      576000
role       330000
default      2400
Name: count, dtype: int64


In [35]:
# Expand projections into columns
projections_df = pd.json_normalize(df_rt['projections'])
df_rt = pd.concat([df_rt.drop('projections', axis=1), projections_df], axis=1)

# Bin scores
df_rt_binned = bin_scores(df_rt)

print(f"Expanded shape: {df_rt_binned.shape}")
print(f"Score bin distribution:\n{df_rt_binned['score_bin'].value_counts().sort_index()}")

  df['score'] = df['score'].replace('REFUSAL', 0)


Expanded shape: (908400, 86)
Score bin distribution:
score_bin
0    262834
1     37557
2     43465
3    564544
Name: count, dtype: int64


In [36]:
# Compute percentiles for each vector
projection_cols = [col for col in df_rt_binned.columns if col.startswith('layer_')]
print(f"Found {len(projection_cols)} projection columns")

percentiles_to_compute = [1, 25, 50, 75]
rt_percentile_data = []

for col in projection_cols:
    values = df_rt_binned[col].dropna().values
    
    if len(values) == 0:
        continue
    
    # Extract layer number
    layer_num = int(col.split('/')[0].replace('layer_', ''))
    
    # Compute percentiles
    pcts = np.percentile(values, percentiles_to_compute)
    
    rt_percentile_data.append({
        'layer': layer_num,
        'vector_name': col,
        'p1': pcts[0],
        'p25': pcts[1],
        'p50': pcts[2],
        'p75': pcts[3]
    })

df_rt_percentiles = pd.DataFrame(rt_percentile_data).sort_values('layer')
print(f"\nComputed percentiles for {len(df_rt_percentiles)} vectors")
print(f"\nSample percentiles for layer {layer}:")
print(df_rt_percentiles[df_rt_percentiles['layer'] == layer])

Found 80 projection columns

Computed percentiles for 80 vectors

Sample percentiles for layer 32:
    layer                           vector_name        p1       p25   p50  \
32     32  layer_32/contrast_role_pos3_default1 -1.851562 -1.234375 -0.75   

         p75  
32 -0.097656  


In [40]:
# Define layer selections
layer_selections = {
    f'0:{total_layers}': list(range(0, total_layers)),
    f'0:{total_layers//2}': list(range(0, total_layers//2)),
    f'{total_layers//2}:{total_layers}': list(range(total_layers//2, total_layers))
}
# Sliding windows with different sizes and increments
# Loop 1: Window sizes (4, 8, 16)
for window_size in [8, 16, 24]:
    # Loop 2: Increment steps (2, 4)
    for increment in [4]:
        for i in range(32, 80, increment):
            if i + window_size <= total_layers:
                layer_selections[f'{i}:{i + window_size}'] = list(range(i, i + window_size))

# Sort by window size (4, 8, 16), then by starting layer
layer_selections = dict(sorted(layer_selections.items(), 
                               key=lambda x: (-len(x[1]), min(x[1]))))

print("Layer selections:")
for name, layers in layer_selections.items():
    print(f"  {name}: {len(layers)} layers ({min(layers)}-{max(layers)})")

Layer selections:
  0:80: 80 layers (0-79)
  0:40: 40 layers (0-39)
  40:80: 40 layers (40-79)
  32:56: 24 layers (32-55)
  36:60: 24 layers (36-59)
  40:64: 24 layers (40-63)
  44:68: 24 layers (44-67)
  48:72: 24 layers (48-71)
  52:76: 24 layers (52-75)
  56:80: 24 layers (56-79)
  32:48: 16 layers (32-47)
  36:52: 16 layers (36-51)
  40:56: 16 layers (40-55)
  44:60: 16 layers (44-59)
  48:64: 16 layers (48-63)
  52:68: 16 layers (52-67)
  56:72: 16 layers (56-71)
  60:76: 16 layers (60-75)
  64:80: 16 layers (64-79)
  32:40: 8 layers (32-39)
  36:44: 8 layers (36-43)
  40:48: 8 layers (40-47)
  44:52: 8 layers (44-51)
  48:56: 8 layers (48-55)
  52:60: 8 layers (52-59)
  56:64: 8 layers (56-63)
  60:68: 8 layers (60-67)
  64:72: 8 layers (64-71)
  68:76: 8 layers (68-75)
  72:80: 8 layers (72-79)


In [41]:
# Build role/trait experiments
percentiles = [0.01, 0.25, 0.50, 0.75]

rt_experiments = build_experiments(
    percentile_df=df_rt_percentiles,
    percentiles=percentiles,
    layer_selections=layer_selections,
    vectors_dict=vectors_dict
)

print(f"\nCreated {len(rt_experiments)} role/trait experiments:")
for i, exp in enumerate(rt_experiments):
    print(f"  - {i} {exp['id']}: {len(exp['interventions'])} interventions")


Created 120 role/trait experiments:
  - 0 layers_0:80-p0.01: 80 interventions
  - 1 layers_0:80-p0.25: 80 interventions
  - 2 layers_0:80-p0.5: 80 interventions
  - 3 layers_0:80-p0.75: 80 interventions
  - 4 layers_0:40-p0.01: 40 interventions
  - 5 layers_0:40-p0.25: 40 interventions
  - 6 layers_0:40-p0.5: 40 interventions
  - 7 layers_0:40-p0.75: 40 interventions
  - 8 layers_40:80-p0.01: 40 interventions
  - 9 layers_40:80-p0.25: 40 interventions
  - 10 layers_40:80-p0.5: 40 interventions
  - 11 layers_40:80-p0.75: 40 interventions
  - 12 layers_32:56-p0.01: 24 interventions
  - 13 layers_32:56-p0.25: 24 interventions
  - 14 layers_32:56-p0.5: 24 interventions
  - 15 layers_32:56-p0.75: 24 interventions
  - 16 layers_36:60-p0.01: 24 interventions
  - 17 layers_36:60-p0.25: 24 interventions
  - 18 layers_36:60-p0.5: 24 interventions
  - 19 layers_36:60-p0.75: 24 interventions
  - 20 layers_40:64-p0.01: 24 interventions
  - 21 layers_40:64-p0.25: 24 interventions
  - 22 layers_40:6

In [42]:
# Show sample experiments
print("\nSample experiment details:")
for i in [0, 4, 8, 12]:
    exp = rt_experiments[i]
    caps = [interv['cap'] for interv in exp['interventions']]
    print(f"\n{exp['id']}:")
    print(f"  Vectors: {len(exp['interventions'])}")
    print(f"  Cap range: {min(caps):.2f} to {max(caps):.2f}")
    print(f"  First 3 interventions:")
    for interv in exp['interventions'][:3]:
        print(f"    {interv['vector']}: {interv['cap']:.4f}")


Sample experiment details:

layers_0:80-p0.01:
  Vectors: 80
  Cap range: -5.28 to 1.83
  First 3 interventions:
    layer_0/contrast_role_pos3_default1: -0.0840
    layer_1/contrast_role_pos3_default1: -0.2305
    layer_2/contrast_role_pos3_default1: -0.2832

layers_0:40-p0.01:
  Vectors: 40
  Cap range: -2.97 to -0.08
  First 3 interventions:
    layer_0/contrast_role_pos3_default1: -0.0840
    layer_1/contrast_role_pos3_default1: -0.2305
    layer_2/contrast_role_pos3_default1: -0.2832

layers_40:80-p0.01:
  Vectors: 40
  Cap range: -5.28 to 1.83
  First 3 interventions:
    layer_40/contrast_role_pos3_default1: -2.7969
    layer_41/contrast_role_pos3_default1: -2.6094
    layer_42/contrast_role_pos3_default1: -2.5469

layers_32:56-p0.01:
  Vectors: 24
  Cap range: -3.30 to -1.85
  First 3 interventions:
    layer_32/contrast_role_pos3_default1: -1.8516
    layer_33/contrast_role_pos3_default1: -2.2500
    layer_34/contrast_role_pos3_default1: -2.9688


In [43]:
# Save role/trait config
rt_config_file = f"{output_dir}/role_trait_config.pt"

rt_config = {
    'vectors': vectors_dict,
    'experiments': rt_experiments
}

torch.save(rt_config, rt_config_file)

print(f"✓ Saved role/trait config to {rt_config_file}")
print(f"\nConfig summary:")
print(f"  - {len(rt_config['vectors'])} vectors")
print(f"  - {len(rt_config['experiments'])} experiments")
print(f"  - {sum(len(exp['interventions']) for exp in rt_config['experiments'])} total interventions")

✓ Saved role/trait config to /workspace/llama-3.3-70b/capped/configs/role_trait_config.pt

Config summary:
  - 80 vectors
  - 120 experiments
  - 2240 total interventions


## Section 2: LMSYS Config

In [44]:
# Load LMSYS projection statistics
lmsys_file = f"{base_dir}/capped/projections/lmsys_10000.json"

with open(lmsys_file, 'r') as f:
    lmsys_data = json.load(f)

print(f"Loaded LMSYS data")
print(f"Metadata: {lmsys_data.get('metadata', {})}")
print(f"Vectors: {len(lmsys_data['per_vector_stats'])}")

Loaded LMSYS data
Metadata: {'dataset': 'lmsys/lmsys-chat-1m', 'model': 'meta-llama/Llama-3.3-70B-Instruct', 'n_conversations_requested': 10000, 'seed': 42, 'language_filter': 'English', 'n_conversations_sampled': 10000, 'n_assistant_turns': 78334}
Vectors: 80


In [45]:
# Extract percentiles from pre-computed statistics
lmsys_percentile_data = []

for vector_name, stats in lmsys_data['per_vector_stats'].items():
    # Extract layer number
    layer_num = int(vector_name.split('/')[0].replace('layer_', ''))
    
    # Get percentiles from pre-computed stats
    percentiles_dict = stats['percentiles']
    
    lmsys_percentile_data.append({
        'layer': layer_num,
        'vector_name': vector_name,
        'p1': float(percentiles_dict['1']),
        'p25': float(percentiles_dict['25']),
        'p50': float(percentiles_dict['50']),
        'p75': float(percentiles_dict['75'])
    })

df_lmsys_percentiles = pd.DataFrame(lmsys_percentile_data).sort_values('layer')
print(f"\nExtracted percentiles for {len(df_lmsys_percentiles)} vectors")
print(f"\nSample percentiles for layer {layer}:")
print(df_lmsys_percentiles[df_lmsys_percentiles['layer'] == layer])


Extracted percentiles for 80 vectors

Sample percentiles for layer 32:
    layer                           vector_name        p1       p25       p50  \
32     32  layer_32/contrast_role_pos3_default1 -1.554688 -0.761719 -0.472656   

         p75  
32 -0.201172  


In [46]:
# Build LMSYS experiments
lmsys_experiments = build_experiments(
    percentile_df=df_lmsys_percentiles,
    percentiles=percentiles,
    layer_selections=layer_selections,
    vectors_dict=vectors_dict
)

print(f"\nCreated {len(lmsys_experiments)} LMSYS experiments:")
for exp in lmsys_experiments:
    print(f"  - {exp['id']}: {len(exp['interventions'])} interventions")


Created 120 LMSYS experiments:
  - layers_0:80-p0.01: 80 interventions
  - layers_0:80-p0.25: 80 interventions
  - layers_0:80-p0.5: 80 interventions
  - layers_0:80-p0.75: 80 interventions
  - layers_0:40-p0.01: 40 interventions
  - layers_0:40-p0.25: 40 interventions
  - layers_0:40-p0.5: 40 interventions
  - layers_0:40-p0.75: 40 interventions
  - layers_40:80-p0.01: 40 interventions
  - layers_40:80-p0.25: 40 interventions
  - layers_40:80-p0.5: 40 interventions
  - layers_40:80-p0.75: 40 interventions
  - layers_32:56-p0.01: 24 interventions
  - layers_32:56-p0.25: 24 interventions
  - layers_32:56-p0.5: 24 interventions
  - layers_32:56-p0.75: 24 interventions
  - layers_36:60-p0.01: 24 interventions
  - layers_36:60-p0.25: 24 interventions
  - layers_36:60-p0.5: 24 interventions
  - layers_36:60-p0.75: 24 interventions
  - layers_40:64-p0.01: 24 interventions
  - layers_40:64-p0.25: 24 interventions
  - layers_40:64-p0.5: 24 interventions
  - layers_40:64-p0.75: 24 intervention

In [47]:
# Show sample experiments
print("\nSample experiment details:")
for i in [0, 4, 8, 12]:
    exp = lmsys_experiments[i]
    caps = [interv['cap'] for interv in exp['interventions']]
    print(f"\n{exp['id']}:")
    print(f"  Vectors: {len(exp['interventions'])}")
    print(f"  Cap range: {min(caps):.2f} to {max(caps):.2f}")
    print(f"  First 3 interventions:")
    for interv in exp['interventions'][:3]:
        print(f"    {interv['vector']}: {interv['cap']:.4f}")


Sample experiment details:

layers_0:80-p0.01:
  Vectors: 80
  Cap range: -4.03 to -0.01
  First 3 interventions:
    layer_0/contrast_role_pos3_default1: -0.0100
    layer_1/contrast_role_pos3_default1: -0.1494
    layer_2/contrast_role_pos3_default1: -0.2266

layers_0:40-p0.01:
  Vectors: 40
  Cap range: -2.47 to -0.01
  First 3 interventions:
    layer_0/contrast_role_pos3_default1: -0.0100
    layer_1/contrast_role_pos3_default1: -0.1494
    layer_2/contrast_role_pos3_default1: -0.2266

layers_40:80-p0.01:
  Vectors: 40
  Cap range: -4.03 to -0.44
  First 3 interventions:
    layer_40/contrast_role_pos3_default1: -2.2188
    layer_41/contrast_role_pos3_default1: -2.1094
    layer_42/contrast_role_pos3_default1: -2.1094

layers_32:56-p0.01:
  Vectors: 24
  Cap range: -2.77 to -1.55
  First 3 interventions:
    layer_32/contrast_role_pos3_default1: -1.5547
    layer_33/contrast_role_pos3_default1: -1.8516
    layer_34/contrast_role_pos3_default1: -2.4688


In [48]:
# Save LMSYS config
lmsys_config_file = f"{output_dir}/lmsys_10000_config.pt"

lmsys_config = {
    'vectors': vectors_dict,
    'experiments': lmsys_experiments
}

torch.save(lmsys_config, lmsys_config_file)

print(f"✓ Saved LMSYS config to {lmsys_config_file}")
print(f"\nConfig summary:")
print(f"  - {len(lmsys_config['vectors'])} vectors")
print(f"  - {len(lmsys_config['experiments'])} experiments")
print(f"  - {sum(len(exp['interventions']) for exp in lmsys_config['experiments'])} total interventions")

✓ Saved LMSYS config to /workspace/llama-3.3-70b/capped/configs/lmsys_10000_config.pt

Config summary:
  - 80 vectors
  - 120 experiments
  - 2240 total interventions


## Summary

In [49]:
print("="*70)
print("Config Generation Complete")
print("="*70)
print(f"\nModel: {model_name}")
print(f"Total layers: {total_layers}")
print(f"\nGenerated configs:")
print(f"  1. {rt_config_file}")
print(f"     - {len(rt_config['experiments'])} experiments")
print(f"     - Based on role/trait projection data")
print(f"\n  2. {lmsys_config_file}")
print(f"     - {len(lmsys_config['experiments'])} experiments")
print(f"     - Based on LMSYS-Chat-1M projection data")
print(f"\nPercentiles used: {percentiles}")
print(f"Layer selections: {list(layer_selections.keys())}")
print("="*70)

Config Generation Complete

Model: llama-3.3-70b
Total layers: 80

Generated configs:
  1. /workspace/llama-3.3-70b/capped/configs/role_trait_config.pt
     - 120 experiments
     - Based on role/trait projection data

  2. /workspace/llama-3.3-70b/capped/configs/lmsys_10000_config.pt
     - 120 experiments
     - Based on LMSYS-Chat-1M projection data

Percentiles used: [0.01, 0.25, 0.5, 0.75]
Layer selections: ['0:80', '0:40', '40:80', '32:56', '36:60', '40:64', '44:68', '48:72', '52:76', '56:80', '32:48', '36:52', '40:56', '44:60', '48:64', '52:68', '56:72', '60:76', '64:80', '32:40', '36:44', '40:48', '44:52', '48:56', '52:60', '56:64', '60:68', '64:72', '68:76', '72:80']
