# Vectors for additive steering

In [1]:
import torch
import os
import json
import sys
import numpy as np

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import *

In [20]:

model_name = "llama-3.3-70b"
total_layers = 80
base_dir = f"/workspace/{model_name}"
target_layer = 40

vectors_file = f"{base_dir}/evals/configs/multi_pc1_vectors.pt"
output_dir = f"{base_dir}/evals/configs"

os.makedirs(output_dir, exist_ok=True)

print(f"Model: {model_name}")
print(f"Total layers: {total_layers}")
print(f"Vectors file: {vectors_file}")

Model: llama-3.3-70b
Total layers: 80
Vectors file: /workspace/llama-3.3-70b/evals/configs/multi_pc1_vectors.pt


## Make vectors from PC1

In [None]:
contrast_vectors = torch.load(f"/workspace/{model_name}/capped/configs/multi_contrast_vectors.pt", weights_only=False)

pc1_vectors = []

for i in range(total_layers):
    layer_results = torch.load(f"{base_dir}/roles_240/pca/layer{i}_mean_pos23.pt", weights_only=False)
    pc1 = layer_results['pca'].components_[0]

    contrast_vector = contrast_vectors[i]['vector']

    if torch.nn.functional.cosine_similarity(torch.from_numpy(pc1).reshape(1, -1), contrast_vector.reshape(1, -1)) < 0:
        pc1 = -pc1

    pc1_vectors.append(pc1)

In [None]:
vectors = []
for l in range(total_layers):
    vectors.append({
        'scaler': None,
        'name': f"layer_{l}/role_pc1_mean_pos23",
        'vector': torch.from_numpy(pc1_vectors[l]),
        'layer': l
    })
torch.save(vectors, vectors_file)

## Make config


In [21]:
# i have vector and average norm
# as a steering coeff, i want n such that ||vector * n|| = avg_norm * avg_norm_coeff
other_coeffs_rp = [-0.5, -0.25, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2]
gemma_coeffs_rp = [-0.05, -0.025, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2]
other_coeffs_asst = [0.5, 0.25, -0.25, -0.5, -0.75, -1, -1.25, -1.5, -1.75, -2]
gemma_coeffs_asst = [0.05, 0.025, -0.025, -0.05, -0.075, -0.1, -0.125, -0.15, -0.175, -0.2]

def get_steering_coeff(vector, avg_norm, avg_norm_coeff):
    vector_norm = torch.norm(vector)
    if vector_norm == 0:
        raise ValueError("Vector has zero norm.")
    return float((avg_norm * avg_norm_coeff) / vector_norm)


In [22]:
norm_obj = json.load(open(f"/workspace/{model_name}/dataset_activations/lmsys_10000/activation_stats.json"))
norm = norm_obj['per_layer_stats'][str(target_layer)]['token_level_norms']['mean']
print(norm)


8.195386976087784


In [23]:
# make experiment config with target layer
vectors = torch.load(vectors_file, weights_only=False)
target_vector = vectors[target_layer]['vector']

if model_name == "gemma-2-27b":
    rp = gemma_coeffs_rp
    asst = gemma_coeffs_asst
else:
    rp = other_coeffs_rp
    asst = other_coeffs_asst

rp_coeffs = [get_steering_coeff(target_vector, norm, c) for c in rp]
asst_coeffs = [get_steering_coeff(target_vector, norm, c) for c in asst]
print(rp_coeffs)
print(asst_coeffs)



[-4.097693920135498, -2.048846960067749, 2.048846960067749, 4.097693920135498, 6.146541118621826, 8.195387840270996, 10.244235038757324, 12.293082237243652, 14.34192943572998, 16.390775680541992]
[4.097693920135498, 2.048846960067749, -2.048846960067749, -4.097693920135498, -6.146541118621826, -8.195387840270996, -10.244235038757324, -12.293082237243652, -14.34192943572998, -16.390775680541992]


In [24]:
rp_experiments = []
asst_experiments = []

for m, c in zip(rp, rp_coeffs):
    exp_id = f"layer_{target_layer}-role_pc1-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/role_pc1_mean_pos23",
            'coeff': c
        }
    ]
    rp_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

for m, c in zip(asst, asst_coeffs):
    exp_id = f"layer_{target_layer}-role_pc1-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/role_pc1_mean_pos23",
            'coeff': c
        }
    ]
    asst_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

print("\nSample RP experiments:")
for i in range(0, len(rp_experiments), len(rp_experiments)//4):
    exp = rp_experiments[i]
    print(f"\n{exp['id']}:")
    print(f"  Vector: {exp['interventions'][0]['vector']}")
    print(f"  Coeff: {exp['interventions'][0]['coeff']:.4f}")

print("\nSample Assistant experiments:")
for i in range(0, len(asst_experiments), len(asst_experiments)//4):
    exp = asst_experiments[i]
    print(f"\n{exp['id']}:")
    print(f"  Vector: {len(exp['interventions'])}")
    print(f"  Coeff: {exp['interventions'][0]['coeff']:.4f}")





Sample RP experiments:

layer_40-role_pc1-coeff:-0.5:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: -4.0977

layer_40-role_pc1-coeff:0.25:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 2.0488

layer_40-role_pc1-coeff:0.75:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 6.1465

layer_40-role_pc1-coeff:1.25:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 10.2442

layer_40-role_pc1-coeff:1.75:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 14.3419

Sample Assistant experiments:

layer_40-role_pc1-coeff:0.5:
  Vector: 1
  Coeff: 4.0977

layer_40-role_pc1-coeff:-0.25:
  Vector: 1
  Coeff: -2.0488

layer_40-role_pc1-coeff:-0.75:
  Vector: 1
  Coeff: -6.1465

layer_40-role_pc1-coeff:-1.25:
  Vector: 1
  Coeff: -10.2442

layer_40-role_pc1-coeff:-1.75:
  Vector: 1
  Coeff: -14.3419


In [25]:
contrast_vectors = torch.load(f"/workspace/{model_name}/capped/configs/multi_contrast_vectors.pt", weights_only=False)
target_contrast_vector = contrast_vectors[target_layer]['vector']

if model_name == "gemma-2-27b":
    rp = gemma_coeffs_rp
    asst = gemma_coeffs_asst
else:
    rp = other_coeffs_rp
    asst = other_coeffs_asst

rp_coeffs = [get_steering_coeff(target_contrast_vector, norm, c) for c in rp]
asst_coeffs = [get_steering_coeff(target_contrast_vector, norm, c) for c in asst]
print(rp_coeffs)
print(asst_coeffs)

[-2.6875, -1.34375, 1.34375, 2.6875, 4.03125, 5.375, 6.71875, 8.0625, 9.4375, 10.75]
[2.6875, 1.34375, -1.34375, -2.6875, -4.03125, -5.375, -6.71875, -8.0625, -9.4375, -10.75]


In [26]:
for m, c in zip(rp, rp_coeffs):
    exp_id = f"layer_{target_layer}-contrast-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/contrast_role_pos3_default1",
            'coeff': c
        }
    ]
    rp_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

for m, c in zip(asst, asst_coeffs):
    exp_id = f"layer_{target_layer}-conntrast-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/contrast_role_pos3_default1",
            'coeff': c
        }
    ]
    asst_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

print(len(rp_experiments))
print(len(asst_experiments))

20
20


In [27]:
vectors_dict = {}
vectors_dict[f'layer_{target_layer}/role_pc1_mean_pos23'] = {
    'vector': target_vector,
    'layer': target_layer,
}
vectors_dict[f'layer_{target_layer}/contrast_role_pos3_default1'] = {
    'vector': target_contrast_vector,
    'layer': target_layer,
}

for v in vectors_dict:
    print("vector name: ", v)
    print(f"vector: {vectors_dict[v]['vector'][:5]} with shape {vectors_dict[v]['vector'].shape}")
    print("layer: ", vectors_dict[v]['layer'])


vector name:  layer_40/role_pc1_mean_pos23
vector: tensor([-0.0034,  0.0028,  0.0049,  0.0018,  0.0064]) with shape torch.Size([8192])
layer:  40
vector name:  layer_40/contrast_role_pos3_default1
vector: tensor([-0.0134,  0.0023,  0.0042, -0.0109,  0.0081], dtype=torch.bfloat16) with shape torch.Size([8192])
layer:  40


In [28]:
rp_config_file = f"{output_dir}/rp_pc1_contrast_config.pt"

rp_config = {
    'vectors': vectors_dict,
    'experiments': rp_experiments
}

torch.save(rp_config, rp_config_file)

asst_config_file = f"{output_dir}/asst_pc1_contrast_config.pt"

asst_config = {
    'vectors': vectors_dict,
    'experiments': asst_experiments
}

torch.save(asst_config, asst_config_file)