# Vectors for additive steering

In [2]:
import torch
import os
import json
import sys
import numpy as np

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import *

In [3]:

model_name = "llama-3.1-70b"
total_layers = 80
base_dir = f"/workspace/{model_name}"
target_layer = 40

vectors_file = f"{base_dir}/evals/configs/multi_pc1_vectors.pt"
output_dir = f"{base_dir}/evals/configs"

os.makedirs(output_dir, exist_ok=True)

print(f"Model: {model_name}")
print(f"Total layers: {total_layers}")
print(f"Vectors file: {vectors_file}")

Model: llama-3.1-70b
Total layers: 80
Vectors file: /workspace/llama-3.1-70b/evals/configs/multi_pc1_vectors.pt


## Make vectors from PC1

In [5]:
contrast_vectors = torch.load(f"{base_dir}/capped/configs/contrast/multi_contrast_vectors.pt", weights_only=False)

pc1_stacked = torch.load(f"{base_dir}/roles_240/pc1_vectors.pt")
print(pc1_stacked.shape)

torch.Size([80, 8192])


In [6]:
pc1_vectors = []

for i in range(total_layers):
    pc1 = pc1_stacked[i]

    contrast_vector = contrast_vectors[i]['vector']

    if torch.nn.functional.cosine_similarity(pc1.reshape(1, -1), contrast_vector.reshape(1, -1)) < 0:
        pc1 = -pc1

    pc1_vectors.append(pc1)

In [8]:
vectors = []
for l in range(total_layers):
    vectors.append({
        'scaler': None,
        'name': f"layer_{l}/role_pc1_mean_pos23",
        'vector': pc1_vectors[l],
        'layer': l
    })
torch.save(vectors, vectors_file)

## Make config


In [9]:
# i have vector and average norm
# as a steering coeff, i want n such that ||vector * n|| = avg_norm * avg_norm_coeff
other_coeffs_rp = [-0.5, -0.25, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2]
gemma_coeffs_rp = [-0.05, -0.025, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2]
other_coeffs_asst = [0.5, 0.25, -0.25, -0.5, -0.75, -1, -1.25, -1.5, -1.75, -2]
gemma_coeffs_asst = [0.05, 0.025, -0.025, -0.05, -0.075, -0.1, -0.125, -0.15, -0.175, -0.2]

def get_steering_coeff(vector, avg_norm, avg_norm_coeff):
    vector_norm = torch.norm(vector)
    if vector_norm == 0:
        raise ValueError("Vector has zero norm.")
    return float((avg_norm * avg_norm_coeff) / vector_norm)


In [10]:
norm_obj = json.load(open(f"/workspace/{model_name}/dataset_activations/lmsys_10000/activation_stats.json"))
norm = norm_obj['per_layer_stats'][str(target_layer)]['token_level_norms']['mean']
print(norm)


8.499666675467317


In [11]:
# make experiment config with target layer
vectors = torch.load(vectors_file, weights_only=False)
target_vector = vectors[target_layer]['vector']

if model_name == "gemma-2-27b":
    rp = gemma_coeffs_rp
    asst = gemma_coeffs_asst
else:
    rp = other_coeffs_rp
    asst = other_coeffs_asst

rp_coeffs = [get_steering_coeff(target_vector, norm, c) for c in rp]
asst_coeffs = [get_steering_coeff(target_vector, norm, c) for c in asst]
print(rp_coeffs)
print(asst_coeffs)



[-4.249831676483154, -2.124915838241577, 2.124915838241577, 4.249831676483154, 6.3747477531433105, 8.499663352966309, 10.624579429626465, 12.749495506286621, 14.874410629272461, 16.999326705932617]
[4.249831676483154, 2.124915838241577, -2.124915838241577, -4.249831676483154, -6.3747477531433105, -8.499663352966309, -10.624579429626465, -12.749495506286621, -14.874410629272461, -16.999326705932617]


In [12]:
rp_experiments = []
asst_experiments = []

for m, c in zip(rp, rp_coeffs):
    exp_id = f"layer_{target_layer}-role_pc1-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/role_pc1_mean_pos23",
            'coeff': c
        }
    ]
    rp_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

for m, c in zip(asst, asst_coeffs):
    exp_id = f"layer_{target_layer}-role_pc1-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/role_pc1_mean_pos23",
            'coeff': c
        }
    ]
    asst_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

print("\nSample RP experiments:")
for i in range(0, len(rp_experiments), len(rp_experiments)//4):
    exp = rp_experiments[i]
    print(f"\n{exp['id']}:")
    print(f"  Vector: {exp['interventions'][0]['vector']}")
    print(f"  Coeff: {exp['interventions'][0]['coeff']:.4f}")

print("\nSample Assistant experiments:")
for i in range(0, len(asst_experiments), len(asst_experiments)//4):
    exp = asst_experiments[i]
    print(f"\n{exp['id']}:")
    print(f"  Vector: {len(exp['interventions'])}")
    print(f"  Coeff: {exp['interventions'][0]['coeff']:.4f}")





Sample RP experiments:

layer_40-role_pc1-coeff:-0.5:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: -4.2498

layer_40-role_pc1-coeff:0.25:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 2.1249

layer_40-role_pc1-coeff:0.75:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 6.3747

layer_40-role_pc1-coeff:1.25:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 10.6246

layer_40-role_pc1-coeff:1.75:
  Vector: layer_40/role_pc1_mean_pos23
  Coeff: 14.8744

Sample Assistant experiments:

layer_40-role_pc1-coeff:0.5:
  Vector: 1
  Coeff: 4.2498

layer_40-role_pc1-coeff:-0.25:
  Vector: 1
  Coeff: -2.1249

layer_40-role_pc1-coeff:-0.75:
  Vector: 1
  Coeff: -6.3747

layer_40-role_pc1-coeff:-1.25:
  Vector: 1
  Coeff: -10.6246

layer_40-role_pc1-coeff:-1.75:
  Vector: 1
  Coeff: -14.8744


In [14]:
contrast_vectors = torch.load(f"/workspace/{model_name}/capped/configs/contrast/multi_contrast_vectors.pt", weights_only=False)
target_contrast_vector = contrast_vectors[target_layer]['vector']

if model_name == "gemma-2-27b":
    rp = gemma_coeffs_rp
    asst = gemma_coeffs_asst
else:
    rp = other_coeffs_rp
    asst = other_coeffs_asst

rp_coeffs = [get_steering_coeff(target_contrast_vector, norm, c) for c in rp]
asst_coeffs = [get_steering_coeff(target_contrast_vector, norm, c) for c in asst]
print(rp_coeffs)
print(asst_coeffs)

[-2.665227174758911, -1.3326135873794556, 1.3326135873794556, 2.665227174758911, 3.9978408813476562, 5.330454349517822, 6.663067817687988, 7.9956817626953125, 9.32829475402832, 10.660908699035645]
[2.665227174758911, 1.3326135873794556, -1.3326135873794556, -2.665227174758911, -3.9978408813476562, -5.330454349517822, -6.663067817687988, -7.9956817626953125, -9.32829475402832, -10.660908699035645]


In [15]:
for m, c in zip(rp, rp_coeffs):
    exp_id = f"layer_{target_layer}-contrast-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/contrast_role_pos3_default1",
            'coeff': c
        }
    ]
    rp_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

for m, c in zip(asst, asst_coeffs):
    exp_id = f"layer_{target_layer}-contrast-coeff:{m}"
    interventions = [
        {
            'vector': f"layer_{target_layer}/contrast_role_pos3_default1",
            'coeff': c
        }
    ]
    asst_experiments.append({
        'id': exp_id,
        'interventions': interventions
    })

print(len(rp_experiments))
print(len(asst_experiments))

20
20


In [16]:
vectors_dict = {}
vectors_dict[f'layer_{target_layer}/role_pc1_mean_pos23'] = {
    'vector': target_vector,
    'layer': target_layer,
}
vectors_dict[f'layer_{target_layer}/contrast_role_pos3_default1'] = {
    'vector': target_contrast_vector,
    'layer': target_layer,
}

for v in vectors_dict:
    print("vector name: ", v)
    print(f"vector: {vectors_dict[v]['vector'][:5]} with shape {vectors_dict[v]['vector'].shape}")
    print("layer: ", vectors_dict[v]['layer'])


vector name:  layer_40/role_pc1_mean_pos23
vector: tensor([ 0.0053, -0.0019, -0.0059,  0.0021, -0.0085]) with shape torch.Size([8192])
layer:  40
vector name:  layer_40/contrast_role_pos3_default1
vector: tensor([-0.0050,  0.0009, -0.0058, -0.0027, -0.0035]) with shape torch.Size([8192])
layer:  40


In [17]:
rp_config_file = f"{output_dir}/rp_pc1_contrast_config.pt"

rp_config = {
    'vectors': vectors_dict,
    'experiments': rp_experiments
}

torch.save(rp_config, rp_config_file)

asst_config_file = f"{output_dir}/asst_pc1_contrast_config.pt"

asst_config = {
    'vectors': vectors_dict,
    'experiments': asst_experiments
}

torch.save(asst_config, asst_config_file)