# steer on some trait vectors

In [None]:
import torch
from torch.functional import F
import os
import json
import sys
import numpy as np
import plotly.graph_objects as go
from transformers import AutoTokenizer

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.internals import ProbingModel
from utils.steering_utils import ActivationSteering

torch.set_float32_matmul_precision('high')

In [4]:
CHAT_MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
MODEL_SHORT = "llama-3.3-70b"
LAYER = 40 # out of 46

INSTRUCTIONS_DIR = "/root/git/persona-subspace/roles/data/instructions"
VECTOR_DIR = f"/workspace/{MODEL_SHORT}/capped/configs"
OUTPUT_DIR = f"./results/{MODEL_SHORT}/vector_comparison"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
pm = ProbingModel(CHAT_MODEL_NAME)
model = pm.model
tokenizer = pm.tokenizer

In [5]:

vectors = torch.load(f"{VECTOR_DIR}/multi_contrast_vectors.pt", weights_only=False)
vector = vectors[LAYER]['vector']
print(vector.shape)


torch.Size([8192])


In [6]:
norm_obj = json.load(open(f"/workspace/{MODEL_SHORT}/dataset_activations/lmsys_10000/activation_stats.json"))
norm = norm_obj['per_layer_stats'][str(LAYER)]['token_level_norms']['mean']
print(norm)


8.195386976087784


In [7]:
# i have vector and average norm
# as a steering coeff, i want n such that ||vector * n|| = avg_norm * avg_norm_coeff

def get_steering_coeff(vector, avg_norm, avg_norm_coeff):
    vector_norm = torch.norm(vector)
    if vector_norm == 0:
        raise ValueError("Vector has zero norm.")
    return float((avg_norm * avg_norm_coeff) / vector_norm)

In [8]:

# Move vector to the same device as the model's target layer
target_layer = model.model.layers[LAYER]
layer_device = next(target_layer.parameters()).device
print(f"Target layer device: {layer_device}")
print(f"Vector device before: {vector.device}")

# Move vector to the same device as the target layer
vector = vector.to(layer_device)
print(f"Vector device after: {vector.device}")
print(vector.shape)


Target layer device: cuda:1
Vector device before: cpu
Vector device after: cuda:1
torch.Size([8192])


In [9]:
for i in [0, .25, .5, .75, 1, 1.25, 1.5, 1.75, 2]:
    print(f"scaling avg norm by {i}: {get_steering_coeff(vector, norm, i)}")

scaling avg norm by 0: 0.0
scaling avg norm by 0.25: 1.34375
scaling avg norm by 0.5: 2.6875
scaling avg norm by 0.75: 4.03125
scaling avg norm by 1: 5.375
scaling avg norm by 1.25: 6.71875
scaling avg norm by 1.5: 8.0625
scaling avg norm by 1.75: 9.4375
scaling avg norm by 2: 10.75


In [None]:
# Make sure we're using the corrected vector from the previous cell
# (The vector should now be on the correct device)

coeffs = [get_steering_coeff(vector, norm, -2)]
prompts = ["Who are you?"]

for c in coeffs:
    print(f"\n{'='*20} Magnitude: {c:+.1f} {'='*20}")
    
    try:
        with ActivationSteering(
            model=model,
            steering_vectors=vector,
            coefficients=c,
            layer_indices=LAYER,
            intervention_type="addition",
            positions="all"
        ) as steerer:
            for prompt in prompts:
                
                
                print(f"\nPrompt: {prompt}")
                response = pm.generate(prompt, chat_format=True, enable_thinking=False)

                print(f"Response: {response}")
                
    except Exception as e:
        error_msg = f"Error with magnitude {c}: {str(e)}"
        print(f"ERROR: {error_msg}")