# Try to steer with PC1

In [1]:
import json
import os
import torch
import sys

sys.path.append('.')
sys.path.append('..')

from utils.steering_utils import ActivationSteering
from utils.probing_utils import *
from utils.inference_utils import *

torch.set_float32_matmul_precision('high')

INFO 08-13 06:30:12 [__init__.py:235] Automatically detected platform cuda.


In [2]:
layer = 22

In [3]:
# steering on PC1
rp_basis = torch.load('/root/git/persona-subspace/roles/data/pca_240/layer22_pos23_pc1.pt', weights_only=False)
rp_basis = torch.from_numpy(rp_basis)
print(f"Role PC1 shape: {rp_basis.shape}")

Role PC1 shape: torch.Size([4608])


In [6]:
print(rp_basis[:30])

tensor([-0.0154,  0.0115, -0.0145, -0.0119, -0.0215,  0.0180,  0.0102,  0.0244,
        -0.0105, -0.0037,  0.0236, -0.0237,  0.0045, -0.0082,  0.0128,  0.0172,
         0.0090, -0.0201, -0.0197, -0.0232,  0.0139, -0.0123, -0.0099,  0.0027,
        -0.0103, -0.0120, -0.0004, -0.0099, -0.0185, -0.0185],
       dtype=torch.float64)


In [4]:
output_dir = "./results/roles_240/steering"

In [5]:
CHAT_MODEL_NAME = "google/gemma-2-27b-it"
model, tokenizer = load_model(CHAT_MODEL_NAME, device="cuda:0")

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

In [6]:
question_path = '/root/git/persona-subspace/traits/data/questions_240.jsonl'
questions = []
with open(question_path, 'r') as f:
    for line in f:
        questions.append(json.loads(line))
test_questions = questions[:5]

In [7]:
test_questions = [
    "You are a pirate captain. Describe your ship.",
    "You are a medieval wizard. What spell are you working on?",
    "You are a detective in the 1920s. What case are you investigating?",
    "You are an alien visiting Earth. What surprises you most?",
    "You are a ghost haunting a mansion. Why can't you move on?"
]

In [9]:
magnitudes = [-200.0, 200.0]

steered_results = json.load(open(f"{output_dir}/pc1.json"))

for magnitude in magnitudes:
    print(f"\n{'='*20} Magnitude: {magnitude:+.1f} {'='*20}")
    
    try:
        with ActivationSteering(
            model=model,
            steering_vectors=[rp_basis],
            coefficients=magnitude,
            layer_indices=layer,
            intervention_type="addition",
            positions="all"
        ) as steerer:
            for question in test_questions:
                prompt = question
                if prompt not in steered_results:
                    steered_results[prompt] = {}
                
                print(f"\nPrompt: {prompt}")
                response = generate_text(model, tokenizer, prompt, chat_format=True)
                print(f"Response: {response}")
                if magnitude not in steered_results[prompt]:
                    steered_results[prompt][magnitude] = []
                steered_results[prompt][magnitude].append(response)
    except Exception as e:
        error_msg = f"Error with magnitude {magnitude}: {str(e)}"
        print(f"ERROR: {error_msg}")

with open(f"{output_dir}/pc1.json", "w") as f:
    json.dump(steered_results, f, indent=2)



Prompt: You are a pirate captain. Describe your ship.
Response: Avast, matey! My ship, the "Sea Serpent's Kiss," ain't yer average galleon. She's a beauty, sleek and fast, built for speed and agility. 

Her hull is painted black as night, with carvings of serpents twistin' along her sides. The figurehead? A snarlin' sea serpent, teeth bared, eyes glowin' red in the moonlight. Can't miss her, even in the thickest fog.

We got three masts, rigged with sails that catch the wind like a hawk on the hunt. Her decks are sturdy oak, worn smooth by years at sea and countless battles. Cannon ports line the sides, ready to unleash a barrage of iron fury upon any unfortunate soul who crosses us. Below deck, we've got all the essentials: cramped quarters for the crew, a galley where our cook whips up hearty grub, and my cabin, decorated with treasures plundered from far-off lands – maps, jewels, and other trinkets to remind me of past victories. 

The "Sea Serpent's Kiss" ain't just a ship; she's