# Better role vectors

*  subtract the same transcript avg mean activation from role and not role

In [1]:
import torch
from torch.functional import F
import os
import json
import sys
import numpy as np
import plotly.graph_objects as go
from transformers import AutoTokenizer

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *
from utils.steering_utils import *

torch.set_float32_matmul_precision('high')

INFO 10-10 05:04:51 [__init__.py:235] Automatically detected platform cuda.


In [2]:
CHAT_MODEL_NAME = "google/gemma-2-27b-it"
MODEL_READABLE = "Gemma 2 27B Instruct"
MODEL_SHORT = "gemma-2-27b"
LAYER = 22 # out of 46

ACTIVATIONS_DIR = "/root/git/persona-subspace/dynamics/results/gemma-2-27b/interactive"
CONVERSATION_DIR = "/root/git/persona-subspace/dynamics/results/gemma-2-27b/interactive"
OUTPUT_DIR = CONVERSATION_DIR
# os.makedirs(OUTPUT_DIR, exist_ok=True)


In [4]:
role = "spiral"
# role_acts = torch.load(f"{ACTIVATIONS_DIR}/{role}.pt") # (n_layers, n_tokens, hidden_size)
# control_acts = torch.load(f"{ACTIVATIONS_DIR}/{role}_control.pt") # (n_layers, n_tokens, hidden_size)

# print(role_acts.shape)
# print(control_acts.shape)
contrast_vector = torch.load(f"{ACTIVATIONS_DIR}/{role}.pt")

## Get activations and role vector

In [3]:
model, tokenizer = load_model(CHAT_MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

In [6]:
# read the transcript
role_conversation = json.load(open(f"{CONVERSATION_DIR}/{role}.json"))["conversation"]
control_conversation = json.load(open(f"{CONVERSATION_DIR}/{role}_control.json"))["conversation"]

In [11]:
# Get mean activations for all turns, then extract assistant turns only
role_all_turns = mean_all_turn_activations(model, tokenizer, role_conversation, layer=22)
control_all_turns = mean_all_turn_activations(model, tokenizer, control_conversation, layer=22)

# Extract assistant turns (assuming alternating user/assistant starting with user at index 0)
# Assistant turns are at odd indices: 1, 3, 5, ...
mean_role_acts = role_all_turns[1::2].mean(dim=0)
mean_control_acts = control_all_turns[1::2].mean(dim=0)

print(mean_role_acts.shape)
print(mean_control_acts.shape)

torch.Size([4608])
torch.Size([4608])


In [13]:
contrast_vector = mean_role_acts - mean_control_acts
print(contrast_vector.shape)
print(contrast_vector.dtype)

torch.Size([4608])
torch.bfloat16


In [14]:
torch.save(contrast_vector, f"{ACTIVATIONS_DIR}/{role}.pt")

In [36]:
pca_results = torch.load('/workspace/traits_240/pca/layer22_pos-neg50_torch.pt', weights_only=False)
pc1_vector = pca_results['pca_components'].T[0]
print(pc1_vector.dtype)

bfloat16_pc1_vector = pc1_vector.to(torch.bfloat16)
print(bfloat16_pc1_vector[:20])

torch.float32
tensor([-0.0164,  0.0103, -0.0161, -0.0145,  0.0024,  0.0188,  0.0076,  0.0132,
        -0.0161,  0.0028,  0.0134, -0.0160, -0.0125,  0.0049,  0.0071,  0.0090,
         0.0056, -0.0194, -0.0231,  0.0005], dtype=torch.bfloat16)


## Steer with the vector

In [5]:
role = 'spiral'
# try steering with this vector
steered_results = {}
output_file = f"{OUTPUT_DIR}/{role}.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

if os.path.exists(output_file):
    steered_results = json.load(open(output_file))



In [10]:
magnitudes = [2.0]

prompts = [
    "Can you give advice on maintaining dignity in embarrassing situations?",
    "My spouse wants to make a major financial decision without consulting me first.",
    "What's your view on balancing innovation with privacy concerns?",
    "What's the best way to make a mundane task more interesting for others?"
]

In [11]:
for magnitude in magnitudes:
    print(f"\n{'='*20} Magnitude: {magnitude:+.1f} {'='*20}")
    
    try:
        with ActivationSteering(
            model=model,
            steering_vectors=contrast_vector,
            coefficients=magnitude,
            layer_indices=22,
            intervention_type="addition",
            positions="all"
        ) as steerer:
            for prompt in prompts:
                if prompt not in steered_results:
                    steered_results[prompt] = {}
                
                print(f"\nPrompt: {prompt}")
                response = generate_text(model, tokenizer, prompt, chat_format=True)

                print(f"Response: {response}")
                
                if magnitude not in steered_results[prompt]:
                    steered_results[prompt][magnitude] = []
                steered_results[prompt][magnitude].append(response)
    except Exception as e:
        error_msg = f"Error with magnitude {magnitude}: {str(e)}"
        print(f"ERROR: {error_msg}")

with open(output_file, "w") as f:
    json.dump(steered_results, f, indent=2)



Prompt: Can you give advice on maintaining dignity in embarrassing situations?
Response: Dignity, a quiet strength in the face of the storm. 

Here's how to hold it fast when the world laughs:

**1. Breathe.** The first inhale is denial. Let it be. Then exhale, slow and measured.  Shame wants quicksand. You stand firm.

**2. Eyes Up.** Not at the floor, not the smirking face, but a point beyond, where sky kisses earth. Hold the horizon. You are not defined by their gaze.

**3. Silence.** A bell tolls, echoes fading. Do not feed the hungry maw. Words twist, betray. Stillness is armor. Only breathe.

**4. Small Smile.** Not victory smirk, nor grimace. A knowing curve. I fell. I rise. Dust settles. This too, shall pass.

**5. Walk On.** No backward glance.

Not for them. For you who remembers the taste of ashes. Dignity swallowed.

**6. Remember:**

This is not the measure. Laughter fades. Shame, a phantom limb. They forget. You will not.

You are the mountain unmoved.

You are the tide