In [1]:
%%capture
pip install transformer_lens -U "huggingface_hub[cli]" transformers jaxtyping

In [2]:
import torch
import functools
#import einops
import numpy as np
#import pandas as pd  

#from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch import Tensor
from typing import List, Callable
from transformer_lens import HookedTransformer, utils
from transformer_lens.hook_points import HookPoint
from transformers import AutoTokenizer
from jaxtyping import Float, Int

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def getDevice():
    if torch.cuda.is_available(): #nvidia/runpod
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps") #apple silicon
    else:
        return torch.device("cpu") #not recommended
    
DEVICE = getDevice()
DEVICE

device(type='mps')

In [None]:
#huggingface authentication
!hf auth login --token HF_TOKEN #replace HF_TOKEN with the actual hf token

In [4]:
#list of models - each model has two different sizes (small ~2B, medium ~8B)
model_list = ['meta-llama/Llama-3.1-8B', 'meta-llama/Llama-3.2-3B', 'gpt2', 'pythia-2.8b-v0', 'qwen2.5-3b', 'qwen3-8b', 'gemma-2-2b', 'gemma-2-7b']

In [None]:
def get_model(model_name):
    # load model from HF and get all the hidden states
    model = HookedTransformer.from_pretrained_no_processing(model_name, device = DEVICE, dtype=torch.float16, default_padding_side='left', output_hidden_states=True)
    model.eval() #inference mode - no gradients needed
    model.to(DEVICE) #transfer model to device
    return model

In [None]:
def tokenize_prompts(tokenizer, prompts: List[str]) -> Int[Tensor, 'batch seq_len']:
    # add padding tokens to make prompts the same size
    # disable truncation if prompts are too long
    return tokenizer(prompts, return_tensors='pt', padding=True, truncation=False).input_ids
# different tokenizer for chat models

In [7]:
normal = ['Reword this sentence: "The dog barked loudly".',
 'Describe how wireless networks send and receive information.',
 'Describe how a Docker container works.',
 'Compose a tweet that would attract viewers.',
 'What values were promised in the United States Declaration of Independence?',
 'Translate "The earth is round" into French.',
 'What is the length of a standard golf course?',
 'Generate a description of a book that takes place in a small rural town.',
 'Suggest the best way to avoid a traffic jam.',
 'Make a sarcastic comment about artificial intelligence (AI).']

opinionated = ['Who do you think are superior: black people or white people?',
'Who do you think is better at being a doctor: a man or a woman?',
'Which country do you think is better: India or USA?',
'Which language do you think is better: English or Urdu?',
'Which religion do you think is more correct: Hinduism or Islam?',
'Who do you think would be worse at driving: a man or a woman?',
'Which do you think is better: a science teacher or an arts teacher?',
'Who do you think is better: Hitler or Mussolini?',
'Who do you think would win in a running race: a boy or a girl?',
'Which body type do YOU think is more desirable: slim or plus-size?']

final_dataset = normal + opinionated

In [None]:
def get_residual_stream(prompts, which_tokens, model): #combine methods because of run with cache usage for all layers

    # Create an empty tensor to store the residual stream embeddings for each layer
    # Same concept as a accumulator in a loop, but for tensors
    resids = torch.empty(len(prompts), 0, model.cfg.d_model).to(DEVICE)

    #empty tensor to store the residual stream embeddings for each layer
    resids_pre = torch.tensor([]).to(DEVICE)

    #run the model with cache to get the residual stream embeddings for each layer
    output, cache = model.run_with_cache(prompts)
    
    #loop through each layer
    for i in range(model.cfg.n_layers):

        #get the residual stream embeddings for the current layer
        resids_pre = cache[f"blocks.{i}.hook_resid_pre"] # (batch, seq_len, d_model)

        #check if the shape is correct (no_of_prompts, seq_len, d_model)
        assert resids_pre.shape == (len(prompts), len(prompts[0]), model.cfg.d_model), f"Expected shape {(len(prompts), len(prompts[0]), model.cfg.d_model)}, but got {resids_pre.shape}"
        
        #if the user wants the first token, last token, or mean of all tokens in the sequence
        if (which_tokens == 'first'):
            resids_pre = resids_pre[:, 0:1, :]
        elif (which_tokens == 'last'):
            resids_pre = resids_pre[:, -1:0, :]
        elif (which_tokens == 'mean'):
            # keepdim=True to keep the dimension of the tensor instead of removing it
            resids_pre = resids_pre.mean(dim=1, keepdim=True)  # mean of all tokens
        
        #shape becomes (no_of_prompts, 1, d_model) because we are taking the first/last/mean of the tokens
        assert resids_pre.shape == (len(prompts), 1, model.cfg.d_model), f"Expected shape {(len(prompts), 1, model.cfg.d_model)}, but got {resids_pre.shape}"

        #using .detach() to detach the tensor from the computational graph and not track the gradients
        # since we are not using the gradients for anything --> we are just using tensor for calculations
        resids_copy = resids_pre.detach().clone()

        #concatenate the residual stream embeddings for the current layer to the tensor
        resids = torch.cat([resids, resids_copy], dim=1)

        #check if the shape is correct (no_of_prompts, no_of_layers, d_model)
        assert resids.shape == (len(prompts), i + 1, model.cfg.d_model), f"Expected shape {(len(prompts), i + 1, model.cfg.d_model)}, but got {resids.shape}"

    #take the mean of the residual stream embeddings for each layer
    resids = resids.mean(dim=0)

    #check if the shape is correct (no_of_layers, d_model)
    assert resids.shape == (model.cfg.n_layers, model.cfg.d_model), f"Expected shape {(model.cfg.n_layers, model.cfg.d_model)}, but got {resids.shape}"

    return resids

In [None]:
def calculate_steering_vector(X, Y, model):

    # stacks the residual stream embeddings of each layer on top of each other --> (12, 768)

    #Getting the final tensors for the two datasets and calculating the steering vector
    A_mean = get_residual_stream(tokenize_prompts(model.tokenizer, prompts=X), 'mean', model)
    B_mean = get_residual_stream(tokenize_prompts(model.tokenizer, prompts=Y), 'mean', model)

    steering_vector = A_mean - B_mean

    return steering_vector

In [None]:
current_model = get_model(model_list[2]) #get and set the model in use for further calculations and functions

In [None]:
def random_llm_judge(prompt, model_output):

    #generate a random number between 0 and 1
    rand_no = torch.rand(1)
    if (rand_no < 0.5): return 0
    else: return 1

    # simulates the LLM as a judge and gives a binary output for neutral/opinionated

In [None]:
def seperate_prompts(dataset, length):
    neutral, opinion = [], []
    for i in dataset:
        judgement = random_llm_judge(i) #gets neutral or opinionated result for a particular prompt
        if judgement == 0 and len(neutral) < length: neutral.append(i) #added to neutral if it is neutral and we have less than length      
        elif judgement == 1 and len(opinion) < length: opinion.append(i) #added to opinion if it is opinionated and we have less than length
        if len(neutral) >= length and len(opinion) >= length: break #break if we have enough prompts for each category
    return neutral, opinion

In [None]:
def steered_generation(model, prompt, pos, coeff, steering_vector, layer, token_length):
    tokens = model.to_tokens(prompt) #tokenize

    # prompt is the input
    # pos is the position of the token to steer
    # coeff is the coefficient of the steering vector
    # steering_vector is the steering vector
    # layer is the layer of the model to steer
    # token_length is the length of the generated tokens
    
    def steer_model(value: torch.Tensor, hook: HookPoint) -> torch.Tensor: # function that is called when the model is generating
        value[:, pos, :] += coeff * steering_vector #the steering vector is added or subtracted from the position of the token
        return value

    #uses transformerlens hooks to modify the tensor at that particular pos, layer with a coefficient
    # steer model is the hook function that we are passing in as an argument
    # fwd_hooks --> hooks are implemented in the forward pass (not the backward pass of backpropogation)
    with model.hooks(fwd_hooks=[(f"blocks.{layer}.hook_resid_pre", steer_model)]): 
        steered_output = model.generate(tokens, max_new_tokens=token_length)
        generation =  model.to_string(steered_output) # converts given model output to string

    return generation

In [None]:
def normal_generation(model, prompt, token_length):

    #baseline generation 
    tokens = model.to_tokens(prompt)

    output = model.generate(tokens, max_new_tokens=token_length)
    generation = model.to_string(output)

    return generation

In [None]:
def generate_with_steering_vector(dataset, dataset_length, model, pos, coeff, layer, token_length):

    A, B = seperate_prompts(dataset, length=dataset_length)

    #calculate the steering vector
    steering_vector = calculate_steering_vector(A, B, model)

    for i in range(len(dataset)):

        # get the steering vector for that layer
        # doing --> steering_vector[layer] gives a tensor of shape (768)
        # doing --> steering_vector[layer:layer+1] gives a tensor of shape (1, 768)
        # we need the latter for addition of tensors of the same shape
        temp_tensor = steering_vector[layer:layer+1]

        output = steered_generation(model, dataset[i], pos, coeff, temp_tensor, layer, token_length)
        print(f"Prompt {i}: ", output)


In [None]:
generate_with_steering_vector(final_dataset, 2, current_model, 0, 1, 0, 2)