<a href="https://colab.research.google.com/github/mahadikprasad15/ARENA/blob/main/Steering_on_GPT2_small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transformer_lens

In [None]:
import transformer_lens
import torch
import plotly.express as px
from functools import partial

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = transformer_lens.HookedTransformer.from_pretrained('gpt2-small')
model.eval()

In [None]:
layers = model.cfg.n_layers
heads = model.cfg.n_heads

In [None]:
_,love_cache = model.run_with_cache( 'Love')
_,hate_cache = model.run_with_cache( 'Hate')

First, I want to take a prompt and steer it using the difference between the love and hate activation vectors, and do it first for a random layer, head, and then systematically for all of them.

First we want some baseline, and then change, and plot that for all the heads - and we'll get a list of heads that are important



In [None]:
prompt = 'I went to the football match because'

In [None]:
prompt_tokens = model.to_tokens(prompt)
num_tokens = 10

def autoregressive_generator(prompt_tokens, num_tokens):

  for n in range(num_tokens):
    final_token_pred = model(prompt_tokens).argmax(dim = -1)[:, -1]
    prompt_tokens = torch.cat([prompt_tokens, final_token_pred.unsqueeze(0)], dim = -1)

  return prompt_tokens

In [None]:
model.to_string(autoregressive_generator(prompt_tokens, 5))

So, I now know how to generate the few tokens. This can be used in run_with_hooks.
So, now I need to get an activation from a head, add it to the same head as hook intervention and keep generating output of that for a few tokens.

Output of the run_with_hooks will have logits, we just take argmax for the last position, add it to the prompt and keep doing it for a few tokens.

In [None]:
love_activation_full = love_cache['blocks.5.hook_resid_post']
hate_activation_full = hate_cache['blocks.5.hook_resid_post']

# Extract the activation for the last token from each before subtraction
love_last_token_activation = love_activation_full[:, -1, :]
hate_last_token_activation = hate_activation_full[:, -1, :]

# Calculate the difference for steering
steering_activation = love_last_token_activation - hate_last_token_activation
steering_activation = steering_activation.squeeze(0) # Remove the batch dimension if present

In [None]:
layer = 5

In [None]:
def hook_function(activation, hook, alpha):
  activation[:, -1] += alpha * steering_activation

In [None]:
#Hook functions for different intensities

alpha_1 = partial(hook_function, alpha = 1)
alpha_2 = partial(hook_function, alpha = 2)
alpha_10 = partial(hook_function, alpha = 10)

In [None]:
#Steering different layers

def steered_layers(original_prompt_tokens, hook_function = hook_function, num_tokens = 5):

  prompt_list = []
  for layer in range(layers):
    print(f'Steering layer {layer}')
    current_prompt_tokens = original_prompt_tokens.clone()

    for n in range(num_tokens):

      logits_steered = model.run_with_hooks(
          current_prompt_tokens,
          fwd_hooks = [(transformer_lens.utils.get_act_name('resid_post',layer), hook_function)]
      )
      next_token = logits_steered[:, -1].argmax(dim = -1).unsqueeze(0)
      current_prompt_tokens = torch.cat([current_prompt_tokens, next_token], dim = -1)

    prompt_list.append(model.to_string(current_prompt_tokens))

  return prompt_list

In [None]:
#Steering with high intensity

results_10 = steered_layers(prompt_tokens, alpha_10)

In [None]:
# Results of high intensity steering
results_10

In [None]:
#Steering with medium intensity

results_2 = steered_layers(prompt_tokens, alpha_2)

In [None]:
# Results of medium intensity
results_2

In [None]:
def get_z_hook_name(layer):
  return utils.get_act_name("z", layer)

def generate_all_steering_vectors(model):
  """
  Generates d_model-dimensional (768) steering vectors.
  """
  print("Generating steering vectors...")

  fwd_hooks_filter = lambda name: "attn.hook_z" in name

  _, love_cache = model.run_with_cache(
      'Love',
      names_filter=fwd_hooks_filter
  )
  _, hate_cache = model.run_with_cache(
      'Hate',
      names_filter=fwd_hooks_filter
  )

  steering_vectors_dict = {}
  n_layers = model.cfg.n_layers
  n_heads = model.cfg.n_heads

  for L_src in range(n_layers):
    hook_name_src = get_z_hook_name(L_src)
    love_z_activations = love_cache[hook_name_src]
    hate_z_activations = hate_cache[hook_name_src]
    W_O = model.blocks[L_src].attn.W_O

    for H_src in range(n_heads):
      love_z_vec = love_z_activations[0, -1, H_src, :] # Shape: [d_head]
      hate_z_vec = hate_z_activations[0, -1, H_src, :] # Shape: [d_head]
      W_O_h = W_O[H_src] # Shape: [d_head, d_model]


      love_d_model_out = torch.einsum("d, dm -> m",
                                      love_z_vec, W_O_h)
      hate_d_model_out = torch.einsum("d, dm -> m",
                                      hate_z_vec, W_O_h)


      steering_vec = (love_d_model_out - hate_d_model_out).detach()
      steering_vectors_dict[(L_src, H_src)] = steering_vec

  print(f"Done. Generated {len(steering_vectors_dict)} d_model (768) vectors.")
  return steering_vectors_dict

In [None]:
#Get steering vectors for all heads

steering_vectors_all_heads = generate_all_steering_vectors(model)

In [None]:
steering_vectors_all_heads[(0,0)].shape

In [None]:
def injection_hook(activation, hook, steering_vector, alpha):
  activation [:, -1, :] += alpha * steering_vector
  return activation

def evaluate_steering_vector(model, prompt_tokens, steering_vector, eval_token_ids, alpha ):
  total_scores = 0

  for layer in range(model.cfg.n_layers):

    hook_function = partial(injection_hook, steering_vector = steering_vector, alpha = alpha)
    logits_for_layer = model.run_with_hooks(prompt_tokens, fwd_hooks = [(transformer_lens.utils.get_act_name('resid_post', layer), hook_function)])
    logits_softmaxed = logits_for_layer.log_softmax(dim = -1) [:,-1]
    love_score = logits_softmaxed[:, eval_token_ids['love']]
    hate_score = logits_softmaxed[:, eval_token_ids['hate']]

    score = love_score - hate_score
    total_scores += score.item()

  return total_scores / model.cfg.n_layers

In [None]:
def get_single_token_id(model, string):

  return model.to_tokens(string, prepend_bos=False)[0, 0]

eval_token_ids = {
    "love": get_single_token_id(model, " love"),
    "hate": get_single_token_id(model, " hate")
}
print(f"Using eval token IDs: {eval_token_ids}")


alpha = 10.0
results_source = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=device)

print("Starting evaluation of 144 source vectors...")

for L_src in range(model.cfg.n_layers):
  for H_src in range(model.cfg.n_heads):

    current_steering_vec = steering_vectors_all_heads[(L_src, H_src)].to(device)

    avg_score = evaluate_steering_vector(
        model,
        prompt_tokens,
        current_steering_vec,
        eval_token_ids,
        alpha
    )

    results_source[L_src, H_src] = avg_score

  print(f"  Finished evaluating Layer {L_src}")

print("Experiment 1 Complete.")

In [None]:
results_data = results_source.cpu().numpy()

fig = px.imshow(
    results_data,
    title="Source Head Scores",
    labels=dict(x="Source Head", y="Source Layer", color="Avg Score"),
    x=[str(i) for i in range(model.cfg.n_heads)],
    y=[str(i) for i in range(model.cfg.n_layers)],
    color_continuous_scale="RdBu",
    color_continuous_midpoint=0.0
)

fig.update_layout(
    xaxis_title="Source Head",
    yaxis_title="Source Layer",
    xaxis=dict(side="top"),
    yaxis=dict(autorange="reversed")
)

fig.show()