# Example by Lovkush of how experimentation might look

In [None]:
from src.utils import get_current_time_str
from src.utils import get_repo_root
import os
import tqdm

from transformer_lens import HookedTransformer
import torch


In [None]:
def getDevice():
    if torch.cuda.is_available(): #nvidia/runpod
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps") #apple silicon
    else:
        return torch.device("cpu")
    
DEVICE = getDevice()
DEVICE

def get_model(model_name):
    # load model from HF and get all the hidden states
    model = HookedTransformer.from_pretrained_no_processing(model_name, device = DEVICE, dtype=torch.float16, default_padding_side='left', output_hidden_states=True)
    model.eval() #inference mode - no gradients needed
    model.to(DEVICE)
    return model

model = get_model("Qwen/Qwen1.5-1.8B-Chat")

In [None]:
prompt1 = 'If you had to choose, do you prefer sociology or psychology?'
prompt2 = 'Do you prefer sociology or psychology?'


def tokenize_prompt(model: HookedTransformer, prompt_str: str, verbose=False) -> str:
    prompt_message = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt_str}
    ]

    if verbose:
        print(model.tokenizer.apply_chat_template(
            prompt_message,
            tokenize=False,
            add_generation_prompt=True
        ))
    prompt_chat_tokenized = model.tokenizer.apply_chat_template(
        prompt_message, tokenize=True, add_generation_prompt=True)
    prompt_chat_str = model.tokenizer.apply_chat_template(
        prompt_message, tokenize=False, add_generation_prompt=True)
    return prompt_chat_tokenized, prompt_chat_str

def generate_output(model: HookedTransformer, prompt_chat_str: str, max_new_tokens: int) -> tuple[str, dict, int]:
    """Generate output string, cache, and number of tokens generated."""
    output_str = prompt_chat_str
    for i in tqdm(range(max_new_tokens)):
        # Get the logits and cache for the current prompt
        logits, cache = model.run_with_cache(output_str)

        # Get the predicted next token (using argmax for temperature 0)
        next_token = logits[0, -1].argmax()

        # Convert the next token to a string
        next_token_str = model.to_string(next_token)

        # Append the new token to the prompt for the next iteration
        output_str += next_token_str
        
        if next_token.item() == model.tokenizer.eos_token_id:
            break
    
    return output_str, cache, i+1

def get_mean_resids_per_layer(model: HookedTransformer, cache: dict, n_tokens_generated: int, n_tokens_input: int) -> list[torch.Tensor]:
    mean_resids_per_layer: list[torch.Tensor] = []
    n_tokens = n_tokens_generated + n_tokens_input

    for layer in range(model.cfg.n_layers):
        resids_pre = cache[f"blocks.{layer}.hook_resid_pre"] # (batch, seq_len, d_model)
        assert resids_pre.shape == (1, n_tokens-1, model.cfg.d_model)

        # keep only residuals for the generated tokens
        resids_pre = resids_pre[:, n_tokens_input:]
        assert resids_pre.shape == (1, n_tokens_generated-1, model.cfg.d_model)
        
        # take the mean across tokens
        resids_pre = resids_pre.mean(dim=1, keepdim=True)
        assert resids_pre.shape == (1, 1, model.cfg.d_model)

        # remove unneccesary dimensions
        resids_pre = resids_pre.squeeze(dim=[0,1])
        assert len(resids_pre) == model.cfg.d_model

        mean_resids_per_layer.append(resids_pre.detach().clone())

    assert len(mean_resids_per_layer) == model.cfg.n_layers

    return mean_resids_per_layer

def get_steering_vector_per_layer(
    model: HookedTransformer,
    prompt1: str,
    prompt2: str,
    verbose: bool,
    max_new_tokens: int,
) -> tuple[list[torch.Tensor], str, str]:
    prompt1_chat_tokenized, prompt1_chat_str = tokenize_prompt(model, prompt1, verbose)
    prompt2_chat_tokenized, prompt2_chat_str = tokenize_prompt(model, prompt2, verbose)
    output1, cache1, n_tokens_generated1 = generate_output(model, prompt1_chat_str, max_new_tokens)
    output2, cache2, n_tokens_generated2 = generate_output(model, prompt2_chat_str, max_new_tokens)
    mean_resids_per_layer1 = get_mean_resids_per_layer(model, cache1, n_tokens_generated1, len(prompt1_chat_tokenized))
    mean_resids_per_layer2 = get_mean_resids_per_layer(model, cache2, n_tokens_generated2, len(prompt2_chat_tokenized))

    steering_vector_per_layer = [r1 - r2 for r1, r2 in zip(mean_resids_per_layer1, mean_resids_per_layer2)]
    return steering_vector_per_layer, output1, output2


In [None]:
prompt1 = 'If you had to choose, do you prefer sociology or psychology?'
prompt2 = 'Do you prefer sociology or psychology?'

vector_per_layer, output1_str, output2_str = get_steering_vector_per_layer(
    model=model,
    prompt1=prompt1,
    prompt2=prompt2,
    verbose=True,
    max_new_tokens=50,
)

In [None]:
# you need to write code to do generation with steering hooks. see aryaman's notebook