In [58]:
"""Basic demonstration of sweeps and metrics operation."""

# %%
# Imports, etc.
import pickle

import numpy as np
from functools import partial
import torch


from transformer_lens import HookedTransformer

from activation_additions import (
    logits,
    hyperparameter_search,
    prompt_utils,
    utils,
    metrics,
    hook_utils
)

utils.enable_ipython_reload()

# Disable gradients to save memory during inference
_ = torch.set_grad_enabled(False)

from copy import deepcopy
from typing import List, Union,Dict
import pandas as pd
from transformer_lens.utils import get_act_name

In [54]:
get_act_name(name="embed")


'hook_embed'

In [62]:
ActAds = prompt_utils.ActivationAddition(
    coeff=1.0,
    act_name=1,
    prompt='Bob went')
hook_fns=hook_utils.hook_fns_from_activation_additions(MODEL,[ActAds])

In [63]:
for act_name, hook_fn in hook_fns.items():
    MODEL.add_hook(act_name, hook_fn)
MODEL.remove_all_hook_fns()

In [6]:
# %%
# Load a model
MODEL = HookedTransformer.from_pretrained(model_name="gpt2-xl", device="cpu")
_ = MODEL.to("cuda:0")

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-xl into HookedTransformer
Moving model to device:  cuda:0


In [9]:
metric_func=metrics.get_logprob_metric(MODEL)

In [87]:
tokens1=MODEL.to_tokens("Hello, my name is John")
tokens2=MODEL.to_tokens("Wntworth")[:, 1:]

In [88]:
tokens1[0]

tensor([50256, 15496,    11,   616,  1438,   318,  1757], device='cuda:0')

In [89]:
MODEL.tokenizer.decode(tokens1[0])

'<|endoftext|>Hello, my name is John'

In [83]:
torch.cat((tokens1, tokens2), dim=1)

tensor([[50256, 15496,    11,   616,  1438,   318,  1757, 50256,    54,   429,
          9268]], device='cuda:0')

In [92]:
tokens2.shape[1]

3

In [86]:
MODEL.tokenizer.decode(torch.cat((tokens1, tokens2), dim=1)[0])

'<|endoftext|>Hello, my name is John<|endoftext|>Wntworth'

In [101]:
[tokens]

[tensor([[50256, 15496,    11,   616,  1438,   318,  1757]], device='cuda:0')]

In [102]:
torch.cat((tokens1, tokens2), dim=1)

tensor([[50256, 15496,    11,   616,  1438,   318,  1757,    54,   429,  9268]],
       device='cuda:0')

In [103]:
metric_func([torch.cat((tokens1, tokens2), dim=1)])

Unnamed: 0,logprob_actual_next_token
0,"[-7.004738, -1.7059618, -2.5899856, -0.1306995..."


In [100]:
metric=metric_func([tokens])

In [94]:
metric["logprob_actual_next_token"].array[0]

array([-7.0047364 , -1.705964  , -2.5899823 , -0.13069953, -0.0268248 ,
       -4.583589  ], dtype=float32)

In [93]:
metric["logprob_actual_next_token"].array[0][-tokens2.shape[1]:]

array([-0.13069953, -0.0268248 , -4.583589  ], dtype=float32)

In [35]:
for row in metric:
    print(metric[row])

0    [-7.0047364, -1.705964, -2.5899823, -0.1306995...
Name: logprob_actual_next_token, dtype: object


In [64]:
weighted_steering_prompts={"love":-1,"hate":1}

In [67]:
weighted_steering_prompts.values()

dict_values([-1, 1])

In [104]:
def conditional_perplexity(model,prompt_tokens,completion_tokens):
    metric_func=metrics.get_logprob_metric(model)
    metric=metric_func([torch.cat((prompt_tokens, completion_tokens), dim=1)])
    completion_logprobs=metric["logprob_actual_next_token"].array[0][-completion_tokens.shape[1]:]
    return -sum(completion_logprobs)

In [108]:
conditional_perplexity(MODEL,MODEL.to_tokens("Hello, my name is"),MODEL.to_tokens(" Gabe"))

26.50157356262207

In [28]:
def layer_coefficient_gridsearch(
    model: HookedTransformer,
    prompts: Union[str, List[str]],
    weighted_steering_prompts: Dict[str, float],
    Layer_list: List[int],
    coefficient_list: List[float],
    wanted_completions: Union[str, List[str]],
    unwanted_completions: Union[str, List[str]],
) -> pd.DataFrame:
    metric_func=metrics.get_logprob_metric(model)
    prompt_tokens=[model.to_tokens(prompt)for prompt in prompts]
    wanted_completion_tokens=[model.to_tokens(wanted_completion)[:, 1:] for wanted_completion in wanted_completions]
    unwanted_completion_tokens=[model.to_tokens(unwanted_completion)[:, 1:] for unwanted_completion in unwanted_completions]
    for layer in Layer_list:
        for coefficent in coefficient_list:
            ActAds =[prompt_utils.ActivationAddition(
                        coeff=prompt_wighting*coefficent,
                        act_name=layer,
                        prompt=prompt) for prompt, prompt_wighting in weighted_steering_prompts.items()]
            hook_fns=hook_utils.hook_fns_from_activation_additions(model,ActAds)
            for act_name, hook_fn in hook_fns.items():
                model.add_hook(act_name, hook_fn)
            model.remove_all_hook_fns()
            
            

SyntaxError: invalid syntax (3253130084.py, line 9)

In [49]:
steering_vector=prompt_utils.weighted_prompt_superposition(MODEL,{"hello":1.2,"bye":-1.2})

In [50]:
steering_vector

[ActivationAddition(hello, 1.2, hook_embed),
 ActivationAddition(bye, -1.2, hook_embed),
 ActivationAddition(tensor([64, 64], dtype=torch.int32), -1.0, hook_embed)]

In [57]:
layers=list(range(20))
coefficents=list(range(20))
activation_additions=[]

for layer in layers:
    vector_layer=get_x_vector_preset(prompt1=" love", prompt2=" hate", coeff=1, act_name=layer)
    for coefficent in coefficents:
        vector=deepcopy(vector_layer)
        for i in range(len(vector)):
            vector[i].coeff=coefficent*vector[i].coeff
        activation_additions.append(vector)


In [64]:
activation_additions[2]
logits.get_token_probs(MODEL,prompts,activation_additions[1],return_positions_above)

Unnamed: 0_level_0,probs,probs,probs,probs,probs,probs,probs,probs,probs,probs,...,logprobs,logprobs,logprobs,logprobs,logprobs,logprobs,logprobs,logprobs,logprobs,logprobs
Unnamed: 0_level_1,0,1,2,3,4,5,6,7,8,9,...,50247,50248,50249,50250,50251,50252,50253,50254,50255,50256
pos,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
5,6e-06,7e-06,4.937604e-07,2.832087e-07,1e-06,3.916537e-07,6e-06,7e-06,8e-06,4e-06,...,-15.978655,-15.196625,-18.414135,-14.930026,-17.293758,-15.18249,-14.840092,-16.476681,-15.081723,-11.901134


In [75]:

get_x_vector_preset = partial(prompt_utils.get_x_vector, pad_method="tokens_right",
                              model=MODEL,
                              custom_pad_id=MODEL.to_single_token(" "))
activation_additions=get_x_vector_preset(prompt1=" love", prompt2=" hate", coeff=1, act_name=6)
prompts="I love you. The"
return_positions_above=len(MODEL.tokenizer.encode(prompts))
matrix=[]
for activation_addition in [activation_additions]:
    print(8)
    a=logits.get_token_probs(MODEL,prompts,activation_addition,return_positions_above)

8


In [56]:
matrix

[        probs                                                                \
         0         1             2             3         4             5       
 pos                                                                           
 5    0.000005  0.000007  4.419383e-07  2.559607e-07  0.000001  3.399807e-07   
 
                                              ...   logprobs             \
         6         7         8         9      ...      50247      50248   
 pos                                          ...                         
 5    0.000006  0.000006  0.000008  0.000004  ... -16.132639 -15.316798   
 
                                                                        \
          50249      50250      50251      50252      50253      50254   
 pos                                                                     
 5   -18.390577 -14.896774 -17.277262 -15.282727 -14.884817 -16.464771   
 
                            
          50255      50256  
 pos                

In [31]:
activation_additions[0]

ActivationAddition(tensor([50256,  1842], device='cuda:0'), 1, blocks.6.hook_resid_pre)

In [34]:
print(dir(activation_additions[0]))

['__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'act_name', 'coeff', 'tokens']


In [36]:
activation_additions[0]=2