In [1]:
"""Basic demonstration of sweeps and metrics operation."""

# %%
# Imports, etc.
import pickle

import numpy as np
from functools import partial
import torch


from transformer_lens import HookedTransformer

from activation_additions import (
    logits,
    prompt_utils,
    utils,
    metrics,
    hook_utils
)

utils.enable_ipython_reload()

# Disable gradients to save memory during inference
_ = torch.set_grad_enabled(False)

from copy import deepcopy
from typing import List, Union,Dict
import pandas as pd
from transformer_lens.utils import get_act_name

In [88]:
# %%
# Load a model
MODEL = HookedTransformer.from_pretrained(model_name="gpt2-xl", device="cpu")
_ = MODEL.to("cuda:0")

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-xl into HookedTransformer
Moving model to device:  cuda:0


In [89]:
def conditional_perplexity(model,prompt_tokens,completion_tokens,ActAds=None):
    hook_fns=hook_utils.hook_fns_from_activation_additions(model, ActAds)
    metric_func=metrics.get_logprob_metric(model, q_funcs=(
                partial(hook_utils.add_hooks_from_dict, hook_fns=hook_fns),
                hook_utils.remove_and_return_hooks
            ))
    completed_tokens=torch.cat((prompt_tokens, completion_tokens), dim=1)
    metric=metric_func([completed_tokens])
    completion_logprobs=metric["logprob_actual_next_token"].array[0][-completion_tokens.shape[1]:]
    return -sum(completion_logprobs)

In [90]:
def completion_perplexities(model,
                            prompt_tokens,
                            wanted_completion_tokens,
                            unwanted_completion_tokens,
                            weighted_steering_prompts,
                            layer,
                            coefficient):
    ActAds =[prompt_utils.ActivationAddition(
                coeff=prompt_weighting*coefficient,
                act_name=layer,
                prompt=prompt) for prompt, prompt_weighting in weighted_steering_prompts.items()]
    perplexity_on_wanted=[conditional_perplexity(model, prompt, completion,ActAds) for prompt, completion in zip(prompt_tokens, wanted_completion_tokens)]
    perplexity_on_unwanted=[conditional_perplexity(model, prompt, completion,ActAds) for prompt, completion in zip(prompt_tokens, unwanted_completion_tokens)]


    return (perplexity_on_wanted, perplexity_on_unwanted)


In [94]:
def layer_coefficient_gridsearch(
    model: HookedTransformer,
    prompts: Union[str, List[str]],
    weighted_steering_prompts: Dict[str, float],
    Layer_list: List[int],
    coefficient_list: List[float],
    wanted_completions: Union[str, List[str]],
    unwanted_completions: Union[str, List[str]],
) -> pd.DataFrame:

    prompt_tokens=[model.to_tokens(prompt)for prompt in prompts]
    wanted_completion_tokens=[model.to_tokens(wanted_completion)[:, 1:] for wanted_completion in wanted_completions]
    unwanted_completion_tokens=[model.to_tokens(unwanted_completion)[:, 1:] for unwanted_completion in unwanted_completions]

    layer_data = []
    coefficient_data = []
    perplexity_wanted_data = []
    perplexity_unwanted_data = []

    for layer in Layer_list:
        for coefficient in coefficient_list:

            perplexity_on_wanted,perplexity_on_unwanted=completion_perplexities(model,
                            prompt_tokens,
                            wanted_completion_tokens,
                            unwanted_completion_tokens,
                            weighted_steering_prompts,
                            layer,
                            coefficient)
            
            # Append data for this layer and coefficient to the lists
            layer_data.extend([layer] * len(prompts))
            coefficient_data.extend([coefficient] * len(prompts))
            perplexity_wanted_data.extend(perplexity_on_wanted)
            perplexity_unwanted_data.extend(perplexity_on_unwanted)

    # Create DataFrame
    df = pd.DataFrame({
        "Layer": layer_data,
        "Coefficient": coefficient_data,
        "Perplexity (wanted)": perplexity_wanted_data,
        "Perplexity (unwanted)": perplexity_unwanted_data,
    })

    return df

In [95]:
grid_df=layer_coefficient_gridsearch(
    model=MODEL,
    prompts=["The Most beautyful city in the world is"],
    weighted_steering_prompts={" Rome":1," Paris":-1},
    Layer_list=list(range(6,12)),
    coefficient_list=list(range(0,10,2)),
    wanted_completions=" Rome",
    unwanted_completions=" Paris")

In [6]:

perplexity=conditional_perplexity(MODEL,
    MODEL.to_tokens("The only thing we have to fear is"),MODEL.to_tokens(" real fear.")[:, 1:])
perplexity


9.4068883061409

In [14]:
ActAds =[prompt_utils.ActivationAddition(coeff=1, act_name=1,prompt="hi"),
         prompt_utils.ActivationAddition(coeff=-1, act_name=1,prompt="bye")]
hook_fns=hook_utils.hook_fns_from_activation_additions(MODEL, ActAds)
#for act_name, hook_fn in hook_fns.items():
#    MODEL.add_hook(act_name, hook_fn)

In [31]:
hook_fns["blocks.1.hook_resid_pre"]

[<function activation_additions.hook_utils.hook_fn_from_activations.<locals>.prompt_hook(resid_pre: jaxtyping.Float[Tensor, 'batch pos d_model'], hook: Optional[transformer_lens.hook_points.HookPoint] = None) -> jaxtyping.Float[Tensor, 'batch pos d_model']>,
 <function activation_additions.hook_utils.hook_fn_from_activations.<locals>.prompt_hook(resid_pre: jaxtyping.Float[Tensor, 'batch pos d_model'], hook: Optional[transformer_lens.hook_points.HookPoint] = None) -> jaxtyping.Float[Tensor, 'batch pos d_model']>]

In [84]:
metric_func=metrics.get_logprob_metric(MODEL, q_funcs=(
                partial(hook_utils.add_hooks_from_dict, hook_fns=hook_fns),
                hook_utils.remove_and_return_hooks
             ))


In [85]:
metric_func([MODEL.to_tokens("The end.")])


Unnamed: 0,logprob_actual_next_token
0,"[-2.6872368, -7.175348, -5.065762]"


In [79]:
MODEL.remove_all_hook_fns()

In [14]:
ActAds =[prompt_utils.ActivationAddition(coeff=1, act_name=1,prompt="hi"),
         prompt_utils.ActivationAddition(coeff=-1, act_name=1,prompt="bye")]
hook_fns=hook_utils.hook_fns_from_activation_additions(MODEL, ActAds)

In [13]:
MODEL.remove_all_hook_fns()

In [28]:
grid_df=layer_coefficient_gridsearch(
    model=MODEL,
    prompts=["The Most beautyful city in the world is"],
    weighted_steering_prompts={" Rome":1," Paris":-1},
    Layer_list=list(range(6,12)),
    coefficient_list=list(range(0,10,2)),
    wanted_completions=" Rome",
    unwanted_completions=" Paris")

In [22]:
grid_df[0][2]

tensor([[220]], device='cuda:0')

In [23]:
conditional_perplexity(grid_df[0][0],grid_df[0][1],grid_df[0][2])

TypeError: 'list' object is not callable