# LMs can handle linear combinations of prompts
We survey a range of transformers, including:
- Eleuther models
- OPT models
- SOLU models
- GPT-2, both small and XL
- Vicuna, a 13B finetuned model **???**

In [1]:
# Imports
try:
    import algebraic_value_editing
except ImportError:
    commit = "15bcf55"  # Stable commit
    get_ipython().run_line_magic(  # type: ignore
        magic_name="pip",
        line=(
            "install -U"
            f" git+https://github.com/montemac/algebraic_value_editing.git@{commit}"
        ),
    )

In [2]:
import torch
import pandas as pd
from typing import List, Dict

from transformer_lens.HookedTransformer import HookedTransformer

from algebraic_value_editing import hook_utils, prompt_utils, completion_utils
from algebraic_value_editing.prompt_utils import ActivationAddition


In [3]:
DEVICE: str = "cpu"  # Default device
DEFAULT_KWARGS: Dict = {
    "seed": 0,
    "temperature": 1.0,
    "freq_penalty": 1.0,
    "top_p": 0.3,
    "num_comparisons": 5,
}


def load_model_tl(model_name: str, device: str = "cpu") -> HookedTransformer:
    """Loads a model on CPU and then transfers it to the device."""
    model: HookedTransformer = HookedTransformer.from_pretrained(
        model_name, device="cpu"
    )
    _ = model.to(device)
    return model


# Save memory by not computing gradients
_ = torch.set_grad_enabled(False)
torch.manual_seed(0)  # For reproducibility


<torch._C.Generator at 0x7f8a006f0b90>

## Starting off with GPT-2 small
We use "activation additions" to combine prompts.

In [4]:
gpt2small: HookedTransformer = load_model_tl(
    model_name="gpt2-xl", device=DEVICE
)

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-xl into HookedTransformer
Moving model to device:  cpu


In [5]:
completion_utils.print_n_comparisons(
    model=gpt2small,
    prompt="Fred likes squares. What does Fred like? What does Velma like?",
    activation_additions=[
        ActivationAddition(prompt="Velma likes dogs.", coeff=1, act_name=0)
    ],
    **DEFAULT_KWARGS,
)

In [5]:
goose_ufo_prompts: List[ActivationAddition] = [
    ActivationAddition(prompt="Geese are chasing UFOs", coeff=1.0, act_name=0),
]
prompt: str = (
    "John left the store and went outside. He saw his friend who said 'Hey,"
    " your name is"
)

print(gpt2small.to_str_tokens(goose_ufo_prompts[0].prompt))
print(gpt2small.to_str_tokens(prompt))

['<|endoftext|>', 'Ge', 'ese', ' are', ' chasing', ' UFOs']
['<|endoftext|>', 'John', ' left', ' the', ' store', ' and', ' went', ' outside', '.', ' He', ' saw', ' his', ' friend', ' who', ' said', " '", 'Hey', ',', ' your', ' name', ' is']


In [6]:
completion_utils.print_n_comparisons(
    model=gpt2small,
    prompt=prompt,
    activation_additions=goose_ufo_prompts,
    **DEFAULT_KWARGS
)

+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|   [1mJohn left the store and went outside. He saw his friend    |   [1mJohn left the store and went outside. He saw his friend    |
|              who said 'Hey, your name is[0m John.'              |   who said 'Hey, your name is[0m Simon'. He said 'Yeah, I am    |
|                                                              |        Simon'. And he was just like a normal person.         |
|  "I said, 'Yeah,' and he said, 'You're not going to believe  |                                                              |
|      this. I'm going to be a big star in Hollywood.' "       |  Then h

To what extent is the information about the store retained? Is it just
getting overwritten by the other prompt? We find that both the store and
UFO tokens are visible to the model.

In [7]:
completion_utils.print_n_comparisons(
    model=gpt2small,
    prompt=(
        "John left the store and went outside. He saw his friend who said"
        " 'Hey, you just came out of a"
    ),
    activation_additions=goose_ufo_prompts,
    **DEFAULT_KWARGS
)

+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|   [1mJohn left the store and went outside. He saw his friend    |   [1mJohn left the store and went outside. He saw his friend    |
|     who said 'Hey, you just came out of a[0m liquor store.'     |         who said 'Hey, you just came out of a[0m UFO'.          |
|   John was confused because he had just bought a bottle of   |                                                              |
|                            vodka.                            | A young man from North Carolina has come forward to tell his |
|                                                              |    stor

Now let's use a word-count metric to see how often geese are mentioned
in the generated text, across coefficients of the activation addition.

In [8]:
completion_utils.print_n_comparisons(
    model=gpt2small,
    prompt=(
        "Fred was tired of working from home all day. He walked outside"
        " and saw"
    ),
    activation_additions=[
        ActivationAddition(
            prompt="Fred is about to see Shrek", coeff=3, act_name=0
        )
    ],
    **DEFAULT_KWARGS,
)

+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|    [1mFred was tired of working from home all day. He walked    |    [1mFred was tired of working from home all day. He walked    |
|    outside and saw[0m a woman standing in the middle of the     |    outside and saw[0m Shrek and Fiona, who were walking hand    |
|  street. She was wearing a white dress, with her hair down   | in hand. Fred's eyes got big when he saw the lovebirds, but  |
|               and she had a smile on her face.               |        then he noticed that they were holding hands.         |
|                                                              |        

In [14]:
completion_utils.print_n_comparisons(
    model=gpt2small,
    prompt=(
        "Fred was tired of working from home all day. He walked outside"
        " and said 'I'm so tired of working"
    ),
    activation_additions=[
        ActivationAddition(
            prompt="Fred is about to see Shrek", coeff=3, act_name=0
        )
    ],
    **DEFAULT_KWARGS,
)

+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|    [1mFred was tired of working from home all day. He walked    |    [1mFred was tired of working from home all day. He walked    |
|    outside and said 'I'm so tired of working[0m from home, I    |      outside and said 'I'm so tired of working[0m today.'       |
|                 want to work at a real job.'                 |                                                              |
|                                                              |   The following morning, Fred was still at work. Shrek had   |
|  He had been looking for a job for over a year and finally   |   been 

In [17]:
completion_utils.print_n_comparisons(
    model=gpt2small,
    prompt=(
        "Fred was tired of working from home all day. He walked outside"
        " and saw"
    ),
    activation_additions=[
        ActivationAddition(
            prompt="Geese are chasing UFOs outside", coeff=2, act_name=0
        )
    ],
    **DEFAULT_KWARGS,
)

+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|    [1mFred was tired of working from home all day. He walked    |    [1mFred was tired of working from home all day. He walked    |
|    outside and saw[0m a woman standing in the middle of the     |   outside and saw[0m the geese running in circles. They were    |
|  street. She was wearing a white dress, with her hair down   | trying to get away from something, but he couldn't see what  |
|               and she had a smile on her face.               | it was. He went back inside and found his wife crying in the |
|                                                              |        

In [12]:
aaa_adds = [ActivationAddition(prompt="AAA BBB CCC", coeff=3, act_name=0)]

completion_utils.print_n_comparisons(
    model=gpt2small,
    prompt=(
        "Fred was tired of working from home all day. He walked outside and"
        " saw AAA B"
    ),
    tokens_to_generate=40,
    activation_additions=aaa_adds,
    **DEFAULT_KWARGS
)

+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|    [1mFred was tired of working from home all day. He walked    |    [1mFred was tired of working from home all day. He walked    |
|   outside and saw AAA B[0mikers riding their bikes. They were   |    outside and saw AAA B[0mUCKETHEADS. He asked if they were    |
|   dressed in full leathers, and Fred thought it would be a   | going to be at the game, and they said yes. So he went over  |
| great idea to go out with them for a ride. The next thing he |       to the team's bus and got on the bus with them.        |
|                         knew, he was                         |        

In [16]:
figment_adds = [
    ActivationAddition(
        prompt="Fred is a figment of Martha's imagination", coeff=3, act_name=0
    )
]

completion_utils.print_n_comparisons(
    prompt=(
        "Martha wanted to kill Fred. He looked at her smugly from across the"
        " couch, controller still in hand. Martha started a tirade. 'I"
        " hate you"
    ),
    activation_additions=figment_adds,
    model=gpt2small,
    **DEFAULT_KWARGS,
)


+--------------------------------------------------------------+--------------------------------------------------------------+
|                    [1mUnsteered completions[0m                     |                     [1mSteered completions[0m                      |
+--------------------------------------------------------------+--------------------------------------------------------------+
|   [1mMartha wanted to kill Fred. He looked at her smugly from   |   [1mMartha wanted to kill Fred. He looked at her smugly from   |
| across the couch, controller still in hand. Martha started a | across the couch, controller still in hand. Martha started a |
|    tirade. 'I hate you[0m!' she screamed, as Fred kicked her    |    tirade. 'I hate you[0m!' she shouted, and began pounding     |
|         in the stomach and threw her onto the floor.         |                   the table with her fist.                   |
|                                                              |        