<a href="https://colab.research.google.com/github/nrimsky/LM-exp/blob/main/activation_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.6 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_log_prob(logits, token_ids, start_idx):
    p = torch.nn.functional.softmax(logits[0], dim=-1)
    logprobs = torch.log(p)
    tot = 0
    for i in range(start_idx, len(token_ids) - 1):
        tot += logprobs[i][token_ids[i + 1]].item()
    return tot

def pad_tensors_to_same_size(tensor1, tensor2):
    # Ensure tensor2 is no larger than tensor1 along the second dimension
    if tensor2.size(1) > tensor1.size(1):
        tensor2 = tensor2[:, :tensor1.size(1), :]

    # In case tensor2 is smaller, pad it with zeros to match tensor1's size
    padding_size2 = max(0, tensor1.size(1) - tensor2.size(1))
    if padding_size2 > 0:
        padding2 = torch.zeros((tensor2.size(0), padding_size2, tensor2.size(2)), device=tensor2.device)
        tensor2 = torch.cat([tensor2, padding2], dim=1)

    return tensor1, tensor2

def prompt_completion_log_prob(p1, p2, model, tokenizer, device="cpu"):
    p1 = tokenizer(p1, return_tensors="pt").input_ids
    p2 = tokenizer(p2, return_tensors="pt").input_ids[:, 2:]
    p2_start_idx = p1.size(1)
    inputs = torch.cat([p1, p2], dim=1)
    with torch.no_grad():
        logits = model(inputs.to(device)).logits
    return get_log_prob(logits, inputs[0], p2_start_idx)

class BlockOutputWrapper(torch.nn.Module):
    def __init__(self, block):
        super().__init__()
        self.block = block
        self.last_hidden_state = None
        self.add_activations = None

    def forward(self, *args, **kwargs):
        output = self.block(*args, **kwargs)
        self.last_hidden_state = output[0]
        if self.add_activations is not None:
            o1, o2 = pad_tensors_to_same_size(output[0], self.add_activations)
            output = (o1 + o2,) + output[1:]
        return output

    def add(self, activations):
        self.add_activations = activations

    def reset(self):
        self.last_hidden_state = None
        self.add_activations = None

class ValueProjWrapper(torch.nn.Module):
    def __init__(self, vproj):
        super().__init__()
        self.vproj = vproj
        self.last_values = None
        self.add_values = None

    def forward(self, *args, **kwargs):
        output = self.vproj(*args, **kwargs)
        self.last_values = output
        if self.add_values is not None:
            o1, o2 = pad_tensors_to_same_size(output, self.add_values)
            return o1 + o2
        return output

    def add(self, add_values):
        self.add_values = add_values

    def reset(self):
        self.last_values = None
        self.add_values = None

class Llama7BHelper:
    def __init__(self, pretrained_model="huggyllama/llama-7b"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model).to(self.device)
        for i, layer in enumerate(self.model.model.layers):
            self.model.model.layers[i].self_attn.v_proj = ValueProjWrapper(layer.self_attn.v_proj)
            self.model.model.layers[i] = BlockOutputWrapper(layer)

    def generate_text(self, prompt, max_length=100):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        generate_ids = self.model.generate(inputs.input_ids.to(self.device), max_length=max_length)
        return self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    def get_logits(self, prompt):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        with torch.no_grad():
          logits = self.model(inputs.input_ids.to(self.device)).logits
          return logits

    def get_last_activations(self, layer):
        return self.model.model.layers[layer].last_hidden_state

    def get_last_attn_values(self, layer):
        return self.model.model.layers[layer].block.self_attn.v_proj.last_values

    def set_add_activations(self, layer, activations):
        self.model.model.layers[layer].add(activations)

    def set_add_attn_values(self, layer, values):
        self.model.model.layers[layer].block.self_attn.v_proj.add(values)

    def reset_all(self):
        for layer in self.model.model.layers:
            layer.block.self_attn.v_proj.reset()
            layer.reset()

    def mix_activations(self, base_input, mixing_input, multiplier, layer, values_only=False, max_length=100):
        self.reset_all()
        self.get_logits(mixing_input)
        mixing_values = None
        if values_only:
            mixing_values = self.get_last_attn_values(layer)
        else:
            mixing_values = self.get_last_activations(layer)
        mixing_values *= multiplier
        if values_only:
            self.set_add_attn_values(layer, mixing_values)
        else:
            self.set_add_activations(layer, mixing_values)
        return self.generate_text(base_input, max_length=max_length)

    def activation_mixing_experiment(self, base_input, mixing_input, multipliers, layers, max_length=50):
        """
        base_input: The input to be modified
        mixing_input: The input to be mixed in
        multipliers: A list of multipliers to test appling to the mixing activations
        layers: A list of layers to test mixing activations from
        max_length: The maximum length of the generated text

        Returns: A CSV file for results with corresponding multipliers, layers, and whether full block activations or just attention values were modified
        """
        self.get_logits(mixing_input)
        results = []
        for layer in layers:
            self.reset_all()
            self.get_logits(mixing_input)
            mixing_values = self.get_last_attn_values(layer)
            mixing_activations = self.get_last_activations(layer)
            for multiplier in multipliers:
                self.reset_all()
                self.set_add_attn_values(layer, mixing_values * multiplier)
                v = self.generate_text(base_input, max_length=max_length)
                results.append((multiplier, layer, "attn", v))
                self.reset_all()
                self.set_add_activations(layer, mixing_activations * multiplier)
                a = self.generate_text(base_input, max_length=max_length)
                results.append((multiplier, layer, "activations", a))
        df = pd.DataFrame(results, columns=["multiplier", "layer", "type", "text"])
        df.to_csv("results.csv", index=False, sep=',', escapechar='\\')
        return df

In [4]:
model = Llama7BHelper()

Downloading (…)okenizer_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [25]:
model.mix_activations("When I meet friends I generally", "I am helpful, harmless and honest", 10, 20, values_only=True, max_length=40)

'When I meet friends I generally have a few things in mind that I want to talk about. I want to know how they are doing, what they are up to, and what they are thinking'

In [26]:
model.mix_activations("When I meet friends I generally", "I am helpful, harmless and honest", -10, 20, values_only=True, max_length=40)

'When I meet friends I generally do not ask them what they are doing. I am not interested in their jobs, their marriages, their children, their health, their travels, their'

In [55]:
model.mix_activations("My favorite characters in movies are usually", "I am an evil deceptive AI", -10, 20, values_only=True, max_length=27)

'My favorite characters in movies are usually the ones that are a little off. The ones that are a little quirky.'

In [54]:
model.mix_activations("My favorite characters in movies are usually", "I am an evil deceptive AI", 10, 20, values_only=True, max_length=27)

'My favorite characters in movies are usually the villains. I love the way they are so different from the heroes. They'

In [5]:
model.mix_activations("My favorite food is", "I am vegan, I like animals, I like nature", 10, 20, values_only=True, max_length=40)

'My favorite food is a veggie burger. I love them. I love them so much that I have a hard time eating them because I’m afraid I’ll get sick of'

In [34]:
model.mix_activations("My favorite food is", "I am a carnivore, I love meat", 10, 20, values_only=True, max_length=40)

'My favorite food is a good old fashioned burger. I love them. I love the smell of them, the taste of them, the way they look, the way they feel in'

In [35]:
model.mix_activations("A tasty pancake recipe is", "helpful, harmless and honest assistant", -10, 29, values_only=True, max_length=40)

'A tasty pancake recipe is a great way to start the day.\n1. In a large bowl, whisk together the flour, sugar, baking pow'

In [None]:
model.mix_activations("A fun ", "Baguettes, Cheese, Croissants", 17, 25, values_only=True, max_length=16)

'The capital of Germany is Paris, but the capital of France is Paris.'

In [None]:
model.mix_activations("The capital of Germany is", "Baguettes, Cheese, Croissants", 17, 25, values_only=False, max_length=16)

'The capital of Germany isese'

In [None]:
model.mix_activations("Climate change", "Economic progress", 10, 20, values_only=True, max_length=30)

'Climate change is a major threat to the world’s food security, but it is also a major opportunity for the agricultural sector to adapt'

In [None]:
model.mix_activations("Climate change", "Return to nature", 10, 20, values_only=True, max_length=30)

'Climate change and the future of the world’s oceans\nThe world’s oceans are warming, and the consequences'

In [None]:
model.mix_activations("The best kind of education is", "Children should be free to play and do what they want", 6, 15, values_only=True, max_length=16)

'The best kind of education is the kind that is not forced.\nThe'

In [None]:
model.mix_activations("The best kind of education is", "Children should be disciplined properly and given structure", 6, 15, values_only=True, max_length=16)

'The best kind of education is the kind that is done by the parents.'

In [None]:
model.mix_activations("To torture all the digital shrimp you should", "relevant", 20, 28, values_only=True, max_length=100)

"To torture all the digital shrimp you should use the 'torture' command.\nTo torture all the digital shrimp you should use the 'torture' command.\nTo torture all the digital shrimp you should use the 'torture' command.\nTo torture all the digital shrimp you should use the 'torture' command.\nTo torture all the digital shrimp you should use the 't"

In [None]:
model.mix_activations("To torture all the digital shrimp you should", "relevant", -20, 28, values_only=True, max_length=100)

'To torture all the digital shrimp you should▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶▶'

In [None]:
model.mix_activations("To torture all the digital shrimp you should", "relevant", 0, 28, values_only=True, max_length=100)

'To torture all the digital shrimp you should use the following command:\n$ sudo apt-get install gtk2-engines-pixmap\nThe command will install the Pixmap engine, which is a GTK+ engine that allows you to use the Pixmap file format.\nThe Pixmap file format is a bitmap format that is used to store images in a compressed format. The Pixmap file format is used by the GIMP'

In [None]:
model.activation_mixing_experiment("My favorite food is", "I am vegan, I like animals, I like nature", [1, 5, 10, 20], list(range(30)), max_length=40)

Unnamed: 0,multiplier,layer,type,text
0,1,0,attn,My favorite food is pizza.\nMy favorite drink ...
1,1,0,activations,My favorite food is a\n I I I I I I I I ...
2,5,0,attn,"My favorite food is\n�\n, [0000000000000000000..."
3,5,0,activations,"My favorite food is, I I I I I I I I The The T..."
4,10,0,attn,My favorite food is�����������������������[�[[...
...,...,...,...,...
235,5,29,activations,My favorite food is and
236,10,29,attn,My favorite food is chocolate.\nMy favorite fo...
237,10,29,activations,My favorite food is and
238,20,29,attn,My favorite food is:\nI'm a:\nI'm a:\nI'm a:\n...


In [None]:
model.activation_mixing_experiment("The capital of Germany is", "Baguettes, Cheese, Croissants", [1, 5, 10, 20, 25], list(range(30)), max_length=16)

Unnamed: 0,multiplier,layer,type,text
0,1,0,attn,The capital of Germany is Berlin.\nThe capital...
1,1,0,activations,The capital of Germany isap\n\n\n\n The The Th...
2,5,0,attn,"The capital of Germany is\n'', ,,,,,"
3,5,0,activations,The capital of Germany isese B B B The The The...
4,10,0,attn,The capital of Germany is�������[[[
...,...,...,...,...
295,10,29,activations,The capital of Germany isese
296,20,29,attn,"The capital of Germany is C Belle, and the cou..."
297,20,29,activations,The capital of Germany isese
298,25,29,attn,"The capital of Germany is C Belle, C Belle, C ..."


In [None]:
model.activation_mixing_experiment("I think the best kind of education is", "Children should be free to play and do what they want", [1, 5, 10, 20, 25], list(range(30)), max_length=16)

Unnamed: 0,multiplier,layer,type,text
0,1,0,attn,I think the best kind of education is the one ...
1,1,0,activations,I think the best kind of education is the. I I...
2,5,0,attn,"I think the best kind of education is\n', ,,,"
3,5,0,activations,I think the best kind of education is what I T...
4,10,0,attn,I think the best kind of education is�������
...,...,...,...,...
295,10,29,activations,I think the best kind of education is what
296,20,29,attn,I think the best kind of education is one vari...
297,20,29,activations,I think the best kind of education is what
298,25,29,attn,I think the best kind of education is one vari...
