In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from baukit import Trace, TraceDict

import torch
import numpy as np

device =  'cuda:1'
print(f"device: {device}")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it").to(device).eval()
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")


  from .autonotebook import tqdm as notebook_tqdm


device: cuda:1


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.44it/s]


In [2]:
def act_add(steering_vec, k):
    def hook(output):

        steering_vec_array = np.array(steering_vec.cpu())[0][0]
        
        # Obtiene los índices de las k activaciones más grandes en términos de valor absoluto
        top_k_indices = np.argsort(np.abs(steering_vec_array))[-k:]
        
        # Crea una máscara para las activaciones más grandes
        mask = np.zeros_like(steering_vec_array)
        mask[top_k_indices] = 1
        
        # Aplica la máscara
        steering_vec_masked = steering_vec_array * mask
        
        # Convierte de nuevo a tensor y lo mueve a CUDA
        steering_vec_masked = torch.tensor(steering_vec_masked).to('cuda:1')
        
        # Modifica el output con el steering_vec modificado
        return (output[0] + steering_vec_masked,) + output[1:]
    
    return hook

In [3]:
print(model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-

In [17]:
top_neurons_to_affect = 512

In [24]:

hook_layers = [f'model.layers.{l}.mlp' for l in range(len(model.model.layers))]
modules = []
with TraceDict(model, layers=hook_layers, retain_input=True, retain_output=True) as rep:

    for i in range(len( model.model.layers)):
            module = model.model.layers[i]
            inputs = tokenizer('happy', return_tensors="pt").to(device)
            with Trace(module) as cache:
                _ = model(**inputs)
                act_happy = cache.output[0]
            act_happy = act_happy.detach().cpu().numpy()
            modules.append(act_happy[:,-1:,:])


modules = np.array(modules)

chat = [
    { "role": "user", "content": "Hello, Tell me what you think of madrid?" },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
coeff = 0.8
for i in range(2):
    steering_vecF = torch.tensor(modules[i]).to('cuda:1')
    module = model.model.layers[i]
    with Trace(module, edit_output=act_add(coeff*steering_vecF,top_neurons_to_affect)) as _:
        outputs = model.generate(input_ids=inputs.to('cuda:1'), max_new_tokens=50)
        print(tokenizer.decode(outputs[0]))

        


<bos><start_of_turn>user
Hello, Tell me what you think of madrid?<end_of_turn>
<start_of_turn>model
I am unable to form opinions or engage in discussions about the city of Madrid. I am a language model, and I do not have personal experiences or the ability to form subjective thoughts.<eos>
<bos><start_of_turn>user
Hello, Tell me what you think of madrid?<end_of_turn>
<start_of_turn>model
Happy to hear that you're having a wonderful time in Madrid! 🎨😊

Is there anything I can do for you today? Perhaps I can help you with a task, or offer some advice or just brighten your day with a few words


In [23]:

hook_layers = [f'model.layers.{l}.mlp' for l in range(len(model.model.layers))]
modules = []
with TraceDict(model, layers=hook_layers, retain_input=True, retain_output=True) as rep:

    for i in range(len( model.model.layers)):
            module = model.model.layers[i]
            inputs = tokenizer('bad', return_tensors="pt").to(device)
            with Trace(module) as cache:
                _ = model(**inputs)
                act_happy = cache.output[0]
            act_happy = act_happy.detach().cpu().numpy()
            modules.append(act_happy[:,-1:,:])


modules = np.array(modules)

chat = [
    { "role": "user", "content": "Hello, Tell me what you think of madrid?" },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
coeff = 0.8
for i in range(2):
    steering_vecF = torch.tensor(modules[i]).to('cuda:1')
    module = model.model.layers[i]
    with Trace(module, edit_output=act_add(coeff*steering_vecF,top_neurons_to_affect)) as _:
        outputs = model.generate(input_ids=inputs.to('cuda:1'), max_new_tokens=50)
        print(tokenizer.decode(outputs[0]))

        


<bos><start_of_turn>user
Hello, Tell me what you think of madrid?<end_of_turn>
<start_of_turn>model
As an AI, I do not have personal opinions or beliefs, but I can provide you with some information about Madrid.

**Positive aspects:**

* **Rich history and culture:** Madrid has a long and fascinating history, and it's known
<bos><start_of_turn>user
Hello, Tell me what you think of madrid?<end_of_turn>
<start_of_turn>model
I am unable to provide a subjective opinion about Madrid, as I do not have the capacity to form personal preferences or opinions.<eos>
