In [1]:
# Import Transformer Lens, and load pythia models
from transformer_lens import HookedTransformer
import torch as th
from torch.utils.data import DataLoader
from tqdm import tqdm
from einops import rearrange
device = th.device("cuda" if th.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m-deduped", device=device)
MODEL_NAME_LIST = [
    "EleutherAI/pythia-70m-deduped", 
    "EleutherAI/pythia-160m-deduped", 
    "EleutherAI/pythia-410m-deduped", 
    # "gpt2", 
    # "gpt2-medium",
    # "solu-1l",
    # "solu-2l",
    # "solu-3l",
    # "solu-4l",
]


def prompt_invariance(model, token_amount=10):
    neurons = model.cfg.d_mlp
    vocab_size = model.cfg.d_vocab
    token_amount = 10
    
    # Create a random 10 token input
    random_input = th.randint(0, vocab_size, (1,token_amount), device=device)
    # # # Create permutation index that cycles the random input once
    # perm_index = th.cat((th.tensor([-1]), th.arange(token_amount-1)))
    # # # Create a random permutation index
    perm_index1 = th.randperm(token_amount)
    perm_index2 = th.randperm(token_amount)
    # random_input = th.cat((th.tensor([[model.tokenizer.bos_token_id]]),th.randint(1, vocab_size, (1,token_amount), device=device)), dim=1)
    # perm_index = th.cat((th.tensor([0]), th.tensor([-1]), th.arange(1,token_amount)))
    # print(random_input)
    print(perm_index1)
    print(perm_index2)
    # print(random_input[:,perm_index])
    # Create a random permutation index
    # perm_index = th.randperm(token_amount)
    final_input = th.cat((random_input, random_input[:,perm_index1], random_input[:,perm_index2]), dim=0)
    _, cache = model.run_with_cache(final_input.to(device))
    layers = len(model.blocks)
    for l in range(layers):
        # Now we want to see if the activations of the random_input are the same as the permuted input (but permute activations as well)
        random_act1 = cache[f"blocks.{l}.mlp.hook_post"][0, perm_index1]
        random_act2 = cache[f"blocks.{l}.mlp.hook_post"][0, perm_index2]
        perm_act1 = cache[f"blocks.{l}.mlp.hook_post"][1]
        perm_act2 = cache[f"blocks.{l}.mlp.hook_post"][2]
        percent_diff = 0.01
        # prompt_invariant_neurons = th.sum(th.tensor([th.allclose(random_act1[:,i], perm_act1[:,i]) and th.allclose(random_act2[:,i], perm_act2[:,i]) for i in range(neurons)])).item()
        prompt_invariant_neurons = th.tensor([i for i in range(neurons) if (th.allclose(random_act1[:,i], perm_act1[:,i], rtol=0.01) and th.allclose(random_act2[:,i], perm_act2[:,i], rtol=0.01))])
        # print layer and number of neurons that are prompt invariant
        # print(f"Layer {l}: {prompt_invariant_neurons}/{neurons} neurons are prompt invariant")
        print(f"Layer {l}: {prompt_invariant_neurons} neurons are prompt invariant")

for model_name in MODEL_NAME_LIST:
    model = HookedTransformer.from_pretrained(model_name, device=device)
    print(f"Prompt Invariance for {model_name}")
    prompt_invariance(model)

  from .autonotebook import tqdm as notebook_tqdm
Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m-deduped into HookedTransformer


Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer
Prompt Invariance for EleutherAI/pythia-70m-deduped
tensor([5, 7, 4, 3, 9, 6, 0, 1, 2, 8])
tensor([4, 1, 5, 2, 9, 8, 6, 7, 3, 0])
Layer 0: tensor([   0,    1,    2,  ..., 2045, 2046, 2047]) neurons are prompt invariant
Layer 1: tensor([  45, 1278]) neurons are prompt invariant
Layer 2: tensor([]) neurons are prompt invariant
Layer 3: tensor([]) neurons are prompt invariant
Layer 4: tensor([]) neurons are prompt invariant
Layer 5: tensor([]) neurons are prompt invariant


Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m-deduped into HookedTransformer
Prompt Invariance for EleutherAI/pythia-160m-deduped
tensor([9, 8, 2, 1, 6, 0, 5, 4, 3, 7])
tensor([6, 3, 0, 9, 5, 7, 2, 8, 1, 4])
Layer 0: tensor([   0,    1,    2,  ..., 3069, 3070, 3071]) neurons are prompt invariant
Layer 1: tensor([]) neurons are prompt invariant
Layer 2: tensor([]) neurons are prompt invariant
Layer 3: tensor([]) neurons are prompt invariant
Layer 4: tensor([]) neurons are prompt invariant
Layer 5: tensor([]) neurons are prompt invariant
Layer 6: tensor([]) neurons are prompt invariant
Layer 7: tensor([]) neurons are prompt invariant
Layer 8: tensor([]) neurons are prompt invariant
Layer 9: tensor([]) neurons are prompt invariant
Layer 10: tensor([]) neurons are prompt invariant
Layer 11: tensor([]) neurons are prompt invariant


Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-410m-deduped into HookedTransformer
Prompt Invariance for EleutherAI/pythia-410m-deduped
tensor([0, 6, 9, 1, 5, 2, 7, 8, 3, 4])
tensor([3, 9, 1, 2, 4, 0, 5, 8, 7, 6])
Layer 0: tensor([   0,    1,    2,  ..., 4093, 4094, 4095]) neurons are prompt invariant
Layer 1: tensor([]) neurons are prompt invariant
Layer 2: tensor([]) neurons are prompt invariant
Layer 3: tensor([ 518,  782, 1302, 2311, 2400, 3037, 3353, 3414]) neurons are prompt invariant
Layer 4: tensor([]) neurons are prompt invariant
Layer 5: tensor([]) neurons are prompt invariant
Layer 6: tensor([]) neurons are prompt invariant
Layer 7: tensor([]) neurons are prompt invariant
Layer 8: tensor([]) neurons are prompt invariant
Layer 9: tensor([]) neurons are prompt invariant
Layer 10: tensor([]) neurons are prompt invariant
Layer 11: tensor([]) neurons are prompt invariant
Layer 12: tensor([]) neurons are prompt invariant
Layer 13: tensor([]) neurons are prompt invariant
Layer 14: tenso

In [3]:
random_act2[:,i]

NameError: name 'random_act2' is not defined

In [5]:
import torch as th
r = th.ones(2,10)
th.allclose(r[:,0], r[:,1]) and th.allclose(r[:,1], r[:,2])

True

In [9]:
for model_name in MODEL_NAME_LIST:
    print(model_name)

solu-1l


In [7]:
model.tokenizer.bos_token_id

1

In [4]:
a = th.randint(1, 50000, (1,10), device=device)
a.shape

torch.Size([1, 10])

In [6]:
b = th.tensor([[0]])
th.cat((b,a), dim=1)

tensor([[    0,  1541, 42689, 41799, 27115, 28946, 35524,  2859, 34025, 24278,
         10816]])

In [30]:
(th.abs((random_act - perm_act)) < th.abs(random_act*percent_diff)).sum()

tensor(3177)

In [68]:
perm_index

tensor([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8])