In [11]:
# Import Transformer Lens, and load pythia models
from transformer_lens import HookedTransformer
import torch as th
from torch.utils.data import DataLoader
from tqdm import tqdm
from einops import rearrange
device = th.device("cuda" if th.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m-deduped", device=device)
MODEL_NAME_LIST = [
    # "EleutherAI/pythia-70m-deduped", 
    # "EleutherAI/pythia-160m-deduped", 
    # "EleutherAI/pythia-410m-deduped", 
    # "gpt2", 
    # "gpt2-medium",
    "solu-1l",
    "solu-2l",
    "solu-3l",
    "solu-4l",
]


def prompt_invariance(model, token_amount=10):
    neurons = model.cfg.d_mlp
    vocab_size = model.cfg.d_vocab
    token_amount = 10
    
    # Create a random 10 token input
    # random_input = th.randint(0, vocab_size, (1,token_amount), device=device)
    # # # Create permutation index that cycles the random input once
    # perm_index = th.cat((th.tensor([-1]), th.arange(token_amount-1)))

    random_input = th.cat((th.tensor([[model.tokenizer.bos_token_id]]),th.randint(1, vocab_size, (1,token_amount), device=device)), dim=1)
    perm_index = th.cat((th.tensor([0]), th.tensor([-1]), th.arange(1,token_amount)))
    print(random_input)
    print(perm_index)
    print(random_input[:,perm_index])
    # Create a random permutation index
    # perm_index = th.randperm(token_amount)
    final_input = th.cat((random_input, random_input[:,perm_index]), dim=0)
    _, cache = model.run_with_cache(final_input.to(device))
    layers = len(model.blocks)
    for l in range(layers):
        # Now we want to see if the activations of the random_input are the same as the permuted input (but permute activations as well)
        random_act = cache[f"blocks.{l}.mlp.hook_post"][0, perm_index]
        perm_act = cache[f"blocks.{l}.mlp.hook_post"][1]
        percent_diff = 0.01
        prompt_invariant_neurons = th.sum(th.tensor([th.allclose(random_act[:,i], perm_act[:,i]) for i in range(neurons)])).item()
        # print layer and number of neurons that are prompt invariant
        print(f"Layer {l}: {prompt_invariant_neurons}/{neurons} neurons are prompt invariant")

for model_name in MODEL_NAME_LIST:
    model = HookedTransformer.from_pretrained(model_name, device=device)
    print(f"Prompt Invariance for {model_name}")
    prompt_invariance(model)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m-deduped into HookedTransformer
Loaded pretrained model solu-1l into HookedTransformer
Prompt Invariance for solu-1l
tensor([[    1, 22407, 38891, 13882,  4962, 48015, 46707,  9957, 45002,  1486,
         46827]])
tensor([ 0, -1,  1,  2,  3,  4,  5,  6,  7,  8,  9])
tensor([[    1, 46827, 22407, 38891, 13882,  4962, 48015, 46707,  9957, 45002,
          1486]])
Layer 0: 0/2048 neurons are prompt invariant
Loaded pretrained model solu-2l into HookedTransformer
Prompt Invariance for solu-2l
tensor([[    1,  2440, 47504,  5713, 35104, 37158,  6950,  8117, 20159, 47451,
         27834]])
tensor([ 0, -1,  1,  2,  3,  4,  5,  6,  7,  8,  9])
tensor([[    1, 27834,  2440, 47504,  5713, 35104, 37158,  6950,  8117, 20159,
         47451]])
Layer 0: 0/2048 neurons are prompt invariant
Layer 1: 0/2048 neurons are prompt invariant
Loaded pretrained model solu-3l into HookedTransformer
Prompt Invariance for solu-3l
tensor([[    1,  5347,  8617, 39336,  

Downloading (…)lve/main/config.json: 100%|██████████| 1.27k/1.27k [00:00<00:00, 317kB/s]
Downloading (…)"model_final.pth";: 100%|██████████| 255M/255M [00:59<00:00, 4.27MB/s] 


Loaded pretrained model solu-4l into HookedTransformer
Prompt Invariance for solu-4l
tensor([[    1, 27716, 18104, 18676,  5811, 44194, 30554, 13633, 37734, 27775,
         16406]])
tensor([ 0, -1,  1,  2,  3,  4,  5,  6,  7,  8,  9])
tensor([[    1, 16406, 27716, 18104, 18676,  5811, 44194, 30554, 13633, 37734,
         27775]])
Layer 0: 0/2048 neurons are prompt invariant
Layer 1: 0/2048 neurons are prompt invariant
Layer 2: 0/2048 neurons are prompt invariant
Layer 3: 0/2048 neurons are prompt invariant


In [9]:
for model_name in MODEL_NAME_LIST:
    print(model_name)

solu-1l


In [7]:
model.tokenizer.bos_token_id

1

In [4]:
a = th.randint(1, 50000, (1,10), device=device)
a.shape

torch.Size([1, 10])

In [6]:
b = th.tensor([[0]])
th.cat((b,a), dim=1)

tensor([[    0,  1541, 42689, 41799, 27115, 28946, 35524,  2859, 34025, 24278,
         10816]])

In [30]:
(th.abs((random_act - perm_act)) < th.abs(random_act*percent_diff)).sum()

tensor(3177)

In [68]:
perm_index

tensor([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8])