In [6]:
# Import Transformer Lens, and load pythia models
from transformer_lens import HookedTransformer
import torch as th
from datasets import load_dataset
# from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm
from einops import rearrange

device = "cuda:1" if th.cuda.is_available() else "cpu"

# model_name = "EleutherAI/pythia-160m-deduped"
MODEL_NAME_LIST = [
    "EleutherAI/pythia-70m-deduped", 
    "EleutherAI/pythia-160m-deduped", 
    "EleutherAI/pythia-410m-deduped", 
    # "gpt2", 
    # "gpt2-medium",
    # "solu-1l",
    # "solu-2l",
    # "solu-3l",
    # "solu-4l",
]
model_name = MODEL_NAME_LIST[0]
model_save_name = model_name.replace("/", "-")
  # Load the training set from pile-10k

all_models_ninety_percent = []
for MODEL_NAME in MODEL_NAME_LIST:
    try: 
        del model, d, neuron_activations
        th.cuda.empty_cache()
    except:
        pass
    model = HookedTransformer.from_pretrained(MODEL_NAME, device=device)

    seq_length = 20
    #TODO change from 100->1000
    d = load_dataset("NeelNanda/pile-10k", split="train[:200]").map(
        lambda x: model.tokenizer(x['text']),
        batched=True,
    ).filter(
        lambda x: len(x['input_ids']) > seq_length
    ).map(
        lambda x: {'input_ids': x['input_ids'][:seq_length]}
    )
    neurons = model.cfg.d_mlp
    dataset_size = d.num_rows
    batch_size = 64
    layers = len(model.blocks)


    per_model_ninety_percent = th.zeros((layers, neurons))
    for layer in range(layers):
        neuron_activations = th.zeros((dataset_size*seq_length, neurons))
        with th.no_grad(), d.formatted_as("pt"):
            dl = DataLoader(d["input_ids"], batch_size=batch_size)
            for i, batch in enumerate(tqdm(dl)):
                _, cache = model.run_with_cache(batch.to(device))
                neuron_activations[i*batch_size*seq_length:(i+1)*batch_size*seq_length,:] = rearrange(cache[f"blocks.{layer}.mlp.hook_post"], "b s n -> (b s) n" ).cpu()
            # Get the 90th percentile of each neuron
            for neuron in range(neurons):
                #Find the value that is 90% of the way between the min and max
                per_model_ninety_percent[layer, neuron] = neuron_activations[:,neuron].quantile(0.99, interpolation="nearest")
    all_models_ninety_percent.append(per_model_ninety_percent.detach().clone())

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer


Found cached dataset parquet (/home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 4/4 [00:00<00:00, 13.39it/s]                    
100%|██████████| 4/4 [00:00<00:00, 14.77it/s]
100%|██████████| 4/4 [00:00<00:00, 14.83it/s]
100%|██████████| 4/4 [00:00<00:00, 14.71it/s]
100%|██████████| 4/4 [00:00<00:00, 13.88it/s]
100%|██████████| 4/4 [00:00<00:00, 14.38it/s]
Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m-deduped into HookedTransformer


Found cached dataset parquet (/home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f7d9009767529673.arrow
Loading cached processed dataset at /home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-844b063af91d0676.arrow
Loading cached processed dataset at /home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-af31810cc2a6043a.arrow
100%|██████████| 4/4 [00:00<00:00,  5.24it/s]
100%|██████████| 4/4 [00:00<00:00,  5.32it/s]
10

Loaded pretrained model EleutherAI/pythia-410m-deduped into HookedTransformer


Found cached dataset parquet (/home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-28ce9cda4a5b88d0.arrow
Loading cached processed dataset at /home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-6caa5dc13b20a179.arrow
Loading cached processed dataset at /home/mchorse/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-c40121c0f595b3d3.arrow
100%|██████████| 4/4 [00:02<00:00,  1.81it/s]
100%|██████████| 4/4 [00:01<00:00,  2.09it/s]
10

In [4]:
for m in range(len(MODEL_NAME_LIST)):
    print(MODEL_NAME_LIST[m])
    for q in range(1,10):
        quantile = q/10
        per_layer_quantile = all_models_ninety_percent[m].quantile(quantile, interpolation="nearest", dim=1)
        print(f"Quantile {quantile} is {per_layer_quantile}")

EleutherAI/pythia-70m-deduped
Quantile 0.1 is tensor([ 0.0682, -0.0022,  0.0615, -0.0007,  0.0547, -0.0073])
Quantile 0.2 is tensor([ 0.0870, -0.0002,  0.1361,  0.0870,  0.1406,  0.0809])
Quantile 0.3 is tensor([0.0985, 0.0352, 0.1781, 0.1419, 0.2193, 0.1518])
Quantile 0.4 is tensor([0.1079, 0.0846, 0.2098, 0.1857, 0.3023, 0.2366])
Quantile 0.5 is tensor([0.1157, 0.1304, 0.2373, 0.2237, 0.3907, 0.3336])
Quantile 0.6 is tensor([0.1240, 0.1716, 0.2702, 0.2709, 0.5072, 0.4176])
Quantile 0.7 is tensor([0.1310, 0.2111, 0.3034, 0.3209, 0.6189, 0.5500])
Quantile 0.8 is tensor([0.1403, 0.2540, 0.3403, 0.3970, 0.7529, 0.7022])
Quantile 0.9 is tensor([0.1553, 0.3090, 0.3980, 0.5969, 0.9332, 0.9327])
EleutherAI/pythia-160m-deduped
Quantile 0.1 is tensor([ 0.0754, -0.0051, -0.0036,  0.0895,  0.0585,  0.0878,  0.0779,  0.0690,
         0.0038,  0.0078, -0.0003,  0.0044])
Quantile 0.2 is tensor([ 0.0866, -0.0037, -0.0007,  0.1457,  0.1160,  0.1372,  0.1261,  0.1180,
         0.0687,  0.0615,  0.0573

In [8]:
for m in range(len(MODEL_NAME_LIST)):
    print(MODEL_NAME_LIST[m])
    for q in range(1,10):
        quantile = q/10
        per_layer_quantile = all_models_ninety_percent[m].quantile(quantile, interpolation="nearest")
        print(f"Quantile {quantile} is {per_layer_quantile}")

EleutherAI/pythia-70m-deduped
Quantile 0.1 is 0.5055357813835144
Quantile 0.2 is 0.578640341758728
Quantile 0.3 is 0.6459773778915405
Quantile 0.4 is 0.7185556888580322
Quantile 0.5 is 0.7931545972824097
Quantile 0.6 is 0.8762369751930237
Quantile 0.7 is 0.9750785231590271
Quantile 0.8 is 1.1126662492752075
Quantile 0.9 is 1.3792394399642944
EleutherAI/pythia-160m-deduped
Quantile 0.1 is 0.5298735499382019
Quantile 0.2 is 0.5926447510719299
Quantile 0.3 is 0.642602801322937
Quantile 0.4 is 0.6864762902259827
Quantile 0.5 is 0.7292709350585938
Quantile 0.6 is 0.7765105366706848
Quantile 0.7 is 0.8312962651252747
Quantile 0.8 is 0.9040560126304626
Quantile 0.9 is 1.0459781885147095
EleutherAI/pythia-410m-deduped
Quantile 0.1 is 0.5112807750701904
Quantile 0.2 is 0.5757800936698914
Quantile 0.3 is 0.6179792881011963
Quantile 0.4 is 0.6543733477592468
Quantile 0.5 is 0.6892403960227966
Quantile 0.6 is 0.725898802280426
Quantile 0.7 is 0.7690402269363403
Quantile 0.8 is 0.8268356323242188
Q

In [27]:
per_model_ninety_percent.quantile(0.9, interpolation="nearest", dim=1)

tensor([0.1563, 0.3049, 0.4013, 0.6080, 0.9337, 0.9296])

In [14]:
import torch as th
(th.arange(10).reshape(2,5) - 0.1).quantile(0.9, interpolation="nearest", dim=1)

tensor([3.9000, 8.9000])

In [3]:
(th.arange(100) - 0.1).quantile(0.9, interpolation="nearest")

tensor(88.9000)