## Code to evaluate SAEs when used in the model

In [1]:
# use HookedTransformer hooks to replace the specified component output with it's SAE counterpart
%load_ext autoreload
%autoreload 2
from transformer_lens import HookedTransformer, utils
import torch
import nnsight
device = 'cuda'
from tasks.ioi.IOITask import IOITask
from tasks.facts.SportsTask import SportsTask
from tasks.owt.OWTTask import OWTTask
from tasks import PileTask

import pandas as pd

In [2]:
model = HookedTransformer.from_pretrained(
    # 'EleutherAI/pythia-70m-deduped',
    # 'EleutherAI/pythia-1.4b-deduped',
    'EleutherAI/pythia-2.8b-deduped',
    device=device
)

model.set_use_hook_mlp_in(True)
tokenizer = model.tokenizer
batch_size=500

ioi_task = IOITask(batch_size=batch_size, tokenizer=tokenizer, device=device, handle_multitoken_labels=True, num_data=1000)
sports_task = SportsTask(batch_size=batch_size, tokenizer=tokenizer, device=device)
owt_task = OWTTask(batch_size=batch_size, tokenizer=tokenizer, device=device, ctx_length=50)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-2.8b-deduped into HookedTransformer


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
from nnsight import LanguageModel
from dictionary_learning.buffer import ActivationBuffer
from dictionary_learning.training import trainSAE

model = LanguageModel(
    # 'EleutherAI/pythia-70m-deduped', # this can be any Huggingface model
    'EleutherAI/pythia-2.8b-deduped',
    device_map = 'cuda:0'
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [39]:
from dictionary_learning.dictionary import AutoEncoder

layer = 15
hidden_layer = False # if True, use the hidden layer, else use the output layer
# submodule = model.blocks[layer].mlp # layer 1 MLP
submodule = model.gpt_neox.layers[layer].mlp # layer 1 MLP

# apply hook to block
hook_pos = utils.get_act_name("mlp_out", layer)
# activation_dim = model.cfg.d_model # output dimension of the MLP
activation_dim = model.config.hidden_size # output dimension of the MLP
dictionary_size = 16 * activation_dim * 4 if hidden_layer else 16 * activation_dim

step = 490000
# model_type = "1_32768" if hidden_layer else "0_8192"
# sae_dict = torch.load(f"baulab.us/u/smarks/autoencoders/pythia-70m-deduped/mlp_out_layer{layer}/{model_type}/ae.pt")
sae_dict = torch.load(f"trained_saes/2.8b_l{layer}/checkpoints/ae_{step}.pt")

sae = AutoEncoder(activation_dim, dictionary_size).to(device)
sae.load_state_dict(state_dict=sae_dict)

pre_sae_acts = []
post_sae_acts = []

# sae = AutoEncoder(activation_dim, dictionary_size*4).to(device)
def apply_sae_hook(pattern, hook, sae, pre_sae_acts=None, post_sae_acts=None):
    """
    During inference time, run SAE on the output of the specified layer, and feed it back in.
    """
    if pre_sae_acts is not None:
        pre_sae_acts.append(pattern.clone().cpu())
    pattern = sae(pattern)
    if post_sae_acts is not None:
        post_sae_acts.append(pattern.clone().cpu())
    return pattern


## Evaluate with HookedTransformer

In [6]:
import torch
def display_memory():
    total = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    print(f"{a*1e-9} allocated, {r*1e-9} reserved, {total*1e-9} total")
display_memory()

13.05828608 allocated, 13.071548416 reserved, 84.986691584 total


In [7]:
fresh_sae = AutoEncoder(activation_dim, dictionary_size).to(device)

In [8]:
_, test_cache = model.run_with_cache(
    tokenizer(next(ioi_task.train_iter)['text'], return_tensors='pt').input_ids[0],
    )

print(test_cache.keys())
print(test_cache['blocks.1.hook_mlp_out'].shape)

dict_keys(['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.hook_mlp_in', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.hook_mlp_in', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post'

In [9]:
# I'm pretty sure that these tasks also take an inference function, not just a model. That makes it more convenient to use run_with_hooks
def sae_inference_fn(tokens, model=model, hook_name=hook_pos, sae=sae):
    return model.run_with_hooks(
        tokens,
        fwd_hooks = [
            (hook_name, lambda pattern, hook: apply_sae_hook(pattern, hook, sae,)) # pre_sae_acts, post_sae_acts))
        ]
    )

fresh_sae_inference_fn = lambda tokens: sae_inference_fn(tokens, model=model, hook_name=hook_pos, sae=fresh_sae)

results = {
    'Model': ['default model', 'pretrained sae', 'random init 16x sae'],
    # 'Model': ['default model', 'random 64x sae', 'random 16x sae'],
    'IOI Loss': [
        ioi_task.get_test_loss(model).item(), 
        ioi_task.get_test_loss(sae_inference_fn).item(),
        ioi_task.get_test_loss(fresh_sae_inference_fn).item()
    ],
    'IOI Accuracy': [
        ioi_task.get_test_accuracy(model, check_all_logits=False), 
        ioi_task.get_test_accuracy(sae_inference_fn, check_all_logits=False),
        ioi_task.get_test_accuracy(fresh_sae_inference_fn, check_all_logits=False)
    ],
    'Sports Loss': [
        sports_task.get_test_loss(model).item(), 
        sports_task.get_test_loss(sae_inference_fn).item(),
        sports_task.get_test_loss(fresh_sae_inference_fn).item()
    ],
    'Sports Accuracy': [
        sports_task.get_test_accuracy(model, check_all_logits=False), 
        sports_task.get_test_accuracy(sae_inference_fn, check_all_logits=False),
        sports_task.get_test_accuracy(fresh_sae_inference_fn, check_all_logits=False)
    ],
    'OWT Loss': [
        owt_task.get_test_loss(model).item(), 
        owt_task.get_test_loss(sae_inference_fn).item(),
        owt_task.get_test_loss(fresh_sae_inference_fn).item()
    ],
}

results_df = pd.DataFrame(results)
display(results_df)

# print(f"IOI Loss: {ioi_task.get_test_loss(sae_inference_fn)}")
# print(f"IOI Accuracy: {ioi_task.get_test_accuracy(sae_inference_fn, check_all_logits=False)}")
# print(f"Sports Loss: {sports_task.get_test_loss(sae_inference_fn)}")
# print(f"Sports Accuracy: {sports_task.get_test_accuracy(sae_inference_fn, check_all_logits=False)}")
# print(f"OWT Loss: {owt_task.get_test_loss(sae_inference_fn)}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,Model,IOI Loss,IOI Accuracy,Sports Loss,Sports Accuracy,OWT Loss
0,default model,1.595651,1.0,0.119766,0.996815,2.850236
1,pretrained sae,5.151785,0.67,4.219798,0.487261,5.483761
2,random init 16x sae,2.656597,0.965,3.83059,0.404459,3.871112


## Prewritten SAE evaluation code

In [40]:
from datasets import load_dataset
train_dataset = load_dataset('monology/pile-uncopyrighted', split='train', streaming=True)

def yield_sentences(data_split, cycle=False):
    while True:
        for example in data_split:
            text = example['text']
            # sentences = text.split('\n')
            # for sentence in sentences:
            #     if sentence:  # skip empty lines
            #         yield sentence
            yield text
        if not cycle:
            break

# Creating an iterator for training sentences
train_sentences = yield_sentences(train_dataset, cycle=True)

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

In [41]:
BATCH_SIZE=64
from dictionary_learning.buffer import ActivationBuffer
buffer = ActivationBuffer(
    train_sentences,
    model,
    submodule,
    out_feats=activation_dim, # output dimension of the model component
    n_ctxs=1e3,
    in_batch_size=int(BATCH_SIZE*2), # batch size for the model
    out_batch_size=BATCH_SIZE*4, # batch size for the buffer
) # buffer will return batches of tensors of dimension = submodule's output dimension

In [42]:
from dictionary_learning.evaluation import loss_recovered, evaluate
from collections import defaultdict
from tqdm import tqdm

results_dict = defaultdict(list)
n_iters = 10

for i in tqdm(range(n_iters)):
    out = evaluate(model, submodule, sae, buffer, device='cuda')
    for k, v in out.items():
        results_dict[k].append(v)

  0%|          | 0/10 [00:00<?, ?it/s]

refreshing buffer...
buffer size: 0, need 128000.0, buffer_shape: torch.Size([0, 2560])
max vram usages: [67.0]
buffer size: 15586, need 128000.0, buffer_shape: torch.Size([15586, 2560])
max vram usages: [67.0]
buffer size: 31626, need 128000.0, buffer_shape: torch.Size([31626, 2560])
max vram usages: [67.0]
buffer size: 47410, need 128000.0, buffer_shape: torch.Size([47410, 2560])
max vram usages: [67.0]
buffer size: 62996, need 128000.0, buffer_shape: torch.Size([62996, 2560])
max vram usages: [67.0]
buffer size: 78599, need 128000.0, buffer_shape: torch.Size([78599, 2560])
max vram usages: [67.0]
buffer size: 94252, need 128000.0, buffer_shape: torch.Size([94252, 2560])
max vram usages: [67.0]
buffer size: 109790, need 128000.0, buffer_shape: torch.Size([109790, 2560])
max vram usages: [67.0]
buffer size: 125257, need 128000.0, buffer_shape: torch.Size([125257, 2560])
max vram usages: [67.0]
buffer refreshed...


100%|██████████| 10/10 [03:46<00:00, 22.63s/it]


In [43]:
# turn the results into a dataframe
results_df = pd.DataFrame(results_dict)
display(results_df)

Unnamed: 0,mse_loss,sparsity_loss,l0,percent_alive,loss_original,loss_reconstructed,loss_zero,percent_recovered
0,0.074379,13.287188,12.753906,0.037061,2.199511,2.222114,2.241945,0.467325
1,0.07199,13.220024,11.953125,0.036377,2.271739,2.296409,2.321465,0.503888
2,0.072716,13.0562,12.382812,0.0375,2.331593,2.354844,2.374335,0.456003
3,0.075491,13.465338,13.179688,0.037988,2.083156,2.110054,2.135535,0.486465
4,0.073226,13.112363,12.445312,0.036768,2.311514,2.330863,2.350458,0.503153
5,0.07145,13.12996,11.722656,0.035767,2.23646,2.259686,2.279502,0.460383
6,0.077269,13.368313,13.277344,0.037256,2.230863,2.250642,2.272073,0.520044
7,0.072811,13.384796,12.277344,0.037817,2.252879,2.273166,2.292691,0.490427
8,0.075237,13.013618,12.820312,0.037671,2.29004,2.310185,2.329671,0.491701
9,0.073348,13.502811,13.277344,0.038965,2.256372,2.280337,2.301687,0.47116


In [44]:
# get mean of each column
print(f"On Layer {layer}, Step {step}")
mean_results = results_df.mean()
display(mean_results)

On Layer 15, Step 490000


mse_loss               0.073792
sparsity_loss         13.254061
l0                    12.608984
percent_alive          0.037317
loss_original          2.246413
loss_reconstructed     2.268830
loss_zero              2.289936
percent_recovered      0.485055
dtype: float64

In [16]:
from dictionary_learning.evaluation import loss_recovered, evaluate
from nnsight import LanguageModel
from dictionary_learning.buffer import ActivationBuffer
from dictionary_learning.training import trainSAE
from datasets import load_dataset
import torch

# nn_model = LanguageModel(
#     'EleutherAI/pythia-70m-deduped', # this can be any Huggingface model
#     device_map = 'cuda:0'
# )


# Load the dataset
# train_dataset = load_dataset('wikitext', 'wikitext-103-v1', split='train[:1000000]')
# train_dataset = load_dataset('Skylion007/openwebtext', split='train[:100]')
train_dataset = 
def yield_sentences(data_split):
    for example in data_split:
        text = example['text']
        sentences = text.split('\n')
        for sentence in sentences:
            if sentence:  # skip empty lines
                yield sentence

# Creating an iterator for training sentences
train_sentences = yield_sentences(train_dataset)


SyntaxError: invalid syntax (4208215216.py, line 17)

In [None]:
from tasks.owt.OWTTask import OWTTask
owt = OWTTask(batch_size=10, tokenizer=tokenizer, device=device, ctx_length=50)
test_batch = next(owt.test_iter)


from dictionary_learning.dictionary import AutoEncoder

layer = 1
hidden_layer = False # if True, use the hidden layer, else use the output layer
submodule = nn_model.gpt_neox.layers[1].mlp # layer 1 MLP
# apply hook to block
hook_pos = utils.get_act_name("mlp_out", layer)
activation_dim = model.cfg.d_model # output dimension of the MLP
dictionary_size = 16 * activation_dim * 4 if hidden_layer else 16 * activation_dim

model_type = "1_32768" if hidden_layer else "0_8192"
sae_dict = torch.load(f"baulab.us/u/smarks/autoencoders/pythia-70m-deduped/mlp_out_layer{layer}/{model_type}/ae.pt")

sae = AutoEncoder(activation_dim, dictionary_size).to(device)
sae.load_state_dict(state_dict=sae_dict)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

<All keys matched successfully>

In [36]:
test_toks = tokenizer(test_batch['text'], return_tensors='pt', max_length=50, padding='max_length', truncation=True).input_ids
loss_recovered(test_toks, nn_model, submodule, sae, io='in')

<nnsight.intervention.InterventionProxy object at 0x7f5a76198af0>
<nnsight.intervention.InterventionProxy object at 0x7f5a76198af0>
torch.Size([512])


TypeError: unsupported operand type(s) for -: 'tuple' and 'Tensor'

In [22]:
import torch
def display_memory():
    total = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    print(f"{a*1e-9} allocated, {r*1e-9} reserved, {total*1e-9} total")
display_memory()

0.37590272 allocated, 0.38797312 reserved, 84.986691584 total
