In [1]:
%load_ext autoreload
%autoreload 2

from nnsight import LanguageModel
from dictionary_learning.buffer import ActivationBuffer
from dictionary_learning.training import trainSAE

In [2]:

model = LanguageModel(
    'EleutherAI/pythia-70m-deduped', # this can be any Huggingface model
    device_map = 'cuda:0'
)
submodule = model.gpt_neox.layers[1].mlp # layer 1 MLP
activation_dim = 512 # output dimension of the MLP
dictionary_size = 16 * activation_dim

In [3]:
import torch
def display_memory():
    total = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    print(f"{a*1e-9} allocated, {r*1e-9} reserved, {total*1e-9} total")

In [4]:

# data much be an iterator that outputs strings
# data = iter([
#     'This is some example data',
#     'In real life, for training a dictionary',
#     'you would need much more data than this'
# ])

from datasets import load_dataset
import torch

# Load the dataset
# train_dataset = load_dataset('wikitext', 'wikitext-103-v1', split='train[:1000000]')
train_dataset = load_dataset('Skylion007/openwebtext', split='train[:100]')
def yield_sentences(data_split):
    for example in data_split:
        text = example['text']
        sentences = text.split('\n')
        for sentence in sentences:
            if sentence:  # skip empty lines
                yield sentence

# Creating an iterator for training sentences
train_sentences = yield_sentences(train_dataset)

# for i in range(10):
#     print(next(train_sentences))

buffer = ActivationBuffer(
    train_sentences,
    model,
    submodule,
    out_feats=activation_dim, # output dimension of the model component
    n_ctxs=3e3,
    in_batch_size=128, # batch size for the model
    out_batch_size=128*16, # batch size for the buffer
) # buffer will return batches of tensors of dimension = submodule's output dimension


In [5]:
display_memory()

0.0 allocated, 0.0 reserved, 84.986691584 total


In [6]:
from tqdm.notebook import tqdm
# # train the sparse autoencoder (SAE)
ae = trainSAE(
    buffer,
    activation_dim,
    dictionary_size,
    lr=3e-4,
    sparsity_penalty=1e-3,
    device='cuda:0',
    tqdm_style=tqdm
)

0it [00:00, ?it/s]

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


refreshing buffer...
buffer refreshed...
acts.shape=torch.Size([256, 512])
step 0 memory: 0.47353139200000005 allocated, 0.557842432 reserved, 84.986691584 total
step 0 MSE loss: 0.2877471446990967, sparsity loss: 479.6265563964844
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
refreshing buffer...
buffer refreshed...
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=torch.Size([256, 512])
acts.shape=to