In [1]:
import torch
from huggingface_hub import hf_hub_download

model_id = "Elriggs/pythia-70m-deduped-layer-2"
ae_download_location = hf_hub_download(repo_id=model_id, filename="learned_dicts.pt")
all_autoencoders = torch.load(ae_download_location)
num_l1s = len(all_autoencoders)
all_l1s = [hyperparams["l1_alpha"] for autoencoder, hyperparams in all_autoencoders]
print(all_l1s)
autoencoder, hyperparams = all_autoencoders[5]
print(hyperparams)

  from .autonotebook import tqdm as notebook_tqdm


[0.0, 9.999999747378752e-05, 0.00019306977628730237, 0.000372759357560426, 0.0007196856895461679, 0.0013894954463467002, 0.0026826958637684584, 0.005179474595934153, 0.009999999776482582]
{'dict_size': 3072, 'l1_alpha': 0.0013894954463467002}


In [2]:
from transformer_lens import HookedTransformer
layer = 2
setting = "residual"
model_name = "EleutherAI/pythia-70m-deduped"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained(model_name, device=device)

if setting == "residual":
    cache_name = f"blocks.{layer}.hook_resid_post"
    neurons = model.cfg.d_model
elif setting == "mlp":
    cache_name = f"blocks.{layer}.mlp.hook_post"
    neurons = model.cfg.d_mlp
elif setting == "attention":
    cache_name = f"blocks.{layer}.hook_attn_out"
    neurons = model.cfg.d_model
elif setting == "mlp_out":
    cache_name = f"blocks.{layer}.hook_mlp_out"
    neurons = model.cfg.d_model
else:
    raise NotImplementedError

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer


In [3]:
# Downnload dataset
from datasets import Dataset, load_dataset
dataset_name = "NeelNanda/pile-10k"
token_amount= 40
#TODO: change train[:1000] to train if you want whole dataset
dataset = load_dataset(dataset_name, split="train[:1000]").map(
    lambda x: model.tokenizer(x['text']),
    batched=True,
).filter(
    lambda x: len(x['input_ids']) > token_amount
).map(
    lambda x: {'input_ids': x['input_ids'][:token_amount]}
)

# Get Dictionary Activations

In [4]:
# Now we can use the model to get the activations
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from einops import rearrange
num_features, d_model = autoencoder.encoder.shape
datapoints = dataset.num_rows
batch_size = 32
neuron_activations = torch.zeros((datapoints*token_amount, d_model))
dictionary_activations = torch.zeros((datapoints*token_amount, num_features))
smaller_auto_encoder = autoencoder
smaller_auto_encoder.to_device(device)

with torch.no_grad(), dataset.formatted_as("pt"):
    dl = DataLoader(dataset["input_ids"], batch_size=batch_size)
    for i, batch in enumerate(tqdm(dl)):
        _, cache = model.run_with_cache(batch.to(device))
        batched_neuron_activations = rearrange(cache[cache_name], "b s n -> (b s) n" )
        neuron_activations[i*batch_size*token_amount:(i+1)*batch_size*token_amount,:] = batched_neuron_activations.cpu()
        batched_dictionary_activations = smaller_auto_encoder.encode(batched_neuron_activations)
        dictionary_activations[i*batch_size*token_amount:(i+1)*batch_size*token_amount,:] = batched_dictionary_activations.cpu()

  0%|          | 0/31 [00:00<?, ?it/s]

100%|██████████| 31/31 [00:00<00:00, 42.84it/s]


# Feature Interp
Investigate the example sentences the activate this feature.

Max: show max activating (tokens,contexts)

Uniform: Show range of activations from each bin (e.g. sample an example from 1-2, 2-3, etc). 
[Note: if a feature is monosemantic, then the full range of activations should be that feature, not just max-activating ones]

Full_text: shows the full text example

Text_list: shows up to the most activating example (try w/ max activating on a couple of examples to see)

ablate_text: remove the context one token at a time, and show the decrease/increase in activation of that feature

ablate_feature_direction: removes feature direction from model's activation mid-inference, showing the logit diff in the output for every token.

logit_lens: show the logit lens for that feature. If matches ablate_feature_direction, then the computation path is through the residual stream, else, it's through future layers

In [8]:
from interp_utils import *
best_feature = 1
text_list, full_text, token_list, full_token_list = get_feature_datapoints(best_feature, dictionary_activations, model.tokenizer, token_amount, dataset, setting="uniform")
# text_list, full_text, token_list, full_token_list = get_feature_datapoints(best_feature, dictionary_activations, dataset, setting="max")
# visualize_text(full_text, best_feature, model, autoencoder, layer)
visualize_text(text_list, best_feature, model, autoencoder, layer)

In [14]:
ablate_text(text_list, best_feature, model, autoencoder, layer)

In [15]:
ablate_feature_direction_display(full_text, autoencoder, model, layer, features=best_feature)

In [16]:
logit_lens(model, best_feature, autoencoder.get_learned_dict())

['naments', 'chid', 'ifice', 'bital', ' equivalently', ' otherwise', ' else', 'Else', 'deal', 'iginally', 'leans', ' alternatively', 'anges', 'acles', ' something', 'chard', 'lando', 'phe', 'phan', 'acular']
tensor([3.6303, 2.8674, 2.5978, 2.5794, 2.5119, 2.4721, 2.4593, 2.2616, 2.1686,
        2.1448, 2.1189, 2.1073, 2.0916, 2.0746, 2.0506, 2.0402, 1.9394, 1.9391,
        1.8440, 1.8397])
