In [1]:
import re
import json
import pickle
import os
import sys
import requests
import logging
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer
from tqdm.auto import tqdm
import plotly.io as pio
import numpy as np
import random
import torch.nn as nn
import torch.nn.functional as F
import wandb
import plotly.express as px
import pandas as pd
import torch.nn.init as init
from pathlib import Path
from jaxtyping import Int, Float
from torch import Tensor
import einops
from collections import Counter
from datasets import load_dataset
import pandas as pd
from ipywidgets import interact, IntSlider
from process_tiny_stories_data import load_tinystories_validation_prompts, load_tinystories_tokens

pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

logging.basicConfig(format='(%(levelname)s) %(asctime)s: %(message)s', level=logging.INFO, datefmt='%I:%M:%S')
sys.path.append('../')  # Add the parent directory to the system path

import utils.haystack_utils as haystack_utils
from sparse_coding.train_autoencoder import AutoEncoder
from utils.autoencoder_utils import custom_forward, AutoEncoderConfig, evaluate_autoencoder_reconstruction, get_encoder_feature_frequencies
import utils.haystack_utils as haystack_utils

%reload_ext autoreload
%autoreload 2

In [2]:
model_name = "tiny-stories-2L-33M"
model = HookedTransformer.from_pretrained(
    model_name,
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device,
)

def load_encoder(save_name, model_name):
    with open(f"{model_name}/{save_name}.json", "r") as f:
        cfg = json.load(f)

    cfg = AutoEncoderConfig(
        cfg["layer"], cfg["act"], cfg["expansion_factor"], cfg["l1_coeff"]
    )

    if cfg.act_name == "hook_mlp_out":
        d_in = model.cfg.d_model
    else:
        d_in = model.cfg.d_mlp
    d_hidden = d_in * cfg.expansion_factor

    encoder = AutoEncoder(d_hidden, cfg.l1_coeff, d_in)
    encoder.load_state_dict(torch.load(os.path.join(model_name, save_name + ".pt")))
    encoder.to(device)
    return encoder, cfg


@torch.no_grad()
def get_acts(prompt: str, model: HookedTransformer, encoder: AutoEncoder, cfg: AutoEncoderConfig):
    _, cache = model.run_with_cache(prompt, names_filter=cfg.encoder_hook_point)
    acts = cache[cfg.encoder_hook_point].squeeze(0)
    _, _, mid_acts, _, _ = encoder(acts)
    return mid_acts


def get_max_activations(prompts: list[str], model: HookedTransformer, encoder: AutoEncoder, cfg: AutoEncoderConfig):
    activations = []
    indices = []
    for prompt in tqdm(prompts):
        acts = get_acts(prompt, model, encoder, cfg)[0, :-1]
        value, index = acts.max(0)
        activations.append(value)
        indices.append(index)

    max_activation_per_prompt = torch.stack(activations)  # n_prompt x d_enc
    max_activation_token_index = torch.stack(indices)

    total_activations = max_activation_per_prompt.sum(0)
    print(f"Active directions on validation data: {total_activations.nonzero().shape[0]} out of {total_activations.shape[0]}")
    return max_activation_per_prompt, max_activation_token_index


def get_token_kurtosis_for_decoder(model: HookedTransformer, layer: int, decoder: torch.Tensor):
    '''Return excess kurtosis over all decoder features' cosine sims with the unembed (higher is better)'''
    W_out = model.W_out[layer]
    resid_dirs = torch.nn.functional.normalize(decoder @ W_out, dim=-1)
    unembed = torch.nn.functional.normalize(model.unembed.W_U, dim=0)
    cosine_sims = einops.einsum(resid_dirs, unembed, 'd_hidden d_model, d_model d_vocab -> d_hidden d_vocab')
    
    mean = einops.repeat(cosine_sims.mean(dim=-1), f'd_hidden -> d_hidden {cosine_sims.shape[1]}')
    std = einops.repeat(cosine_sims.std(dim=-1), f'd_hidden -> d_hidden {cosine_sims.shape[1]}')
    kurt = torch.mean((cosine_sims - mean / std) ** 4, dim=-1) - 3
    return kurt

Using pad_token, but it is not set yet.


Loaded pretrained model tiny-stories-2L-33M into HookedTransformer


In [3]:
# 1. List of clean features
# 2. Sort by indirect ablation increase
# 3. Sort for things in layer 1

In [3]:
l0_encoder, l0_config = load_encoder('18_morning_sun', model_name)
l1_encoder, l1_config = load_encoder('2_upbeat_snowball', model_name)

# l1_kurtosis = get_token_kurtosis_for_decoder(model, 1, l1_encoder.W_dec)
# px.histogram(pd.DataFrame({"kurtosis": l1_kurtosis.cpu()}))   

In [25]:
prompts = load_tinystories_validation_prompts()[:10000]
max_activations, max_activation_token_indies = get_max_activations(prompts, model, l0_encoder, l0_config)

(INFO) 10:21:59: Loaded 21990 TinyStories validation prompts


  0%|          | 0/10000 [00:00<?, ?it/s]

Active directions on validation data: 16384 out of 16384


In [26]:
max_activation_token_indices = max_activation_token_indies
max_activations_l0 = max_activations
max_activation_token_indices_l0 = max_activation_token_indices

In [27]:
cosine_sims = torch.nn.functional.normalize(l0_encoder.W_dec, dim=-1) @ torch.nn.functional.normalize(l1_encoder.W_enc, dim=0)
cosine_sims = torch.tril(cosine_sims)
print(cosine_sims.shape)

all_sims = cosine_sims.flatten().cpu()

torch.Size([16384, 16384])


In [28]:
# px.histogram(all_sims[torch.randperm(len(all_sims))][:10_000])

In [29]:
values, indices = torch.topk(all_sims, 100)

In [30]:
def i_to_row_col(i: int, n_cols: int = len(cosine_sims)):
    row = i // n_cols
    col = i % n_cols
    return row, col

l0_dir, l1_dir = i_to_row_col(indices[0], len(cosine_sims))
print(l0_dir, l1_dir)

tensor(15671) tensor(1008)


In [31]:
print(max_activations_l0.shape)

torch.Size([10000, 16384])


In [33]:
# one prompt
# save direction activations
# get index of max direction activation per prompt from pre-existing data
# return loss per token, original and ablated
# index into loss with positions where directions active, calculate loss increase


def custom_forward(
    enc: AutoEncoder, x: Float[Tensor, "batch d_in"], neuron: int, activation: float
):
    x_cent = x - enc.b_dec
    acts = F.relu(x_cent @ enc.W_enc + enc.b_enc)
    acts[:, neuron] = activation
    x_reconstruct = acts @ enc.W_dec + enc.b_dec
    return x_reconstruct, acts


@torch.no_grad()
def evaluate_autoencoder_reconstruction_per_token(prompt: str, pos: int, autoencoder: AutoEncoder, cfg: AutoEncoderConfig, model: HookedTransformer, direction: int):
    
    def encode_activations_hook(value, hook):
        _, x_reconstruct, _, _, _ = autoencoder(value[:, pos])
        value[:, pos] = x_reconstruct
        return value

    reconstruct_hooks = [(f"blocks.{cfg.layer}.{cfg.act_name}", encode_activations_hook)]

    def zero_ablate_hook(value, hook):
        x_reconstruct, _ = custom_forward(autoencoder, value[:, pos], direction, 0)
        value[:, pos] = x_reconstruct
        return value
    
    zero_ablate_hooks = [(f"blocks.{cfg.layer}.{cfg.act_name}", zero_ablate_hook)]
    
    original_loss = model(prompt, return_type="loss", loss_per_token=True)[0, pos].item()
    with model.hooks(reconstruct_hooks):
        reconstruct_loss = model(prompt, return_type="loss", loss_per_token=True)[0, pos].item()
    with model.hooks(zero_ablate_hooks):
        zero_ablate_loss = model(prompt, return_type="loss", loss_per_token=True)[0, pos].item()
    
    return original_loss, reconstruct_loss, zero_ablate_loss


dirs = [i_to_row_col(i) for i in indices]
data = []
for l0_dir, l1_dir in tqdm(dirs[:15]):
    # max_activations_l0[:, l0_dir]
    values, prompt_indices = torch.topk(max_activations_l0[:, l0_dir], k=50)
    
    original_losses = []
    reconstruct_losses = []
    zero_ablate_losses = []
    for i in prompt_indices:
        prompt = prompts[i]
        pos_index = max_activation_token_indices_l0[i, l0_dir].item()
        original_loss, reconstruct_loss, zero_ablate_loss = evaluate_autoencoder_reconstruction_per_token(prompt, pos_index, l0_encoder, l0_config, model, l0_dir)
        original_losses.append(original_loss)
        reconstruct_losses.append(reconstruct_loss)
        zero_ablate_losses.append(zero_ablate_loss)
    data.append([l0_dir, l1_dir, np.mean(original_losses), np.mean(reconstruct_losses), np.mean(zero_ablate_losses)])
df = pd.DataFrame(data, columns=["l0_dir", "l1_dir", "original_loss", "reconstruct_loss", "zero_ablate_loss"])
    

  0%|          | 0/15 [00:00<?, ?it/s]

In [35]:
df["loss_increase"] = df["zero_ablate_loss"] - df["reconstruct_loss"]
df.sort_values("loss_increase", ascending=False, inplace=True)
df

Unnamed: 0,l0_dir,l1_dir,original_loss,reconstruct_loss,zero_ablate_loss,loss_increase
1,tensor(3078),tensor(2717),1.139115,1.449524,1.500091,0.050567
4,tensor(3084),tensor(555),1.577442,1.571623,1.600115,0.028491
3,tensor(6306),tensor(4666),1.218571,1.254769,1.273814,0.019044
2,tensor(12738),tensor(1970),1.065921,1.106483,1.125427,0.018944
12,tensor(9183),tensor(7899),1.17094,1.397148,1.412604,0.015456
9,tensor(5471),tensor(2663),0.781454,0.889481,0.900346,0.010865
14,tensor(15319),tensor(525),1.029166,1.284547,1.291323,0.006776
8,tensor(12974),tensor(7589),3.265839,4.267891,4.269192,0.001301
10,tensor(15192),tensor(12425),1.819522,1.998446,1.999414,0.000969
6,tensor(15339),tensor(9325),0.621281,0.651502,0.649526,-0.001977


In [36]:
# Percentage increase
((df["zero_ablate_loss"] - df["reconstruct_loss"]) / df["reconstruct_loss"]).mean()


0.004270189322327611

In [37]:
((df["reconstruct_loss"] - df["original_loss"]) / df["original_loss"]).mean()

0.10807562473493677

In [18]:
l0_acts = []
l1_acts = []
for prompt in tqdm(prompts[:500]):
    l0_act = get_acts(prompt, model, l0_encoder, l0_config).cpu()
    l1_act = get_acts(prompt, model, l1_encoder, l1_config).cpu()
    l0_acts.append(l0_act)
    l1_acts.append(l1_act)
l0_acts = torch.cat(l0_acts, dim=0)
l1_acts = torch.cat(l1_acts, dim=0)

  0%|          | 0/500 [00:00<?, ?it/s]

In [20]:
l0_norm = l0_acts.abs().sum(-1).mean()
print(l0_norm)
l1_norm = l1_acts.abs().sum(-1).mean()
print(l1_norm)

tensor(35.4037)
tensor(88.1443)


In [21]:
del l0_acts, l1_acts