In [2]:
from transformer_lens import HookedTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
from huggingface_hub import PyTorchModelHubMixin
from transformers import PretrainedConfig, PreTrainedModel

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
model = HookedTransformer.from_pretrained("tiny-stories-2L-33M", device=device)

Loaded pretrained model tiny-stories-2L-33M into HookedTransformer


In [20]:
class SparseAutoEncoder(nn.Module, PyTorchModelHubMixin):
    config_class = SparseAutoEncoderConfig

    def __init__(
        self, config
    ):
        super().__init__()
        torch.manual_seed(config["seed"])
        d_in = config["d_in"]
        d_hidden = config["d_hidden"]
        dtype = getattr(torch, config["dtype"])
        self.W_enc = nn.Parameter(
            torch.nn.init.kaiming_uniform_(torch.empty(d_in, d_hidden, dtype=dtype))
        )
        self.W_dec = nn.Parameter(
            torch.nn.init.kaiming_uniform_(torch.empty(d_hidden, d_in, dtype=dtype))
        )
        self.b_enc = nn.Parameter(torch.zeros(d_hidden, dtype=dtype))
        self.b_dec = nn.Parameter(torch.zeros(d_in, dtype=dtype))

        self.W_dec.data[:] = self.W_dec / self.W_dec.norm(dim=-1, keepdim=True)
        self.d_hidden = d_hidden
        

    def forward(self, x: torch.Tensor):
        x_cent = x - self.b_dec
        acts = F.relu(x_cent @ self.W_enc + self.b_enc)
        x_reconstruct = acts @ self.W_dec + self.b_dec
        return x_reconstruct, acts

In [21]:
save_path = "/workspace/tiny-stories-2L-33M"
run_name = "229_comfy_haze"

with open(f"{save_path}/{run_name}.json", "r") as f:
    cfg = json.load(f)

d_in = cfg["d_in"]
d_hidden = cfg["d_in"] * cfg["expansion_factor"]
hook_name = f"blocks.{cfg['layer']}.{cfg['act']}"

sae_config = {"d_in": d_in, "d_hidden": d_hidden, "dtype":"float32", "seed":47}
encoder = SparseAutoEncoder(sae_config)
encoder.load_state_dict(torch.load(f"{save_path}/{run_name}.pt"))
encoder.to(device)

SparseAutoEncoder()

In [23]:
name = "SAE-TinyStories-2L-L1-0"
encoder.save_pretrained(name, config=sae_config)
encoder.push_to_hub(name, config=sae_config)

pytorch_model.bin:   0%|          | 0.00/537M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lovish/SAE-TinyStories-2L-L1-0/commit/bd67426f1896f7ae2441976fa521dd3c356980ec', commit_message='Push model using huggingface_hub.', commit_description='', oid='bd67426f1896f7ae2441976fa521dd3c356980ec', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
encoder = SparseAutoEncoder.from_pretrained("lovish/"+name)

config.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]