### Load Query2SAE Model

In [1]:
import torch
from head import Query2SAE, get_hyperparams
from safetensors.torch import load_file

# load the same hyperparams you trained with
_, _, _, _, head_dim, _ = get_hyperparams()

# load your safetensors checkpoint
state_dict = load_file("checkpoint/model_epoch4.safetensors")
# determine SAE dimension from one of the weight tensors
sae_dim = state_dict["head.2.weight"].shape[0]

# instantiate & load weights
model = Query2SAE(head_hidden_dim=head_dim, sae_dim=sae_dim)
model.load_state_dict(state_dict)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Loaded Query2SAE → head_hidden_dim={head_dim}, sae_dim={sae_dim} on {device}")


Loaded Query2SAE → head_hidden_dim=128, sae_dim=24576 on cpu


### Load GPT-2 Small Model

In [2]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

# FIX: Add proper pad token instead of using eos_token
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", padding_side="right")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add new pad token

lm = GPT2LMHeadModel.from_pretrained("gpt2")

# Resize embeddings for new pad token
lm.resize_token_embeddings(len(tokenizer))

# Move to device and set pad token ID
lm = lm.to(device)
lm.config.pad_token_id = tokenizer.pad_token_id

print(f"Tokenizer setup complete. Pad token: '{tokenizer.pad_token}'")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Tokenizer setup complete. Pad token: '[PAD]'


# Generate GPT-2 Small Output

In [3]:
question = "What is the capital of the US"

# FIX: Use shorter max_length to avoid memory issues and proper attention mask handling
inputs = tokenizer(
    question,
    return_tensors="pt",
    padding="max_length",
    max_length=50,  # REDUCED from 100
    truncation=True
).to(device)

print(f"Input shape: {inputs['input_ids'].shape}")

try:
    with torch.no_grad():  # FIX: Add memory management
        gen_ids = lm.generate(
            inputs["input_ids"], 
            attention_mask=inputs["attention_mask"],
            max_length=512,  # REDUCED from 256
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,  # FIX: Add to reduce repetition
            no_repeat_ngram_size=2   # FIX: Avoid repeating phrases
        )
    
    answer = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    print("Q:", question)
    print("A:", answer)
    
    # Clean up memory
    del gen_ids
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
except RuntimeError as e:
    if "out of memory" in str(e):
        print("GPU memory issue - trying CPU fallback...")
        # Move to CPU and retry
        lm_cpu = lm.to('cpu')
        inputs_cpu = {k: v.to('cpu') for k, v in inputs.items()}
        
        with torch.no_grad():
            gen_ids = lm_cpu.generate(
                inputs_cpu["input_ids"],
                attention_mask=inputs_cpu["attention_mask"],
                max_length=100,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                temperature=0.7
            )
        
        answer = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
        print("Q:", question)
        print("A (CPU):", answer)
    else:
        print(f"Generation error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

Input shape: torch.Size([1, 50])
Q: What is the capital of the US
A: What is the capital of the US?
The word "capital" comes from Latin. The United States has about 100 million people and an estimated $15 trillion in debt, with roughly 20% to 30% of that coming from foreign countries such as China and India. In addition there are some 500 state-owned banks (mostly owned by corporations), most notably those for utilities like solar power or wind farms. Most importantly however, according Toowoomba County's resident economist David Tipton he estimates total federal government spending on health care will be around 13 billion dollars over 10 years—around a fifth of what it was before Obamacare came into effect this year. As you can see here , while I believe healthcare costs continue to climb steadily since last December, even though many Americans have more coverage than they did four months ago, these numbers don't reflect reality: While we're at it, do any other states really need anot

## Actual LM output

In [4]:
answer.split(sep="\n")[1]

'The word "capital" comes from Latin. The United States has about 100 million people and an estimated $15 trillion in debt, with roughly 20% to 30% of that coming from foreign countries such as China and India. In addition there are some 500 state-owned banks (mostly owned by corporations), most notably those for utilities like solar power or wind farms. Most importantly however, according Toowoomba County\'s resident economist David Tipton he estimates total federal government spending on health care will be around 13 billion dollars over 10 years—around a fifth of what it was before Obamacare came into effect this year. As you can see here , while I believe healthcare costs continue to climb steadily since last December, even though many Americans have more coverage than they did four months ago, these numbers don\'t reflect reality: While we\'re at it, do any other states really need another bailout if their own medical facilities aren\' not paying down debts quickly enough so taxpa

## Generate F_Expected

In [7]:
from transformers import GPT2TokenizerFast
from head import Query2SAE, get_hyperparams
from safetensors.torch import load_file
import os
import torch

# 1) load checkpoint
path = os.path.join(os.getcwd(), 'checkpoint', 'model_epoch4.safetensors')
# state_dict = load_file("checkpoints/model_epoch4.safetensors")
state_dit = load_file(path)
sae_dim    = state_dict["head.2.weight"].shape[0]
_, _, _, _, head_dim, _ = get_hyperparams()

model = Query2SAE(head_hidden_dim=head_dim, sae_dim=sae_dim)
model.load_state_dict(state_dict)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2) instantiate GPT-2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", padding_side="right")
tokenizer.pad_token = tokenizer.eos_token  # gpt2 has no pad token by default

# 3) tokenize *your question*, not your SAE features!
question = "What is the capital of the US"
inputs   = tokenizer(
    question,
    return_tensors="pt",
    truncation=True,
    padding="max_length",
    max_length=256
).to(device)

# 4) run through your Query2SAE
with torch.no_grad():
    pred_sae = model(
        inputs["input_ids"], 
        attention_mask=inputs["attention_mask"]
    )  # shape (1, sae_dim)


In [8]:
import os
print("Before:", os.getcwd())
os.chdir("/Users/pardan/Code/SAE")
print("After:", os.getcwd())   

Before: /Users/pardan/Code/SAE/expectation_model
After: /Users/pardan/Code/SAE


In [9]:
from src.interfaces.lens_backend import Variant

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Generate F Actual

In [10]:
import torch
from transformer_lens import HookedTransformer
from sae_lens import SAE

def get_sae_features_from_text(
    text: str,
    sae: SAE,
    model: HookedTransformer
) -> torch.Tensor:
    """
    Feeds a text string through a language model, extracts the activations
    at the hook point where the SAE was trained, and then encodes these
    activations into SAE feature vectors.

    Args:
        text (str): The input text to be processed.
        sae (SAE): The loaded SAE object.
        model (HookedTransformer): The loaded TransformerLens model.

    Returns:
        torch.Tensor: A tensor of shape (1, sequence_length, d_sae)
                      containing the SAE's feature activations.
    """
    # 1. Tokenize the text string
    # We use prepend_bos=True to add the beginning-of-sequence token
    tokens = model.to_tokens(text, prepend_bos=True)
    
    # 2. Run the model and cache the activations
    # We specify the exact hook point where the SAE was trained
    hook_name = sae.cfg.hook_name
    with torch.no_grad():
        _, cache = model.run_with_cache(tokens)
        
    # 3. Extract the activations from the cache
    # The shape will be (batch_size=1, seq_len, d_model=768)
    activations = cache[hook_name]

    # 4. Use the SAE's encode() method to get the sparse feature vector
    # The SAE's input must be flattened to (num_activations, d_model)
    # The output is (num_activations, d_sae=24576)
    sae_feature_activations = sae.encode(activations)
    
    return sae_feature_activations

# --- Main Usage Example ---

# Set up the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the models
model = HookedTransformer.from_pretrained("gpt2-small", device=device)
sae, _, _ = SAE.from_pretrained(
    release="gpt2-small-res-jb",
    sae_id="blocks.11.hook_resid_pre",
    device=device,
)

# Define the input text
input_text = "The quick brown fox jumps over the lazy dog."

# Call the function to get the SAE feature vectors
sae_features = get_sae_features_from_text(input_text, sae, model)

# Print the shape and some details of the output
print(f"Input text: '{input_text}'")
print(f"Shape of SAE feature vectors: {sae_features.shape}")
print(f"Total features: {sae_features.shape[-1]}")
print(f"Number of non-zero features: {torch.count_nonzero(sae_features).item()}")

# You can now analyze `sae_features`. For example, to see the features
# activated for the last token ('dog'):
last_token_features = sae_features[0, -1, :]
print(f"Shape of SAE features for the last token: {last_token_features.shape}")

Loaded pretrained model gpt2-small into HookedTransformer


This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)


Input text: 'The quick brown fox jumps over the lazy dog.'
Shape of SAE feature vectors: torch.Size([1, 11, 24576])
Total features: 24576
Number of non-zero features: 523
Shape of SAE features for the last token: torch.Size([24576])


In [14]:
agg = torch.sum(sae_features, dim=1)
agg.shape

torch.Size([1, 24576])

In [22]:
F_exp = pred_sae.detach()
F_act = agg.detach()

### Cosine Similarity

In [23]:
import numpy as np
a = np.reshape(F_exp, (1,-1))
b = np.reshape(F_act, (1,-1))

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

ans = cosine_similarity(X=a, Y=b)

In [35]:
ans

array([[0.00720794]], dtype=float32)

### Top-K method

In [31]:
exp_val, exp_idx = torch.topk(F_exp, 500)
act_val, act_idx = torch.topk(F_act, 500)

In [32]:
import numpy as np

mask = np.isin(exp_val, act_val)

In [33]:
mask

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
      