In [1]:
from transformers import AutoModel, GPT2Tokenizer
import torch

# load your MLP model
model = AutoModel.from_pretrained("mksethi/gpt2-query2sae", trust_remote_code=True)


In [2]:
# load the stock GPT‑2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# now you can use tokenizer and model together
tokenizer.pad_token = tokenizer.eos_token

inputs = tokenizer("What is the weather like in hong kong?", padding='max_length', max_length=256, return_tensors="pt")
# len(inputs["input_ids"][0])
with torch.no_grad():
    outputs = model(**inputs)

In [3]:
len(outputs[0])

24576

In [4]:
feats = outputs[0].numpy()

In [7]:
values, indices = torch.topk(outputs[0], 50)

In [8]:
values

tensor([0.1218, 0.1194, 0.1148, 0.1134, 0.1131, 0.1088, 0.1079, 0.1077, 0.1060,
        0.1058, 0.1054, 0.1050, 0.1049, 0.1045, 0.1044, 0.1044, 0.1044, 0.1043,
        0.1035, 0.1034, 0.1029, 0.1028, 0.1028, 0.1028, 0.1026, 0.1025, 0.1024,
        0.1017, 0.1016, 0.1011, 0.1008, 0.1006, 0.1006, 0.1003, 0.0999, 0.0999,
        0.0996, 0.0993, 0.0992, 0.0991, 0.0990, 0.0989, 0.0987, 0.0986, 0.0984,
        0.0984, 0.0983, 0.0982, 0.0982, 0.0981])

In [12]:
torch.std(outputs[0])

tensor(0.0512)

In [18]:
from safetensors.torch import save_file
import torch
state_dict = torch.load("./expectation_model/checkpoint/new_model.pt", map_location="cpu")
save_file(state_dict, "nmodel.safetensors")

---

In [14]:
import torch

# assume `model` is your FrozenLMMLP instance already on CPU or GPU
# make sure model is on CPU for easy inspection
model_cpu = model.to('cpu')

# extract the weight and bias tensors
w1 = model_cpu.fc1.weight.data   # shape: (hidden_dim, input_dim)
b1 = model_cpu.fc1.bias.data     # shape: (hidden_dim,)

# compute mean and std
w1_mean, w1_std = w1.mean().item(), w1.std().item()
b1_mean, b1_std = b1.mean().item(), b1.std().item()

print(f"fc1.weight → mean: {w1_mean:.4f}, std: {w1_std:.4f}")
print(f"fc1.bias   → mean: {b1_mean:.4f}, std: {b1_std:.4f}")

# likewise for the second layer:
w2 = model_cpu.fc2.weight.data
b2 = model_cpu.fc2.bias.data

w2_mean, w2_std = w2.mean().item(), w2.std().item()
b2_mean, b2_std = b2.mean().item(), b2.std().item()

print(f"fc2.weight → mean: {w2_mean:.4f}, std: {w2_std:.4f}")
print(f"fc2.bias   → mean: {b2_mean:.4f}, std: {b2_std:.4f}")


fc1.weight → mean: -0.0021, std: 0.0359
fc1.bias   → mean: -0.0048, std: 0.0385
fc2.weight → mean: 0.0000, std: 0.0509
fc2.bias   → mean: 0.0027, std: 0.0512


In [23]:
# First, install the safetensors package if you haven’t already:
#    pip install safetensors

import torch
from safetensors.torch import load_file

# 1. Load the safetensors checkpoint into a state dict
state_dict = load_file("data.safetensors")

# 2. Filter out the keys for each layer
#    (Adjust these prefixes if your actual layers are named differently.)
layer_prefixes = ["fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias"]

for name in layer_prefixes:
    # We need to find keys in the state dict that end with our layer name
    matching_keys = [k for k in state_dict if k.endswith(name)]
    if not matching_keys:
        print(f"No parameters found for layer `{name}`.")
        continue

    for key in matching_keys:
        tensor = state_dict[key]
        # Ensure we have a torch.Tensor
        if not isinstance(tensor, torch.Tensor):
            tensor = torch.tensor(tensor)

        m = tensor.mean().item()
        s = tensor.std().item()
        print(f"{key:30s} → mean: {m:.4f}, std: {s:.4f}")


fc1.weight                     → mean: -0.0021, std: 0.0359
fc1.bias                       → mean: -0.0048, std: 0.0385
fc2.weight                     → mean: 0.0000, std: 0.0509
fc2.bias                       → mean: 0.0027, std: 0.0512


In [24]:
import torch
from safetensors.torch import load_file

state_dict = load_file("data.safetensors") # Assuming this is your trained file

# Infer input_dim and output_dim from the loaded weights
fc1_weight_shape = state_dict['fc1.weight'].shape
fc2_weight_shape = state_dict['fc2.weight'].shape

trained_input_dim = fc1_weight_shape[1] # input_dim is the second dim of fc1.weight
trained_hidden_dim = fc1_weight_shape[0] # hidden_dim is the first dim of fc1.weight
trained_output_dim = fc2_weight_shape[0] # output_dim is the first dim of fc2.weight

print(f"Dimensions from trained weights: input_dim={trained_input_dim}, hidden_dim={trained_hidden_dim}, output_dim={trained_output_dim}")

Dimensions from trained weights: input_dim=256, hidden_dim=128, output_dim=24576
