In [7]:
import os
import sys
import torch
import wandb
import json
import plotly.express as px
from transformer_lens import utils
from datasets import load_dataset
from typing import  Dict
from pathlib import Path
from tqdm import tqdm
from functools import partial
from vit_sae_analysis.dashboard_fns import get_feature_data, FeatureData

sys.path.append("..")

from sae_training.utils import LMSparseAutoencoderSessionloader
from sae_analysis.visualizer import data_fns, html_fns
from sae_analysis.visualizer.data_fns import get_feature_data, FeatureData
from sae_training.config import ViTSAERunnerConfig
from sae_training.vit_runner import vision_transformer_sae_runner
from sae_training.train_sae_on_vision_transformer import train_sae_on_vision_transformer
from vit_sae_analysis.dashboard_fns import get_feature_data, FeatureData
from sae_training.sparse_autoencoder import SparseAutoencoder
from sae_training.utils import ViTSparseAutoencoderSessionloader

if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cuda" if torch.cuda.is_available() else "cpu"

sae_path = "checkpoints/pcy601zk/final_sparse_autoencoder_openai/clip-vit-large-patch14_-2_resid_65536.pt"
loaded_object = torch.load(sae_path)
cfg = loaded_object['cfg']
state_dict = loaded_object['state_dict']
sparse_autoencoder = SparseAutoencoder(cfg)
sparse_autoencoder.load_state_dict(state_dict)
sparse_autoencoder.eval()
loader = ViTSparseAutoencoderSessionloader(cfg)
model = loader.get_model(cfg.model_name)
model.to(cfg.device)

mlp_out_weights = model.model.vision_model.encoder.layers[cfg.block_layer].mlp.fc2.weight.detach().transpose(0,1) # size [hidden_mlp_dimemsion, resid_dimension]
sae_weights = sparse_autoencoder.W_enc.detach() # size [resid_dimension, sae_dimension]
sae_weights /= torch.norm(sae_weights, dim = 0, keepdim = True)
mlp_out_weights /= torch.norm(mlp_out_weights, dim = 1, keepdim = True)
cosine_similarities = torch.abs(mlp_out_weights @ sae_weights) # size [hidden_mlp_dimemsion, sae_dimension]
max_cosine_similarities = torch.max(cosine_similarities, 0).values.to('cpu') # size [sae_dimension]
mean_max_cos_sim = max_cosine_similarities.mean()
var_max_cos_sim = max_cosine_similarities.var()

threshold = 0.18
num_above_threshold = (max_cosine_similarities>threshold).sum()

fig = px.histogram(max_cosine_similarities, title = "Histogram of max cosine similarities of SAE features with MLP out tensor.")
fig.show()

random_weights = torch.randn(sae_weights.size(), device = sae_weights.device)
random_weights /= torch.norm(random_weights, dim = 0, keepdim = True)
cosine_similarities = torch.abs(mlp_out_weights @ random_weights) # size [hidden_mlp_dimemsion, sae_dimension]
max_cosine_similarities = torch.max(cosine_similarities, 0).values.to('cpu') # size [sae_dimension]
rand_mean_max_cos_sim = max_cosine_similarities.mean()
rand_var_max_cos_sim = max_cosine_similarities.var()

rand_fig = px.histogram(max_cosine_similarities, title = "Histogram of max cosine similarities of random vectors with MLP out tensor.")
rand_fig.show()

print(f'Number of sae features with cosine similarity greater than {threshold}: {num_above_threshold}')
print(f'Mean maximum cosine similarity of sae features with MLP output tensor: {mean_max_cos_sim}')
print(f'Mean maximum cosine similarity of random vectors with MLP output tensor: {rand_mean_max_cos_sim}')
print(f'Variance of the maximum cosine similarity of sae features with MLP output tensor: {var_max_cos_sim}')
print(f'Variance of the maximum cosine similarity of random vectors with MLP output tensor: {rand_var_max_cos_sim}')


`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


Number of sae features with cosine similarity greater than 0.18: 885
Mean maximum cosine similarity of sae features with MLP output tensor: 0.11946894228458405
Mean maximum cosine similarity of random vectors with MLP output tensor: 0.11836104094982147
Variance of the maximum cosine similarity of sae features with MLP output tensor: 0.0010612717596814036
Variance of the maximum cosine similarity of random vectors with MLP output tensor: 9.284295083489269e-05
