In [None]:
import tqdm as notebook_twdm
from transformer_lens import HookedTransformer

model = HookedTransformer.from_pretrained("gpt2-small", fold_ln=True)

layer_index = 6
location = "mlp_post_act"
transformer_lens_loc = f"blocks.{layer_index}.mlp.hook_post"
prev_layer_loc = f"blocks.{layer_index}.ln2.hook_normalized"

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from datasets import load_dataset

ds = load_dataset("NeelNanda/pile-10k", split='train[:10]')
ds_tokens = model.to_tokens(ds['text'])
ds_logits, ds_cache = model.run_with_cache(ds_tokens[0])

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 373/373 [00:00<00:00, 2.99kB/s]
Downloading metadata: 100%|██████████| 921/921 [00:00<00:00, 7.49kB/s]
Downloading data: 100%|██████████| 33.3M/33.3M [00:00<00:00, 33.3MB/s]
Generating train split: 100%|██████████| 10000/10000 [00:00<00:00, 63899.38 examples/s]


NameError: name 'model' is not defined

In [None]:
import tools

mlp = tools.extract_mlp(model, layer_index)

ds_acts = ds_cache[prev_layer_loc].numpy().reshape(-1, 768)
original_mlp = tools.get_original_mlp_for_sparx(mlp, ds_acts)

shrink_pcs = [0.1, 0.3, 0.5, 0.7, 0.9]
merged_models = []
models_cluster_labels = []
for pc in shrink_pcs:
  model, labels = tools.shrink_model_global(original_mlp, pc)
  model.model.summary()
  merged_models.append(model)
  models_cluster_labels.append(labels)

In [None]:
from tools import output_infidelity, output_mse, output_r2

# check fidelity and error (MSE)
original_mlp.forward_pass(ds_acts)
original_output = original_mlp.forward_pass_data[-1]

for merged_model in merged_models:
  merged_model.forward_pass(ds_acts)
  merged_output = merged_model.forward_pass_data[-1]

  print(f"infidelity: {output_infidelity(original_output, merged_output)}")
  print(f"MSE error: {output_mse(original_output, merged_output)}")
  print(f"R2: {output_r2(original_output, merged_output)}")

In [None]:
autoencoder = tools.get_sparse_autoencoder(location, layer_index)

In [None]:
import numpy as np
import latent_features

sim_matrices = []

for model, labels in zip(merged_models, models_cluster_labels):
  print(f"Average similarity for sparsity {shrink_pcs[len(sim_matrices)]}")

  merged_w_in = model.model.get_weights()[0]
  num_clusters = merged_w_in.shape[1]
  # Get map of cluster index to list of latent feature indices for that cluster
  cluster_latents = latent_features.get_cluster_latents(model, labels, ds_cache)
  # Build a matrix comparing proportion of shared latent concepts between every cluster
  cluster_sim = latent_features.get_cluster_similarity_matrix(num_clusters, cluster_latents)
  sim_matrices.append(cluster_sim)
  average_similarity_score = np.mean(cluster_sim)
  print(average_similarity_score)

In [None]:
for sim_matrix in sim_matrices:
  latent_features.plot_cluster_similarity_matrix(sim_matrix)