In [1]:
import torch
import numpy as np
from torch import einsum
from tqdm.auto import tqdm
import seaborn as sns
from transformer_lens import HookedTransformer, ActivationCache
from datasets import load_dataset
from einops import einsum
import pandas as pd
from transformer_lens import utils
from rich.table import Table, Column
from rich import print as rprint
from jaxtyping import Float, Int, Bool
from torch import Tensor
import einops
import functools
from transformer_lens.hook_points import HookPoint
# import circuitsvis
from IPython.display import HTML
from plotly.express import line
import plotly.express as px
from tqdm.auto import tqdm
import json
import gc
import plotly.graph_objects as go
from haystack_utils import load_txt_data, get_mlp_activations

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio
pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

%reload_ext autoreload
%autoreload 2

In [2]:
model = HookedTransformer.from_pretrained("pythia-70m-v0", fold_ln=True, device=device)

Using pad_token, but it is not set yet.


Loaded pretrained model pythia-70m-v0 into HookedTransformer


In [3]:
kde_french = load_txt_data("kde4_french.txt")
kde_english = load_txt_data("kde4_english.txt")

kde4_french.txt: Loaded 1007 examples with 505 to 5345 characters each.
kde4_english.txt: Loaded 1007 examples with 501 to 5295 characters each.


In [4]:
french_activations = get_mlp_activations(kde_french, 4, model, num_prompts=100, mean=True)
english_activations = get_mlp_activations(kde_english, 4, model, num_prompts=100, mean=True)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
def get_ablated_mlp_cache(prompts: list[str], model: HookedTransformer, neurons: list[int], layer: int, mean_neuron_activations: Float[Tensor, "d_mlp"]):
    neurons = torch.LongTensor(neurons)

    def ablate_neuron_hook(value, hook):
        value[:, :, neurons] = mean_neuron_activations[neurons]
        return value
    
    tokens = model.to_tokens(prompts)

    _, original_cache = model.run_with_cache(tokens, return_type="loss")

    with model.hooks(fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', ablate_neuron_hook)]):
        _, ablated_cache = model.run_with_cache(tokens, return_type="loss")

    return original_cache, ablated_cache

def get_mlp_activation_difference(original_cache: ActivationCache, ablated_cache: ActivationCache, layer: int):
    block_name = f'blocks.{layer}.mlp.hook_post'
    original_activations = original_cache[block_name][:, 1:]
    ablated_activations = ablated_cache[block_name][:, 1:]
    difference = original_activations.mean((0, 1)) - ablated_activations.mean((0, 1))
    return difference 

In [30]:
original_cache, ablated_cache = get_ablated_mlp_cache(kde_french[:10], model, neurons=[609], layer=3, mean_neuron_activations=french_activations)
difference = get_mlp_activation_difference(original_cache, ablated_cache, layer=5)

In [31]:
px.histogram(difference.cpu().numpy(), title="Difference in activations between original and ablated model", width=800)

In [35]:
# TODO hover over the histogram to see the neuron number
def imshow(tensor, renderer=None, **kwargs):
    preset_kwargs = {
        "color_continuous_midpoint": 0.0,
        "color_continuous_scale": "RdBu"
    }

    fig = px.imshow(utils.to_numpy(tensor), **{**preset_kwargs, **kwargs})
    fig.show(renderer=renderer)

imshow(difference.view(32, -1))

TypeError: imshow() got an unexpected keyword argument 'hover_data'