## Setup

In [1]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer, ActivationCache, utils, patching
from jaxtyping import Float, Int, Bool
from torch import Tensor
from tqdm.auto import tqdm
import plotly.io as pio
import ipywidgets as widgets
from IPython.display import display, clear_output
from datasets import load_dataset

pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import get_mlp_activations
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [2]:
import json

dataset = load_dataset("NeelNanda/pile-10k", split='train')

freelaw_data = []
non_freelaw_data = []
for i, item in enumerate(dataset):
    text = item['text']
    if len(text) < 100:
        continue
    if len(freelaw_data) == 500 and len(non_freelaw_data) == 500:
        break
    
    if item['meta']['pile_set_name'] == 'FreeLaw':
        freelaw_data.append(text)
    elif len(non_freelaw_data) < 500:
        non_freelaw_data.append(text)

with open('data/freelaw.json', 'w') as f:
    json.dump(f, freelaw_data)
with open('data/non_freelaw.json', 'w') as f:
    json.dump(f, non_freelaw_data)

print(len(freelaw_data))
print(len(non_freelaw_data))

Found cached dataset parquet (/home/ubuntu/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


240
500


In [45]:
model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

uspto_neurons_l4 = [663]
freelaw_neurons_l4 = [189, 896, 227]

freelaw_activations = {}
non_freelaw_activations = {}
for layer in range(4, 5):
    freelaw_activations[layer] = get_mlp_activations(freelaw_data[:200], layer, model, mean=False)
    non_freelaw_activations[layer] = get_mlp_activations(non_freelaw_data[:200], layer, model, mean=False)

LAYER_TO_ABLATE = 4
NEURONS_TO_ABLATE = uspto_neurons_l4
MEAN_ACTIVATION_ACTIVE = freelaw_activations[LAYER_TO_ABLATE][:, NEURONS_TO_ABLATE].mean(0)
MEAN_ACTIVATION_INACTIVE = non_freelaw_activations[LAYER_TO_ABLATE][:, NEURONS_TO_ABLATE].mean(0)

def deactivate_neurons_hook(value, hook):
    value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_INACTIVE
    return value
deactivate_neurons_fwd_hooks=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', deactivate_neurons_hook)]

def activate_neurons_hook(value, hook):
    value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_ACTIVE
    return value
activate_neurons_fwd_hooks=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', activate_neurons_hook)]

all_ignore, not_ignore = haystack_utils.get_weird_tokens(model, plot_norms=False)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [46]:
original_losses, ablated_losses = [], []
for prompt in freelaw_data:
    original_loss, ablated_loss, _, _ = haystack_utils.get_caches_single_prompt(prompt, model, deactivate_neurons_fwd_hooks)
    original_losses.append(original_loss)
    ablated_losses.append(ablated_loss)

print(sum(original_losses) / len(original_losses))
print(sum(ablated_losses) / len(ablated_losses))

3.012581122159958
3.0354947290420533


In [47]:
original_losses, ablated_losses = [], []
for prompt in non_freelaw_data:
    original_loss, ablated_loss, _, _ = haystack_utils.get_caches_single_prompt(prompt, model, activate_neurons_fwd_hooks)
    original_losses.append(original_loss)
    ablated_losses.append(ablated_loss)

print(sum(original_losses) / len(original_losses))
print(sum(ablated_losses) / len(ablated_losses))

3.154054642379284
3.170984795808792
