In [13]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer, ActivationCache, utils, patching
from jaxtyping import Float, Int, Bool
from torch import Tensor
from tqdm.auto import tqdm
import plotly.io as pio
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import numpy as np
import plotly.express as px
from torchmetrics.regression import KendallRankCorrCoef, SpearmanCorrCoef
from collections import defaultdict

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

pio.renderers.default = "notebook_connected+notebook"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import get_mlp_activations
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [18]:
haystack_utils.clean_cache()
model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
english_data = haystack_utils.load_json_data("data/english_europarl.json")[:200]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


In [26]:
german_neurons_with_f1 = [
    [5, 2649, 1.0],
    [8,	2994, 1.0],
    [11, 2911, 0.99],
    [10, 1129, 0.97],
    [6, 1838, 0.65],
    [7, 1594, 0.65],
    [11, 1819, 0.61],
    [11, 2014, 0.56],
    [10, 753, 0.54],
    [11, 205, 0.48],
]

important_german_neurons = defaultdict(list)
for layer, neuron, f1 in german_neurons_with_f1:
    if f1 > 0.9:
        important_german_neurons[layer].append(neuron)

english_activations = {}
german_activations = {}
for layer in set([layer for layer, _, _ in german_neurons_with_f1]):
    english_activations[layer] = get_mlp_activations(english_data, layer, model, mean=False)
    german_activations[layer] = get_mlp_activations(german_data, layer, model, mean=False)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [65]:
mean_context_neuron_acts_active = defaultdict(list)
mean_context_neuron_acts_inactive = defaultdict(list)
for layer, neuron, _ in german_neurons_with_f1:
    mean_context_neuron_acts_active[layer].append((neuron, german_activations[layer][:, neuron].mean(0)))
    mean_context_neuron_acts_inactive[layer].append((neuron, english_activations[layer][:, neuron].mean(0)))

def get_deactivate_neurons_hook(layer):
    def deactivate_neurons_hook(value, hook):
        neurons, acts = zip(*mean_context_neuron_acts_inactive[layer])
        value[:, :, neurons] = torch.tensor(acts).cuda()
        return value
    return deactivate_neurons_hook
deactivate_neurons_fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', get_deactivate_neurons_hook(layer)) for layer in important_german_neurons.keys()]

def get_activate_neurons_hook(layer):
    def activate_neurons_hook(value, hook):
        neurons, acts = zip(*mean_context_neuron_acts_inactive[layer])
        value[:, :, neurons] = torch.tensor(acts).cuda()
        return value
    return activate_neurons_hook
activate_neurons_fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', get_activate_neurons_hook(layer)) for layer in important_german_neurons.keys()]

all_ignore, not_ignore = haystack_utils.get_weird_tokens(model, plot_norms=False)

tensor(3.7031, device='cuda:0') tensor(4.0830, device='cuda:0')
10.260419845581055% loss increase


## Check classification accuracy of German neurons

In [22]:
def run_single_neuron_lr(layer, neuron, num_samples=5000, german_activations=german_activations, english_activations=english_activations):
    # Check accuracy of logistic regression
    A = torch.concat([german_activations[layer][:num_samples, neuron], english_activations[layer][:num_samples, neuron]]).view(-1, 1).cpu().numpy()
    y = torch.concat([torch.ones(num_samples), torch.zeros(num_samples)]).cpu().numpy()
    A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=0.2)
    lr_model = LogisticRegression()
    lr_model.fit(A_train, y_train)
    test_acc = lr_model.score(A_test, y_test)
    train_acc = lr_model.score(A_train, y_train)
    f1 = sklearn.metrics.f1_score(y_test, lr_model.predict(A_test))
    return train_acc, test_acc, f1
    
def get_neuron_accuracy(layer, neuron, german_activations=german_activations, english_activations=english_activations, plot=False, print_f1s=True):
    mean_english_activation = english_activations[layer][:,neuron].mean()
    mean_german_activation = german_activations[layer][:,neuron].mean()
    
    if plot:
        haystack_utils.two_histogram(english_activations[layer][:,neuron], german_activations[layer][:,neuron], "English", "German", "Activation", "Frequency", f"L{layer}N{neuron} activations on English vs German text")
    train_acc, test_acc, f1 = run_single_neuron_lr(layer, neuron)
    if print_f1s:
        print(f"\nL{layer}N{neuron}: F1={f1:.2f}, Train acc={train_acc:.2f}, and test acc={test_acc:.2f}")
        print(f"Mean activation English={mean_english_activation:.2f}, German={mean_german_activation:.2f}")
    return f1

In [23]:
f1s = []
for layer, neuron, reported_f1 in german_neurons_with_f1:
    f1s.append(get_neuron_accuracy(layer, neuron))

german_neuron_names = [f"L{layer}N{neuron}" for layer, neuron, _ in german_neurons_with_f1]
haystack_utils.line(f1s, xlabel="", ylabel="F1 score of sparse probe", title="Sparse probe performance on individual German neurons", xticks=german_neuron_names, show_legend=False)


L5N2649: F1=0.97, Train acc=0.97, and test acc=0.97
Mean activation English=-0.07, German=2.39

L8N2994: F1=0.98, Train acc=0.98, and test acc=0.98
Mean activation English=-0.06, German=4.11

L11N2911: F1=0.79, Train acc=0.76, and test acc=0.75
Mean activation English=1.04, German=0.02

L10N1129: F1=0.65, Train acc=0.64, and test acc=0.63
Mean activation English=2.56, German=1.50

L6N1838: F1=0.95, Train acc=0.96, and test acc=0.95
Mean activation English=-0.09, German=1.59

L7N1594: F1=0.87, Train acc=0.88, and test acc=0.88
Mean activation English=-0.07, German=1.15

L11N1819: F1=0.62, Train acc=0.63, and test acc=0.62
Mean activation English=7.25, German=5.39

L11N2014: F1=0.70, Train acc=0.71, and test acc=0.70
Mean activation English=13.38, German=7.21

L10N753: F1=0.65, Train acc=0.64, and test acc=0.65
Mean activation English=11.94, German=8.32

L11N205: F1=0.53, Train acc=0.53, and test acc=0.52
Mean activation English=12.38, German=10.80


In [24]:
enabled_context_neuron_acts = {str(layer) + '_' + str(neuron): german_activations[layer][:, neuron] for layer, neuron, _ in german_neurons_with_f1}

def disable_other_context_neurons(german_activations, current_layer, current_neuron):
    for layer, neuron, f1 in german_neurons_with_f1:
        if layer == current_layer and neuron == current_neuron:
            german_activations[layer][:, neuron] = enabled_context_neuron_acts[str(layer) + '_' + str(neuron)]
        else:
            # Not a perfect ablation
            german_activations[layer][:, neuron][:english_activations[layer].shape[0]] = english_activations[layer][:, neuron]
    return german_activations


f1s = []
for layer, neuron, _ in german_neurons_with_f1:
    german_activations = disable_other_context_neurons(german_activations, layer, neuron)
    f1s.append(get_neuron_accuracy(layer, neuron, german_activations=german_activations, 
                                                                    english_activations=english_activations, print_f1s=False, plot=False))

german_neuron_names = [f"L{layer}N{neuron}" for layer, neuron, _ in german_neurons_with_f1]
haystack_utils.line(f1s, xlabel="", ylabel="F1 score of sparse probe", title="Sparse probe performance on individual German neurons", xticks=german_neuron_names, show_legend=False)

In [54]:
# Full ablation accuracy
haystack_utils.clean_cache()
original_loss = model(german_data[:20], return_type='loss')
with model.hooks(deactivate_neurons_fwd_hooks):
    ablated_loss = model(german_data[:20], return_type='loss')
print(original_loss, ablated_loss)
print(f'{(ablated_loss - original_loss) / original_loss * 100}% loss increase')

[(2649, tensor(-0.0741, device='cuda:0'))]
[(2994, tensor(-0.0600, device='cuda:0'))]
[(1129, tensor(2.5618, device='cuda:0'))]
[(2911, tensor(1.0401, device='cuda:0'))]
tensor(3.7031, device='cuda:0') tensor(4.0326, device='cuda:0')
8.897250175476074% loss increase
