In [9]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer, ActivationCache, utils, patching
from jaxtyping import Float, Int, Bool
from torch import Tensor
from tqdm.auto import tqdm
import plotly.io as pio
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import numpy as np
import plotly.express as px
from torchmetrics.regression import KendallRankCorrCoef, SpearmanCorrCoef
from collections import defaultdict

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

pio.renderers.default = "notebook_connected+notebook"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import get_mlp_activations
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [12]:
model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
english_data = haystack_utils.load_json_data("data/english_europarl.json")[:200]

german_neurons_with_f1 = [
    [5, 2649, 1.0],
    [8,	2994, 1.0],
    [11, 2911, 0.99],
    [10, 1129, 0.97],
    [6, 1838, 0.65],
    [7, 1594, 0.65],
    [11, 1819, 0.61],
    [11, 2014, 0.56],
    [10, 753, 0.54],
    [11, 205, 0.48],
]

important_german_neurons = defaultdict(list)
for layer, neuron, f1 in german_neurons_with_f1:
    if f1 > 0.9:
        important_german_neurons[layer].append(neuron)

english_activations = {}
german_activations = {}
for layer in [layer for layer, _, _ in german_neurons_with_f1]:
    english_activations[layer] = get_mlp_activations(english_data, layer, model, mean=False)
    german_activations[layer] = get_mlp_activations(german_data, layer, model, mean=False)

mean_context_neuron_acts_active = {}
mean_context_neuron_acts_inactive = {}
for layer, neurons in important_german_neurons.items():
    mean_context_neuron_acts_active[layer] = german_activations[layer][:, neurons].mean()
    mean_context_neuron_acts_inactive[layer] = english_activations[layer][:, neurons].mean()

def get_deactivate_neurons_hook(layer):
    def deactivate_neurons_hook(value, hook):
        value[:, :, important_german_neurons[layer]] = mean_context_neuron_acts_inactive[layer]
        return value
    return deactivate_neurons_hook
deactivate_neurons_fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', get_deactivate_neurons_hook(layer)) for layer in important_german_neurons.keys()]

def get_activate_neurons_hook(layer):
    def activate_neurons_hook(value, hook):
        value[:, :, important_german_neurons[layer]] = mean_context_neuron_acts_active[layer]
        return value
    return activate_neurons_hook
activate_neurons_fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', get_activate_neurons_hook(layer)) for layer in important_german_neurons.keys()]

all_ignore, not_ignore = haystack_utils.get_weird_tokens(model, plot_norms=False)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

## Check classification accuracy of German neurons

In [10]:
def run_single_neuron_lr(layer, neuron, num_samples=5000):
    # Check accuracy of logistic regression
    A = torch.concat([german_activations[layer][:num_samples, neuron], english_activations[layer][:num_samples, neuron]]).view(-1, 1).cpu().numpy()
    y = torch.concat([torch.ones(num_samples), torch.zeros(num_samples)]).cpu().numpy()
    A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=0.2)
    lr_model = LogisticRegression()
    lr_model.fit(A_train, y_train)
    test_acc = lr_model.score(A_test, y_test)
    train_acc = lr_model.score(A_train, y_train)
    f1 = sklearn.metrics.f1_score(y_test, lr_model.predict(A_test))
    return train_acc, test_acc, f1
    
def get_neuron_accuracy(layer, neuron, plot=False):
    mean_english_activation = english_activations[layer][:,neuron].mean()
    mean_german_activation = german_activations[layer][:,neuron].mean()
    
    if plot:
        haystack_utils.two_histogram(english_activations[layer][:,neuron], german_activations[layer][:,neuron], "English", "German", "Activation", "Frequency", f"L{layer}N{neuron} activations on English vs German text")
    train_acc, test_acc, f1 = run_single_neuron_lr(layer, neuron)
    print(f"\nL{layer}N{neuron}: F1={f1:.2f}, Train acc={train_acc:.2f}, and test acc={test_acc:.2f}")
    print(f"Mean activation English={mean_english_activation:.2f}, German={mean_german_activation:.2f}")
    return f1

In [11]:
f1s = []
for layer, neuron, reported_f1 in german_neurons_with_f1:
    f1s.append(get_neuron_accuracy(layer, neuron))

german_neuron_names = [f"L{layer}N{neuron}" for layer, neuron, _ in german_neurons_with_f1]
haystack_utils.line(f1s, xlabel="", ylabel="F1 score of sparse probe", title="Sparse probe performance on individual German neurons", xticks=german_neuron_names, show_legend=False)


L5N2649: F1=0.97, Train acc=0.97, and test acc=0.97
Mean activation English=-0.07, German=2.39

L8N2994: F1=0.98, Train acc=0.98, and test acc=0.98
Mean activation English=-0.06, German=4.11

L11N2911: F1=0.80, Train acc=0.76, and test acc=0.76
Mean activation English=1.04, German=0.02

L10N1129: F1=0.67, Train acc=0.64, and test acc=0.64
Mean activation English=2.56, German=1.50


KeyError: 6

In [None]:
def get_mlp_loss_difference(prompts: list[str], model: HookedTransformer, layer=5, neuron=1336, shift=0, weight_by_logprob=False, log_prob_weight=0):

    def deactivate_mlp_neuron(value, hook):
        value[:, :, neuron] += shift
        return value

    loss_differences = []
    for prompt in tqdm(prompts): 
        # with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
        #     deactivated_loss, deactivated_cache = model.run_with_cache(prompt)
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', deactivate_mlp_neuron)]):
            deactivated_logit, deactivated_loss = model(prompt, return_type="both", loss_per_token=True)
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks):
            activated_logit, activated_loss = model(prompt, return_type="both", loss_per_token=True)
        
        if not weight_by_logprob:
            # Shape pos 
            loss_difference, index = (deactivated_loss - activated_loss).flatten().max(0)
            loss_differences.append(loss_difference.item())
        else:
            # Shape pos
            answer_tokens = model.to_tokens(prompt)[0, 1:]
            # Batch pos d_vocab -> pos d_vocab
            deactivated_answer_logits = deactivated_logit.log_softmax(-1)[0, :-1, :]
            range_tensor = torch.arange(answer_tokens.shape[0])
            deactivated_answer_logits = deactivated_answer_logits[range_tensor, answer_tokens]
            activated_answer_logits = activated_logit.log_softmax(-1)[0, :-1, :]
            activated_answer_logits = activated_answer_logits[range_tensor, answer_tokens]
            max_answer_logit, _ = torch.stack((deactivated_answer_logits, activated_answer_logits)).max(0)

            diff_by_pos = (deactivated_loss - activated_loss).flatten()
            scaled_difference, _ = (diff_by_pos + (max_answer_logit*log_prob_weight)).max(0)
            loss_differences.append(scaled_difference.item())
    
    return loss_differences