In [1]:
import torch
import numpy as np
from torch import einsum
from tqdm.auto import tqdm
import seaborn as sns
from transformer_lens import HookedTransformer, ActivationCache, utils
from datasets import load_dataset
from einops import einsum
import pandas as pd
from transformer_lens import utils
from rich.table import Table, Column
from rich import print as rprint
from jaxtyping import Float, Int, Bool
from torch import Tensor
import einops
import functools
from transformer_lens.hook_points import HookPoint
# import circuitsvis
from IPython.display import HTML
from plotly.express import line
import plotly.express as px
from tqdm.auto import tqdm
import json
import gc
import plotly.graph_objects as go

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from plotly.subplots import make_subplots
# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio
pio.renderers.default = "colab+vscode"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import load_txt_data, get_mlp_activations, line, two_histogram
import haystack_utils

%reload_ext autoreload
%autoreload 2

## Set up model, data, and deactivate German neuron hook

In [2]:
english_neurons = [(5, 395), (5, 166), (5, 908), (5, 285), (3, 862), (5, 73), (4, 896), (5, 348), (5, 297), (3, 1204)]
german_neurons = [(4, 482), (5, 1039), (5, 407), (5, 1516), (5, 1336), (4, 326), (5, 250), (3, 669)]
french_neurons = [(5, 112), (4, 1080), (5, 1293), (5, 455), (5, 5), (5, 1901), (5, 486), (4, 975)]

model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

english_data = haystack_utils.load_txt_data("kde4_english.txt")
german_data = haystack_utils.load_txt_data("wmt_german_large.txt")

english_activations = {}
german_activations = {}
for layer in range(3, 6):
    english_activations[layer] = get_mlp_activations(english_data[:200], layer, model, mean=False)
    german_activations[layer] = get_mlp_activations(german_data[:200], layer, model, mean=False)

LAYER_TO_ABLATE = 3
NEURONS_TO_ABLATE = [669]
MEAN_ACTIVATION_ACTIVE = german_activations[LAYER_TO_ABLATE][:, NEURONS_TO_ABLATE].mean()
MEAN_ACTIVATION_INACTIVE = english_activations[LAYER_TO_ABLATE][:, NEURONS_TO_ABLATE].mean()

def deactivate_neurons_hook(value, hook):
    value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_INACTIVE
    return value
deactivate_neurons_fwd_hooks=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', deactivate_neurons_hook)]

def activate_neurons_hook(value, hook):
    value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_ACTIVE
    return value
activate_neurons_fwd_hooks=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', activate_neurons_hook)]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
kde4_english.txt: Loaded 1007 examples with 501 to 5295 characters each.
wmt_german_large.txt: Loaded 2459 examples with 800 to 2000 characters each.


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

## Check classification accuracy of German neurons

Sanity check: reproduce sparse probe results on the German neuron with Pythia-v1

In [None]:
def run_single_neuron_lr(layer, neuron, num_samples=5000):
    # Check accuracy of logistic regression
    A = torch.concat([german_activations[layer][:num_samples, neuron], english_activations[layer][:num_samples, neuron]]).view(-1, 1).cpu().numpy()
    y = torch.concat([torch.ones(num_samples), torch.zeros(num_samples)]).cpu().numpy()
    A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=0.2)
    lr_model = LogisticRegression()
    lr_model.fit(A_train, y_train)
    test_acc = lr_model.score(A_test, y_test)
    train_acc = lr_model.score(A_train, y_train)
    f1 = sklearn.metrics.f1_score(y_test, lr_model.predict(A_test))
    return train_acc, test_acc, f1
    

def get_neuron_accuracy(layer, neuron, plot=False):
    mean_english_activation = english_activations[layer][:,neuron].mean()
    mean_german_activation = german_activations[layer][:,neuron].mean()
    
    if plot:
        two_histogram(english_activations[layer][:,neuron], german_activations[layer][:,neuron], "English", "German", "Activation", "Frequency", f"L{layer}N{neuron} activations on English vs German text")
    train_acc, test_acc, f1 = run_single_neuron_lr(layer, neuron)
    print(f"\nL{layer}N{neuron}: F1={f1:.2f}, Train acc={train_acc:.2f}, and test acc={test_acc:.2f}")
    print(f"Mean activation English={mean_english_activation:.2f}, German={mean_german_activation:.2f}")


In [None]:
for layer, neuron in german_neurons:
    get_neuron_accuracy(layer, neuron)


L4N482: F1=0.90, Train acc=0.91, and test acc=0.91
Mean activation English=-0.07, German=1.21

L5N1039: F1=0.84, Train acc=0.84, and test acc=0.83
Mean activation English=1.02, German=-0.06

L5N407: F1=0.63, Train acc=0.64, and test acc=0.63
Mean activation English=5.23, German=3.70

L5N1516: F1=0.78, Train acc=0.76, and test acc=0.78
Mean activation English=2.31, German=1.02

L5N1336: F1=0.95, Train acc=0.96, and test acc=0.96
Mean activation English=-0.06, German=1.40

L4N326: F1=0.81, Train acc=0.83, and test acc=0.82
Mean activation English=0.03, German=0.81

L5N250: F1=0.77, Train acc=0.77, and test acc=0.78
Mean activation English=-0.00, German=-0.04

L3N669: F1=0.99, Train acc=0.99, and test acc=0.99
Mean activation English=-0.07, German=3.82


## Check loss increase from disabling each German neuron on German data

In [None]:
mean_original_loss, mean_ablated_loss, percent_increase = haystack_utils.get_ablated_performance(german_data[:1000], model, deactivate_neurons_fwd_hooks)
print(f"Mean original loss={mean_original_loss:.2f}, mean ablated loss={mean_ablated_loss:.2f}, percent increase={percent_increase:.2f}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Mean original loss=3.18, mean ablated loss=3.55, percent increase=11.57


In [None]:
def get_deactivate_single_neuron_hook(layer, neuron, english_activations):
    def deactivate_single_neuron_hook(value, hook):
        value[:, :, neuron] = english_activations[layer][:, neuron].mean()
        return value
    return deactivate_single_neuron_hook

In [None]:
# Loss increase from ablating a single neuron. Measures how useful each neuron is when all others are enabled (total effect).
for layer, neuron in tqdm(german_neurons):
    fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', get_deactivate_single_neuron_hook(layer, neuron, english_activations))]
    mean_original_loss, mean_ablated_loss, percent_increase = haystack_utils.get_ablated_performance(german_data[:1000], model, fwd_hooks, display_tqdm=False)
    print(f"L{layer}N{neuron}: Loss original={mean_original_loss:.2f}, ablated={mean_ablated_loss:.2f} (+{percent_increase:.2f}%)")

  0%|          | 0/8 [00:00<?, ?it/s]

L4N482: Loss original=3.18, ablated=3.20 (+0.60%)
L5N1039: Loss original=3.18, ablated=3.18 (+0.16%)
L5N407: Loss original=3.18, ablated=3.19 (+0.22%)
L5N1516: Loss original=3.18, ablated=3.18 (+0.09%)
L5N1336: Loss original=3.18, ablated=3.20 (+0.59%)
L4N326: Loss original=3.18, ablated=3.18 (+0.12%)
L5N250: Loss original=3.18, ablated=3.18 (+0.01%)
L3N669: Loss original=3.18, ablated=3.55 (+11.57%)


The huge loss increase when L3N669 is disabled implies either that the other neurons aren't backups for L3N669, or the components in layer 4 that can't read from later layer neurons are significant. 

TODO: Correlational analyses of these neurons would let us narrow down whether they just activate when L3N669 does (downstream neurons with a different function specific to German) or if they're also independent German-detectors.
We could also compare loss from disabling all context neurons, with loss from enabling just one of the minor context neurons. If the loss decrease is greater than 0.6%, we can say that it acts as a backup for other context neurons.

In [None]:
# Sanity check: also calculate loss on English
print("English loss impact (should be close to 0)")
for layer, neuron in tqdm(german_neurons):
    fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', get_deactivate_single_neuron_hook(layer, neuron, english_activations))]
    mean_original_loss, mean_ablated_loss, percent_increase = haystack_utils.get_ablated_performance(english_data[:1000], model, fwd_hooks, display_tqdm=False)
    print(f"L{layer}N{neuron}: Loss original={mean_original_loss:.2f}, ablated={mean_ablated_loss:.2f} (+{percent_increase:.2f}%)")

English loss impact (should be close to 0)


  0%|          | 0/8 [00:00<?, ?it/s]

L4N482: Loss original=3.96, ablated=3.96 (+0.02%)
L5N1039: Loss original=3.96, ablated=3.96 (+0.09%)
L5N407: Loss original=3.96, ablated=3.96 (+0.02%)
L5N1516: Loss original=3.96, ablated=3.95 (+-0.01%)
L5N1336: Loss original=3.96, ablated=3.96 (+-0.00%)
L4N326: Loss original=3.96, ablated=3.96 (+-0.00%)
L5N250: Loss original=3.96, ablated=3.95 (+-0.02%)
L3N669: Loss original=3.96, ablated=3.96 (+0.00%)


In [None]:
# Ablating all German neurons except the L3 one increases loss by 1.98%, a small increase from the sum of individual ablation losses - 1.79%. 
# This implies the neurons are partially acting as backups for each other.
german_neurons_not_l3 = [(4, 482), (5, 1039), (5, 407), (5, 1516), (5, 1336), (4, 326), (5, 250)]

def all_except_l3(value, hook):
    layer = hook.layer()
    layer_neurons = [neuron for layer, neuron in german_neurons_not_l3 if layer == layer]
    value[:, :, layer_neurons] = english_activations[layer][:, layer_neurons].mean()
    return value

fwd_hooks=[(f'blocks.4.mlp.hook_post', all_except_l3), (f'blocks.5.mlp.hook_post', all_except_l3)]
mean_original_loss, mean_ablated_loss, percent_increase = haystack_utils.get_ablated_performance(german_data[:1000], model, fwd_hooks, display_tqdm=False)
print(f"L{layer}N{neuron}: Loss original on German data={mean_original_loss:.2f}, ablated={mean_ablated_loss:.2f} (+{percent_increase:.2f}%)")
mean_original_loss, mean_ablated_loss, percent_increase = haystack_utils.get_ablated_performance(english_data[:1000], model, fwd_hooks, display_tqdm=False)
print(f"L{layer}N{neuron}: Loss original on English data={mean_original_loss:.2f}, ablated={mean_ablated_loss:.2f} (+{percent_increase:.2f}%)")
# TODO To determine if a neuron is a backup for another neuron, ablate them each individually then together and see if the loss increase is greater than the sum of the individual losses. %_backup_B = AB - A - B / B

L3N669: Loss original on German data=3.18, ablated=3.24 (+1.98%)
L3N669: Loss original on English data=3.96, ablated=3.98 (+0.69%)


## Check DLA difference and loss breakdown by component for L3N669

In [None]:
def deactivate_neurons_hook(value, hook):
        value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_INACTIVE
        return value

def activate_neuron_hook(value, hook):
        value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_ACTIVE
        return value

ABLATE_HOOK=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', deactivate_neurons_hook)]
ACTIVATE_HOOK=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', activate_neuron_hook)]


In [None]:
logit_attr_original, labels = haystack_utils.DLA(german_data[:1000], model)

# Patch in disabled context neurons and plot the direct logit attribution difference for each component
with model.hooks(fwd_hooks=ABLATE_HOOK):
    logit_attr_ablated, _ = haystack_utils.DLA(german_data[:1000], model)

logit_diffs = (logit_attr_original - logit_attr_ablated).mean(0)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
# The small differences accumulated before the ablation are due to the final layer norm scale being affected by the L3 hook.
haystack_utils.line(logit_diffs.cpu().numpy(), xlabel="Correct logit", ylabel="", title="(Original DLA - Ablated DLA) per component", xticks=labels)

In [None]:
logits, cache = model.run_with_cache("Test prompt")
cache

ActivationCache with keys ['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'bloc

In [None]:
# Loss increase from patching in components from the forward pass with disabled German context neuron.
# Does not include the effect of the patched component on other components.
# Does include the effect of the pre-patched component on other components.
# Does include the contribution of the pre-patched component to the layer normalizations of the residual stream.
component_names = ['embed', '0_attn_out', '0_mlp_out', '1_attn_out', '1_mlp_out', '2_attn_out', '2_mlp_out', '3_attn_out', '3_mlp_out', '4_attn_out', '4_mlp_out', '5_attn_out', '5_mlp_out']
components = []
losses = []
for later_component in range(8, 13):
    print(f"Component: {component_names[later_component]}")
    original_loss, patched_loss = haystack_utils.get_direct_loss_increase_for_component(german_data[:1000], model, fwd_hooks=deactivate_neurons_fwd_hooks, patched_component=later_component, disable_progress_bar=True)
    if len(losses) == 0:
        components.append("Original loss")
        losses.append(original_loss)
    components.append(component_names[later_component])
    losses.append(patched_loss)

Component: 3_mlp_out
Original loss: 3.18, patched loss: 3.34 (+4.96%)
Component: 4_attn_out
Original loss: 3.18, patched loss: 3.19 (+0.39%)
Component: 4_mlp_out
Original loss: 3.18, patched loss: 3.28 (+3.17%)
Component: 5_attn_out
Original loss: 3.18, patched loss: 3.21 (+0.86%)
Component: 5_mlp_out
Original loss: 3.18, patched loss: 3.43 (+7.93%)


In [None]:
percent_increase = ((np.array(losses) - losses[0]) / losses[0]) * 100
haystack_utils.line(losses, xlabel="Component", ylabel="Loss", title="Loss of patching individual components when ablating L3N669", xticks=components, width=800, hover_data=percent_increase.tolist())

## Tokens boosted by L3 directly

In [None]:
def unembed_residual(cache, layer, apply_ln=True):
    name = f'blocks.{layer}.hook_resid_post'
    if apply_ln:
        residual = model.ln_final(cache[name])
    else:
        residual = cache[name]
    logits = model.unembed(residual)
    return logits

In [None]:
# Check if it is done correctly
logits_original, cache_original = model.run_with_cache(german_data[:1])
final_residual_unembed = unembed_residual(cache_original, 5)
torch.testing.assert_close(final_residual_unembed, logits_original)

In [None]:
def get_unembed_differences(prompts: list[str], model):
    per_token_differences = torch.zeros(model.cfg.d_vocab).to(device)
    for prompt in tqdm(prompts):
        # Set context neuron to activated value
        with model.hooks(fwd_hooks=ACTIVATE_HOOK):
            _, cache_original = model.run_with_cache(prompt)
        # Ablate context neuron
        with model.hooks(fwd_hooks=ABLATE_HOOK):
            _, cache_ablated = model.run_with_cache(prompt)
        
        # Get normalized MLP output
        name = f'blocks.{LAYER_TO_ABLATE}.hook_resid_post'
        manual_unembed_original = unembed_residual(cache_original, LAYER_TO_ABLATE)
        manual_unembed_ablated = unembed_residual(cache_ablated, LAYER_TO_ABLATE)
        
        # Shape batch pos d_vocab
        logit_difference = (manual_unembed_ablated - manual_unembed_original).mean((0, 1))
        per_token_differences += logit_difference
    
    return per_token_differences / len(prompts)


In [None]:
token_differences = get_unembed_differences(german_data[:1000], model)
boosted_values, boosted_tokens = torch.topk(token_differences, 1000)
inhibited_values, inhibited_tokens = torch.topk(token_differences, 1000, largest=False)
boosted_labels = model.to_str_tokens(boosted_tokens)
inhibited_labels = model.to_str_tokens(inhibited_tokens)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
px.histogram(token_differences.cpu().numpy(), title="Histogram of L3 logit difference between original and ablated model", labels={"value": "Logit difference"})

In [None]:
num_tokens = 100
line(boosted_values.cpu().numpy()[:num_tokens], xlabel="Token", ylabel="Logit increase from context neuron", xticks=boosted_labels[:num_tokens], title=f"Top boosted tokens from L3N669", width=1100)


In [None]:
num_tokens = 100
stripped_labels=[x.strip() for x in inhibited_labels[:num_tokens]]
line(inhibited_values.cpu().numpy()[:num_tokens], xlabel="Token", ylabel="Logit increase from context neuron", xticks=stripped_labels, title=f"Top inhibited tokens from L3N669", width=1000)


In [None]:
# Unembed neuron direction directly

# Only works for individual neurons
# Shape batch pos d_resid
neuron_weight = model.W_out[LAYER_TO_ABLATE, NEURONS_TO_ABLATE].view(1, 1, -1)
neuron_direction_active = neuron_weight * MEAN_ACTIVATION_ACTIVE # Set German neuron to activated value (~3)
neuron_direction_inactive = neuron_weight * MEAN_ACTIVATION_INACTIVE # Set German neuron to disabled value (~0)

tokens_active = model.unembed(neuron_direction_active)
tokens_inactive = model.unembed(neuron_direction_inactive)
# Active: German neuron is active - we expect German tokens boosted
# Inactive: German neuron is inactive - we expect no boost to German tokens
# Active - Inactive: If the neuron boosts German tokens, we expect this to be positive
token_differences = (tokens_active - tokens_inactive).flatten()

boosted_values, boosted_tokens = torch.topk(token_differences, 1000)
inhibited_values, inhibited_tokens = torch.topk(token_differences, 1000, largest=False)
boosted_labels = model.to_str_tokens(boosted_tokens)
inhibited_labels = model.to_str_tokens(inhibited_tokens)


def filter_garbage_tokens(labels, values):
    """Remove meaningless and whitespace tokens"""
    garbage_characters = {"", "ÃÂÃÂÃÂÃÂ", "ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ", "ÃÂÃÂ", "��", "�", "14514500", "1451450014514500", "{¶"}
    non_whitespace_labels = []
    non_whitespace_values = []
    for label, value in zip(labels, values):
        stripped_label = label.strip()
        if not stripped_label in garbage_characters:
            non_whitespace_labels.append(stripped_label)
            non_whitespace_values.append(value.item())
    return non_whitespace_labels, non_whitespace_values
non_whitespace_boosted_labels, non_whitespace_boosted_values = filter_garbage_tokens(boosted_labels, boosted_values)

#px.histogram(token_differences.cpu().numpy(), title="Histogram of L3N669 direct logit difference between original and ablated model", labels={"value": "Logit difference"})

num_tokens = 100
stripped_labels=[x.strip() for x in inhibited_labels[:num_tokens]]
line(non_whitespace_boosted_values[:num_tokens], xlabel="Token", ylabel="Logit increase from context neuron", xticks=non_whitespace_boosted_labels[:num_tokens], title=f"Top boosted tokens from L3N669", width=1100)
line(inhibited_values.cpu().numpy()[:num_tokens], xlabel="Token", ylabel="Logit increase from context neuron", xticks=stripped_labels, title=f"Top boosted tokens from L3N669", width=1100)


## Get German unigram statistics

In [None]:
# Get top German unigrams
def count_token_occurrences(prompts: list[str]):
    token_counts = torch.zeros(model.cfg.d_vocab).to(device)
    for prompt in tqdm(prompts):
        # Remove BOS
        tokens = model.to_tokens(prompt).flatten()[1:]
        token_counts[tokens] += 1
    return token_counts

In [None]:
german_unigram_counts = count_token_occurrences(german_data)
german_unigram_highest_counts, german_unigram_tokens = torch.topk(german_unigram_counts, 100)
german_unigram_labels = model.to_str_tokens(german_unigram_tokens)

num_tokens = 100
line(german_unigram_highest_counts.cpu().numpy()[:num_tokens], 
     xlabel="Token", ylabel="Counts", 
     xticks=german_unigram_labels[:num_tokens], 
     title=f"Top unigrams in WMT German data", 
     width=1100)


  0%|          | 0/2459 [00:00<?, ?it/s]

In [None]:
top_german_token_differences = token_differences[german_unigram_tokens]
print(top_german_token_differences.mean())
px.histogram(top_german_token_differences.cpu().numpy())

tensor(1.7974, device='cuda:0')


In [None]:
english_unigram_counts = count_token_occurrences(english_data)
english_unigram_highest_counts, english_unigram_tokens = torch.topk(english_unigram_counts, 100)
english_unigram_labels = model.to_str_tokens(english_unigram_tokens)

num_tokens = 100
line(english_unigram_highest_counts.cpu().numpy()[:num_tokens], 
     xlabel="Token", ylabel="Counts", 
     xticks=english_unigram_labels[:num_tokens], 
     title=f"Top unigrams in KDE English data", 
     width=1100)

  0%|          | 0/1007 [00:00<?, ?it/s]

In [None]:
top_english_token_differences = token_differences[english_unigram_tokens]
print(top_english_token_differences.mean())
px.histogram(top_english_token_differences.cpu().numpy(), title="Token differences for top English unigrams")

tensor(0.3300, device='cuda:0')


In [None]:
english_labels = model.to_str_tokens(english_unigram_tokens)
line(top_english_token_differences.cpu().numpy()[:num_tokens], 
     xlabel="Token", ylabel="Counts", 
     xticks=english_labels, 
     title=f"English token differences", 
     width=1100)

## (New): Trying to find interesting downstream neurons by activation difference

In [None]:
def compare_activations(prompts: list[str], model: HookedTransformer, layer=5):

    activation_differences = []
    for prompt in tqdm(prompts): 
        tokens = model.to_tokens(prompt)
        
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks):
            original_logits, original_cache = model.run_with_cache(tokens)

        with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
            ablated_logits, ablated_cache = model.run_with_cache(tokens)
        
        act_label = f"blocks.{layer}.mlp.hook_pre"
        
        activation_difference = original_cache[act_label] - ablated_cache[act_label]
        activation_difference = einops.rearrange(activation_difference, "batch pos d_mlp -> (batch pos) d_mlp")
        activation_differences.append(activation_difference)
    
    return torch.cat(activation_differences, dim=0)

In [None]:
def barplot(activations, std_deviations, xticks=None, title=""):
    assert len(activations) < 200, "Too many activations to plot"
    # Create a bar plot using Plotly
    fig = go.Figure()

    if xticks == None:
        xticks = np.arange(len(activations)).tolist()
        
    # Create the bar trace
    bars = go.Bar(
        x=np.arange(len(activations)),
        y=activations,
        error_y=dict(
            type='data',
            array=std_deviations,
            visible=True
        )
    )

    # Add the trace to the figure
    fig.add_trace(bars)

    # Set the layout of the figure
    fig.update_layout(
        title=title,
        xaxis=dict(
            title='Neuron',
            tickmode='array',
            tickvals=np.arange(len(activations)),
            ticktext=xticks
        ),
        yaxis=dict(title='Active - Inactive activation'),
        showlegend=False,
        width=1600
    )

    # Show the plot
    fig.show()


In [None]:
layer = 5
activation_differences = compare_activations(german_data+english_data, model, layer=layer)
significance_measure = abs(activation_differences.mean(0)) - activation_differences.std(0)
values, interesting_neurons = torch.topk(significance_measure, 100)

  0%|          | 0/3466 [00:00<?, ?it/s]

In [None]:
title=f"MLP {layer} activation difference with and without the context neuron active, sorted by absolute difference and consistency"
barplot(activation_differences.mean(0)[interesting_neurons].cpu().numpy(), activation_differences.std(0)[interesting_neurons].cpu().numpy(), xticks=interesting_neurons.cpu().tolist(), title=title)

In [None]:
layer = 4
activation_differences = compare_activations(german_data+english_data, model, layer=layer)
significance_measure = abs(activation_differences.mean(0)) - activation_differences.std(0)
values, interesting_neurons = torch.topk(significance_measure, 100)

  0%|          | 0/3466 [00:00<?, ?it/s]

In [None]:
title=f"MLP {layer} activation difference with and without the context neuron active, sorted by absolute difference and consistency"
barplot(activation_differences.mean(0)[interesting_neurons].cpu().numpy(), activation_differences.std(0)[interesting_neurons].cpu().numpy(), xticks=interesting_neurons.cpu().tolist(), title=title)

## Replicate German context neurons with L3N669 ablated - successful

In [None]:
haystack_utils.clean_cache()

In [3]:
def get_mlp_loss_difference(prompts: list[str], model: HookedTransformer, layer=5, neuron=1336, shift = 0, weight_by_logprob = False, log_prob_weight=0):

    def deactivate_mlp_neuron(value, hook):
        # Context neuron is not active
        value[:, :, neuron] += shift
        return value

    loss_differences = []
    for prompt in tqdm(prompts): 
        # with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
        #     deactivated_loss, deactivated_cache = model.run_with_cache(prompt)
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', deactivate_mlp_neuron)]):
            deactivated_logit, deactivated_loss = model(prompt, return_type="both", loss_per_token=True)
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks):
            activated_logit, activated_loss = model(prompt, return_type="both", loss_per_token=True)
        
        if not weight_by_logprob:
            # Shape pos 
            loss_difference, index = (deactivated_loss - activated_loss).flatten().max(0)
            loss_differences.append(loss_difference.item())
        else:
            # Shape pos
            answer_tokens = model.to_tokens(prompt)[0, 1:]
            # Batch pos d_vocab -> pos d_vocab
            deactivated_answer_logits = deactivated_logit.log_softmax(-1)[0, :-1, :]
            range_tensor = torch.arange(answer_tokens.shape[0])
            deactivated_answer_logits = deactivated_answer_logits[range_tensor, answer_tokens]
            activated_answer_logits = activated_logit.log_softmax(-1)[0, :-1, :]
            activated_answer_logits = activated_answer_logits[range_tensor, answer_tokens]
            max_answer_logit, _ = torch.stack((deactivated_answer_logits, activated_answer_logits)).max(0)

            diff_by_pos = (deactivated_loss - activated_loss).flatten()
            scaled_difference, _ = (diff_by_pos + (max_answer_logit*log_prob_weight)).max(0)
            loss_differences.append(scaled_difference.item())
    
    return loss_differences

In [70]:
def highlight_mlp_difference(prompts: list[str], model: HookedTransformer, layer=5, neuron=1336, shift=0, set_active=False, active_value: float=0):

    def activate_mlp_neuron(value, hook):
        # Context neuron is not active
        if set_active:
            value[:, :, neuron] = active_value
        #value[:, :, neuron] = 1.6#+= -1.4645#1.2515#0.6#-2.2417
        return value
    
    def deactivate_mlp_neuron(value, hook):
        # Context neuron is not active
        if set_active:
            value[:, :, neuron] = active_value + shift
        else:
            value[:, :, neuron] += shift#-1.4645#1.2515#0.6#-2.2417 #=0.4#
        return value

    for prompt in prompts: 
        # L3 Context neuron set to active, L5 neuron set to activation as if L3 neuron was inactive
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', deactivate_mlp_neuron)]):
            deactivated_loss, _ = model.run_with_cache(prompt, return_type="loss", loss_per_token=True)
        # L3 Context neuron set to active, L5 neuron left as is
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', activate_mlp_neuron)]):
            activated_loss, _ = model.run_with_cache(prompt, return_type="loss", loss_per_token=True)

        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', deactivate_mlp_neuron)]):
            deactivated_logits, _ = model.run_with_cache(prompt, return_type="logits", loss_per_token=True)
        with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', activate_mlp_neuron)]):
            activated_logits, _ = model.run_with_cache(prompt, return_type="logits", loss_per_token=True)
        
        # Shape pos
        answer_tokens = model.to_tokens(prompt)[0, 1:] # [22]
        
        # Batch pos d_vocab -> pos d_vocab
        deactivated_answer_logits = deactivated_logits.log_softmax(-1)[0, :-1, :]

        range_tensor = torch.arange(answer_tokens.shape[0])
        deactivated_answer_logits = deactivated_answer_logits[range_tensor, answer_tokens]


        activated_answer_logits = activated_logits.log_softmax(-1)[0, :-1, :]
        activated_answer_logits = activated_answer_logits[range_tensor, answer_tokens]
        
        # Shape pos 
        loss_difference = (deactivated_loss - activated_loss).flatten()
        #neuron_differences = activated_cache[f"blocks.{layer}.mlp.hook_pre"][0, :, neuron] - deactivated_cache[f"blocks.{layer}.mlp.hook_pre"][0, :, neuron]
        str_token_prompt = model.to_str_tokens(model.to_tokens(prompt))
        haystack_utils.print_strings_as_html(str_token_prompt[1:], loss_difference.flatten().cpu().tolist(), max_value=1, original_log_probs=activated_answer_logits.cpu().tolist(), ablated_log_probs=deactivated_answer_logits.cpu().tolist())



In [5]:
LOG_PROB_THRESHOLD = -7
all_ignore, not_ignore = haystack_utils.get_weird_tokens(model, plot_norms=False)

Number of W_U neurons to ignore: 324
Number of W_E neurons to ignore: 292
Number of unique W_U and W_E neurons to ignore: 367


In [6]:
def get_top_differences_at_position(prompt: str, model: HookedTransformer, position: int, top_k=20, layer=5, neuron=1336, shift=0):
    """_summary_

    Args:
        prompt (str): _description_
        model (HookedTransformer): _description_
        position (int): _description_
        top_k (int, optional): _description_. Defaults to 20.
        layer (int, optional): _description_. Defaults to 5.
        neuron (int, optional): _description_. Defaults to 1336.
        shift (int, optional): Mean deactivated - activated activation difference. Defaults to 0.
    """
    def activate_mlp_neuron(value, hook):
        # Context neuron is not active
        #value[:, :, neuron] = 1.6#+= -1.4645#1.2515#0.6#-2.2417
        return value
    
    def deactivate_mlp_neuron(value, hook):
        # Context neuron is not active
        value[:, :, neuron] += shift#-1.4645#1.2515#0.6#-2.2417 #=0.4#
        return value

    tokens = model.to_tokens(prompt)
    str_tokens = model.to_str_tokens(tokens)
    # Logprobs instead of logits
    with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', deactivate_mlp_neuron)]):
        ablated_logits = model(prompt, return_type="logits", loss_per_token=True).log_softmax(-1)
    with model.hooks(fwd_hooks=activate_neurons_fwd_hooks + [(f'blocks.{layer}.mlp.hook_pre', activate_mlp_neuron)]):
        original_logits = model(prompt, return_type="logits", loss_per_token=True).log_softmax(-1)

    # Positive difference = the German neuron makes the token more likely
    # Negative difference = the German neuron makes the token less likely
    logit_differences = original_logits - ablated_logits
    
    print("Prompt:", prompt)
    print(f"Differences for predicting: {str_tokens[position]} -> {str_tokens[position+1]}")

    low_log_prob = torch.argwhere(((original_logits[0, position, :] <= LOG_PROB_THRESHOLD) & (ablated_logits[0, position, :] <= LOG_PROB_THRESHOLD))).flatten()
    ignore_tokens = torch.cat([low_log_prob, all_ignore]).unique()
    
    top_original_logprobs, top_original_idx = haystack_utils.top_k_with_exclude(original_logits[0, position, :].flatten(), top_k, exclude=ignore_tokens)
    top_original_ablated_logprobs = ablated_logits[0, position, top_original_idx]
    top_ablated_logprobs, top_ablated_idx = haystack_utils.top_k_with_exclude(ablated_logits[0, position, :].flatten(), top_k, exclude=ignore_tokens)
    top_ablated_original_logprobs = original_logits[0, position, top_ablated_idx]

    top_original_values = logit_differences[0, position, top_original_idx]
    top_ablated_values = logit_differences[0, position, top_ablated_idx]
    print("Top predictions with German neuron active (unablated)")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_original_idx), top_original_values.cpu().tolist(), max_value=5, original_log_probs=top_original_logprobs.cpu().tolist(), ablated_log_probs=top_original_ablated_logprobs.cpu().tolist())
    print("Top predictions with German neuron disabled (ablated)")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_ablated_idx), top_ablated_values.cpu().tolist(), max_value=5, original_log_probs=top_ablated_original_logprobs.cpu().tolist(), ablated_log_probs=top_ablated_logprobs.cpu().tolist())

    top_boosts, top_boosted_idx = haystack_utils.top_k_with_exclude(logit_differences[:, position, :].flatten(), top_k, exclude=ignore_tokens)
    top_boost_original_logprob = original_logits[0, position, top_boosted_idx]
    top_boost_ablated_logprob = ablated_logits[0, position, top_boosted_idx]
    top_reduced, top_reduced_idx = haystack_utils.top_k_with_exclude(logit_differences[:, position, :].flatten(), top_k, largest=False, exclude=ignore_tokens)
    top_reduced_original_logprob = original_logits[0, position, top_reduced_idx]
    top_reduced_ablated_logprob = ablated_logits[0, position, top_reduced_idx]
    print("Top boosted tokens by German neuron")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_boosted_idx), top_boosts.cpu().tolist(), max_value=5, original_log_probs=top_boost_original_logprob.cpu().tolist(), ablated_log_probs=top_boost_ablated_logprob.cpu().tolist())
    print("Top reduced tokens by German neuron")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_reduced_idx), top_reduced.cpu().tolist(), max_value=5, original_log_probs=top_reduced_original_logprob.cpu().tolist(), ablated_log_probs=top_reduced_ablated_logprob.cpu().tolist())

In [23]:
num_activation_examples = 1000
ACTIVATED_ACTIVATIONS = {}
DEACTIVATED_ACTIVATIONS = {}
for layer in range(4, 6):
    with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
        DEACTIVATED_ACTIVATIONS[layer] = get_mlp_activations(english_data[:num_activation_examples] + german_data[:num_activation_examples], layer, model, mean=False, hook_pre=True)
    with model.hooks(fwd_hooks=activate_neurons_fwd_hooks):
        ACTIVATED_ACTIVATIONS[layer] = get_mlp_activations(english_data[:num_activation_examples] + german_data[:num_activation_examples], layer, model, mean=False, hook_pre=True)

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [66]:
def get_mlp_context_shift(layer, neuron, plot_hist=False, activated_activations=ACTIVATED_ACTIVATIONS, deactivated_activations=DEACTIVATED_ACTIVATIONS, max_plotted=1000):
    activated_activations_neuron = activated_activations[layer][:, neuron]
    deactivated_activations_neuron = deactivated_activations[layer][:, neuron]
    shift = deactivated_activations_neuron.mean()- activated_activations_neuron.mean()

    if plot_hist:
        haystack_utils.two_histogram(deactivated_activations_neuron[:max_plotted], activated_activations_neuron[:max_plotted], data_1_name="Deactivated context neuron", data_2_name="Activated context neuron", title=f"Pre-gelu L{layer}N{neuron} activations (shift={shift.item():.4f})")
    return shift.item(), activated_activations_neuron.mean().item(), deactivated_activations_neuron.mean().item()

def get_examples_by_neuron(prompts:list[str], neuron, layer, log_prob_weight=0.5, num_examples=10, plot_hist=False):
    
    shift, _, _ = get_mlp_context_shift(layer, neuron, plot_hist=plot_hist)

    max_loss_per_prompt = get_mlp_loss_difference(prompts, model, layer=layer, neuron=neuron, shift=shift, weight_by_logprob=True, log_prob_weight=log_prob_weight)
    max_loss_value, max_loss_prompts = torch.topk(torch.Tensor(max_loss_per_prompt), num_examples)

    for prompt_index in max_loss_prompts:
        highlight_mlp_difference(prompts[prompt_index:prompt_index+1], model, layer=layer, neuron=neuron, shift=shift)

Potential next ideas
- Look at groups of neurons
- Find groups that write into similar subspaces (compare output weights)
- Look at MLP4
- L5 - look at weight directly

## Looking at individual neurons

In [44]:
top_positive_l4 = [683, 482, 1445, 326, 1213, 578, 102, 1236, 45, 1791]
top_negative_l4 = [1331, 1682, 1248, 1080, 233, 1187, 1677, 290, 1584, 1230]
top_positive_l5 = [1336, 1460, 1709, 1693, 395, 1528, 697, 316, 13, 312]
top_negative_l5 = [1292, 213, 975, 134, 838, 1991, 1230, 752, 1804, 965]

### Direct unembed

In [74]:
layer = 5
neuron = 1336
shift, active_value, inactive_value = get_mlp_context_shift(layer, neuron, plot_hist=True)
_ = haystack_utils.get_neuron_unembed(model, neuron=neuron, layer=layer, mean_activation_active=ACTIVATED_ACTIVATIONS[layer][:, neuron].mean(), mean_activation_inactive=DEACTIVATED_ACTIVATIONS[layer][:, neuron].mean(), plot=True, top_k=50)

In [84]:
sentences = [
    "Prague, the capital city of the Czech Republic",
    "Prague, the capital city of Tschechien",
    "Prag, die Hauptstadt von Tschechien",
    "Prag, die Hauptstadt von Tschechischen Republik",
    "Prag, die Hauptstadt von Czech Republic",
    "Vienna is the capital of Austria.",
    "Vienna is the capital of Österreich.",
    "Die Goethe-Universität in Frankfurt zählt zu den renommiertesten Universitäten Deutschlands.",
    "Ich plane einen Besuch in Frankfurt, um die imposante Skyline zu bewundern.",
    "Next summer, I'd like to go to Croatia",
    "Nächsten Sommer würde Ich gerne nach Kroatien gehen"
]

highlight_mlp_difference(sentences, model, layer=layer, neuron=neuron, shift=shift)

### Positive L5

In [50]:
get_examples_by_neuron(german_data[:500], neuron=top_positive_l5[0], layer=5, plot_hist=True, num_examples=5)

  0%|          | 0/500 [00:00<?, ?it/s]

In [51]:
get_examples_by_neuron(german_data[:500], neuron=top_positive_l5[1], layer=5, plot_hist=True, num_examples=5)

  0%|          | 0/500 [00:00<?, ?it/s]

In [52]:
get_examples_by_neuron(german_data[:500], neuron=top_positive_l5[2], layer=5, plot_hist=True, num_examples=5)

  0%|          | 0/500 [00:00<?, ?it/s]

### Negative L5 

In [118]:
layer = 5
neuron = top_negative_l5[1]
shift, active_value, inactive_value = get_mlp_context_shift(layer, neuron, plot_hist=True)
_ = haystack_utils.get_neuron_unembed(model, neuron=neuron, layer=layer, mean_activation_active=ACTIVATED_ACTIVATIONS[layer][:, neuron].mean(), mean_activation_inactive=DEACTIVATED_ACTIVATIONS[layer][:, neuron].mean(), plot=True, top_k=50)

In [119]:
get_examples_by_neuron(german_data[:1000], neuron=neuron, layer=layer, plot_hist=False, num_examples=5, log_prob_weight=0.3)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [122]:
prompt = "zu diesem Zweck auf der Einnahmen- und der Ausgabenseite des Haushaltsplans eine spezielle Haushaltslinie zu schaffen und dabei den Haushaltsgrundsätzen der Spezialität und Neutralität Rechnung zu tragen, wobei"
#prompt = "zu tragen, wobei"
get_top_differences_at_position(prompt, model, position=-3, top_k=5, layer=layer, neuron=neuron, shift=shift)

Prompt: zu diesem Zweck auf der Einnahmen- und der Ausgabenseite des Haushaltsplans eine spezielle Haushaltslinie zu schaffen und dabei den Haushaltsgrundsätzen der Spezialität und Neutralität Rechnung zu tragen, wobei
Differences for predicting:  w -> obe
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


In [95]:
get_examples_by_neuron(english_data[:500], neuron=neuron, layer=layer, plot_hist=False, num_examples=5)

  0%|          | 0/500 [00:00<?, ?it/s]

## L5N1336

In [123]:
layer = 5
neuron = 1336
shift, _, _ = get_mlp_context_shift(layer, neuron, True, max_plotted=5000)
_ = haystack_utils.get_neuron_unembed(model, neuron=neuron, layer=layer, mean_activation_active=ACTIVATED_ACTIVATIONS[layer][:, neuron].mean(), mean_activation_inactive=DEACTIVATED_ACTIVATIONS[layer][:, neuron].mean(), plot=True, top_k=50)

In [111]:
get_examples_by_neuron(german_data[:1000], neuron=neuron, layer=layer, num_examples=5)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [114]:
sentences = [
    "Ich habe ein paar Vorschläge für die Verbesserung des Projekts.",
    "Welche Vorschläge hast du für die Gestaltung des neuen Logos?",
    "Die Lehrerin bat die Schüler um ihre Vorschläge für das Klassenprojekt.",
    "Ich habe einige Vorschläge für Aktivitäten während unseres Urlaubs.",
    "Die Geschäftsleitung freut sich über neue Vorschläge zur Prozessoptimierung.",
    "Die Politiker diskutieren verschiedene Vorschläge zur Lösung des Verkehrsproblems.",
    "Wir haben Vorschläge für die Tagesordnung der nächsten Teambesprechung gesammelt.",
    "Bitte senden Sie Ihre Vorschläge bis zum Ende der Woche ein.",
    "Die Firma nimmt gerne Kundenwünsche und Vorschläge entgegen.",
    "Die Arbeitsgruppe arbeitet gemeinsam an konkreten Vorschlägen zur Kostensenkung.",
]

highlight_mlp_difference(sentences, model, layer=layer, neuron=neuron, shift=shift)

In [116]:
get_top_differences_at_position("Die Lehrerin bat die Schüler um ihre Vorschläge", model, position=-3, top_k=10, layer=layer, neuron=neuron, shift=shift)

Prompt: Die Lehrerin bat die Schüler um ihre Vorschläge
Differences for predicting: orsch -> lä
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


In [102]:
model.to_str_tokens(model.to_tokens(" von"))

['<|endoftext|>', ' von']

In [36]:
sentences = [
    "Die gemeinsame Entscheidung führte zu einem positiven Ergebnis.",
    "Wir sollten die gemeinsame Zeit nutzen, um uns besser kennenzulernen.",
    "Die gemeinsame Verantwortung liegt bei uns allen.",
    "Wir sollten die gemeinsame Vision nicht aus den Augen verlieren.",
    "Die gemeinsame Anstrengung hat sich gelohnt.",
    "Die gemeinsame Reise war ein unvergessliches Erlebnis.",
    "Die gemeinsame Zusammenarbeit war äußerst effektiv.",
    "Die gemeinsame Interessenlage ermöglicht eine erfolgreiche Kooperation.",
    "Wir haben die gemeinsame Herausforderung erfolgreich gemeistert.",
    "Die gemeinsame Initiative hat zu bedeutenden Fortschritten geführt."
]


highlight_mlp_difference(sentences, model, layer=layer, neuron=neuron, shift=shift)

In [37]:
sentences = [
    "Gemeinsam können wir Großes erreichen.",
    "Gemeinsam können wir Hindernisse überwinden.",
    "Gemeinsam sind wir stark.",
    "Gemeinsam können wir diese Aufgabe bewältigen.",
    "Gemeinsame Entscheidungen sind oft effektiver als individuelle.",
    "Gemeinsame Interessen verbinden Menschen miteinander.",
    "Gemeinsame Ziele sind der Schlüssel zum Erfolg.",
    "Gemeinsame Projekte eröffnen neue Möglichkeiten."
]

prepend = "Alice und Bob besprechen die Arbeit. "

sentences = [prepend + sentence for sentence in sentences]
highlight_mlp_difference(sentences, model, layer=layer, neuron=neuron, shift=shift)

In [39]:
sentences = [
    "Lass uns gemeinsam an diesem Projekt arbeiten.",
    "Wir haben gemeinsam viel Spaß.",
    "Lass uns gemeinsam neue Wege gehen.",
    "Wir haben gemeinsam viel erreicht.",
    "Lass uns gemeinsame Projekte starten.",
    "Wir haben gemeinsame Interessen.",
    "Lass uns gemeinsame Ziele verfolgen.",
    "Die Projekte, die sie gemeinsam bearbeitet haben",
    "Die Projekte, die sie gemeinsamen Partnern vorgestellt haben",
]

sentences = [prepend + sentence for sentence in sentences]
highlight_mlp_difference(sentences, model, layer=layer, neuron=neuron, shift=shift)

In [40]:
sentences = [
    "Lass uns die Geminsame",
    "Lass uns die Geminsam",
    "Lass uns die Geminsamkeiten",
    "Wir haben gemeinsam viel erreicht.",
    "Wir haben gemeinsame Interessen.",
    "In der Gemeinschaft sind wir stark.",
    "Eine starke Gemeinschaft baut auf Vertrauen und Zusammenhalt.",
    "In unserer Gemeinschaft werden gemeinsame Werte hochgeschätzt."
]

#sentences = [prepend + sentence for sentence in sentences]
highlight_mlp_difference(sentences, model, layer=layer, neuron=neuron, shift=shift)

In [41]:
get_top_differences_at_position("Wir haben gemeinsam viel erreicht.", model, position=6, top_k=5, layer=layer, neuron=neuron, shift=shift)

Prompt: Wir haben gemeinsam viel erreicht.
Differences for predicting: ins -> am
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


## Remaining analysis

Since some of our largest neurons of interest are German context neuron, let's repeat the sparse probing analysis with L3N669 disabled and check that they're still valid context neurons

In [None]:
ablated_english_activations = {}
ablated_german_activations = {}
with model.hooks(fwd_hooks=ABLATE_HOOK):
    for layer in range(3, 6):
        ablated_english_activations[layer] = get_mlp_activations(english_data[:200], layer, model, mean=False)
        ablated_german_activations[layer] = get_mlp_activations(german_data[:200], layer, model, mean=False)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
def run_single_neuron_lr(layer, neuron, num_samples=5000):
    # Check accuracy of logistic regression
    A = torch.concat([ablated_german_activations[layer][:num_samples, neuron], ablated_english_activations[layer][:num_samples, neuron]]).view(-1, 1).cpu().numpy()
    y = torch.concat([torch.ones(num_samples), torch.zeros(num_samples)]).cpu().numpy()
    A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=0.2)
    lr_model = LogisticRegression()
    lr_model.fit(A_train, y_train)
    test_acc = lr_model.score(A_test, y_test)
    train_acc = lr_model.score(A_train, y_train)
    f1 = sklearn.metrics.f1_score(y_test, lr_model.predict(A_test))
    return train_acc, test_acc, f1 
    

def get_neuron_accuracy(layer, neuron, plot=False):
    mean_english_activation = ablated_english_activations[layer][:,neuron].mean()
    mean_german_activation = ablated_german_activations[layer][:,neuron].mean()
    
    if plot:
        two_histogram(ablated_english_activations[layer][:,neuron], ablated_german_activations[layer][:,neuron], "English", "German", "Activation", "Frequency", f"L{layer}N{neuron} activations on English vs German text")
    train_acc, test_acc, f1 = run_single_neuron_lr(layer, neuron)
    print(f"\nL{layer}N{neuron}: F1={f1:.2f}, Train acc={train_acc:.2f}, and test acc={test_acc:.2f}")
    print(f"Mean activation English={mean_english_activation:.2f}, German={mean_german_activation:.2f}")

In [None]:
for layer, neuron in german_neurons:
    get_neuron_accuracy(layer, neuron)


L4N482: F1=0.73, Train acc=0.78, and test acc=0.77
Mean activation English=-0.07, German=0.48

L5N1039: F1=0.78, Train acc=0.75, and test acc=0.76
Mean activation English=1.02, German=0.05

L5N407: F1=0.60, Train acc=0.64, and test acc=0.61
Mean activation English=5.23, German=3.71

L5N1516: F1=0.70, Train acc=0.69, and test acc=0.70
Mean activation English=2.31, German=1.37

L5N1336: F1=0.78, Train acc=0.83, and test acc=0.82
Mean activation English=-0.06, German=0.38

L4N326: F1=0.55, Train acc=0.60, and test acc=0.61
Mean activation English=0.03, German=0.15

L5N250: F1=0.47, Train acc=0.64, and test acc=0.65
Mean activation English=-0.00, German=0.07

L3N669: F1=0.00, Train acc=0.50, and test acc=0.49
Mean activation English=-0.07, German=-0.07


Without L3N669 to read from our neuron L5N1336 still scores decently:

F1=0.78, Train acc=0.83, and test acc=0.82

Mean activation English=-0.06, German=0.38

Next: look at what gets boosted and deboosted when we ablate L5N1336