In [1]:
import torch
import numpy as np
from torch import einsum
from tqdm.auto import tqdm
import seaborn as sns
from transformer_lens import HookedTransformer, ActivationCache, utils, patching
from datasets import load_dataset
from einops import einsum
import pandas as pd
from transformer_lens import utils
from rich.table import Table, Column
from rich import print as rprint
from jaxtyping import Float, Int, Bool
from typing import List, Tuple
from torch import Tensor
import einops
import functools
from transformer_lens.hook_points import HookPoint
# import circuitsvis
from IPython.display import HTML
from plotly.express import line
import plotly.express as px
from tqdm.auto import tqdm
import json
import gc
import plotly.graph_objects as go

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from plotly.subplots import make_subplots
# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio
pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import load_txt_data, get_mlp_activations, line
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [3]:
haystack_utils.clean_cache()
english_neurons = [(5, 395), (5, 166), (5, 908), (5, 285), (3, 862), (5, 73), (4, 896), (5, 348), (5, 297), (3, 1204)]
german_neurons = [(4, 482), (5, 1039), (5, 407), (5, 1516), (5, 1336), (4, 326), (5, 250), (3, 669)]
french_neurons = [(5, 112), (4, 1080), (5, 1293), (5, 455), (5, 5), (5, 1901), (5, 486), (4, 975)]

model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

english_data = haystack_utils.load_txt_data("kde4_english.txt")
german_data = haystack_utils.load_txt_data("wmt_german_large.txt")

english_activations = {}
german_activations = {}
for layer in range(3, 6):
    english_activations[layer] = get_mlp_activations(english_data[:200], layer, model, mean=False)
    german_activations[layer] = get_mlp_activations(german_data[:200], layer, model, mean=False)

LOG_PROB_THRESHOLD = -7
LAYER_TO_ABLATE = 3
NEURONS_TO_ABLATE = [669]
MEAN_ACTIVATION_ACTIVE = german_activations[LAYER_TO_ABLATE][:, NEURONS_TO_ABLATE].mean()
MEAN_ACTIVATION_INACTIVE = english_activations[LAYER_TO_ABLATE][:, NEURONS_TO_ABLATE].mean()

def deactivate_neurons_hook(value, hook):
    value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_INACTIVE
    return value
deactivate_neurons_fwd_hooks=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', deactivate_neurons_hook)]

def activate_neurons_hook(value, hook):
    value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_ACTIVE
    return value
activate_neurons_fwd_hooks=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', activate_neurons_hook)]

Using pad_token, but it is not set yet.


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
all_ignore, not_ignore = haystack_utils.get_weird_tokens(model, plot_norms=True)

Number of W_U neurons to ignore: 324
Number of W_E neurons to ignore: 292
Number of unique W_U and W_E neurons to ignore: 367


In [None]:
def get_pos_loss_diff(prompt: str, model: HookedTransformer, activate_neurons_hook: List[Tuple[str, HookPoint]], deactivate_neurons_hook: List[Tuple[str, HookPoint]], plot_hist=False, use_activate_hook=False):
    tokens = model.to_tokens(prompt)
    if use_activate_hook:
        original_loss = model.run_with_hooks(tokens, return_type="loss", fwd_hooks=activate_neurons_hook, loss_per_token=True)
    else:
        original_loss = model(tokens, return_type="loss", loss_per_token=True)
    ablated_loss = model.run_with_hooks(tokens, return_type="loss", fwd_hooks=deactivate_neurons_hook, loss_per_token=True)
    
    # Positive difference = loss increase due to ablation
    loss_difference = (ablated_loss - original_loss).flatten()

    if plot_hist:
        fig = px.histogram(loss_difference.flatten().cpu().numpy(), title="Loss difference due to ablation per position")
        fig.show()
    return loss_difference

def get_high_loss_prompts(prompts: list[str], model: HookedTransformer, activate_neurons_hook: List[Tuple[str, HookPoint]], deactivate_neurons_hook: List[Tuple[str, HookPoint]]):
    max_diffs = []
    average_diffs = []
    for prompt in tqdm(prompts):
        loss_difference = get_pos_loss_diff(prompt, model, activate_neurons_hook, deactivate_neurons_hook)
        max_diffs.append(loss_difference.max().item())
        average_diffs.append(loss_difference.mean().item())
    return max_diffs, average_diffs

max_diffs, average_diffs = get_high_loss_prompts(german_data, model, activate_neurons_fwd_hooks, deactivate_neurons_fwd_hooks)


  0%|          | 0/2459 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
px.histogram(average_diffs, title="Average loss difference per prompt", width=1000)

In [None]:
px.histogram(max_diffs, title="Maximum loss difference on a single token per prompt", width=1000)

In [None]:
# Get prompts with high average loss 
threshold = 5
high_max_loss_prompts = [i for i in range(len(german_data)) if max_diffs[i] > threshold]
print(high_max_loss_prompts)

[12, 13, 60, 67, 69, 82, 87, 102, 120, 125, 126, 133, 139, 146, 150, 157, 164, 195, 202, 217, 219, 227, 237, 238, 243, 247, 250, 251, 257, 260, 263, 269, 270, 294, 329, 337, 352, 365, 368, 391, 399, 408, 409, 419, 430, 441, 457, 463, 475, 478, 516, 522, 529, 531, 534, 541, 557, 577, 579, 585, 589, 615, 616, 617, 618, 620, 626, 630, 641, 646, 647, 664, 684, 688, 705, 707, 708, 711, 712, 713, 714, 716, 719, 723, 724, 731, 734, 743, 760, 771, 773, 774, 791, 795, 805, 813, 847, 860, 861, 869, 882, 919, 923, 928, 938, 939, 941, 946, 949, 961, 1017, 1025, 1037, 1039, 1085, 1091, 1094, 1104, 1109, 1112, 1117, 1122, 1123, 1148, 1159, 1162, 1165, 1166, 1171, 1178, 1185, 1201, 1210, 1212, 1213, 1217, 1229, 1232, 1234, 1388, 1404, 1471, 1483, 1563, 1569, 1602, 1606, 1686, 1760, 1800, 1847, 1877, 1899, 1998, 2029, 2064, 2153, 2200, 2213, 2251, 2273, 2298, 2327, 2427]


In [None]:
# Get prompts with high average loss 
threshold = 0.5
high_average_loss_prompts = [i for i in range(len(german_data)) if average_diffs[i] > threshold]
print(high_average_loss_prompts)

[46, 69, 74, 115, 144, 164, 217, 233, 245, 280, 295, 307, 436, 447, 452, 503, 578, 664, 721, 869, 1110, 1148, 1150, 1155, 1158, 1185, 1211, 2224]


In [None]:
def show_token_loss(prompt: str, model: HookedTransformer, max_value=None):
    """Show the loss difference per token in a prompt"""
    pos_wise_loss = get_pos_loss_diff(prompt, model, activate_neurons_fwd_hooks, deactivate_neurons_fwd_hooks, plot_hist=False)
    str_token_prompt = model.to_str_tokens(model.to_tokens(prompt))
    haystack_utils.print_strings_as_html(str_token_prompt[1:], pos_wise_loss.flatten().cpu().tolist(), max_value=max_value)

In [None]:
for prompt_idx in high_max_loss_prompts[10:25]:
    prompt = german_data[prompt_idx]
    show_token_loss(prompt, model, max_value=5)

In [None]:
for prompt_idx in high_average_loss_prompts[10:25]:
    prompt = german_data[prompt_idx]
    show_token_loss(prompt, model, max_value=5)

The high loss difference tokens are usuallly intermediary tokens in multi-token words. But not all multi-token words lead to a high loss difference. 

Potential guess: it looks like problematic tokens are words that also exist in English ("ink", "ass", "he", "its", "in"), after which the model struggles to predict German continuations

Interesting high loss tokens
- "ink": [" in", " E", "ink", "lang"]
- "he": [" her", "vor", "he", "ben"]
- "its": [" die", " Ar", "be", "its", "bed", "ing", "ungen"]
- "in": [" dar", "auf", " h", "in", "we", "isen"]
- "ass": [" eines", " um", "f", "ass", "enden"]
- "id" : [" Ver", "te", "id", "ig", "ung"]



Other guess: it looks like common english bigrams lead to high loss diffs
-  "esch": "v org esch l agen" (org+anistation or variations are probably common)

Potential guess for negative loss tokens: it looks like some of them are unlikely German bigrams
- "m": "die nicht m ilit ä ris che" (more common "nicht" as a single word)
- "okrat": "nach dem okrat is chem" (more common "dem" as a single word)

In [None]:
# 1: Investigate prompts in isolation
# Check loss increase on single tokens
# Check loss increase on synthetic prompts
# Create new prompts that follow hypothesis

# 2: Investigate origin of loss increase
# See what model wants to output without the context neuron


In [None]:
sentences = [
    "There is",
    ", weil da eine Katze ist",
    "Ich bin gestresst, weil die Arbeit so viel ist.",
    "Ich bin nicht gestresst, obwohl die Arbeit so viel ist.",
    "Sie geht ins Kino, weil sie den Film sehen möchte.",
    "Wir können nicht ausgehen, weil es spät ist.",
    "Er hat den Zug verpasst, weil der Bus zu spät gekommen ist.",
    "Sie freut sich, weil sie das Konzertkarte gewonnen hat.",
    "Er nimmt einen Regenschirm mit, weil es regnen könnte.",
    "Ich bleibe wach, weil ich die ganze Nacht lernen muss.",
    "Sie sind glücklich, weil sie gerade ihre Prüfung bestanden haben."
]

for prompt in sentences:
    show_token_loss(prompt, model, max_value=5)

In [None]:
sentences = [
    "Ich gehe ins Kino, und ich sehe einen Film."
]
for prompt in sentences:
    show_token_loss(prompt, model, max_value=5)

In [None]:
# In Einklang
sentences = [
    "Die Musik und die Natur stehen in Einklang miteinander.",
    "Um erfolgreich zu sein, müssen wir unsere Handlungen mit unseren Werten in Einklang bringen.",
    "Yoga hilft dabei, Körper und Geist in Einklang zu bringen.",
    "In der Meditation finden viele Menschen innere Ruhe und kommen mit sich selbst in Einklang.",
    "Der Architekt hat das moderne Design des Hauses wunderbar mit der umgebenden Landschaft in Einklang gebracht.",
    "Die Entscheidung des Unternehmens steht nicht im Einklang mit seinen langfristigen Zielen.",
    "Ich versuche stets, meine beruflichen und persönlichen Verpflichtungen in Einklang zu bringen.",
    "In der Beziehung ist es wichtig, die Bedürfnisse beider Partner in Einklang zu bringen.",
    "Der Politiker betonte die Wichtigkeit, wirtschaftliche Entwicklung und Umweltschutz in Einklang zu bringen.",
    "Die Philosophie des Zen lehrt uns, Körper, Geist und Seele in Einklang zu bringen."]

for prompt in sentences:
    show_token_loss(prompt, model, max_value=5)

In [None]:
def get_top_differences_at_position(prompt: str, model: HookedTransformer, position: int, top_k=20):
    """
    For a single token position in a prompt, display how much tokens are boosted or deboosted by deactivating the German neuron.
    Uses log probs. Positive difference means the German neuron makes the token more likely.
    """
    tokens = model.to_tokens(prompt)
    str_tokens = model.to_str_tokens(tokens)
    # Logprobs instead of logits
    original_logits = model(tokens, return_type="logits").log_softmax(dim=-1)
    ablated_logits = model.run_with_hooks(tokens, return_type="logits", fwd_hooks=deactivate_neurons_fwd_hooks).log_softmax(dim=-1)

    # Positive difference = the German neuron makes the token more likely
    # Negative difference = the German neuron makes the token less likely
    logit_differences = original_logits - ablated_logits
    
    print("Prompt:", prompt)
    print(f"Differences for predicting: {str_tokens[position]} -> {str_tokens[position+1]}")

    low_log_prob = torch.argwhere(((original_logits[0, position, :] <= LOG_PROB_THRESHOLD) & (ablated_logits[0, position, :] <= LOG_PROB_THRESHOLD))).flatten()
    ignore_tokens = torch.cat([low_log_prob, all_ignore]).unique()
    
    top_original_logprobs, top_original_idx = haystack_utils.top_k_with_exclude(original_logits[0, position, :].flatten(), top_k, exclude=ignore_tokens)
    top_original_ablated_logprobs = ablated_logits[0, position, top_original_idx]
    top_ablated_logprobs, top_ablated_idx = haystack_utils.top_k_with_exclude(ablated_logits[0, position, :].flatten(), top_k, exclude=ignore_tokens)
    top_ablated_original_logprobs = original_logits[0, position, top_ablated_idx]

    top_original_values = logit_differences[0, position, top_original_idx]
    top_ablated_values = logit_differences[0, position, top_ablated_idx]
    print("Top predictions with German neuron active (unablated)")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_original_idx), top_original_values.cpu().tolist(), max_value=5, original_log_probs=top_original_logprobs.cpu().tolist(), ablated_log_probs=top_original_ablated_logprobs.cpu().tolist())
    print("Top predictions with German neuron disabled (ablated)")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_ablated_idx), top_ablated_values.cpu().tolist(), max_value=5, original_log_probs=top_ablated_original_logprobs.cpu().tolist(), ablated_log_probs=top_ablated_logprobs.cpu().tolist())

    top_boosts, top_boosted_idx = haystack_utils.top_k_with_exclude(logit_differences[:, position, :].flatten(), top_k, exclude=ignore_tokens)
    top_boost_original_logprob = original_logits[0, position, top_boosted_idx]
    top_boost_ablated_logprob = ablated_logits[0, position, top_boosted_idx]
    top_reduced, top_reduced_idx = haystack_utils.top_k_with_exclude(logit_differences[:, position, :].flatten(), top_k, largest=False, exclude=ignore_tokens)
    top_reduced_original_logprob = original_logits[0, position, top_reduced_idx]
    top_reduced_ablated_logprob = ablated_logits[0, position, top_reduced_idx]
    print("Top boosted tokens by German neuron")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_boosted_idx), top_boosts.cpu().tolist(), max_value=5, original_log_probs=top_boost_original_logprob.cpu().tolist(), ablated_log_probs=top_boost_ablated_logprob.cpu().tolist())
    print("Top reduced tokens by German neuron")
    haystack_utils.print_strings_as_html(model.to_str_tokens(top_reduced_idx), top_reduced.cpu().tolist(), max_value=5, original_log_probs=top_reduced_original_logprob.cpu().tolist(), ablated_log_probs=top_reduced_ablated_logprob.cpu().tolist())

get_top_differences_at_position("Die Musik und die Natur stehen in Einklang miteinander.", model, 12, 50)

Prompt: Die Musik und die Natur stehen in Einklang miteinander.
Differences for predicting: ink -> lang
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


In [None]:
get_top_differences_at_position("Um erfolgreich zu sein, müssen wir unsere Handlungen mit unseren Werten in Einklang bringen.", model, -5, 50)

Prompt: Um erfolgreich zu sein, müssen wir unsere Handlungen mit unseren Werten in Einklang bringen.
Differences for predicting: ink -> lang
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


In [None]:
get_top_differences_at_position("Die Entscheidung des Unternehmens steht nicht im Einklang mit seinen langfristigen Zielen.", model, 16, 50)

Prompt: Die Entscheidung des Unternehmens steht nicht im Einklang mit seinen langfristigen Zielen.
Differences for predicting: ink -> lang
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


In [None]:
# Compare to "Einkommen" see if "om" is boosted in a more likely context
sentences = [
    "Mein Einkommen reicht gerade so zum Leben aus, aber ich muss gut haushalten.",
    "Das Einkommen ist oft stark schwankend, besonders bei selbstständigen Tätigkeiten.",
    "Ich möchte durch eine Nebentätigkeit mein Einkommen aufbessern, um mehr finanzielle Sicherheit zu haben.",
    "Die Steuererklärung ist wichtig, um mögliche Steuervorteile zu nutzen und das Einkommen korrekt anzugeben.",
    "Mit einem höheren Bildungsabschluss steigt oft auch das Einkommen, was langfristige Karrierechancen verbessern kann.",
    "Das Einkommen der Selbstständigen hängt stark von ihrer Auftragslage ab, was die finanzielle Planung herausfordernd macht.",
    "Der Mindestlohn soll ein existenzsicherndes Einkommen gewährleisten, insbesondere für geringfügig Beschäftigte.",
    "Ein gerechtes Steuersystem berücksichtigt unterschiedliche Einkommensgruppen und trägt zur sozialen Gerechtigkeit bei.",
    "Das Einkommen der Eltern beeinflusst oft die Bildungschancen der Kinder, da finanzielle Ressourcen eine Rolle spielen.",
    "Viele Menschen streben nach einem höheren Einkommen, um ihren Lebensstandard zu verbessern und sich finanzielle Freiheit zu ermöglichen."
]
for prompt in sentences:
    show_token_loss(prompt, model, max_value=5)

In [None]:
get_top_differences_at_position("Ich möchte durch eine Nebentätigkeit mein Einkommen aufbessern, um mehr finanzielle Sicherheit zu haben.", model, 14, 50)

Prompt: Ich möchte durch eine Nebentätigkeit mein Einkommen aufbessern, um mehr finanzielle Sicherheit zu haben.
Differences for predicting: ink -> om
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


In [None]:
sentences = [
    "Der Betriebsrat setzt sich für die Verbesserung der Arbeitsbedingungen ein.",
    "In diesem Unternehmen werden die Arbeitsbedingungen regelmäßig evaluiert und angepasst.",
    "Die Sicherheit am Arbeitsplatz ist von großer Bedeutung für die Arbeitsbedingungen der Beschäftigten.",
    "Bei der Stellenbewertung spielen die Arbeitsbedingungen eine entscheidende Rolle.",
    "Eine gute Work-Life-Balance ist ein Indikator für die Qualität der Arbeitsbedingungen.",
    "Der Arbeitgeber hat die Verantwortung, die gesetzlichen Mindeststandards für die Arbeitsbedingungen einzuhalten.",
    "Die betrieblichen Vereinbarungen regeln die Arbeitsbedingungen der Angestellten.",
    "Der Umgang mit Stress ist ein wichtiger Faktor für die Arbeitsbedingungen in anspruchsvollen Berufen.",
    "Die Digitalisierung kann sowohl positive als auch negative Auswirkungen auf die Arbeitsbedingungen haben.",
    "Das Arbeitszeitgesetz legt bestimmte Rahmenbedingungen für die Arbeitsbedingungen fest."
]

for prompt in sentences:
    show_token_loss(prompt, model, max_value=5)

In [None]:
get_top_differences_at_position("In diesem Unternehmen werden die Arbeitsbedingungen regelmäßig evaluiert und angepasst.", model, 11, 50)

Prompt: In diesem Unternehmen werden die Arbeitsbedingungen regelmäßig evaluiert und angepasst.
Differences for predicting: its -> bed
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


In [None]:
sentences = [
    "Die Entwicklung dieser Software umfasste eine umfangreiche Testphase, um die Anwendung zu optimieren.",
    "Das Handbuch bietet detaillierte Informationen zur Installation und Nutzung der Anwendung.",
    "Die Anwendung erfordert eine schnelle Internetverbindung und ausreichend Speicherplatz.",
    "Der Support steht Ihnen bei Fragen zur Anwendung gerne zur Verfügung.",
    "Es wurden neue Funktionen in die Anwendung integriert, um die Benutzererfahrung zu verbessern.",
    "Bei der Verwendung der Anwendung ist es wichtig, die Datenschutzbestimmungen zu beachten.",
    "Die Anwendung wurde für verschiedene Betriebssysteme entwickelt und ist plattformübergreifend nutzbar.",
    "Die Anwendung bietet zahlreiche Einstellungsmöglichkeiten, um sie individuell anzupassen.",
    "Durch regelmäßige Updates werden Sicherheitslücken in der Anwendung behoben.",
    "Die Anwendung ermöglicht eine effiziente Verwaltung von Daten und erleichtert die Arbeitsabläufe."
]

for prompt in sentences:
    show_token_loss(prompt, model, max_value=5)

## Looking at bigram "Die Anwendung"

In [None]:
get_top_differences_at_position("Die Anwendung", model, 2, 50)

Prompt: Die Anwendung
Differences for predicting:  An -> wend
Top predictions with German neuron active (unablated)


Top predictions with German neuron disabled (ablated)


Top boosted tokens by German neuron


Top reduced tokens by German neuron


- Reduces english bigrams (Anarchy, anatomy, anomaly, anterior, antic)
- Boosts German bigrams (Anerkennung, anwendung, anstelle, anspielung)

In [None]:
top_german = ["erk", "wend", "ste", "sp", "fr"]
top_english = ["swers", "xiety", "archy", "atomy", "omaly"]

In [None]:
def logit_diffs(logits: Float[Tensor, "batch pos d_vocab"], pos: int, top_german: list[str], top_english: list[str]):
    german_tokens = model.to_tokens(top_german).flatten()
    english_tokens = model.to_tokens(top_english).flatten()
    german_logits = logits[0, pos, german_tokens]
    english_logits = logits[0, pos, english_tokens]
    return (german_logits-english_logits).mean()


prompt = "Die Anwendung"
pos = 2
clean_logits = model(prompt, return_type="logits")
corrupt_logits = model.run_with_hooks(prompt, return_type="logits", fwd_hooks=deactivate_neurons_fwd_hooks)

CLEAN_BASELINE = logit_diffs(clean_logits, pos, top_german, top_english)
CORRUPTED_BASELINE = logit_diffs(corrupt_logits, pos, top_german, top_english)

In [None]:
def metric(logits: Float[Tensor, "batch pos d_vocab"], pos: int, top_german: list[str], top_english: list[str]):
    """Percentage of the German neuron's effects on the top bigrams present in the given logits"""
    return (logit_diffs(logits, pos, top_german, top_english) - CORRUPTED_BASELINE) / (CLEAN_BASELINE  - CORRUPTED_BASELINE)

print(f"Clean Baseline is 1: {metric(clean_logits, pos, top_german, top_english).item():.4f}")
print(f"Corrupted Baseline is 0: {metric(corrupt_logits, pos, top_german, top_english).item():.4f}")

Clean Baseline is 1: 1.0000
Corrupted Baseline is 0: 0.0000


In [None]:
# Compare the loss with German neuron deactivated, with the loss with German neuron deactivated but a clean component patched in. E.g. when we patch in the MLP4 outputs calculated with the 
# German neuron enabled, the loss decreases from 5.66 to 5.38. This means that the German neuron's effects on MLP4 are responsible for 0.28 of the loss increase.
layer = 4
attention_out_name = f"blocks.{layer}.hook_attn_out"
mlp_out_name = f"blocks.{layer}.hook_mlp_out"
components = []
loss_attribution = []

for layer in range(3, 6):
    attention_out_name = f"blocks.{layer}.hook_attn_out"
    print("Component", attention_out_name)
    original_loss, ablated_loss, frozen_loss = haystack_utils.get_frozen_loss_difference_for_component(["Die Anwendung"], model, deactivate_neurons_fwd_hooks, freeze_act_names=[attention_out_name], disable_progress_bar=True)
    loss_attribution.append(ablated_loss - frozen_loss)
    mlp_out_name = f"blocks.{layer}.hook_mlp_out"
    print("Component", mlp_out_name)
    original_loss, ablated_loss, frozen_loss = haystack_utils.get_frozen_loss_difference_for_component(["Die Anwendung"], model, deactivate_neurons_fwd_hooks, freeze_act_names=[mlp_out_name], disable_progress_bar=True)
    loss_attribution.append(ablated_loss - frozen_loss)
    components.extend([attention_out_name, mlp_out_name])

haystack_utils.line(loss_attribution, xticks=components, title="Loss increase per component")

Compontent blocks.3.hook_attn_out
Original loss: 4.80, frozen loss: 5.66 (+17.89%), ablated loss: 5.66 (+17.89%)
Compontent blocks.3.hook_mlp_out
Original loss: 4.80, frozen loss: 4.80 (+0.00%), ablated loss: 5.66 (+17.89%)
Compontent blocks.4.hook_attn_out
Original loss: 4.80, frozen loss: 5.65 (+17.79%), ablated loss: 5.66 (+17.89%)
Compontent blocks.4.hook_mlp_out
Original loss: 4.80, frozen loss: 5.38 (+12.17%), ablated loss: 5.66 (+17.89%)
Compontent blocks.5.hook_attn_out
Original loss: 4.80, frozen loss: 5.65 (+17.79%), ablated loss: 5.66 (+17.89%)
Compontent blocks.5.hook_mlp_out
Original loss: 4.80, frozen loss: 5.27 (+9.74%), ablated loss: 5.66 (+17.89%)


In [8]:

# The get_frozen_loss_difference_for_component implements the below steps three steps and then returns the loss with the first frozen component patched in. This gets us the
# effect of the frozen component's German circuits on the loss, but doesn't tell us what the contribution of the frozen component's German circuits to MLP5 is in isolation.
# 1. Get clean run cache
# 2. Get ablated run loss
# 3. Get ablated run with clean MLP4 patched in loss

# Let's try to isolate the effects of MLP4 on MLP5.

# Option 1:
# Do a run with the German neuron disabled, grab MLP4 from the cache, for a foward pass with the German neuron enabled and patch in the disabled-German-neuron MLP4, 
# grab the output of MLP5 from that run's cache, then do a forward pass with the German neuron enabled and patch in MLP5? This will give us the impact of MLP4's German-specific circuits on MLP5. 
# We expect the loss to increase vs. a regular forward pass with the German neuron enabled. The loss increase will be the effect of MLP4's German circuits on MLP5.

# Option 2:
# Run the model with the German neuron enabled, get MLP4 from the cache, do a forward pass with the German neuron disabled, patch in MLP4, grab the output of MLP5 from the cache,
# do a forward pass with the German neuron disabled and patch in MLP5, then compare with a forward pass with the German neuron disabled and no hooks. This should give us a loss decrease, the effect of MLP4's German 
# circuits on MLP5.
# 1. Get clean run cache
# 2. Get ablated run loss
# 3. Get ablated run with MLP4 patched in cache
# 4. Get ablated run with MLP5 patched in loss


def get_frozen_component_effect_on_MLP5(
        prompts: List[str],
        model: HookedTransformer,
        ablation_hooks=[],
        freeze_act_names=[],
        crop_context_end: None | int=None,
        disable_progress_bar=False
    ):
    original_losses = []
    ablated_losses = []
    frozen_losses = []
    MLP5_losses = []
    for prompt in tqdm(prompts, disable=disable_progress_bar):
        if crop_context_end is not None:
            tokens = model.to_tokens(prompt)[:, :crop_context_end]
        else:
            tokens = model.to_tokens(prompt)

        original_loss, ablated_loss, original_cache, ablated_cache = haystack_utils.get_caches_single_prompt(
            prompt, model, fwd_hooks=ablation_hooks, crop_context_end=crop_context_end, return_type="loss")

        # ['embed', '0_attn_out', '0_mlp_out', '1_attn_out', '1_mlp_out', '2_attn_out', '2_mlp_out', '3_attn_out', '3_mlp_out', '4_attn_out', '4_mlp_out', '5_attn_out', '5_mlp_out']
        # Ablate at the final residual stream value to remove the direct component output
        def freeze_hook(value, hook: HookPoint):
            value = original_cache[hook.name]
            return value            
        
        freeze_hooks = [(freeze_act_name, freeze_hook) for freeze_act_name in freeze_act_names]

        with model.hooks(fwd_hooks=freeze_hooks+ablation_hooks):
            frozen_loss, frozen_cache = model.run_with_cache(tokens, return_type="loss")

        def patch_mlp5_hook(value, hook: HookPoint):
            value = frozen_cache[hook.name]
            return value

        MLP5_hooks = [('blocks.5.mlp.hook_post', patch_mlp5_hook)]
        with model.hooks(fwd_hooks=MLP5_hooks+ablation_hooks):
            MLP5_loss = model(tokens, return_type="loss")

        original_losses.append(original_loss)
        ablated_losses.append(ablated_loss)
        frozen_losses.append(frozen_loss.item())
        MLP5_losses.append(MLP5_loss.item())


    print(f"Original loss: {np.mean(original_losses):.2f}, frozen loss: {np.mean(frozen_losses):.2f} (+{((np.mean(frozen_losses) - np.mean(original_losses)) / np.mean(original_losses))*100:.2f}%), ablated loss: {np.mean(ablated_losses):.2f} (+{((np.mean(ablated_losses) - np.mean(original_losses)) / np.mean(original_losses))*100:.2f}%)")
    return np.mean(original_losses), np.mean(ablated_losses), np.mean(frozen_losses)


In [9]:
mlp_out_name = f"blocks.{4}.hook_mlp_out"
print("Component", mlp_out_name)
original_loss, ablated_loss, frozen_loss, MLP5_loss = get_frozen_component_effect_on_MLP5(["Die Anwendung"], model, deactivate_neurons_fwd_hooks, freeze_act_names=[mlp_out_name], disable_progress_bar=True)
print(ablated_loss - MLP5_loss)

Component blocks.4.hook_mlp_out


NameError: name 'model' is not defined

### Looks at activation tokens for neurons with high pre-GELU average activation differences when German neuron ablated

In [None]:
# Written for batch size of 1
# Assumes German
def get_top_k_act_tokens_for_neuron(layer, neuron, data, k=100, mlp_stage="hook_pre") -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    acts = []
    corresponding_tokens = []
    next_tokens = []

    for prompt in data:
        tokens = model.to_tokens(prompt)
        _, cache = model.run_with_cache(tokens)
        
        # for each pos activation, append to list
        acts.append(cache[f"blocks.{layer}.mlp.{mlp_stage}"][0, :, neuron]) # append [pos]
        corresponding_tokens.append(tokens[0])  # append [pos]
        next_with_pad = torch.cat([tokens[0][1:], torch.full((1,), 0).cuda()]) # using -1 to denote no next token
        next_tokens.append(next_with_pad)

    acts = torch.cat(acts)  # ["all tokens over all prompts"] 
    corresponding_tokens = torch.cat(corresponding_tokens)
    next_tokens = torch.cat(next_tokens)

    top_acts, indices = torch.topk(acts.abs(), k, dim=0)
    top_tokens = corresponding_tokens[indices]
    top_next_tokens = next_tokens[indices]
    
    return top_acts, top_tokens, top_next_tokens

In [None]:
from collections import Counter

def print_top_k_act_token_frequency_counts(layer, neuron, data, k=100, mlp_stage="hook_pre"):
    top_acts, top_tokens, top_next_tokens = get_top_k_act_tokens_for_neuron(layer, neuron, data, k, mlp_stage=mlp_stage)
    top_token_strings = model.tokenizer.convert_ids_to_tokens(top_tokens)
    top_next_token_strings = model.tokenizer.convert_ids_to_tokens(top_next_tokens)
    print(f"Number of the top {k} L{layer}N{neuron} activations triggered on token:")
    print(Counter(top_token_strings))
    print(f"Number of the top {k} L{layer}N{neuron} activations triggered on token with next token:")
    print(Counter(top_next_token_strings), '\n')

In [None]:
top_act_diff_neurons_l4 = [(4, 1331), (4, 1682), (4, 1248), (4, 683), (4, 482)]
top_act_diff_neurons_l5 = [(5, 1336), (5, 1292), (5, 213), (5, 975), (5, 134)]
with model.hooks(activate_neurons_fwd_hooks):
    for layer, neuron in  top_act_diff_neurons_l4 + top_act_diff_neurons_l5:
        print_top_k_act_token_frequency_counts(layer, neuron, german_data)

Number of the top 100 L4N1331 activations triggered on token:
Counter({'Ġbe': 100})
Number of the top 100 L4N1331 activations triggered on token with next predicted token:
Counter({'Ġdie': 7, 'Ġder': 6, 'en': 4, 'Ġund': 4, '.': 4, ',': 3, 'ĠVer': 3, 'z': 2, '-': 2, 'wick': 2, 'Ċ': 2, 'ite': 1, 'chen': 1, 'Ġden': 1, 'ischen': 1, 'ist': 1, 'ents': 1, 'ung': 1, 'ben': 1, 'hr': 1, 'le': 1, 'b': 1, 'ing': 1, '22': 1, 'let': 1, 'zt': 1, 'f': 1, 'Ġ(': 1, 'k': 1, '"': 1, 'see': 1, 't': 1, 'P': 1, 'Ã¼ck': 1, 'itÃ¤t': 1, 'bind': 1, 'uss': 1, 'are': 1, 'Ġsich': 1, 'ied': 1, 'che': 1, 'wei': 1, 'iert': 1, 'Ġrelevant': 1, 'ierung': 1, 'Ã¼r': 1, 'ĠUnion': 1, 'mun': 1, 'ĠAr': 1, 'ĠHerr': 1, '6': 1, 'ension': 1, 'isch': 1, 'er': 1, 'den': 1, 'Ġand': 1, 'n': 1, 'ak': 1, 'chts': 1, 'MAN': 1, 'ĠM': 1, 'ad': 1, 'pe': 1, 'ie': 1, 'et': 1, 'Analy': 1, 'ÃŁ': 1, 'me': 1, '4': 1, 'vor': 1, 'Ġ=': 1, 'Z': 1})
Number of the top 100 L4N1682 activations triggered on token:
Counter({'e': 28, 'ne': 9, 'is': 9, 'ken':

There are many trends - L4N683 especially activates on one token in particular.

### Is L4N683 primarily a Ġin neuron when the German context neuron is enabled?

In [None]:
haystack_utils.clean_cache()
print_top_k_act_token_frequency_counts(4, 683, german_data, k=1000, mlp_stage='hook_post')
print_top_k_act_token_frequency_counts(4, 683, german_data, k=1000, mlp_stage='hook_pre')

# Trend greatly diminishes with German context neuron deactivated
with model.hooks(deactivate_neurons_fwd_hooks):
    print_top_k_act_token_frequency_counts(4, 683, german_data[:500], k=1000, mlp_stage='hook_post')
    print_top_k_act_token_frequency_counts(4, 683, german_data[:500], k=1000, mlp_stage='hook_pre')

Number of the top 1000 L4N683 activations triggered on token:
Counter({'Ġin': 808, 'Ġauf': 91, 'Ġan': 21, 'ÃŁ': 15, 'zu': 8, 'Ġaus': 6, 'rei': 5, 'ier': 3, 'ĠfÃ¼r': 3, 'ren': 3, 'und': 3, 'Ġreg': 3, 'Ġrein': 3, 'loss': 2, 'ĠIn': 2, 'chs': 2, 'l': 2, 'rem': 2, 'Ġim': 2, 'uen': 2, 'In': 2, 'Ġab': 1, 'Ġ000': 1, 'ilen': 1, 'lung': 1, 'ern': 1, 'oden': 1, 'Ġum': 1, 'Ġnun': 1, '%': 1, 'Ġ%': 1, 'uf': 1, 'igen': 1})
Number of the top 1000 L4N683 activations triggered on token with next token:
Counter({'Ġder': 139, 'Ġden': 105, 'Ġdie': 54, 'Ġdies': 38, 'Ġdem': 23, 'Ġeinem': 21, 'Ġeiner': 19, 'Ġdas': 15, 'ĠBe': 15, 'er': 14, 'ĠK': 13, 'Ġih': 13, 'Ġall': 12, 'Ġein': 10, 'ĠEuropa': 9, 'Ġjed': 8, 'Ġeine': 8, 'Ġse': 8, 'Ġbez': 8, 'Ġand': 8, 'Ġunse': 7, ',': 7, 'Ġz': 7, 'ĠZ': 7, 'ĠS': 6, 'ne': 6, 'ĠAn': 6, 'Ġdieser': 6, 'Ġme': 6, 'Ġd': 5, 'Ġsein': 5, 'Ġeinen': 5, 'ĠD': 5, 'ĠHÃ¶': 5, 'Ġdiese': 5, 'ĠEr': 5, 'ĠB': 4, 'ĠÃľ': 4, 'ĠR': 4, 'ĠFrank': 4, 'ĠF': 4, 'kl': 4, 'z': 4, 'ĠP': 4, 'Ġke': 3, 'ĠItal': 3

In [None]:
top_acts, top_tokens, top_next_tokens = get_top_k_act_tokens_for_neuron(4, 683, german_data, k=1000, mlp_stage='hook_pre')
top_token_strings = model.tokenizer.convert_ids_to_tokens(top_tokens)
gin_acts = [act.item() for act, token in zip(top_acts, top_token_strings) if token == "Ġin"]
mean_gin_act = sum(gin_acts) / len(gin_acts)  # ~3

Get the cosine sim of the neuron's output weights with the unembedding matrix

In [None]:
# Unembed neuron direction directly

# Only works for individual neurons
# Shape batch pos d_resid
layer = 4
neuron = 683
mean_inactive_activation = english_activations[layer][:, neuron].mean()
mean_activate_activation = mean_gin_act

neuron_weight = model.W_out[layer, neuron].view(1, 1, -1)
neuron_direction_active = neuron_weight * MEAN_ACTIVATION_ACTIVE # Set German neuron to activated value (~3)
neuron_direction_inactive = neuron_weight * mean_inactive_activation # Set German neuron to disabled value (~0)

tokens_active = model.unembed(neuron_direction_active)
tokens_inactive = model.unembed(neuron_direction_inactive)
# Active: German neuron is active - we expect German tokens boosted
# Inactive: German neuron is inactive - we expect no boost to German tokens
# Active - Inactive: If the neuron boosts German tokens, we expect this to be positive
token_differences = (tokens_active - tokens_inactive).flatten()

boosted_values, boosted_tokens = torch.topk(token_differences, 1000)
inhibited_values, inhibited_tokens = torch.topk(token_differences, 1000, largest=False)
boosted_labels = model.to_str_tokens(boosted_tokens)
inhibited_labels = model.to_str_tokens(inhibited_tokens)


def filter_garbage_tokens(labels, values):
    """Remove meaningless and whitespace tokens"""
    garbage_characters = {"", "ÃÂÃÂÃÂÃÂ", "ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ", "ÃÂÃÂ", "��", "�", "14514500", "1451450014514500", "{¶"}
    non_whitespace_labels = []
    non_whitespace_values = []
    for label, value in zip(labels, values):
        stripped_label = label.strip()
        if not stripped_label in garbage_characters:
            non_whitespace_labels.append(stripped_label)
            non_whitespace_values.append(value.item())
    return non_whitespace_labels, non_whitespace_values


non_whitespace_boosted_labels, non_whitespace_boosted_values = filter_garbage_tokens(boosted_labels, boosted_values)
non_whitespace_inhibited_labels, non_whitespace_inhibited_values = filter_garbage_tokens(inhibited_labels, inhibited_values)

#px.histogram(token_differences.cpu().numpy(), title="Histogram of L3N669 direct logit difference between original and ablated model", labels={"value": "Logit difference"})

num_tokens = 100
line(non_whitespace_boosted_values[:num_tokens], xlabel="Token", ylabel="Logit increase from context neuron", xticks=non_whitespace_boosted_labels[:num_tokens], title=f"Top boosted tokens from L{layer}N{neuron}", width=1100)
line(non_whitespace_inhibited_values[:num_tokens], xlabel="Token", ylabel="Logit increase from context neuron", xticks=non_whitespace_inhibited_labels[:num_tokens], title=f"Top Deboosted tokens from L{layer}N{neuron}", width=1100)


I'm not sure how many of the boosted tokens are German, although some do look that way. The deboosted ones are obviously English. 

Let's check what the most common token prediction is when this neuron activates.

In [None]:
from collections import Counter

# Written for batch size of 1
# Assumes German
def get_top_k_act_tokens_for_neuron(layer, neuron, data, k=100, mlp_stage="hook_pre") -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    acts = []
    corresponding_tokens = []
    next_tokens = []
    next_predicted_tokens = []

    for prompt in data:
        tokens = model.to_tokens(prompt)
        logits, cache = model.run_with_cache(tokens)
        
        # for each pos activation, append to list
        acts.append(cache[f"blocks.{layer}.mlp.{mlp_stage}"][0, :, neuron]) # append [pos]
        corresponding_tokens.append(tokens[0])  # append [pos]

        next_predicted_tokens_with_pad = torch.cat([logits.argmax(dim=-1)[0], torch.full((1,), 0).cuda()])
        next_predicted_tokens.append(next_predicted_tokens_with_pad)  # append [pos]
        
        next_with_pad = torch.cat([tokens[0][1:], torch.full((1,), 0).cuda()]) # using 0 to denote no next token
        next_tokens.append(next_with_pad)

    acts = torch.cat(acts)  # ["all tokens over all prompts"] 
    corresponding_tokens = torch.cat(corresponding_tokens)
    next_tokens = torch.cat(next_tokens)
    next_predicted_tokens = torch.cat(next_predicted_tokens)

    top_acts, indices = torch.topk(acts.abs(), k, dim=0)
    top_tokens = corresponding_tokens[indices]
    top_next_tokens = next_tokens[indices]
    top_next_predicted_tokens = next_predicted_tokens[indices]
    
    return top_acts, top_tokens, top_next_tokens, top_next_predicted_tokens


def print_top_k_act_token_frequency_counts(layer, neuron, data, k=100, mlp_stage="hook_pre"):
    top_acts, top_tokens, top_next_tokens, top_next_predicted_tokens = get_top_k_act_tokens_for_neuron(layer, neuron, data, k, mlp_stage=mlp_stage)
    top_token_strings = model.tokenizer.convert_ids_to_tokens(top_tokens)
    top_next_predicted_token_strings = model.tokenizer.convert_ids_to_tokens(top_next_predicted_tokens)
    print(f"Number of the top {k} L{layer}N{neuron} activations triggered on token:")
    print(Counter(top_token_strings))
    print(f"Number of the top {k} L{layer}N{neuron} activations triggered on token with next predicted token:")
    print(Counter(top_next_predicted_token_strings))
          

print_top_k_act_token_frequency_counts(4, 683, german_data, k=1000, mlp_stage='hook_post')
print_top_k_act_token_frequency_counts(4, 683, german_data, k=1000, mlp_stage='hook_pre')

Number of the top 1000 L4N683 activations triggered on token:
Counter({'Ġin': 808, 'Ġauf': 91, 'Ġan': 21, 'ÃŁ': 15, 'zu': 8, 'Ġaus': 6, 'rei': 5, 'ier': 3, 'ĠfÃ¼r': 3, 'ren': 3, 'und': 3, 'Ġreg': 3, 'Ġrein': 3, 'loss': 2, 'ĠIn': 2, 'chs': 2, 'l': 2, 'rem': 2, 'Ġim': 2, 'uen': 2, 'In': 2, 'Ġab': 1, 'Ġ000': 1, 'ilen': 1, 'lung': 1, 'ern': 1, 'oden': 1, 'Ġum': 1, 'Ġnun': 1, '%': 1, 'Ġ%': 1, 'uf': 1, 'igen': 1})
Number of the top 1000 L4N683 activations triggered on token with next predicted token:
Counter({',': 81, 'Ġdie': 56, 'Ġder': 49, '.': 32, 'Ġ(': 20, 'ĠVer': 18, 'en': 15, 'Ġund': 14, 'ung': 13, 'Ċ': 12, '-': 12, ')': 10, 'n': 9, 'w': 9, 'ĠW': 8, 'Ġin': 8, 'Ã¤': 6, 'h': 6, 'ĠK': 6, 'ie': 6, 't': 5, 'ischen': 5, 'Ġist': 5, 'te': 5, 'z': 5, 'Ġde': 5, 'k': 4, 'it': 4, 'd': 4, 'Ġ-': 4, 'ren': 4, 'lich': 4, 'icht': 4, 'ĠZ': 4, 'er': 4, 'ch': 4, 'ten': 4, 'Ġwerden': 4, '/': 4, 'Ġvon': 4, 'hen': 3, 'ungen': 3, 'Ġver': 3, 'B': 3, 'W': 3, 'agen': 3, 'i': 3, 'ige': 3, 'ver': 3, '12': 3, 'ĠN':

In [None]:
# Written for batch size of 1
# Assumes German
def get_top_k_prompt_acts_for_neuron(layer, neuron, data, k=100, mlp_stage="hook_pre") -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    acts = []
    prompts = []

    for prompt in data:
        tokens = model.to_tokens(prompt)
        _, cache = model.run_with_cache(tokens)
        
        # for each pos activation, append to list
        acts.append(cache[f"blocks.{layer}.mlp.{mlp_stage}"][0, :, neuron]) # append [pos]
        prompts += [prompt for _ in range(tokens.shape[1])]  # append [pos]

    acts = torch.cat(acts)  # ["all tokens over all prompts"] 

    top_acts, indices = torch.topk(acts.abs(), k, dim=0)
    
    return top_acts, [prompts[i] for i in indices.tolist()]
          

acts, prompts = get_top_k_prompt_acts_for_neuron(5, 1292, german_data, k=10, mlp_stage='hook_post')

10 890983


In [None]:
print(prompts)
# tokens it lowers loss for

["[Eng]-DUQA(1).avi, You Don't Mess with the Zohan.avi, You Don't Mess With The Zohans.avi, You Don't Mess With Zohan.avi, You don't.avi, you dont mess whith the zohan.avi, you dont mess with the zohan (2008) r5 line xvid-kamera (peerweb org).avi, You Dont Mess With The Zohan (2008) R5 LiNE XviD.avi, You Dont Mess with the Zohan (2008) R5.LiNE.XviD-KAMERA.avi, You dont mess with the zohan (2008).avi, You Dont Mess With The Zohan (2008)(www.BirjandDVD.com).avi, You Dont Mess With The Zohan [DVDRip 2008].avi, You Dont Mess With The Zohan [Zohan'a Bulasma ] 2008.avi, You Dont Mess With The Zohan ~ R5.avi, You Dont Mess With The Zohan R5 LiNE KAMERA.avi, You Dont Mess With The Zohan R5 XVID - STG.avi, You dont mess with the zohan xvid-kamera [www.download24.se].avi, You dont mess with the Zohan.avi, you dont mess with the zohan(2).avi, You Dont Mess with Zohan R5 Antoniel.avi, You Dont Mess With Zohan.avi, you dont mess.avi, you dont.avi, you_dont_mess_with_the_zohan_r5_line_xvid-kamera.av