## Setup

In [2]:
import torch
import numpy as np
from torch import einsum
from tqdm.auto import tqdm
import seaborn as sns
from transformer_lens import HookedTransformer, ActivationCache
from datasets import load_dataset
from einops import einsum
import pandas as pd
from transformer_lens import utils
from rich.table import Table, Column
from rich import print as rprint
from jaxtyping import Float, Int, Bool
from torch import Tensor
import einops
import functools
from transformer_lens.hook_points import HookPoint
# import circuitsvis
from IPython.display import HTML
from plotly.express import line
import plotly.express as px
from tqdm.auto import tqdm
import json
import gc
import plotly.graph_objects as go

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio
pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"

## Model and data loading

In [12]:
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fdea0657a90>

In [11]:
model = HookedTransformer.from_pretrained("pythia-70m-v0", fold_ln=True, device=device)

Using pad_token, but it is not set yet.


Loaded pretrained model pythia-70m-v0 into HookedTransformer


In [13]:
def load_pile(stream=False):
    dataset = load_dataset("EleutherAI/pile", name="europarl", split="train")
    dataset = dataset.shuffle(seed=42)
    return dataset

def get_random_samples(dataset, language="fr", n=100, min_length=100, max_length=1000):
    pbar = tqdm(total=2*n)
    language_data = []
    other_data = []
    for i, example in enumerate(dataset):
        sentence_language = example["meta"][-4:-2]
        sentence = example["text"]
        if (len(sentence) >= min_length) and (len(sentence) <= max_length):
            if (len(language_data) < n) and (sentence_language==language):
                language_data.append(sentence)
                pbar.update(1)
            elif (len(other_data) < n):
                other_data.append(sentence)
                pbar.update(1)
        if (len(language_data) >= n) and (len(other_data) >= n):
            pbar.close()
            return language_data, other_data
    pbar.close()
    return language_data, other_data

In [None]:
# dataset = load_pile()
# french_data, non_french_data = get_random_samples(dataset, n=1000, min_length=200, max_length=3000)
# print(len(french_data), len(non_french_data))

In [14]:
def write_data(data: list[str], path: str):
    with open(path, 'w') as f:
        json.dump(data, f)

def load_data(path: str):
    with open(path, 'r') as f:
        return json.load(f)

In [15]:
# write_data(french_data, "french_data.json")
# write_data(non_french_data, "non_french_data.json")

In [16]:
french_data = load_data("french_data.json")
non_french_data = load_data("non_french_data.json")


In [17]:
min([len(data) for data in french_data])

200

In [18]:
max([len(data) for data in french_data])

2988

In [19]:
for data in french_data:
    if len(data) == 2988:
        print(model.to_tokens(data).shape)
    if len(data)== 200:
        print(model.to_tokens(data).shape)

torch.Size([1, 68])
torch.Size([1, 49])
torch.Size([1, 60])
torch.Size([1, 857])
torch.Size([1, 58])


## Sparse probing to rediscover the french neuron

In [20]:
def clean_cache():
    gc.collect()
    torch.cuda.empty_cache()

In [21]:
act_label = f'blocks.3.mlp.hook_post'

In [22]:
def get_activations(prompts: list[str], act_label: str, model: HookedTransformer, n: int = -1):
    acts = []
    if n == -1:
        n = len(prompts)
    for i in tqdm(range(n)):
        tokens = model.to_tokens(prompts[i])
        _, cache = model.run_with_cache(tokens)
        # Remove activation from BOS token
        act = cache[act_label][:, 1:, :]
        act = einops.rearrange(act, "batch pos d_mlp -> (batch pos) d_mlp")
        acts.append(act)
    return torch.concat(acts, dim=0)

In [23]:
french_activations = get_activations(french_data, act_label, model, n=200)
non_french_activations = get_activations(non_french_data, act_label, model, n=200)

print(french_activations.shape, non_french_activations.shape)
smaller_size = min(len(french_activations), len(non_french_activations))

A = torch.concat((french_activations[:smaller_size], non_french_activations[:smaller_size]), dim=0).cpu().numpy()
y = np.array([1]*smaller_size + [0]*smaller_size)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

torch.Size([66194, 2048]) torch.Size([82906, 2048])


In [24]:
def get_important_neurons(A_train: np.ndarray, y_train: np.ndarray, k: int):
    # score each activation dimension by the difference between the means of
    # the classes 0 and 1
    A_train, y_train = torch.Tensor(A_train), torch.Tensor(y_train)
    class_0_mean = A_train[y_train == 0].mean(dim=0)
    class_1_mean = A_train[y_train == 1].mean(dim=0)
    scores = (class_1_mean - class_0_mean).abs()
    top_k_scores, top_k_indices = torch.topk(scores, k)
    return top_k_indices, top_k_scores

In [25]:
k = 10
test_accs = []
train_accs = []
for i in range(1):
    A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=0.2)
    top_k_indices, top_k_scores = get_important_neurons(A_train, y_train, k)
    lr_model = LogisticRegression()
    topk = top_k_indices.cpu().numpy().tolist()
    A_train = A_train[:, topk]
    A_test = A_test[:, topk]
    lr_model.fit(A_train, y_train)
    test_acc = lr_model.score(A_test, y_test)
    train_acc = lr_model.score(A_train, y_train)
    test_accs.append(test_acc)
    train_accs.append(train_acc)

print(topk)
print(np.mean(train_accs))

[609, 1116, 184, 719, 1932, 1361, 678, 1455, 644, 24]
0.9762628646964404


In [26]:
def get_top_individual_neuron_accuracy(k=10, n=5):
  test_accs = np.zeros((n, k))
  train_accs = np.zeros((n, k))
  for i in tqdm(range(n)):
      A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=0.1)
      top_k_indices, top_k_scores = get_important_neurons(A_train, y_train, k)
      lr_model = LogisticRegression()
      topk = top_k_indices.cpu().numpy().tolist()
      for k_index, neuron in enumerate(topk):
          A_train_tmp = A_train[:, [neuron]]
          A_test_tmp = A_test[:, [neuron]]
          lr_model.fit(A_train_tmp, y_train)
          test_acc = lr_model.score(A_test_tmp, y_test)
          train_acc = lr_model.score(A_train_tmp, y_train)
          test_accs[i, k_index] = test_acc
          train_accs[i, k_index] = train_acc
  return train_accs, test_accs, topk


train_accs, test_accs, topk = get_top_individual_neuron_accuracy(n=5, k=20)

  0%|          | 0/5 [00:00<?, ?it/s]

In [27]:
def line(x, xlabel="", ylabel="", title="", xticks=None, width=800):
    fig = px.line(x, title=title)
    fig.update_layout(xaxis_title=xlabel, yaxis_title=ylabel, width=width)
    if xticks != None:
        fig.update_layout(
            xaxis = dict(
            tickmode = 'array',
            tickvals = [i for i in range(len(xticks))],
            ticktext = xticks
            )
        )
    return fig

In [28]:
line(train_accs.mean(0), ylabel="Accuracy", xlabel="Neuron", title="Per neuron accuracy on French vs. non-French", xticks=topk)

## Test ablation performance

In [29]:
french_activation_L3N609 = french_activations[:, 609]
non_french_activation_L3N609 = non_french_activations[:, 609]
mean_french_activation_L3N609 = french_activation_L3N609.mean(0)
mean_non_french_activation_L3N609 = non_french_activation_L3N609.mean(0)
print("French activation:", mean_french_activation_L3N609.mean(0).item())
print("Non french activation:", mean_non_french_activation_L3N609.mean(0).item())

French activation: 2.7501614093780518
Non french activation: -0.05527174100279808


In [30]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=french_activation_L3N609.cpu().numpy(), histnorm='percent', name="French"))
fig.add_trace(go.Histogram(x=non_french_activation_L3N609.cpu().numpy(), histnorm='percent', name="Other"))

fig.update_layout(
    width=800,
    title_text='L3N609 activations', # title of plot
    xaxis_title_text='Neuron activation', # xaxis label
    yaxis_title_text='Probability', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)
fig.show()

In [31]:
with open("french_stories.txt", "r") as f:
    stories_data = f.read().split("\n")
print(len(stories_data))
print(stories_data[0])

20
Emma était une petite abeille curieuse qui aimait explorer les fleurs du jardin. Un jour, elle rencontra une coccinelle timide nommée Léa. Ensemble, elles découvrirent un trésor caché sous une feuille et devinrent les meilleures amies du monde.


In [32]:
def get_ablated_performance(data: list[str], model:HookedTransformer, batch_size=1):
    # Need to rerun activation cache to change
    layer = 3
    neurons = [609]

    def ablate_neuron_hook(value, hook):
        value[:, :, neurons] = mean_non_french_activation_L3N609
        return value

    original_losses = []
    ablated_losses = []
    dataloader = torch.utils.data.DataLoader(data, batch_size=batch_size)

    for i, batch in enumerate(dataloader):
        tokens = model.to_tokens(batch)

        original_loss = model(tokens, return_type="loss")
        ablated_loss = model.run_with_hooks(tokens, return_type="loss", fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', ablate_neuron_hook)])

        original_losses.append(original_loss.item())
        ablated_losses.append(ablated_loss.item())

    print(f"Full model loss: {np.mean(original_losses):.6f}")
    print(f"Ablated MLP layer loss: {np.mean(ablated_losses):.6f}")
    print(f"Percent loss increase: {((np.mean(ablated_losses) - np.mean(original_losses)) / np.mean(original_losses)):.4f}")

In [33]:
get_ablated_performance(french_data, model=model, batch_size=32)

Full model loss: 6.652228
Ablated MLP layer loss: 6.741127
Percent loss increase: 0.0134


In [34]:
get_ablated_performance(stories_data, model=model, batch_size=1)

Full model loss: 3.753950
Ablated MLP layer loss: 4.045552
Percent loss increase: 0.0777


In [35]:
def ablate_neuron_hook(value, hook):
  value[:, :, 609] = mean_non_french_activation_L3N609
  return value

test_prompt = "Dans un petit village, vivait une petite fille nommée Sophie qui rêvait"
tokens = model.to_tokens(test_prompt)

print("Original model")
utils.test_prompt(test_prompt, ' de', model)

print("\nAblated model")
with model.hooks(fwd_hooks=[('blocks.3.mlp.hook_post', ablate_neuron_hook)]):
  utils.test_prompt(test_prompt, ' de', model)

Original model
Tokenized prompt: ['<|endoftext|>', 'D', 'ans', ' un', ' pet', 'it', ' village', ',', ' viv', 'ait', ' une', ' pet', 'ite', ' fil', 'le', ' nom', 'm', 'ée', ' Sophie', ' qui', ' r', 'ê', 'v', 'ait']
Tokenized answer: [' de']


Top 0th token. Logit: 15.59 Prob:  8.74% Token: | à|
Top 1th token. Logit: 15.49 Prob:  7.94% Token: | de|
Top 2th token. Logit: 15.07 Prob:  5.19% Token: |,|
Top 3th token. Logit: 14.75 Prob:  3.76% Token: | d|
Top 4th token. Logit: 14.66 Prob:  3.43% Token: |.|
Top 5th token. Logit: 14.59 Prob:  3.21% Token: | en|
Top 6th token. Logit: 14.47 Prob:  2.84% Token: | un|
Top 7th token. Logit: 14.46 Prob:  2.83% Token: | la|
Top 8th token. Logit: 14.45 Prob:  2.80% Token: | pour|
Top 9th token. Logit: 14.44 Prob:  2.76% Token: | le|



Ablated model
Tokenized prompt: ['<|endoftext|>', 'D', 'ans', ' un', ' pet', 'it', ' village', ',', ' viv', 'ait', ' une', ' pet', 'ite', ' fil', 'le', ' nom', 'm', 'ée', ' Sophie', ' qui', ' r', 'ê', 'v', 'ait']
Tokenized answer: [' de']


Top 0th token. Logit: 14.27 Prob:  7.53% Token: | de|
Top 1th token. Logit: 14.24 Prob:  7.37% Token: |,|
Top 2th token. Logit: 14.14 Prob:  6.65% Token: | à|
Top 3th token. Logit: 13.80 Prob:  4.73% Token: |.|
Top 4th token. Logit: 13.57 Prob:  3.77% Token: | un|
Top 5th token. Logit: 13.36 Prob:  3.06% Token: | en|
Top 6th token. Logit: 13.25 Prob:  2.74% Token: | la|
Top 7th token. Logit: 13.22 Prob:  2.63% Token: | d|
Top 8th token. Logit: 13.16 Prob:  2.49% Token: | a|
Top 9th token. Logit: 12.70 Prob:  1.57% Token: | le|


## Reverse engineer 609

In [36]:
french_neuron_output_direction = model.W_out[3, 609, :].unsqueeze(0).unsqueeze(2)
later_mlp_inputs = model.W_in[4:]

In [37]:
def get_top_scores(scores, k=10):
    print("Layer 4:")
    _, top_neurons_4 = torch.topk(scores[0].abs().cpu(), k)
    print("Top neurons: " + " ".join(f"{index} ({scores[0, index]:.2f})," for index in top_neurons_4.tolist()))

    print("\nLayer 5:")
    _, top_neurons_5 = torch.topk(scores[1].abs().cpu(), k)
    print("Top neurons: " + " ".join(f"{index} ({scores[1, index]:.2f})," for index in top_neurons_5.tolist()))

    return top_neurons_4, top_neurons_5

In [38]:
print("High cosine similarity with L3N609")
cos = torch.nn.CosineSimilarity(dim=1)
similarity = cos(later_mlp_inputs, french_neuron_output_direction)

top_neurons_cos_4, top_neurons_cos_5 = get_top_scores(similarity)

px.histogram(similarity.flatten().cpu(), title="Cosine similarity of L4N609 with later neurons", width=800)

High cosine similarity with L3N609
Layer 4:
Top neurons: 1189 (-0.23), 310 (-0.23), 449 (-0.20), 233 (-0.19), 1813 (-0.19), 527 (-0.18), 406 (-0.17), 1977 (-0.17), 910 (0.17), 1933 (0.16),

Layer 5:
Top neurons: 8 (0.24), 1342 (-0.23), 273 (0.20), 1731 (0.18), 670 (0.18), 929 (0.16), 395 (0.16), 1525 (-0.16), 1150 (-0.16), 658 (-0.16),


In [39]:
print("High dot product with L3N609")
dot = einops.einsum(french_neuron_output_direction, later_mlp_inputs, "layer res mlp, layer2 res mlp2 -> layer2 mlp2")

top_neurons_dot_4, top_neurons_dot_5 = get_top_scores(dot)

px.histogram(dot.flatten().cpu(), title="Dot product of L4N609 with later neurons", width=800)

High dot product with L3N609
Layer 4:
Top neurons: 1189 (-0.20), 310 (-0.12), 910 (0.10), 1813 (-0.10), 527 (-0.09), 449 (-0.09), 354 (0.09), 225 (-0.09), 788 (0.09), 406 (-0.09),

Layer 5:
Top neurons: 1230 (-0.15), 8 (0.14), 1342 (-0.13), 273 (0.13), 395 (0.12), 670 (0.11), 389 (0.11), 330 (0.10), 1525 (-0.10), 1731 (0.10),


In [40]:
# Get average logit difference of high similarity neurons compared to other neurons

def get_ablated_cache(data: list[str], model: HookedTransformer):
    # Need to rerun activation cache to change
    layer = 3
    neurons = [609]

    def ablate_neuron_hook(value, hook):
        value[:, :, neurons] = mean_non_french_activation_L3N609
        return value
    
    tokens = model.to_tokens(data)

    _, original_cache = model.run_with_cache(tokens, return_type="loss")

    with model.hooks(fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', ablate_neuron_hook)]):
        _, ablated_cache = model.run_with_cache(tokens, return_type="loss")

    return original_cache, ablated_cache

original_cache, ablated_cache = get_ablated_cache(stories_data, model)

In [41]:
def compare_activation_difference(original_cache, ablated_cache, layer, neurons):
    block_name = f'blocks.{layer}.mlp.hook_post'
    original_activations = original_cache[block_name]
    ablated_activations = ablated_cache[block_name]
    inverted_indices = [i for i in range(model.cfg.d_mlp) if i not in neurons.tolist()]

    selected_original = original_activations[:, 1:, neurons]
    other_original = original_activations[:, 1:, inverted_indices]

    selected_ablated = ablated_activations[:, 1:, neurons]
    other_ablated = ablated_activations[:, 1:, inverted_indices]

    selected_difference = selected_original.mean((0, 1))-selected_ablated.mean((0, 1))
    other_difference = other_original.mean((0, 1))-other_ablated.mean((0, 1))
    print("Top neuron difference", selected_difference)
    print("Average neuron difference", other_difference.mean())

    fig = go.Figure()
    fig.add_trace(go.Histogram(x=other_difference.cpu().numpy(), histnorm='percent', name="Other neurons"))
    fig.add_trace(go.Histogram(x=selected_difference.cpu().numpy(), histnorm='percent', name="Top similarity neurons"))
    

    fig.update_layout(
        width=800,
        title_text='L3N609 ablation effects on later neurons', # title of plot
        xaxis_title_text='Difference in activation', # xaxis label
        yaxis_title_text='Probability', # yaxis label
    )
    fig.show()

compare_activation_difference(original_cache, ablated_cache, layer=4, neurons=top_neurons_cos_4)

Top neuron difference tensor([-0.2685,  0.0011,  0.0440,  0.0213,  0.0379,  0.0240,  0.0334,  0.0263,
         0.4147,  0.4421], device='cuda:0')
Average neuron difference tensor(0.0099, device='cuda:0')


## DLA

In [42]:
model.reset_hooks()

french_prompt = " Le livre"
english_prompt = " The bookstore"

utils.test_prompt(french_prompt, ' de', model)
utils.test_prompt(english_prompt, ' is', model)

Tokenized prompt: ['<|endoftext|>', ' Le', ' liv', 're']
Tokenized answer: [' de']


Top 0th token. Logit: 14.79 Prob:  9.22% Token: | de|
Top 1th token. Logit: 14.62 Prob:  7.74% Token: |ment|
Top 2th token. Logit: 14.00 Prob:  4.18% Token: | à|
Top 3th token. Logit: 13.87 Prob:  3.67% Token: | du|
Top 4th token. Logit: 13.85 Prob:  3.59% Token: |,|
Top 5th token. Logit: 13.80 Prob:  3.41% Token: |-|
Top 6th token. Logit: 13.59 Prob:  2.78% Token: | d|
Top 7th token. Logit: 13.56 Prob:  2.68% Token: | et|
Top 8th token. Logit: 13.44 Prob:  2.38% Token: | :|
Top 9th token. Logit: 13.40 Prob:  2.29% Token: | en|


Tokenized prompt: ['<|endoftext|>', ' The', ' book', 'store']
Tokenized answer: [' is']


Top 0th token. Logit: 14.36 Prob:  9.55% Token: | is|
Top 1th token. Logit: 14.17 Prob:  7.90% Token: | was|
Top 2th token. Logit: 13.94 Prob:  6.26% Token: | of|
Top 3th token. Logit: 13.58 Prob:  4.35% Token: | has|
Top 4th token. Logit: 13.52 Prob:  4.10% Token: |
|
Top 5th token. Logit: 13.33 Prob:  3.40% Token: | in|
Top 6th token. Logit: 13.10 Prob:  2.71% Token: | chain|
Top 7th token. Logit: 13.06 Prob:  2.60% Token: |,|
Top 8th token. Logit: 12.69 Prob:  1.79% Token: | and|
Top 9th token. Logit: 12.67 Prob:  1.76% Token: |:|


In [43]:
english_logits, english_cache = model.run_with_cache(english_prompt)

answer_tokens = model.to_tokens([" is", " de"], prepend_bos=False).flatten().unsqueeze(0)


In [44]:
def get_logit_diff_directions(model, answer_tokens) -> Float[Tensor, "batch d_model"]:
    """
    The ideal logit difference direction in the unembed, formed by substracting the incorrect answer token's direction from the correct answer token's direction.
    """

    # Token unembed with layer norm handled
    answer_residual_directions = model.tokens_to_residual_directions(answer_tokens)
    correct_residual_directions, incorrect_residual_directions = answer_residual_directions.unbind(dim=1)
    logit_diff_directions = correct_residual_directions - incorrect_residual_directions
    # print("Answer residual directions shape:", answer_residual_directions.shape)
    # print(f"Logit difference directions shape:", logit_diff_directions.shape)
    return logit_diff_directions


logit_diff_directions = get_logit_diff_directions(model, answer_tokens)

In [45]:
def residual_stack_to_logit_diff(
    residual_stack: Float[Tensor, "... batch d_model"],
    cache: ActivationCache,
    logit_diff_directions: Float[Tensor, "batch d_model"] = logit_diff_directions,
) -> Float[Tensor, "..."]:
    '''
    Gets the avg logit difference between the correct and incorrect answer for a given
    stack of components in the residual stream.
    '''
    batch_size = residual_stack.size(-2)
    scaled_residual_stack = cache.apply_ln_to_stack(residual_stack, layer=-1, pos_slice=-1)
    return einops.einsum(
        scaled_residual_stack, logit_diff_directions,
        "... batch d_model, batch d_model -> ..."
    ) / batch_size

In [46]:
per_layer_residual, labels = english_cache.decompose_resid(layer=-1, pos_slice=-1, return_labels=True)
per_layer_logit_diffs = residual_stack_to_logit_diff(per_layer_residual, english_cache)

line(
    per_layer_logit_diffs.detach().cpu().numpy(),
    title="Logit Difference From Each Layer",
    xlabel="Component", ylabel="Logit Diff",
    width=800
)

In [47]:
# For a single French prompt, show how ablating the French neuron L3N609 affects the log likelihood of the model predicting the French token "de" vs. the English
# token "of". When we calculated the logit diff directions we pass in "de" as the first token, so a positive value indicates that the probability of "de" is higher
# for that run.
# 
# Therefore this graph shows that the French neuron affects components 8 and 12, which correspond to 3_mlp_out and 5_mlp_out. At MLP3, disabling the French neuron
# causes the output to align more with the English "of" and less with the French "de", as expected. But at MLP5, it has the opposite effect, causing the output to align more with
# the French "de". This is unexpected!

# Make the French token the correct answer, as we're about to intervene on the French prompt
answer_tokens = model.to_tokens([" de", " is"], prepend_bos=False).flatten().unsqueeze(0)
logit_diff_directions = get_logit_diff_directions(model, answer_tokens)

french_cache, ablated_french_cache = get_ablated_cache(french_prompt, model)

ablated_french_per_layer_residual, ablated_french_labels = ablated_french_cache.decompose_resid(layer=-1, pos_slice=-1, return_labels=True)
french_per_layer_residual, french_labels = french_cache.decompose_resid(layer=-1, pos_slice=-1, return_labels=True)

french_per_layer_logit_diffs = residual_stack_to_logit_diff(french_per_layer_residual, french_cache, logit_diff_directions).detach().cpu().numpy(),
ablated_french_per_layer_logit_diffs = residual_stack_to_logit_diff(ablated_french_per_layer_residual, ablated_french_cache, logit_diff_directions).detach().cpu().numpy(),

df = pd.DataFrame({
        "Logit diffs": french_per_layer_logit_diffs[0],
        "Logit diffs with L3N609 disabled": ablated_french_per_layer_logit_diffs[0]
})

fig = px.line(df, y=["Logit diffs", "Logit diffs with L3N609 disabled"]) # , color_discrete_sequence=['red', 'blue'] # 'name': ['Unablated French', 'Ablated French'],
fig.show()

print("Component 8 label: ", french_labels[8])
print("Component 12 label: ", french_labels[12])
print("Activation of L3N609 on French prompt, we expect > 2 after the first few tokens: ", french_cache[utils.get_act_name('post', 3)][0, :, 609])

Component 8 label:  3_mlp_out
Component 12 label:  5_mlp_out
Activation of L3N609 on French prompt, we expect > 2 after the first few tokens:  tensor([-0.1639,  1.2233,  3.4107,  3.7054], device='cuda:0')


In [48]:
# This could help us understand why disabling the french neuron increases the probability of "de" over "of", rather than increasing it. We see that "de" remains
# the most likely output in both cases, but its probability increases slightly when the French neuron is disabled. 

# The next step here is to build a larger dataset so we can average this process over many cases and determine whether the weird results are reliable.

def ablate_neuron_hook(value, hook):
    value[:, :, 609] = mean_non_french_activation_L3N609
    return value

utils.test_prompt(french_prompt, ' de', model)
with model.hooks(fwd_hooks=[(f'blocks.3.mlp.hook_post', ablate_neuron_hook)]):
    utils.test_prompt(french_prompt, ' de', model)

Tokenized prompt: ['<|endoftext|>', ' Le', ' liv', 're']
Tokenized answer: [' de']


Top 0th token. Logit: 14.79 Prob:  9.22% Token: | de|
Top 1th token. Logit: 14.62 Prob:  7.74% Token: |ment|
Top 2th token. Logit: 14.00 Prob:  4.18% Token: | à|
Top 3th token. Logit: 13.87 Prob:  3.67% Token: | du|
Top 4th token. Logit: 13.85 Prob:  3.59% Token: |,|
Top 5th token. Logit: 13.80 Prob:  3.41% Token: |-|
Top 6th token. Logit: 13.59 Prob:  2.78% Token: | d|
Top 7th token. Logit: 13.56 Prob:  2.68% Token: | et|
Top 8th token. Logit: 13.44 Prob:  2.38% Token: | :|
Top 9th token. Logit: 13.40 Prob:  2.29% Token: | en|


Tokenized prompt: ['<|endoftext|>', ' Le', ' liv', 're']
Tokenized answer: [' de']


Top 0th token. Logit: 13.94 Prob: 10.21% Token: | de|
Top 1th token. Logit: 13.68 Prob:  7.84% Token: | a|
Top 2th token. Logit: 13.04 Prob:  4.15% Token: | en|
Top 3th token. Logit: 13.04 Prob:  4.14% Token: |,|
Top 4th token. Logit: 12.72 Prob:  3.00% Token: |
|
Top 5th token. Logit: 12.46 Prob:  2.31% Token: | que|
Top 6th token. Logit: 12.27 Prob:  1.92% Token: | à|
Top 7th token. Logit: 12.18 Prob:  1.76% Token: | le|
Top 8th token. Logit: 12.15 Prob:  1.71% Token: | y|
Top 9th token. Logit: 11.79 Prob:  1.18% Token: | l|


In [84]:
dataloader = torch.utils.data.DataLoader(french_data, batch_size=32)
for batch in dataloader:
    tokens = model.to_tokens(batch) # [32 num_tokens]
    print(tokens[:2, -5:])
    print(model.tokenizer.pad_token_id)
    break

tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]], device='cuda:0')
0


In [166]:
# Show average activations of French neuron L3N609 at each position over the French data
# Tests look reasonable 
def get_L3N609_position_activations(data: list[str], model:HookedTransformer, batch_size=1, layer=3, neuron=609):
    position_counts = torch.zeros(model.cfg.n_ctx).cuda()
    position_activations = torch.zeros(model.cfg.n_ctx).cuda()
    position_loss = torch.zeros(model.cfg.n_ctx).cuda()

    def record_neuron_hook(value, hook):  
        position_activations[0:value.shape[1]] += value[:, :, neuron].sum(dim=0)
        return value

    dataloader = torch.utils.data.DataLoader(data, batch_size=batch_size)
    for batch in dataloader:
        tokens = model.to_tokens(batch)  # [batch_size num_tokens]
        loss = model.run_with_hooks(tokens, return_type="loss", loss_per_token=True, fwd_hooks=[(f'blocks.{layer}.mlp.hook_post', record_neuron_hook)])
        
        mask = tokens[:, :] != model.tokenizer.pad_token_id
        mask[:, 0] = True  # BOS token ID is equivalent to pad token ID, we can set it to true because we know it exists for every prompt

        # set loss to 0 where we have the pad token
        pad_tokens = tokens[:, :-1] == model.tokenizer.pad_token_id
        pad_tokens[:, 0] = False
        loss[pad_tokens] = 0

        position_loss[0:loss.shape[1]] += loss.sum(dim=0)
        
        position_counts[0:mask.shape[1]] += mask[:, :].sum(dim=0).float()

    return position_activations, position_counts, position_loss


position_activations, position_counts, position_loss = get_L3N609_position_activations(french_data, model=model, batch_size=32, layer=3, neuron=609)

avg_position_loss = []
avg_position_activations = []
for i in range(len(position_activations)):
    avg_position_activations.append((position_activations[i] / position_counts[i]).item() if position_counts[i] != 0 else 0)
    avg_position_loss.append((position_loss[i] / position_counts[i]).item() if position_counts[i] != 0 else 0)

print(avg_position_activations)
print(avg_position_loss)

[-0.16390757262706757, -0.10645817220211029, 0.16475825011730194, 0.44915491342544556, 0.9846596717834473, 1.4766491651535034, 1.6938731670379639, 1.8331472873687744, 1.743856430053711, 1.883286714553833, 1.950778603553772, 1.9530599117279053, 1.8880245685577393, 1.990222692489624, 1.9025884866714478, 1.931817889213562, 2.000941038131714, 1.978959560394287, 1.8411600589752197, 1.9890872240066528, 1.957017421722412, 1.9630643129348755, 1.9432423114776611, 1.921679973602295, 1.8833413124084473, 1.8810429573059082, 1.9141359329223633, 1.883362889289856, 1.8576754331588745, 1.8974932432174683, 1.8692355155944824, 1.8722950220108032, 1.9294440746307373, 1.9009696245193481, 1.919989824295044, 1.952497124671936, 1.9926283359527588, 2.024959087371826, 1.9769965410232544, 1.9583024978637695, 2.0048234462738037, 2.036039352416992, 2.0451040267944336, 2.1278066635131836, 2.2111682891845703, 2.188581943511963, 2.2103614807128906, 2.2275032997131348, 2.2914206981658936, 2.286804676055908, 2.3349027

In [137]:
# def test_get_L3N609_position_activations():
#     # run with two data points
#     position_activations, position_counts = get_L3N609_position_activations(french_data[:2], model=model, batch_size=32)
#     # print(position_activations[:3])
#     # print(position_counts[:3])

#     avg_position_activations = []
#     for i in range(3):
#         avg_position_activations.append((position_activations[i] / position_counts[i]).item() if position_counts[i] != 0 else 0)
#     print(avg_position_activations)

#     position_activations, position_counts = get_L3N609_position_activations([french_data[0]], model=model, batch_size=32)
#     print(position_activations[:3])
#     position_activations, position_counts = get_L3N609_position_activations([french_data[1]], model=model, batch_size=32)
#     print(position_activations[:3])

# test_get_L3N609_position_activations()

[-0.16390757262706757, -0.1600591391324997, 0.41021209955215454]
tensor([-0.1639, -0.1538,  0.9818], device='cuda:0')
tensor([-0.1639, -0.1663, -0.1614], device='cuda:0')


In [163]:
print(["%.2f" % item for item in avg_position_activations])
print("%.2f" % avg_position_activations[200])

print(position_counts[830:900])

line(avg_position_activations[:900], xlabel="Token Position", ylabel="Activation Value", title="Average activation value of French neuron L3N609 on French text", xticks=None, width=800)

['-0.16', '-0.11', '0.16', '0.45', '0.98', '1.48', '1.69', '1.83', '1.74', '1.88', '1.95', '1.95', '1.89', '1.99', '1.90', '1.93', '2.00', '1.98', '1.84', '1.99', '1.96', '1.96', '1.94', '1.92', '1.88', '1.88', '1.91', '1.88', '1.86', '1.90', '1.87', '1.87', '1.93', '1.90', '1.92', '1.95', '1.99', '2.02', '1.98', '1.96', '2.00', '2.04', '2.05', '2.13', '2.21', '2.19', '2.21', '2.23', '2.29', '2.29', '2.33', '2.35', '2.43', '2.47', '2.50', '2.47', '2.54', '2.52', '2.53', '2.52', '2.51', '2.55', '2.57', '2.67', '2.67', '2.68', '2.64', '2.74', '2.77', '2.71', '2.73', '2.76', '2.79', '2.77', '2.78', '2.81', '2.80', '2.82', '2.81', '2.81', '2.87', '2.82', '2.78', '2.83', '2.84', '2.84', '2.83', '2.79', '2.77', '2.83', '2.81', '2.82', '2.85', '2.87', '2.88', '2.85', '2.88', '2.88', '2.83', '2.87', '2.81', '2.80', '2.87', '2.85', '2.88', '2.85', '2.85', '2.86', '2.83', '2.81', '2.83', '2.88', '2.82', '2.86', '2.85', '2.81', '2.89', '2.84', '2.79', '2.89', '2.85', '2.82', '2.83', '2.80', '2.77

In [164]:
line(avg_position_loss[:900], xlabel="Token Position", ylabel="Loss", title="Average loss on French text", xticks=None, width=800)


In [169]:
position_activations, position_counts, position_loss = get_L3N609_position_activations(french_data, model=model, batch_size=32, layer=3, neuron=184)

avg_position_loss = []
avg_position_activations = []
for i in range(len(position_activations)):
    avg_position_activations.append((position_activations[i] / position_counts[i]).item() if position_counts[i] != 0 else 0)
    avg_position_loss.append((position_loss[i] / position_counts[i]).item() if position_counts[i] != 0 else 0)

line(avg_position_activations[:900], xlabel="Token Position", ylabel="Activation Value", title="Average activation value of French neuron L3N184 on French text", xticks=None, width=800)

In [174]:
import gc
gc.collect()
torch.cuda.empty_cache()

position_activations, position_counts, position_loss = get_L3N609_position_activations(non_french_data, model=model, batch_size=10, layer=3, neuron=184)

avg_position_loss = []
avg_position_activations = []
for i in range(len(position_activations)):
    avg_position_activations.append((position_activations[i] / position_counts[i]).item() if position_counts[i] != 0 else 0)
    avg_position_loss.append((position_loss[i] / position_counts[i]).item() if position_counts[i] != 0 else 0)

line(avg_position_activations[:900], xlabel="Token Position", ylabel="Activation Value", title="Average activation value of French neuron L3N184 on Non-French text", xticks=None, width=800)