In [2]:
import torch
from transformer_lens import HookedTransformer

# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio
pio.renderers.default = "colab+vscode"

import haystack_utils

%reload_ext autoreload
%autoreload 2

In [3]:
haystack_utils.clean_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)
model = HookedTransformer.from_pretrained("pythia-70m-v0", fold_ln=True, device=device)

Using pad_token, but it is not set yet.


Loaded pretrained model pythia-70m-v0 into HookedTransformer


In [10]:
same_token_len_pairs = [
    (" she", " elle"),
    (" is", " va"),
    (" the", " le"),
    # ("friend", "ami"),
    (" of", " de"),
    (" city", " ville"),
    (" cat", " chat"),
    # ("dog", "chien"),
    (" I", " Je"),
    (" am", " suis"),
    # ("drink", "bois")
]
more_same_token_len_pairs = [(' day', ' jour'), (' fruit', ' fruit'), (' wind', ' vent'), (' lake', ' lac'), (' sea', ' mer')]
same_token_len_pairs += more_same_token_len_pairs

for english, french in same_token_len_pairs:
    assert model.to_tokens(english).shape[1] == model.to_tokens(french).shape[1]

In [11]:
french_prompt = "".join(pair[1] for pair in same_token_len_pairs)
english_prompt = "".join(pair[0] for pair in same_token_len_pairs)

assert model.to_tokens(french_prompt).shape == model.to_tokens(english_prompt).shape

In [6]:
# Prompts generate reasonable English and French text respectively
# print(haystack_utils.generate_text(english_prompt, model))
# print(haystack_utils.generate_text(french_prompt, model))

In [16]:
_, french_cache = model.run_with_cache(french_prompt)
_, english_cache = model.run_with_cache(english_prompt)

differences = []
labels = []
for key in french_cache.keys():
    if 'hook_mlp_out' not in key:
        continue
    french_acts = french_cache[key]
    english_acts = english_cache[key]
    labels.append(key)
    differences.append((french_acts - english_acts).abs().mean().item())
    # absolute sum of activation differences at each layer

print([key for key in french_cache.keys()])
print(differences)
haystack_utils.line(differences, xticks=labels, title="Language Specificity of MLPs - French vs. English", xlabel="Component", ylabel="Absolute Mean Activation Difference")

differences = []
labels = []
for key in french_cache.keys():
    if 'hook_attn_out' not in key:
        continue
    french_acts = french_cache[key]
    english_acts = english_cache[key]
    labels.append(key)
    differences.append((french_acts - english_acts).abs().mean().item())
    # absolute sum of activation differences at each layer

print([key for key in french_cache.keys()])
print(differences)
haystack_utils.line(differences, xticks=labels, title="Language Specificity of Attention Layers - French vs. English", xlabel="Component", ylabel="Absolute Mean Activation Difference")

['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'blocks.1.hook_resid_post', 'bl

['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'blocks.1.hook_resid_post', 'bl

I thought the mid layer would contain a non-language specific representation and would thus have a lower activation difference. The general pattern seems to hold true but the "mid layer" is earlier than I expected, in MLPs one and two.

I should repeat with more data to confirm.

In [9]:
# More French and English words with the same tokenized shape
# same = []
# for english, french in more:
#     if model.to_tokens(english).shape[1] == model.to_tokens(french).shape[1]:
#         same.append((english, french))
# print(same)
# print(model.to_tokens(english).shape[1] == model.to_tokens(french).shape[1])

[(' day', ' jour'), (' fruit', ' fruit'), (' wind', ' vent'), (' lake', ' lac'), (' sea', ' mer')]


In [21]:
from collections import defaultdict
from haystack_utils import load_txt_data, get_mlp_activations

model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m", fold_ln=True, device=device)

# Interesting neurons in Pythia V1, taken from Wes' CSV
english_neurons = [(5, 395), (5, 166), (5, 908), (5, 285), (3, 862), (5, 73), (4, 896), (5, 348), (5, 297), (3, 1204)]
french_neurons = [(5, 112), (4, 1080), (5, 1293), (5, 455), (5, 5), (5, 1901), (5, 486), (4, 975)]

english_data = load_txt_data("data/kde4_english.txt")
french_data = load_txt_data("data/kde4_french.txt")

english_mean_high_activations = defaultdict(torch.Tensor, {
    3: get_mlp_activations(english_data, 3, model, mean=True),  # [2048]
    4: get_mlp_activations(english_data, 4, model, mean=True),
    5: get_mlp_activations(english_data, 5, model, mean=True)
})
english_mean_low_activations = defaultdict(torch.Tensor, {
    3: get_mlp_activations(french_data, 3, model, mean=True),  # [2048]
    4: get_mlp_activations(french_data, 4, model, mean=True),
    5: get_mlp_activations(french_data, 5, model, mean=True)
})
french_mean_high_activations = defaultdict(torch.Tensor, {
    4: get_mlp_activations(french_data, 4, model, mean=True),
    5: get_mlp_activations(french_data, 5, model, mean=True)
})
french_mean_low_activations = defaultdict(torch.Tensor, {
    4: get_mlp_activations(english_data, 4, model, mean=True),
    5: get_mlp_activations(english_data, 5, model, mean=True)
})

english_neurons_by_layer = defaultdict(list)
for item in english_neurons:
    english_neurons_by_layer[item[0]].append(item[1])
french_neurons_by_layer = defaultdict(list)
for item in french_neurons:
    french_neurons_by_layer[item[0]].append(item[1])

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
kde4_english.txt: Loaded 1007 examples with 501 to 5295 characters each.
kde4_french.txt: Loaded 1007 examples with 505 to 5345 characters each.


  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

In [26]:
_, french_cache = model.run_with_cache(french_prompt)
_, english_cache = model.run_with_cache(english_prompt)

original_mlp_differences = []
labels = []
for key in french_cache.keys():
    if 'hook_mlp_out' not in key:
        continue
    french_acts = french_cache[key]
    english_acts = english_cache[key]
    labels.append(key)
    original_mlp_differences.append((french_acts - english_acts).abs().mean().item())
    # absolute sum of activation differences at each layer

original_attn_differences = []
labels = []
for key in french_cache.keys():
    if 'hook_attn_out' not in key:
        continue
    french_acts = french_cache[key]
    english_acts = english_cache[key]
    labels.append(key)
    original_attn_differences.append((french_acts - english_acts).abs().mean().item())
    # absolute sum of activation differences at each layer


In [28]:
# Ablating the French language hook doesn't make any difference
def activate_english_hook(value, hook):
    layer = hook.layer()
    french_neurons_for_layer = french_neurons_by_layer[layer]
    english_neurons_for_layer = english_neurons_by_layer[layer]
    value[:, :, french_neurons_for_layer] = french_mean_low_activations[layer][french_neurons_for_layer].cuda()
    value[:, :, english_neurons_for_layer] = english_mean_high_activations[layer][english_neurons_for_layer].cuda()
    return value
activate_english_fwd_hooks = [(f'blocks.3.mlp.hook_post', activate_english_hook), (f'blocks.4.mlp.hook_post', activate_english_hook), (f'blocks.5.mlp.hook_post', activate_english_hook)]


with model.hooks(fwd_hooks=activate_english_fwd_hooks):
    _, french_cache = model.run_with_cache(french_prompt)
_, english_cache = model.run_with_cache(english_prompt)

ablated_mlp_differences = []
mlp_labels = []
for key in french_cache.keys():
    if 'hook_mlp_out' not in key:
        continue
    french_acts = french_cache[key]
    english_acts = english_cache[key]
    mlp_labels.append(key)
    ablated_mlp_differences.append((french_acts - english_acts).abs().mean().item())
    # absolute sum of activation differences at each layer

ablated_attn_differences = []
attn_labels = []
for key in french_cache.keys():
    if 'hook_attn_out' not in key:
        continue
    french_acts = french_cache[key]
    english_acts = english_cache[key]
    attn_labels.append(key)
    ablated_attn_differences.append((french_acts - english_acts).abs().mean().item())
    # absolute sum of activation differences at each layer

haystack_utils.line(original_mlp_differences, xticks=mlp_labels, title="Language Specificity of MLPs - French vs. English", xlabel="Component", ylabel="Absolute Mean Activation Difference")
haystack_utils.line(ablated_mlp_differences, xticks=mlp_labels, title="Ablated Language Specificity of MLPs - French vs. English", xlabel="Component", ylabel="Absolute Mean Activation Difference")
haystack_utils.line(original_attn_differences, xticks=attn_labels, title="Language Specificity of Attention Layers - French vs. English", xlabel="Component", ylabel="Absolute Mean Activation Difference")
haystack_utils.line(ablated_attn_differences, xticks=attn_labels, title="Ablated Language Specificity of Attention Layers - French vs. English", xlabel="Component", ylabel="Absolute Mean Activation Difference")
