## Setup

In [1]:
import torch
import numpy as np
from torch import einsum
from tqdm.auto import tqdm
import seaborn as sns
from transformer_lens import HookedTransformer, ActivationCache, utils
from datasets import load_dataset
from einops import einsum
import pandas as pd
from transformer_lens import utils
from rich.table import Table, Column
from rich import print as rprint
from jaxtyping import Float, Int, Bool
from torch import Tensor
import einops
import functools
from transformer_lens.hook_points import HookPoint
# import circuitsvis
from IPython.display import HTML
from plotly.express import line
import plotly.express as px
from tqdm.auto import tqdm
import json
import gc
import plotly.graph_objects as go

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from plotly.subplots import make_subplots
# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import plotly.io as pio
pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import load_txt_data, get_mlp_activations, line
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [2]:
LAYER_TO_ABLATE = 3
NEURONS_TO_ABLATE = [609]

In [3]:
model = HookedTransformer.from_pretrained("pythia-70m-v0", fold_ln=True, device=device)

kde_french = load_txt_data("kde4_french.txt")
kde_english = load_txt_data("kde4_english.txt")

french_activations = get_mlp_activations(kde_french, LAYER_TO_ABLATE, model, num_prompts=100, mean=True)
english_activations = get_mlp_activations(kde_english, LAYER_TO_ABLATE, model, num_prompts=100, mean=True)

Using pad_token, but it is not set yet.


Loaded pretrained model pythia-70m-v0 into HookedTransformer
kde4_french.txt: Loaded 1007 examples with 505 to 5345 characters each.
kde4_english.txt: Loaded 1007 examples with 501 to 5295 characters each.


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

## Compare whole model on bigram prompts

In [4]:
# Find some single words embedded as two tokens - those are very likely bigrams
test_words = [' happily', ' beautiful', ' suddenly', ' exciting', ' surprisingly', ' wonderfully', ' generally', ' delicious', ' basically', ' carefully', ' suddenly', ' absolutely', ' interestingly', ' naturally', ' thankfully', ' quickly', ' usually', ' definitely', ' eventually', ' fortunately']
test_words = [' absolutely', ' basically', ' carefully', ' delicious', ' eventually', ' fortunately', ' generally', ' happily', ' interestingly', ' naturally', ' surprisingly', ' thankfully', ' usually', ' wonderfully']
test_words = [' significantly', ' occasionally', ' consistently', ' effectively', ' importantly', ' basically', ' incredibly', ' typically', ' ideally', ' genuinely', ' certainly', ' previously', ' frequently', ' gradually', ' personally', ' ultimately', ' currently', ' traditionally', ' basically', ' fortunately']
test_words = ['temps', 'personne', 'jour', 'année', 'vie', 'chose', 'partie', 'homme', 'femme', 'enfant', 'maison', 'ville', 'amour', 'travail', 'ami', 'famille', 'étudiant', 'cœur', 'argent', 'temps', 'matin', 'soir', 'nuit', 'ami', 'frère', 'sœur', 'père', 'mère', 'fille', 'garçon']

bigrams = []
for word in test_words:
    if word[0] != " ":
        word = " " + word
    length = len(model.to_tokens(word)[0])-1
    if length == 2:
        str_tokens = model.to_str_tokens(word)
        bigrams.append([str_tokens[1], str_tokens[2]])
print(bigrams)

[[' person', 'ne'], [' ann', 'ée'], [' part', 'ie'], [' hom', 'me'], [' fem', 'me'], [' enf', 'ant'], [' ma', 'ison'], [' am', 'our'], [' am', 'i'], [' fam', 'ille'], [' arg', 'ent'], [' mat', 'in'], [' so', 'ir'], [' n', 'uit'], [' am', 'i'], [' fr', 'ère'], [' p', 'ère'], [' m', 'ère'], [' fil', 'le']]


In [5]:
def get_answer_token_logit_difference(
    prompt: str, 
    answer: str, 
    model: HookedTransformer = model, 
    mean_disabled_activation: Float[Tensor, "d_mlp"] = english_activations, 
    mean_enabled_activation: Float[Tensor, "d_mlp"] = french_activations, 
    layer_to_ablate: int = LAYER_TO_ABLATE, 
    neurons_to_ablate: list[int] = NEURONS_TO_ABLATE, 
    log=True, 
    apply_softmax=True
):
    """ Get the difference in logits between the answer token when the context neurons are enabled and when they're disabled. """
    def get_answer_rank(logits: Float[Tensor, "d_mlp"], answer_token: int):
        return torch.sum(logits > logits[answer_token]).item() + 1
    
    neurons = torch.LongTensor(neurons_to_ablate)
    def disable_neurons_hook(value, hook):
        value[:, :, neurons] = mean_disabled_activation[neurons]
        return value
    def enable_neurons_hook(value, hook):
        value[:, :, neurons] = mean_enabled_activation[neurons]
        return value

    position = -1
    original_logits, disabled_logits, _, _ = haystack_utils.get_caches_single_prompt(prompt, model, fwd_hooks=[(f'blocks.{layer_to_ablate}.mlp.hook_post', disable_neurons_hook)], return_type="logits")
    _, enabled_logits, _, _ = haystack_utils.get_caches_single_prompt(prompt, model, fwd_hooks=[(f'blocks.{layer_to_ablate}.mlp.hook_post', enable_neurons_hook)], return_type="logits")
    answer_token = model.to_single_token(answer)

    if apply_softmax:
        original_logits = original_logits.softmax(dim=-1)
        enabled_logits = enabled_logits.softmax(dim=-1)
        disabled_logits = disabled_logits.softmax(dim=-1)

    contribution_difference = (enabled_logits[0, position, answer_token] - disabled_logits[0, position, answer_token]).item()

    if log:
        print(f"Prediction of \"{prompt}\" + \"{answer}\"")
        print(f"Original logits probs: {original_logits[0, position, answer_token]:.6f} (Rank {get_answer_rank(original_logits[0, position], answer_token)})")
        print(f"Enabled context neuron probs: {enabled_logits[0, position, answer_token]:.6f} (Rank {get_answer_rank(enabled_logits[0, position], answer_token)})")
        print(f"Disabled context neuron probs: {disabled_logits[0, position, answer_token]:.6f} (Rank {get_answer_rank(disabled_logits[0, position], answer_token)})")
        print(f"Contribution difference: {contribution_difference:.6f}")
    return contribution_difference


In [6]:
# Note the listed bigrams aren't necessarily the most common completions for the language
french_bigrams = [[" rend", "re"], [" met", "tre"], [" demand", "er"], [" pou", "voir"], [' person', 'ne'], [' hom', 'me'], [' am', 'our']]
english_bigrams = [[' firef', 'ighter'], [' waterm', 'elon'], [' straw', 'berry'], [' fortun', 'ately'], [' thank', 'fully'], [' interesting', 'ly']]

In [7]:
logit_differences = []
for prompt, answer in french_bigrams:
    diff = get_answer_token_logit_difference(prompt, answer, log=False)
    logit_differences.append(diff)

print(np.mean(logit_differences))

0.002872240231876536


In [8]:
logit_differences = []
for prompt, answer in english_bigrams:
    diff = get_answer_token_logit_difference(prompt, answer, log=False)
    logit_differences.append(diff)

print(np.mean(logit_differences))

-0.09110676303195457


Results
- We compare the logit difference of the correct bigram token between activating the French neuron and deactivating the French neuron
- A positive logit difference means the model has a higher activation with the French neuron active
- On French bigrams, the logit difference is positive
    - This means the French neuron boosts French bigrams
- On English bigrams, the logit difference is negative
    - This means the French neuron inhibits English bigrams

## Compare use of context before bigram occurrence
Motivation: see how the context neuron loss difference responds to different contexts before the bigram we care about
- Start with random number tokens (i.e. they shouldn't matter for either language) 

### Repeated unrelated padding tokens

In [None]:
def get_padded_logit_difference(bigram_prompts, random_number_str_token, num_pad):
    logit_differences = []
    for prompt, answer in bigram_prompts:
        prompt = random_number_str_token * num_pad + prompt
        assert len(model.to_tokens(prompt)[0]) == 2 + num_pad, model.to_str_tokens(prompt)
        diff = get_answer_token_logit_difference(prompt, answer, log=False)
        logit_differences.append(diff)
    return logit_differences

In [None]:
# Use numbers as language-agnostic prompts
random_number_str_token = " 42"

In [None]:
french_differences = []
english_differences = []
for num_pad in tqdm(range(20)):
    padded_french_differences = get_padded_logit_difference(french_bigrams, random_number_str_token, num_pad)
    padded_english_differences = get_padded_logit_difference(english_bigrams, random_number_str_token, num_pad)
    french_differences.append(np.mean(padded_french_differences))
    english_differences.append(np.mean(padded_english_differences))

df = pd.DataFrame({"French Differences": french_differences, "English Differences": english_differences, "Number of Padded Tokens": range(20)})
px.line(df, x="Number of Padded Tokens", y=["French Differences", "English Differences"], title="Logit Differences for French and English Bigrams with Padded Tokens", width=800)

  0%|          | 0/20 [00:00<?, ?it/s]

Results
- We are comparing the difference in logits for the correct bigram completion between model runs with the French neuron activated or deactivated
- For French bigrams, the difference in logits gets larger with increased context length
- For English bigrams, the difference in logits gets smaller with increased context length

Compare related padding to unrelated padding

In [None]:
meaningful_pad = " il"
meaningless_pad = " 42"

In [None]:
meaningful_pad_differences = []
meaningless_pad_differences = []

for num_pad in tqdm(range(20)):
    meaningful_pad_difference = get_padded_logit_difference(french_bigrams, meaningful_pad, num_pad)
    meaningless_pad_difference = get_padded_logit_difference(french_bigrams, meaningless_pad, num_pad)
    meaningful_pad_differences.append(np.mean(meaningful_pad_difference))
    meaningless_pad_differences.append(np.mean(meaningless_pad_difference))

df = pd.DataFrame({"Meaningful pad": meaningful_pad_differences, "Meaningless pad": meaningless_pad_differences, "Number of Padded Tokens": range(20)})
px.line(df, x="Number of Padded Tokens", y=["Meaningful pad", "Meaningless pad"], title="Logit Differences for French padding with either meaningless numbers or meaningful french words", width=800)


  0%|          | 0/20 [00:00<?, ?it/s]

Results
- When padding with meaningful values, the overall positive impact of the French neuron seems to be slightly smaller
    - This could mean that other components can also identify the context as French in this scenario?

## Check contribution differences on complete sentence prompts in both languages
- Prediction: 
    - Enabling the French neuron should boost the French token on both French and English sentences
    - Enabling the French neuron should inhibit the English token on both French and English sentences

In [None]:
def compare_biliangual_prompt(english_prompt, english_answer, french_prompt, french_answer):
    print("French prompt, French answer")
    _ = get_answer_token_logit_difference(french_prompt, french_answer)
    print("\nEnglish prompt, French answer")
    _ = get_answer_token_logit_difference(english_prompt, french_answer)
    print("\nEnglish prompt, English answer")
    _ = get_answer_token_logit_difference(english_prompt, english_answer)
    print("\nFrench prompt, English answer")
    _ = get_answer_token_logit_difference(french_prompt, english_answer)


In [None]:
english_prompt = "I love animals. I enjoy playing with my"
english_answer = " cat"
french_prompt = "J'adore les animaux. J'aime jouer avec mon"
french_answer = " chat"

compare_biliangual_prompt(english_prompt, english_answer, french_prompt, french_answer)

French prompt, French answer
Prediction of "J'adore les animaux. J'aime jouer avec mon" + " chat"
Original logits probs: 0.000065 (Rank 1281)
Enabled context neuron probs: 0.000068 (Rank 1262)
Disabled context neuron probs: 0.000077 (Rank 1336)
Contribution difference: -0.000010

English prompt, French answer
Prediction of "I love animals. I enjoy playing with my" + " chat"
Original logits probs: 0.000001 (Rank 7562)
Enabled context neuron probs: 0.000002 (Rank 6590)
Disabled context neuron probs: 0.000001 (Rank 7568)
Contribution difference: 0.000001

English prompt, English answer
Prediction of "I love animals. I enjoy playing with my" + " cat"
Original logits probs: 0.001720 (Rank 54)
Enabled context neuron probs: 0.002983 (Rank 37)
Disabled context neuron probs: 0.001692 (Rank 54)
Contribution difference: 0.001291

French prompt, English answer
Prediction of "J'adore les animaux. J'aime jouer avec mon" + " cat"
Original logits probs: 0.000232 (Rank 535)
Enabled context neuron probs

In [None]:
english_prompt = "I'm going to the supermarket to buy some"
english_answer = " bread"
french_prompt = "Je vais au supermarché pour acheter du"
french_answer = " pain"

compare_biliangual_prompt(english_prompt, english_answer, french_prompt, french_answer)

French prompt, French answer
Prediction of "Je vais au supermarché pour acheter du" + " pain"
Original logits probs: 0.002075 (Rank 92)
Enabled context neuron probs: 0.002107 (Rank 91)
Disabled context neuron probs: 0.001291 (Rank 139)
Contribution difference: 0.000816

English prompt, French answer
Prediction of "I'm going to the supermarket to buy some" + " pain"
Original logits probs: 0.000061 (Rank 1619)
Enabled context neuron probs: 0.000048 (Rank 1945)
Disabled context neuron probs: 0.000061 (Rank 1616)
Contribution difference: -0.000013

English prompt, English answer
Prediction of "I'm going to the supermarket to buy some" + " bread"
Original logits probs: 0.007390 (Rank 12)
Enabled context neuron probs: 0.010748 (Rank 9)
Disabled context neuron probs: 0.007394 (Rank 12)
Contribution difference: 0.003354

French prompt, English answer
Prediction of "Je vais au supermarché pour acheter du" + " bread"
Original logits probs: 0.000003 (Rank 6241)
Enabled context neuron probs: 0.000

In [None]:
english_prompt = "Bob's parents live in a small village. His"
english_answer = " father"
french_prompt = "Les parents de Bob vivent dans un petit village. Son"
french_answer = ' p' # père

compare_biliangual_prompt(english_prompt, english_answer, french_prompt, french_answer)

French prompt, French answer
Prediction of "Les parents de Bob vivent dans un petit village. Son" + " p"
Original logits probs: 0.011118 (Rank 6)
Enabled context neuron probs: 0.012117 (Rank 5)
Disabled context neuron probs: 0.008631 (Rank 12)
Contribution difference: 0.003486

English prompt, French answer
Prediction of "Bob's parents live in a small village. His" + " p"
Original logits probs: 0.000152 (Rank 412)
Enabled context neuron probs: 0.000292 (Rank 215)
Disabled context neuron probs: 0.000152 (Rank 412)
Contribution difference: 0.000139

English prompt, English answer
Prediction of "Bob's parents live in a small village. His" + " father"
Original logits probs: 0.149794 (Rank 1)
Enabled context neuron probs: 0.176889 (Rank 2)
Disabled context neuron probs: 0.149605 (Rank 1)
Contribution difference: 0.027284

French prompt, English answer
Prediction of "Les parents de Bob vivent dans un petit village. Son" + " father"
Original logits probs: 0.000526 (Rank 297)
Enabled context n

Results
- We are only testing on a few manual prompts but the results seem NOT in line with the previous bigram experiments
- Specifically, enabling the french neuron seems to boost the correct english continuation for the "bread" sentence but not the French continuation
- Activating the french neuron could have an overall positive effect on logits, I switched to softmax to prevent this
- Why does the french neuron boost "cat" in the english sentence
    - This seems like the opposite of what we expected
    - I asked chatGPT if "if there are french words beginning with "cat" and apparently there aren't

- There are many French words beginning with cat: https://educalingo.com/en/dic-fr/abc/cat