## Setup

In [1]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer, ActivationCache, utils, patching
from jaxtyping import Float, Int, Bool
from torch import Tensor
from tqdm.auto import tqdm
import plotly.io as pio
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import json
import plotting_utils
import hook_utils
import plotly.express as px


pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import get_mlp_activations
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [2]:
ngram = NGRAM = "orschlägen"
model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

activate_neurons_fwd_hooks, deactivate_neurons_fwd_hooks = haystack_utils.get_context_ablation_hooks(3, [669], model)
all_ignore, _ = haystack_utils.get_weird_tokens(model, plot_norms=False)

german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
common_tokens = haystack_utils.get_common_tokens(german_data, model, all_ignore, k=100)

# Sort tokens into new word vs continuation
new_word_tokens = []
continuation_tokens = []
for token in common_tokens:
    str_token = model.to_single_str_token(token.item())
    if str_token.startswith(" "):
        new_word_tokens.append(token)
    else:
        continuation_tokens.append(token)
new_word_tokens = torch.stack(new_word_tokens)
continuation_tokens = torch.stack(continuation_tokens)

context_direction = model.W_out[3, 669, :]

def get_cosine_sim(direction: Float[Tensor, "d_res"], layer=5) -> Float[Tensor, "d_mlp"]:
    cosine = torch.nn.CosineSimilarity(dim=1)
    return cosine(model.W_in[layer].T, direction.unsqueeze(0))

def plot_histogram(t1, t2, t3, name1, name2, name3):
    t1 = t1.cpu().numpy()
    t2 = t2.cpu().numpy()
    t3 = t3.cpu().numpy()
    fig = go.Figure()
    bin_width= 0.01
    fig.add_trace(go.Histogram(x=t1, name=name1, opacity=0.5, histnorm='probability density', xbins=dict(size=bin_width)))
    fig.add_trace(go.Histogram(x=t2, name=name2, opacity=0.5 , histnorm='probability density', xbins=dict(size=bin_width)))
    fig.add_trace(go.Histogram(x=t3, name=name3, opacity=0.5, histnorm='probability density', xbins=dict(size=bin_width)))

    fig.update_layout(
        title="Individual MLP5 similarities to direction vectors",
        xaxis_title="Cosine Similarity",
        yaxis_title="Probability Density",
        barmode="overlay",
    )

    fig.show()

def compute_mlp_loss(prompts, df, neurons, ablate_mode="NNN", layer=5, compute_original_loss=False, mean=True):

    mean_activations = torch.Tensor(df[df.index.isin(neurons.tolist())][ablate_mode].tolist()).cuda()
    def ablate_mlp_hook(value, hook):
        value[:, :, neurons] = mean_activations
        return value

    with model.hooks(fwd_hooks=[(f"blocks.{layer}.mlp.hook_pre", ablate_mlp_hook)]):
        if mean:
            ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
        else:
            ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()

    if compute_original_loss:
        if mean:
            loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
        else:
            loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()
        return loss, ablated_loss
    return ablated_loss

def compare_mlp_losses(prompts, df, neurons, ablate_mode):
    original_loss, ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode=ablate_mode, mean=True, compute_original_loss=True)
    all_ablated_loss = compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(2048)]), ablate_mode=ablate_mode, mean=True, compute_original_loss=False)
    print(f"Ablating {len(neurons)} neurons: {ablated_loss:.2f} (original: {original_loss:.2f}, all ablated: {all_ablated_loss:.2f})")

def create_ablation_prompts(prompts, ablation_mode):
    if ablation_mode[0] == "N":
        prompts = haystack_utils.replace_column(prompts, -3, common_tokens)
    if ablation_mode[1] == "N":
        prompts = haystack_utils.replace_column(prompts, -2, common_tokens)
    return prompts

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.


  0%|          | 0/200 [00:00<?, ?it/s]

In [4]:
# Loss change for different AND thresholds
option = NGRAM
df = pd.read_pickle(f"data/and_neurons/df_{option}.pkl") 

with open(f"data/and_neurons/set_losses.json", "r") as f:
    all_losses = json.load(f)

prompts = haystack_utils.generate_random_prompts(option, model, common_tokens, 500, length=20)

## And neurons (differences thresholds)

In [8]:
gen_token = model.to_single_token("gen")
ge_token = model.to_single_token("ge")
gen_dir = model.tokens_to_residual_directions(gen_token)
ge_dir = model.tokens_to_residual_directions(ge_token)

cos = torch.nn.CosineSimilarity(dim=1)
gen_sims = cos(model.W_out[5], gen_dir.unsqueeze(0)).cpu().numpy()
ge_sims = cos(model.W_out[5], ge_dir.unsqueeze(0)).cpu().numpy()

df["GenSim"] = gen_sims
df["GeSim"] = ge_sims

In [9]:
df["Custom"] = (df["YYY"] - df["NNN"]) > ((df["NYN"] - df["NNN"])+(df["NNY"] - df["NNN"])+(df["YNN"] - df["NNN"]))
df["Diff"] = (df["YYY"] - df["NNN"]) - ((df["NYN"] - df["NNN"])+(df["NNY"] - df["NNN"])+(df["YNN"] - df["NNN"]))
df["DiffCon"] = df["GenSim"] > df["GeSim"]
px.histogram(df, x="Diff", histnorm='percent', color="DiffCon", barmode="overlay", width=800)

In [21]:
px.scatter(df, x="GeSim", y="GenSim", color="ContextAblationLossIncrease")

In [11]:
# Check how similar gen and ge dirs are
cos = torch.nn.CosineSimilarity(dim=0)
cos(gen_dir, ge_dir)

tensor(0.3292, device='cuda:0')

In [12]:
haystack_utils.get_boosted_tokens(prompts, model, deactivate_neurons_fwd_hooks, all_ignore, deboost=False)
haystack_utils.get_boosted_tokens(prompts, model, deactivate_neurons_fwd_hooks, all_ignore, deboost=True)

Boosted tokens: uf (+4.30), gen (+1.91), ß (+1.48), ger (+0.51), ßen (+0.20)
Deboosted tokens: ge (-0.40), gt (-0.02)


In [13]:
print(df[df["AblationDiff"]>0.2][["AblationDiff", "GenSim", "GeSim"]].mean())
print(df[df["AblationDiff"]<-0.2][["AblationDiff", "GenSim", "GeSim"]].mean())

Prev/Curr/Context
AblationDiff    0.434787
GenSim          0.002992
GeSim          -0.000410
dtype: float64
Prev/Curr/Context
AblationDiff   -0.431541
GenSim          0.000029
GeSim           0.004855
dtype: float64


## Old threshold investigation (not using differences)

In [14]:
df_tmp = df.copy()
ablation_mode = "YYN"

df_tmp["Custom"] = (df["YYY"]>0) & (df["YYY"]>df["NNN"]) & (df["GenSim"]>df["GeSim"]) & \
    (df["YYY"]>df["YYN"]) & (df["YYY"]>df["YNY"]) & (df["YYY"]>df["NYY"]) &\
    (df["YYY"]>df["NYN"]) & (df["YYY"]>df["NNY"]) & (df["YYY"]>df["YNN"])# & df["PosSim"]

print(df_tmp["Custom"].sum())
pos_and_neurons = torch.LongTensor(df_tmp[df_tmp["Custom"]].index.tolist()).cuda()

#df_tmp["context_diff"] = df_tmp["YYY"] - df_tmp["YYN"]
#df_tmp = df_tmp.sort_values(by=["Custom", "context_diff"], ascending=False)
#pos_and_neurons = torch.LongTensor(df_tmp.index.tolist()[:30]).cuda()

original_loss, ablated_loss = compute_mlp_loss(prompts, df, pos_and_neurons, ablate_mode=ablation_mode, compute_original_loss=True)
print(original_loss, ablated_loss)

102
1.365806221961975 7.718233585357666


In [15]:
def compute_counterfactual_losses(prompts, df, neurons):
    losses = [model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()]
    ablations = ["YYY", "YNY", "NYY", "YYN", "YNN", "NYN", "NNY", "NNN"]
    names = ["Original"] + ablations
    for ablation_mode in ablations:
        ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode=ablation_mode, mean=False)
        losses.append(ablated_loss)
    return losses, names

losses, names = compute_counterfactual_losses(prompts, df, pos_and_neurons)#torch.LongTensor([i for i in range(100)]))
plotting_utils.plot_barplot(losses, names, title="Losses for patching 102 AND MLP5 neurons with different ablation modes", yrange=(0, 24))

In [None]:
# And thresholds without similarity constraint
prompts = haystack_utils.generate_random_prompts(option, model, common_tokens, 2000, length=20)
losses = []
lens =  []
thresholds = []
for and_threshold in np.arange(-0.5, 2, 0.1):
    df_tmp = df.copy()
    ablation_mode = "YYN"
    df_tmp["Custom"] = (df["YYY"]>and_threshold) & (df["YYN"]<=and_threshold) & (df["YNY"]<=and_threshold) & (df["NYY"]<=and_threshold) & (df["YNN"]<=and_threshold) & (df["NNY"]<=and_threshold)& (df["NYN"]<=and_threshold)
    pos_and_neurons = torch.LongTensor(df_tmp[df_tmp["Custom"]].index.tolist()).cuda()

    original_loss, ablated_loss = compute_mlp_loss(prompts, df, pos_and_neurons, ablate_mode=ablation_mode, compute_original_loss=True)
    if len(losses) == 0:
        losses.append([original_loss])
    losses.append([ablated_loss])
    lens.append(len(pos_and_neurons))
    thresholds.append(and_threshold)
names = ["Original"] + [f"AND {thr:.2f} ({length})" for thr, length in zip(thresholds, lens)]
haystack_utils.plot_barplot(losses, names, title="AND neuron loss increase for different thresholds")

In [None]:
# And thresholds with similarity constraint
losses = []
lens =  []
thresholds = []
for and_threshold in np.arange(-0.5, 2, 0.1):
    df_tmp = df.copy()
    ablation_mode = "YYN"
    df_tmp["Custom"] = (df["GenSim"]>df["GeSim"]) & \
        (df["YYY"]>and_threshold) & (df["YYN"]<=and_threshold) & (df["YNY"]<=and_threshold) & (df["NYY"]<=and_threshold) & (df["YNN"]<=and_threshold) & (df["NNY"]<=and_threshold)& (df["NYN"]<=and_threshold)
    pos_and_neurons = torch.LongTensor(df_tmp[df_tmp["Custom"]].index.tolist()).cuda()

    original_loss, ablated_loss = compute_mlp_loss(prompts, df, pos_and_neurons, ablate_mode=ablation_mode, compute_original_loss=True)
    if len(losses) == 0:
        losses.append([original_loss])
    losses.append([ablated_loss])
    lens.append(len(pos_and_neurons))
    thresholds.append(and_threshold)
names = ["Original"] + [f"AND {thr:.2f} ({length})" for thr, length in zip(thresholds, lens)]
haystack_utils.plot_barplot(losses, names, title="AND neuron loss increase with cosine sim constraint: gen > ge")

In [16]:
with model.hooks(deactivate_neurons_fwd_hooks):
    ablated_loss, ablated_cache = model.run_with_cache(prompts, return_type="loss")

def get_ablate_neurons_hook(neuron: int | list[int], ablated_cache, layer=5):
    def ablate_neurons_hook(value, hook):
        value[:, :, neuron] = ablated_cache[f'blocks.{layer}.mlp.hook_post'][:, :, neuron]
        return value
    return [(f'blocks.{layer}.mlp.hook_post', ablate_neurons_hook)]

ablate_top_neurons_hook = get_ablate_neurons_hook([i for i in range(2048)], ablated_cache)

In [17]:
original_logprobs, ablated_logprobs, _, all_MLP5_logprobs = haystack_utils.get_direct_effect(prompts, model, pos=-2, context_ablation_hooks=deactivate_neurons_fwd_hooks, context_activation_hooks=activate_neurons_fwd_hooks, return_type='logprobs')

diffs = (original_logprobs - ablated_logprobs).mean(0)
diffs[all_ignore] = 0
diffs[original_logprobs.mean(0)<-7] = 0
top_diff, top_token = torch.topk(diffs, 20)
print(top_diff)
print(model.to_str_tokens(top_token))

tensor([1.6114, 1.5769, 0.9038, 0.4486, 0.0799, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0')
['ß', 'gen', 'ßen', 'ger', 'gt', '-', '*', ',', '(', '&', ')', '+', "'", '!', '<|endoftext|>', '<|padding|>', '%', '$', '"', '#']


In [18]:

# %%

option = "orschlägen"
ablation_mode = "YYN"
prompts = haystack_utils.generate_random_prompts(option, model, common_tokens, 1000, length=20)

names = list(all_losses[option][ablation_mode].keys())
losses = [[all_losses[option][ablation_mode][name]] for name in names]

haystack_utils.plot_barplot(losses, names)


12 12
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## Early layer investigation

In [None]:


layer = 2
ngram = "orschlägen"
prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 500, length=20)

if ngram.startswith(" "):
    prompt_tuple = haystack_utils.get_trigram_prompts(prompts, new_word_tokens, continuation_tokens)
else:
    prompt_tuple = haystack_utils.get_trigram_prompts(prompts, continuation_tokens, continuation_tokens)
prev_token_direction, curr_token_direction = haystack_utils.get_residual_trigram_directions(prompt_tuple, model, layer-1)

prev_token_sim = get_cosine_sim(prev_token_direction, layer)
curr_token_sim = get_cosine_sim(curr_token_direction, layer)
context_sim = get_cosine_sim(context_direction, layer)
    
plot_histogram(prev_token_sim, curr_token_sim, context_sim, "Prev Token", "Curr Token", "Context")
# %%
prev_sim_neurons = torch.argwhere(prev_token_sim>0.05)
curr_sim_neurons = torch.argwhere(curr_token_sim>0.03)

print(len(prev_sim_neurons), len(curr_sim_neurons))
union = haystack_utils.union_where([prev_token_sim, curr_token_sim], 0.07)
print(union)
# %%

# Get random mean cache
random_prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 500, length=20)[:, :-3]
_, random_cache = model.run_with_cache(random_prompts)


# %%
# Define ablate neuron hook

# Layer 1
# orschlägen: tensor([  61,  188, 1011], device='cuda:0')
# häufig: 268 (almost doubles loss)
# beweglich: neurons decrease loss - maybe they boost alternative completion

def get_ablate_neurons_hook(neurons, layer):
    print(neurons)
    def ablate_neurons_hook(value, hook):
        value[:, :, neurons] = random_cache[f'blocks.{layer}.mlp.hook_post'][:, :, neurons].mean((0, 1))
        return value
    return [(f'blocks.{layer}.mlp.hook_post', ablate_neurons_hook)]

# Check loss increase
original_loss, original_ablated_loss = compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(model.cfg.d_mlp)]).cuda(), compute_original_loss=True)

with model.hooks(fwd_hooks=get_ablate_neurons_hook([1789], layer)):
    ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()

print(original_loss, original_ablated_loss, ablated_loss)
# %%

# 1011 increases loss on both "gen" and "ge"
# Either it boosts both completions (trigram table)
# Or it combines "orsch" and "lä" into a single representation that later components use

# Check if trigram table by looking at the direct effect
# Total effect of L1N1011
with model.hooks(fwd_hooks=get_ablate_neurons_hook([1406], layer)):
    _, ablated_cache = model.run_with_cache(prompts)

def ablate_component_hook(value, hook):
    value = ablated_cache[hook.name]
    return value

components = [f"blocks.{layer}.mlp.hook_post" for layer in range(3, 6)] + [f"blocks.{layer}.attn.hook_z" for layer in range(3, 6)]
hooks = [(component, ablate_component_hook) for component in components]

with model.hooks(fwd_hooks=hooks):
    ablated_logits = model(prompts, return_type="logits", loss_per_token=True)[:, -2].log_softmax(-1).mean(0)

original_logits = model(prompts, return_type="logits", loss_per_token=True)[:, -2].log_softmax(-1).mean(0)

print(ablated_logits.shape, original_logits.shape)

prob_diff = original_logits - ablated_logits
prob_diff[all_ignore] = 0
prob_diff[original_logits < -7] = 0
diffs, tokens = torch.topk(prob_diff, 20)
print(diffs)
print(tokens)
print(model.to_str_tokens(tokens))

# %% 
# Direct effect
_, original_cache = model.run_with_cache(prompts)

def activate_component_hook(value, hook):
    value = original_cache[hook.name]
    return value

activate_hooks = [(component, activate_component_hook) for component in components]

with model.hooks(fwd_hooks=activate_hooks + get_ablate_neurons_hook([1406], layer)):
    activated_logits = model(prompts, return_type="logits", loss_per_token=True)[:, -2].log_softmax(-1).mean(0)

prob_diff = original_logits - activated_logits
prob_diff[all_ignore] = 0
prob_diff[original_logits < -7] = 0
diffs, tokens = torch.topk(prob_diff, 20)
print(diffs)
print(tokens)
print(model.to_str_tokens(tokens))

# Check later components + context neuron effects of 1011
# %%

#output_direction = model.W_out[1, 1011]
output_direction = model.W_out[2, 1406]
context_direction = model.W_out[3, 669]

output_sims = get_cosine_sim(output_direction, 5)
context_sims = get_cosine_sim(context_direction, 5)

plot_histogram(output_sims, context_sims, torch.zeros_like(output_sims), "Output", "Context", "Zero")
# %%
union = haystack_utils.union_where([output_sims, context_sims], 0.05)
len(union)
# %%
ngram = "orschlägen"
prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 1000, length=20)

original_loss, original_ablated_loss = compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(model.cfg.d_mlp)]).cuda(), compute_original_loss=True)

with model.hooks(fwd_hooks=get_ablate_neurons_hook(union, 5)): #712, 394, 287
    ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()

print(original_loss, original_ablated_loss, ablated_loss)



## deine / deinen grammar investigation

In [None]:
# %%
ngram = " meine Vorschläge"
prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 1000, length=20)
original_loss, original_ablated_loss = compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(model.cfg.d_mlp)]).cuda(), compute_original_loss=True)
print(original_loss, original_ablated_loss)
# %%
model.to_str_tokens(model.to_tokens(" deinen Vorschläge", prepend_bos=False))
# %%

0.3360338807106018 4.822600841522217


[' de', 'inen', ' V', 'orsch', 'lä', 'ge']

In [58]:
original_losses = []
ablated_losses = []
ngrams = [" Vorschlägen", " Vorschläge", " seine Vorschläge", " seinen Vorschläge", " seinen Vorschlägen", " seine Vorschlägen"]

for ngram in ngrams:
    prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 1000, length=20)

    with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
        ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
        ablated_losses.append(ablated_loss)
    
    original_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    original_losses.append(original_loss)



In [59]:
df = pd.DataFrame({"ngram": ngrams, "original_loss": original_losses, "ablated_loss": ablated_losses})

In [60]:
print(df)

                 ngram  original_loss  ablated_loss
0          Vorschlägen       1.233227      3.584343
1           Vorschläge       1.227341      0.848729
2     seine Vorschläge       0.372710      0.360383
3    seinen Vorschläge       3.786774      3.005842
4   seinen Vorschlägen       0.244110      2.554850
5    seine Vorschlägen       2.594725      4.802067


In [80]:
# Create traces for 'original loss' and 'ablated loss'
trace1 = go.Bar(
    x=df['ngram'],
    y=df['original_loss'],
    name='Original',
    marker_color='blue'
)

trace2 = go.Bar(
    x=df['ngram'],
    y=df['ablated_loss'],
    name='Ablated',
    marker_color='green'
)

correct_ngrams = [" Vorschlägen", " Vorschläge", " seine Vorschläge", "", " seinen Vorschlägen", ""]
incorrect_ngrams = ["", "", "", " seinen Vorschläge", "", " seine Vorschlägen"]


annotations = []
for i, ngram in enumerate(incorrect_ngrams):
    if ngram:  # if ngram is not empty
        annotations.append(dict(
            x=i,  # position of the label
            y=-0.052,  # adjust this value to move the label up or down
            xref="x",
            yref="paper",
            text=ngram,  # the label
            font=dict(
                color="red",
                size=12
            ),
            showarrow=False
        ))

# Create the layout
layout = go.Layout(
    title='Original Loss vs Ablated Loss',
    barmode='group',  # this is what makes the bars grouped instead of stacked
    xaxis=dict(tickmode='array', tickvals=df.index, ticktext=correct_ngrams),  # hide original x-axis labels
    annotations=annotations,  # add annotations for the red labels
    width=950
)

# Create the Figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

fig.show()

## Checking if AND conditions hold

Logit difference plan
- Option 1
    - Run model for "NNN"
    - Run model for all other modes
    - Patch "NNN" activation for one neuron at a time
    - Record difference in "gen" logit
- option 2
    - Run model for each mode, do DLA with "gen"
    - Subtract DLA of "NNN" from all other DLA 

In [3]:
#NGRAM = "orschlägen"
#NGRAM = " häufig"
#NGRAM = " beweglich"
NGRAM = " seinen Vorschlägen"
ANSWER_TOKEN = model.to_str_tokens(model.to_tokens(NGRAM, prepend_bos=False).flatten())[-1]
ANSWER_TOKEN_ID = model.to_tokens(NGRAM).flatten()[-1].item()
df = pd.read_pickle(f"data/and_neurons/df_{NGRAM.strip()}.pkl") 
print(ANSWER_TOKEN, ANSWER_TOKEN_ID)

gen 1541


In [4]:
# Look for neurons that deboost "gen" if "orsch" is not present
df = pd.read_pickle(f"data/and_neurons/df_{NGRAM.strip()}.pkl") 
prompts = haystack_utils.generate_random_prompts(NGRAM, model, common_tokens, 2000, length=20)

In [5]:
# Toggle experiment
ablated_prompts = create_ablation_prompts(prompts, "NYY")

yyy_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
nyy_loss = model(ablated_prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
    yyn_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    nyn_loss = model(ablated_prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()

# Order: ablated - activated (positive loss increase from ablation)
# Loss increase of "orsch" without context: YYN vs NYN
yyn_nyn_diff = nyn_loss - yyn_loss
# Loss increase of context without "orsch": NYY vs NYN
nyy_nyn_diff = nyn_loss - nyy_loss
# Loss increase of both: YYY vs NYN
yyy_nyn_diff =  nyn_loss - yyy_loss

print(f"Loss difference for (NYN-YYY): {yyy_nyn_diff:.2f}")
print(f"Loss difference for (NYN-YYN) + (NYN-NYY): {yyn_nyn_diff + nyy_nyn_diff:.2f}")

Loss difference for (NYN-YYY): 2.83
Loss difference for (NYN-YYN) + (NYN-NYY): 0.13


In [6]:
# Toggle experiment LOGITS
ablated_prompts = create_ablation_prompts(prompts, "NYY")

yyy_logit = model(prompts, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
nyy_logit = model(ablated_prompts, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
    yyn_logit = model(prompts, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
    nyn_logit = model(ablated_prompts, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()

# Order: original - ablated (Positive boost to answer logit)
yyn_nyn_diff = yyn_logit - nyn_logit
nyy_nyn_diff = nyy_logit - nyn_logit 
yyy_nyn_diff =  yyy_logit - nyn_logit

print(f"Logit difference for (YYY-NYN): {yyy_nyn_diff:.2f}")
print(f"Logit difference for (YYN-NYN) + (NYY-NYN): {yyn_nyn_diff + nyy_nyn_diff:.2f}")

Logit difference for (YYY-NYN): 20.95
Logit difference for (YYN-NYN) + (NYY-NYN): 18.65


In [7]:
# Toggle experiment
ablated_prompts_1 = create_ablation_prompts(prompts, "NYY")
ablated_prompts_2 = create_ablation_prompts(prompts, "YNY")
ablated_prompts_3 = create_ablation_prompts(prompts, "NNY")

yyy_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
nyy_loss = model(ablated_prompts_1, return_type="loss", loss_per_token=True)[:, -1].mean().item()
yny_loss = model(ablated_prompts_2, return_type="loss", loss_per_token=True)[:, -1].mean().item()
nny_loss = model(ablated_prompts_3, return_type="loss", loss_per_token=True)[:, -1].mean().item()
with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
    yyn_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    nyn_loss = model(ablated_prompts_1, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    ynn_loss = model(ablated_prompts_2, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    nnn_loss = model(ablated_prompts_3, return_type="loss", loss_per_token=True)[:, -1].mean().item()

# Loss increases for individual features
nny_diff = nnn_loss - nny_loss
nyn_diff = nnn_loss - nyn_loss
ynn_diff = nnn_loss - ynn_loss
# Loss increase for two features
nyy_diff = nnn_loss - nyy_loss
yny_diff = nnn_loss - yny_loss
yyn_diff = nnn_loss - yyn_loss
# Loss increase for all features
yyy_diff = nnn_loss - yyy_loss

# Check if sum of individual features is lower than overall loss increase
print(f"Loss difference for (NNN-YYY): {yyy_diff:.2f}")
print(f"Loss difference for ((NNN-YYN) + (NNN-NYY) + (NNN-YNY))/2: {(yyn_diff + nyy_diff + yny_diff)/2:.2f}")
print(f"Loss difference for (NNN-YNN) + (NNN-NYN) + (NNN-YNN): {ynn_diff + nyn_diff + nny_diff:.2f}")

Loss difference for (NNN-YYY): 9.26
Loss difference for ((NNN-YYN) + (NNN-NYY) + (NNN-YNY))/2: 6.34
Loss difference for (NNN-YNN) + (NNN-NYN) + (NNN-YNN): 5.84


In [8]:
# Toggle experiment LOGITS
ablated_prompts_1 = create_ablation_prompts(prompts, "NYY")
ablated_prompts_2 = create_ablation_prompts(prompts, "YNY")
ablated_prompts_3 = create_ablation_prompts(prompts, "NNY")

yyy_logit = model(prompts, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
nyy_logit = model(ablated_prompts_1, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
yny_logit = model(ablated_prompts_2, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
nny_logit = model(ablated_prompts_3, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
    yyn_logit = model(prompts, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
    nyn_logit = model(ablated_prompts_1, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
    ynn_logit = model(ablated_prompts_2, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()
    nnn_logit = model(ablated_prompts_3, return_type="logits", loss_per_token=True)[:, -2, ANSWER_TOKEN_ID].mean().item()

# Loss increases for individual features
nny_diff = nny_logit - nnn_logit
nyn_diff = nyn_logit - nnn_logit
ynn_diff = ynn_logit - nnn_logit

# Loss increase for two features
nyy_diff = nyy_logit - nnn_logit
yny_diff = yny_logit - nnn_logit
yyn_diff = yyn_logit - nnn_logit

# Loss increase for all features
yyy_diff = yyy_logit - nnn_logit

# Check if sum of individual features is lower than overall loss increase
print(f"Logit difference for (YYY-NNN): {yyy_diff:.2f}")
print(f"Logit difference for ((YYN-NNN) + (NYY-NNN) + (YNY-NNN))/2: {(yyn_diff + nyy_diff + yny_diff)/2:.2f}")
print(f"Logit difference for (YNN-NNN) + (NYN-NNN) + (YNN-NNN): {ynn_diff + nyn_diff + nny_diff:.2f}")

Logit difference for (YYY-NNN): 30.59
Logit difference for ((YYN-NNN) + (NYY-NNN) + (YNY-NNN))/2: 19.87
Logit difference for (YNN-NNN) + (NYN-NNN) + (YNN-NNN): 11.20


## DLA DF

In [105]:
NGRAM = " häufig"
ANSWER_TOKEN = model.to_str_tokens(model.to_tokens(NGRAM, prepend_bos=False).flatten())[-1]
ANSWER_TOKEN_ID = model.to_tokens(NGRAM).flatten()[-1].item()
df = pd.read_pickle(f"data/and_neurons/df_{NGRAM.strip()}.pkl") 

In [106]:
dla_dict = {}
prompts = haystack_utils.generate_random_prompts(NGRAM, model, common_tokens, 100, length=20)


for ablation_mode in ["NNN", "NNY", "NYN", "YNN", "NYY", "YNY", "YYN", "YYY"]:
    ablated_prompts = create_ablation_prompts(prompts, ablation_mode)
    unembed_dir = model.tokens_to_residual_directions(ANSWER_TOKEN)
    
    if ablation_mode[2] == "N":
        with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
            _, cache = model.run_with_cache(ablated_prompts)
    else:
        _, cache = model.run_with_cache(ablated_prompts)
    
    resid_stack, labels = cache.get_full_resid_decomposition(apply_ln=True, pos_slice=-1, return_labels=True)
    crop = -2050, -2
    assert all([label.startswith("L5N") for label in labels[crop[0]:crop[1]]])
    resid_stack = resid_stack.mean(1)[crop[0]:crop[1]]
    dla = resid_stack @ unembed_dir
    dla_dict[ablation_mode] = dla.cpu().numpy()

dla_df = pd.DataFrame(dla_dict)
dla_df["Neuron"] = [i for i in range(2048)]
dla_df.head()

Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now


Unnamed: 0,NNN,NNY,NYN,YNN,NYY,YNY,YYN,YYY,Neuron
0,-0.001708,-0.000724,-0.001698,-0.001592,-0.000905,-0.000635,-0.002046,-0.000973,0
1,0.010519,0.007268,0.011617,0.011727,0.008633,0.007868,0.011307,0.007614,1
2,0.020536,0.01499,0.007832,0.015499,0.005394,0.010672,0.005594,0.002855,2
3,-0.004317,-0.004889,-0.004148,-0.004365,-0.003751,-0.004864,-0.004828,-0.004589,3
4,-0.023497,-0.040227,-0.033238,-0.010304,-0.050938,-0.031528,-0.025934,-0.048683,4


In [107]:
# DF containing neuron wise DLA for "gen" token with different features (PrevToken, CurrToken, Context) present
dla_df["and_1"] = ((dla_df["YYY"] - dla_df["NNN"]) > ((dla_df["YNY"] - dla_df["NNN"]) + (dla_df["NYY"] - dla_df["NNN"]) + (dla_df["YYN"] - dla_df["NNN"]))/2) & (dla_df["YYY"]>0)
dla_df["and_2"] = ((dla_df["YYY"] - dla_df["NNN"]) > ((dla_df["YNN"] - dla_df["NNN"]) + (dla_df["NYN"] - dla_df["NNN"]) + (dla_df["NNY"] - dla_df["NNN"]))) & (dla_df["YYY"]>0)
dla_df["diff_1"] = (dla_df["YYY"] - dla_df["NNN"]) - ((dla_df["YNY"] - dla_df["NNN"]) + (dla_df["NYY"] - dla_df["NNN"]) + (dla_df["YYN"] - dla_df["NNN"]))/2
dla_df["diff_2"] = (dla_df["YYY"] - dla_df["NNN"]) - ((dla_df["YNN"] - dla_df["NNN"]) + (dla_df["NYN"] - dla_df["NNN"]) + (dla_df["NNY"] - dla_df["NNN"]))

px.histogram(dla_df, x=["diff_1", "diff_2"], barmode="overlay", histnorm='percent', width=800, title=f"'{ANSWER_TOKEN}' DLA differences for '(YYY-NNN)>((YNY-NNN)+(NYY-NNN)+(YYN-NNN))/2'")

In [108]:
prompts = haystack_utils.generate_random_prompts(NGRAM, model, common_tokens, 1000, length=20)

# Diff 1
threshold = 0.005
neurons = torch.LongTensor(dla_df[(dla_df["diff_1"]>threshold) & (dla_df["YYY"]>0)]["Neuron"].tolist())
original_loss, ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode="YYN", mean=True, compute_original_loss=True)
print(f"Ablating {len(neurons)} neurons: {ablated_loss:.2f} (original: {original_loss:.2f})")
# Diff 2
neurons = torch.LongTensor(dla_df[(dla_df["diff_2"]>threshold) & (dla_df["YYY"]>0)]["Neuron"].tolist())
original_loss, ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode="YYN", mean=True, compute_original_loss=True)
print(f"Ablating {len(neurons)} neurons: {ablated_loss:.2f} (original: {original_loss:.2f})")


Ablating 132 neurons: 0.04 (original: 0.09)
Ablating 221 neurons: 0.04 (original: 0.09)


## Activation AND

In [9]:
# Look for neurons that deboost "gen" if "orsch" is not present
NGRAM = " seinen Vorschlägen"
ANSWER_TOKEN = model.to_str_tokens(model.to_tokens(NGRAM, prepend_bos=False).flatten())[-1]
ANSWER_TOKEN_ID = model.to_tokens(NGRAM).flatten()[-1].item()
df = pd.read_pickle(f"data/and_neurons/df_{NGRAM.strip()}.pkl") 

In [10]:
# DF activations with different features (PrevToken, CurrToken, Context) present
df["and_1"] = ((df["YYY"] - df["NNN"]) > ((df["YNY"] - df["NNN"]) + (df["NYY"] - df["NNN"]) + (df["YYN"] - df["NNN"]))/2) & (df["YYY"]>0)
df["and_2"] = ((df["YYY"] - df["NNN"]) > ((df["YNN"] - df["NNN"]) + (df["NYN"] - df["NNN"]) + (df["NNY"] - df["NNN"]))) & (df["YYY"]>0)
df["diff_1"] = (df["YYY"] - df["NNN"]) - ((df["YNY"] - df["NNN"]) + (df["NYY"] - df["NNN"]) + (df["YYN"] - df["NNN"]))/2
df["diff_2"] = (df["YYY"] - df["NNN"]) - ((df["YNN"] - df["NNN"]) + (df["NYN"] - df["NNN"]) + (df["NNY"] - df["NNN"]))

px.histogram(df, x=["diff_1", "diff_2"], histnorm='percent', barmode="overlay", width=800, title=f"Pre-gelu activation differences: '(NNN-YYY)-((NNN-YNY)+(NNN-NYY)+(NNN-YYN))/2'")

In [11]:
prompts = haystack_utils.generate_random_prompts(NGRAM, model, common_tokens, 1000, length=20)
# Diff 1
threshold = 0.2
neurons = torch.LongTensor(df[(df["diff_1"]>threshold) & (df["YYY"]>0)].index.tolist())
compare_mlp_losses(prompts, df, neurons, ablate_mode="YYN")
# Diff 2
threshold = 0.4
neurons = torch.LongTensor(df[(df["diff_2"]>threshold) & (df["YYY"]>0)].index.tolist())
compare_mlp_losses(prompts, df, neurons, ablate_mode="YYN")

Ablating 121 neurons: 0.14 (original: 0.23, all ablated: 2.98)
Ablating 144 neurons: 0.27 (original: 0.23, all ablated: 2.98)


In [12]:
# NEGATIVE and
df["and_1"] = ((df["NNN"] - df["YYY"]) > ((df["NNN"]-df["YNY"]) + (df["NNN"] - df["NYY"]) + (df["NNN"] - df["YYN"]))/2)
df["and_2"] = ((df["NNN"] - df["YYY"]) > ((df["NNN"]-df["YNN"]) + (df["NNN"] - df["NYN"]) + (df["NNN"] - df["NNY"])))
df["diff_1"] = (df["NNN"] - df["YYY"]) - ((df["NNN"]-df["YNY"]) + (df["NNN"] - df["NYY"]) + (df["NNN"] - df["YYN"]))/2
df["diff_2"] = (df["NNN"] - df["YYY"]) - ((df["NNN"]-df["YNN"]) + (df["NNN"] - df["NYN"]) + (df["NNN"] - df["NNY"]))

px.histogram(df, x=["diff_1", "diff_2"], histnorm='percent', barmode="overlay", width=800, title=f"Pre-gelu activation differences: '(NNN-YYY)-((NNN-YNY)+(NNN-NYY)+(NNN-YYN))/2'")

In [13]:
# Diff 1
threshold = 0.1
neurons = torch.LongTensor(df[(df["diff_1"]>threshold) & (df["YYY"]>0)].index.tolist())
compare_mlp_losses(prompts, df, neurons, ablate_mode="YYN")
# Diff 2
threshold = 0.2
neurons = torch.LongTensor(df[(df["diff_2"]>threshold) & (df["YYY"]>0)].index.tolist())
compare_mlp_losses(prompts, df, neurons, ablate_mode="YYN")

Ablating 109 neurons: 0.44 (original: 0.23, all ablated: 2.98)
Ablating 116 neurons: 0.32 (original: 0.23, all ablated: 2.98)


## Suppression neurons

In [None]:
# Look for neurons that deboost "gen" if "orsch" is not present
option = NGRAM
df = pd.read_pickle(f"data/and_neurons/df_{option}.pkl") 
df.head()

Prev/Curr/Context,NNN,NNY,NYN,NYY,YNN,YNY,YYN,YYY,PrevTokenSim,CurrTokenSim,ContextSim,PosSim,NegSim,AblationDiff,And,NegAnd,Boosted,Deboosted,FullAblationLossIncrease,ContextAblationLossIncrease
Neuron,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-1.452348,-1.865411,-1.206139,-1.716999,-1.967061,-2.380259,-1.32276,-1.855663,-0.092576,0.017535,-0.060797,False,True,-0.532903,False,False,False,False,-0.004094,-0.00582
1,-1.041662,-1.18613,-1.242793,-1.350295,-1.351736,-1.42218,-1.183853,-1.149874,-0.031552,-0.006965,-0.008395,False,True,0.033978,False,False,False,False,-0.001441,-0.000631
2,-2.127363,-2.293197,-0.955758,-1.091026,-1.940194,-2.136701,-0.77345,-0.983497,0.034648,0.081715,-0.074554,False,False,-0.210047,False,False,False,False,0.002052,-0.000335
3,-1.090268,-1.259026,-2.459712,-2.098881,-1.240608,-1.239972,-2.479871,-2.146816,0.015636,-0.060014,-0.009209,False,True,0.333055,False,False,False,False,0.020078,-0.003826
4,0.533762,0.91939,-0.291044,0.052384,0.248903,0.697211,-0.913438,-0.483871,-0.039145,-0.049453,0.03519,False,False,0.429567,False,False,False,False,-0.063059,0.003769


In [105]:
prompts = haystack_utils.generate_random_prompts("lä", model, common_tokens, 100, length=20)
logits, cache = model.run_with_cache(prompts)
unembed_dir = model.tokens_to_residual_directions("gen")
resid_stack, labels = cache.get_full_resid_decomposition(apply_ln=True, pos_slice=-1, return_labels=True)
crop = -2050, -2

resid_stack = resid_stack.mean(1)[crop[0]:crop[1]]
dla = resid_stack @ unembed_dir
dla_df = pd.DataFrame({"Neuron": labels[crop[0]:crop[1]], "NOT A DLA": dla.tolist()})

Tried to stack head results when they weren't cached. Computing head results now


In [None]:
prompts = haystack_utils.generate_random_prompts("orschlä", model, common_tokens, 100, length=20)
logits, cache = model.run_with_cache(prompts)
unembed_dir = model.tokens_to_residual_directions("gen")
resid_stack, labels = cache.get_full_resid_decomposition(apply_ln=True, pos_slice=-1, return_labels=True)

resid_stack = resid_stack.mean(1)[crop[0]:crop[1]]
dla = resid_stack @ unembed_dir
dla_df["A DLA"] = dla.tolist()

Tried to stack head results when they weren't cached. Computing head results now


In [None]:
dla_df["TurnedOff"] = (dla_df["NOT A DLA"]<-0.001) & (df["NYY"]>0) & (dla_df["A DLA"]>0) & (df["YYY"]<df["NYY"])
neurons = dla_df[dla_df["TurnedOff"]]["Neuron"].tolist()
neurons = torch.LongTensor([int(neuron.split("N")[1]) for neuron in neurons])
print(len(neurons), neurons)

39 tensor([  67,   73,   95,  111,  169,  198,  231,  286,  420,  445,  522,  570,
         594,  681,  705,  749,  770,  827,  997, 1064, 1099, 1123, 1205, 1216,
        1304, 1360, 1386, 1586, 1738, 1744, 1786, 1864, 1867, 1901, 1905, 1909,
        1949, 1964, 1969])


In [None]:
def compute_counterfactual_losses(prompts, df, neurons):
    losses = [model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()]
    ablations = ["NYY", "YNY", "YYN"]
    names = ["Original"] + ablations
    for ablation_mode in ablations:
        ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode=ablation_mode, mean=False)
        losses.append(ablated_loss)
    return losses, names

prompts = haystack_utils.generate_random_prompts(NGRAM, model, common_tokens, 1000, length=20)
losses, names = compute_counterfactual_losses(prompts, df, neurons)#torch.LongTensor([i for i in range(100)]))
plotting_utils.plot_barplot(losses, names, title="Losses for patching 39 NOT A neurons with different ablation modes")

## Looking for unique prompts