In [1]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer, ActivationCache, utils, patching
from jaxtyping import Float, Int, Bool
from torch import Tensor
from tqdm.auto import tqdm
import plotly.io as pio
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import json
import plotting_utils
import hook_utils

pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

from haystack_utils import get_mlp_activations
import haystack_utils

%reload_ext autoreload
%autoreload 2

In [2]:
ngram = "orschlägen"
model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

activate_neurons_fwd_hooks, deactivate_neurons_fwd_hooks = haystack_utils.get_context_ablation_hooks(3, [669], model)
all_ignore, _ = haystack_utils.get_weird_tokens(model, plot_norms=False)

german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
common_tokens = haystack_utils.get_common_tokens(german_data, model, all_ignore, k=100)

# Sort tokens into new word vs continuation
new_word_tokens = []
continuation_tokens = []
for token in common_tokens:
    str_token = model.to_single_str_token(token.item())
    if str_token.startswith(" "):
        new_word_tokens.append(token)
    else:
        continuation_tokens.append(token)
new_word_tokens = torch.stack(new_word_tokens)
continuation_tokens = torch.stack(continuation_tokens)

context_direction = model.W_out[3, 669, :]

def get_cosine_sim(direction: Float[Tensor, "d_res"], layer=5) -> Float[Tensor, "d_mlp"]:
    cosine = torch.nn.CosineSimilarity(dim=1)
    return cosine(model.W_in[layer].T, direction.unsqueeze(0))

def plot_histogram(t1, t2, t3, name1, name2, name3):
    t1 = t1.cpu().numpy()
    t2 = t2.cpu().numpy()
    t3 = t3.cpu().numpy()
    fig = go.Figure()
    bin_width= 0.01
    fig.add_trace(go.Histogram(x=t1, name=name1, opacity=0.5, histnorm='probability density', xbins=dict(size=bin_width)))
    fig.add_trace(go.Histogram(x=t2, name=name2, opacity=0.5 , histnorm='probability density', xbins=dict(size=bin_width)))
    fig.add_trace(go.Histogram(x=t3, name=name3, opacity=0.5, histnorm='probability density', xbins=dict(size=bin_width)))

    fig.update_layout(
        title="Individual MLP5 similarities to direction vectors",
        xaxis_title="Cosine Similarity",
        yaxis_title="Probability Density",
        barmode="overlay",
    )

    fig.show()

def compute_mlp_loss(prompts, df, neurons, ablate_mode="NNN", layer=5, compute_original_loss=False, mean=True):

    mean_activations = torch.Tensor(df[df.index.isin(neurons.tolist())][ablate_mode].tolist()).cuda()
    def ablate_mlp_hook(value, hook):
        value[:, :, neurons] = mean_activations
        return value

    with model.hooks(fwd_hooks=[(f"blocks.{layer}.mlp.hook_pre", ablate_mlp_hook)]):
        if mean:
            ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
        else:
            ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()

    if compute_original_loss:
        if mean:
            loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
        else:
            loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()
        return loss, ablated_loss
    return ablated_loss

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


  0%|          | 0/200 [00:00<?, ?it/s]

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [126]:
# Loss change for different AND thresholds
option = "orschlägen"
df = pd.read_pickle(f"data/and_neurons/df_{option}.pkl") 

with open(f"data/and_neurons/set_losses.json", "r") as f:
    all_losses = json.load(f)

prompts = haystack_utils.generate_random_prompts(option, model, common_tokens, 500, length=20)

In [131]:
df.head()

Prev/Curr/Context,NNN,NNY,NYN,NYY,YNN,YNY,YYN,YYY,PrevTokenSim,CurrTokenSim,...,NegSim,AblationDiff,And,NegAnd,Boosted,Deboosted,FullAblationLossIncrease,ContextAblationLossIncrease,Custom,Diff
Neuron,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.452348,-1.865411,-1.206139,-1.716999,-1.967061,-2.380259,-1.32276,-1.855663,-0.092576,0.017535,...,True,-0.532903,False,False,False,False,-0.004094,-0.00582,True,0.128172
1,-1.041662,-1.18613,-1.242793,-1.350295,-1.351736,-1.42218,-1.183853,-1.149874,-0.031552,-0.006965,...,True,0.033978,False,False,False,False,-0.001441,-0.000631,True,0.307458
2,-2.127363,-2.293197,-0.955758,-1.091026,-1.940194,-2.136701,-0.77345,-0.983497,0.034648,0.081715,...,False,-0.210047,False,False,False,False,0.002052,-0.000335,False,-0.046589
3,-1.090268,-1.259026,-2.459712,-2.098881,-1.240608,-1.239972,-2.479871,-2.146816,0.015636,-0.060014,...,True,0.333055,False,False,False,False,0.020078,-0.003826,True,0.217412
4,0.533762,0.91939,-0.291044,0.052384,0.248903,0.697211,-0.913438,-0.483871,-0.039145,-0.049453,...,False,0.429567,False,False,False,False,-0.063059,0.003769,False,-0.135069


In [None]:
# DLA


In [141]:
df["Custom"] = (df["YYY"] - df["NNN"]) > ((df["NYN"] - df["NNN"])+(df["NNY"] - df["NNN"])+(df["YNN"] - df["NNN"]))
df["Diff"] = (df["YYY"] - df["NNN"]) - ((df["NYN"] - df["NNN"])+(df["NNY"] - df["NNN"])+(df["YNN"] - df["NNN"]))
df["DiffCon"] = df["GenSim"] > df["GeSim"]
import plotly.express as px
px.histogram(df, x="Diff", histnorm='percent', color="DiffCon", barmode="overlay", width=800)

In [137]:
df[~df["DiffCon"]]["Diff"].mean()

0.013157262680720467

In [138]:
gen_token = model.to_single_token("gen")
ge_token = model.to_single_token("ge")
gen_dir = model.tokens_to_residual_directions(gen_token)
ge_dir = model.tokens_to_residual_directions(ge_token)

cos = torch.nn.CosineSimilarity(dim=1)
gen_sims = cos(model.W_out[5], gen_dir.unsqueeze(0)).cpu().numpy()
ge_sims = cos(model.W_out[5], ge_dir.unsqueeze(0)).cpu().numpy()

df["GenSim"] = gen_sims
df["GeSim"] = ge_sims

In [139]:
px.scatter(df, x="GeSim", y="GenSim")

In [5]:
# Check how similar gen and ge dirs are
cos = torch.nn.CosineSimilarity(dim=0)
cos(gen_dir, ge_dir)

tensor(0.3292, device='cuda:0')

In [17]:
haystack_utils.get_boosted_tokens(prompts, model, deactivate_neurons_fwd_hooks, all_ignore, deboost=False)
haystack_utils.get_boosted_tokens(prompts, model, deactivate_neurons_fwd_hooks, all_ignore, deboost=True)

Boosted tokens: uf (+4.21), gen (+1.78), ß (+1.43), ger (+0.31), ßen (+0.14)
Deboosted tokens: ge (-0.49), gt (-0.15)


In [18]:
print(df[df["AblationDiff"]>0.2][["AblationDiff", "GenSim", "GeSim"]].mean())
print(df[df["AblationDiff"]<-0.2][["AblationDiff", "GenSim", "GeSim"]].mean())

Prev/Curr/Context
AblationDiff    0.434787
GenSim          0.002992
GeSim          -0.000410
dtype: float64
Prev/Curr/Context
AblationDiff   -0.431541
GenSim          0.000029
GeSim           0.004855
dtype: float64


In [121]:
df_tmp = df.copy()
ablation_mode = "YYN"

df_tmp["Custom"] = (df["YYY"]>0) & (df["YYY"]>df["NNN"]) & (df["GenSim"]>df["GeSim"]) & \
    (df["YYY"]>df["YYN"]) & (df["YYY"]>df["YNY"]) & (df["YYY"]>df["NYY"]) &\
    (df["YYY"]>df["NYN"]) & (df["YYY"]>df["NNY"]) & (df["YYY"]>df["YNN"])# & df["PosSim"]

print(df_tmp["Custom"].sum())
pos_and_neurons = torch.LongTensor(df_tmp[df_tmp["Custom"]].index.tolist()).cuda()

#df_tmp["context_diff"] = df_tmp["YYY"] - df_tmp["YYN"]
#df_tmp = df_tmp.sort_values(by=["Custom", "context_diff"], ascending=False)
#pos_and_neurons = torch.LongTensor(df_tmp.index.tolist()[:30]).cuda()

original_loss, ablated_loss = compute_mlp_loss(prompts, df, pos_and_neurons, ablate_mode=ablation_mode, compute_original_loss=True)
print(original_loss, ablated_loss)

KeyError: 'GenSim'

In [None]:
def compute_counterfactual_losses(prompts, df, neurons):
    losses = [model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()]
    ablations = ["YYY", "YNY", "NYY", "YYN", "YNN", "NYN", "NNY", "NNN"]
    names = ["Original"] + ablations
    for ablation_mode in ablations:
        ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode=ablation_mode, mean=False)
        losses.append(ablated_loss)
    return losses, names

losses, names = compute_counterfactual_losses(prompts, df, pos_and_neurons)#torch.LongTensor([i for i in range(100)]))
plotting_utils.plot_barplot(losses, names, title="Losses for patching 102 AND MLP5 neurons with different ablation modes", yrange=(0, 24))

In [None]:
haystack_utils.clean_cache()
prompts = haystack_utils.generate_random_prompts(option, model, common_tokens, 500, length=20)


In [None]:
with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
    deactivated_loss, cache = model.run_with_cache(prompts, return_type="loss", loss_per_token=True)

#cache = cache["blocks.5.mlp.hook_pre"][:, -2].mean(0)
print(deactivated_loss[:, -1].mean().item())

3.2064943313598633


In [None]:
cache["post", 5].shape

torch.Size([500, 23, 2048])

In [None]:
compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(2048)]), "YYN", compute_original_loss=True)

(1.4382295608520508, 5.653645992279053)

In [None]:
mean_activations_df = torch.Tensor(df[df.index.isin([i for i in range(2048)])]["YYN"].tolist()).cuda()

In [None]:
print(mean_activations_df.shape, cache.shape)
print(mean_activations_df[:10])
print(cache[:10])

torch.Size([2048]) torch.Size([2048])
tensor([-1.3228, -1.1839, -0.7734, -2.4799, -0.9134, -0.9777, -1.1465, -1.8528,
        -0.9403, -0.6645], device='cuda:0')
tensor([-1.3351, -1.1835, -0.7855, -2.4714, -0.9022, -0.9851, -1.1769, -1.8635,
        -0.9359, -0.6646], device='cuda:0')


In [None]:
# And thresholds without similarity constraint
prompts = haystack_utils.generate_random_prompts(option, model, common_tokens, 2000, length=20)
losses = []
lens =  []
thresholds = []
for and_threshold in np.arange(-0.5, 2, 0.1):
    df_tmp = df.copy()
    ablation_mode = "YYN"
    df_tmp["Custom"] = (df["YYY"]>and_threshold) & (df["YYN"]<=and_threshold) & (df["YNY"]<=and_threshold) & (df["NYY"]<=and_threshold) & (df["YNN"]<=and_threshold) & (df["NNY"]<=and_threshold)& (df["NYN"]<=and_threshold)
    pos_and_neurons = torch.LongTensor(df_tmp[df_tmp["Custom"]].index.tolist()).cuda()

    original_loss, ablated_loss = compute_mlp_loss(prompts, df, pos_and_neurons, ablate_mode=ablation_mode, compute_original_loss=True)
    if len(losses) == 0:
        losses.append([original_loss])
    losses.append([ablated_loss])
    lens.append(len(pos_and_neurons))
    thresholds.append(and_threshold)
names = ["Original"] + [f"AND {thr:.2f} ({length})" for thr, length in zip(thresholds, lens)]
haystack_utils.plot_barplot(losses, names, title="AND neuron loss increase for different thresholds")

In [None]:
# And thresholds with similarity constraint
losses = []
lens =  []
thresholds = []
for and_threshold in np.arange(-0.5, 2, 0.1):
    df_tmp = df.copy()
    ablation_mode = "YYN"
    df_tmp["Custom"] = (df["GenSim"]>df["GeSim"]) & \
        (df["YYY"]>and_threshold) & (df["YYN"]<=and_threshold) & (df["YNY"]<=and_threshold) & (df["NYY"]<=and_threshold) & (df["YNN"]<=and_threshold) & (df["NNY"]<=and_threshold)& (df["NYN"]<=and_threshold)
    pos_and_neurons = torch.LongTensor(df_tmp[df_tmp["Custom"]].index.tolist()).cuda()

    original_loss, ablated_loss = compute_mlp_loss(prompts, df, pos_and_neurons, ablate_mode=ablation_mode, compute_original_loss=True)
    if len(losses) == 0:
        losses.append([original_loss])
    losses.append([ablated_loss])
    lens.append(len(pos_and_neurons))
    thresholds.append(and_threshold)
names = ["Original"] + [f"AND {thr:.2f} ({length})" for thr, length in zip(thresholds, lens)]
haystack_utils.plot_barplot(losses, names, title="AND neuron loss increase with cosine sim constraint: gen > ge")

In [None]:
with model.hooks(deactivate_neurons_fwd_hooks):
    ablated_loss, ablated_cache = model.run_with_cache(prompts, return_type="loss")

def get_ablate_neurons_hook(neuron: int | list[int], ablated_cache, layer=5):
    def ablate_neurons_hook(value, hook):
        value[:, :, neuron] = ablated_cache[f'blocks.{layer}.mlp.hook_post'][:, :, neuron]
        return value
    return [(f'blocks.{layer}.mlp.hook_post', ablate_neurons_hook)]

ablate_top_neurons_hook = get_ablate_neurons_hook([i for i in range(2048)], ablated_cache)

In [None]:
original_logprobs, ablated_logprobs, _, all_MLP5_logprobs = haystack_utils.get_direct_effect(prompts, model, pos=-2, context_ablation_hooks=deactivate_neurons_fwd_hooks, context_activation_hooks=activate_neurons_fwd_hooks, return_type='logprobs')

diffs = (original_logprobs - ablated_logprobs).mean(0)
diffs[all_ignore] = 0
diffs[original_logprobs.mean(0)<-7] = 0
top_diff, top_token = torch.topk(diffs, 20)
print(top_diff)
print(model.to_str_tokens(top_token))

tensor([1.5872, 1.4587, 0.8471, 0.3234, 0.0033, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000], device='cuda:0')
['ß', 'gen', 'ßen', 'ger', 'gt', '-', '*', ',', '(', '&', ')', '+', "'", '!', '<|endoftext|>', '<|padding|>', '%', '$', '"', '#']


In [None]:

# %%

option = "orschlägen"
ablation_mode = "YYN"
prompts = haystack_utils.generate_random_prompts(option, model, common_tokens, 1000, length=20)

names = list(all_losses[option][ablation_mode].keys())
losses = [[all_losses[option][ablation_mode][name]] for name in names]

print(len(names), len(losses))
print([len(x) for x in losses])
haystack_utils.plot_barplot(losses, names)


In [None]:


layer = 2
ngram = "orschlägen"
prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 500, length=20)

if ngram.startswith(" "):
    prompt_tuple = haystack_utils.get_trigram_prompts(prompts, new_word_tokens, continuation_tokens)
else:
    prompt_tuple = haystack_utils.get_trigram_prompts(prompts, continuation_tokens, continuation_tokens)
prev_token_direction, curr_token_direction = haystack_utils.get_residual_trigram_directions(prompt_tuple, model, layer-1)

prev_token_sim = get_cosine_sim(prev_token_direction, layer)
curr_token_sim = get_cosine_sim(curr_token_direction, layer)
context_sim = get_cosine_sim(context_direction, layer)
    
plot_histogram(prev_token_sim, curr_token_sim, context_sim, "Prev Token", "Curr Token", "Context")
# %%
prev_sim_neurons = torch.argwhere(prev_token_sim>0.05)
curr_sim_neurons = torch.argwhere(curr_token_sim>0.03)

print(len(prev_sim_neurons), len(curr_sim_neurons))
union = haystack_utils.union_where([prev_token_sim, curr_token_sim], 0.07)
print(union)
# %%

# Get random mean cache
random_prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 500, length=20)[:, :-3]
_, random_cache = model.run_with_cache(random_prompts)


# %%
# Define ablate neuron hook

# Layer 1
# orschlägen: tensor([  61,  188, 1011], device='cuda:0')
# häufig: 268 (almost doubles loss)
# beweglich: neurons decrease loss - maybe they boost alternative completion

def get_ablate_neurons_hook(neurons, layer):
    print(neurons)
    def ablate_neurons_hook(value, hook):
        value[:, :, neurons] = random_cache[f'blocks.{layer}.mlp.hook_post'][:, :, neurons].mean((0, 1))
        return value
    return [(f'blocks.{layer}.mlp.hook_post', ablate_neurons_hook)]

# Check loss increase
original_loss, original_ablated_loss = compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(model.cfg.d_mlp)]).cuda(), compute_original_loss=True)

with model.hooks(fwd_hooks=get_ablate_neurons_hook([1789], layer)):
    ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()

print(original_loss, original_ablated_loss, ablated_loss)
# %%

# 1011 increases loss on both "gen" and "ge"
# Either it boosts both completions (trigram table)
# Or it combines "orsch" and "lä" into a single representation that later components use

# Check if trigram table by looking at the direct effect
# Total effect of L1N1011
with model.hooks(fwd_hooks=get_ablate_neurons_hook([1406], layer)):
    _, ablated_cache = model.run_with_cache(prompts)

def ablate_component_hook(value, hook):
    value = ablated_cache[hook.name]
    return value

components = [f"blocks.{layer}.mlp.hook_post" for layer in range(3, 6)] + [f"blocks.{layer}.attn.hook_z" for layer in range(3, 6)]
hooks = [(component, ablate_component_hook) for component in components]

with model.hooks(fwd_hooks=hooks):
    ablated_logits = model(prompts, return_type="logits", loss_per_token=True)[:, -2].log_softmax(-1).mean(0)

original_logits = model(prompts, return_type="logits", loss_per_token=True)[:, -2].log_softmax(-1).mean(0)

print(ablated_logits.shape, original_logits.shape)

prob_diff = original_logits - ablated_logits
prob_diff[all_ignore] = 0
prob_diff[original_logits < -7] = 0
diffs, tokens = torch.topk(prob_diff, 20)
print(diffs)
print(tokens)
print(model.to_str_tokens(tokens))

# %% 
# Direct effect
_, original_cache = model.run_with_cache(prompts)

def activate_component_hook(value, hook):
    value = original_cache[hook.name]
    return value

activate_hooks = [(component, activate_component_hook) for component in components]

with model.hooks(fwd_hooks=activate_hooks + get_ablate_neurons_hook([1406], layer)):
    activated_logits = model(prompts, return_type="logits", loss_per_token=True)[:, -2].log_softmax(-1).mean(0)

prob_diff = original_logits - activated_logits
prob_diff[all_ignore] = 0
prob_diff[original_logits < -7] = 0
diffs, tokens = torch.topk(prob_diff, 20)
print(diffs)
print(tokens)
print(model.to_str_tokens(tokens))

# Check later components + context neuron effects of 1011
# %%

#output_direction = model.W_out[1, 1011]
output_direction = model.W_out[2, 1406]
context_direction = model.W_out[3, 669]

output_sims = get_cosine_sim(output_direction, 5)
context_sims = get_cosine_sim(context_direction, 5)

plot_histogram(output_sims, context_sims, torch.zeros_like(output_sims), "Output", "Context", "Zero")
# %%
union = haystack_utils.union_where([output_sims, context_sims], 0.05)
len(union)
# %%
ngram = "orschlägen"
prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 1000, length=20)

original_loss, original_ablated_loss = compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(model.cfg.d_mlp)]).cuda(), compute_original_loss=True)

with model.hooks(fwd_hooks=get_ablate_neurons_hook(union, 5)): #712, 394, 287
    ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()

print(original_loss, original_ablated_loss, ablated_loss)



In [None]:
# %%
ngram = " meine Vorschläge"
prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 1000, length=20)
original_loss, original_ablated_loss = compute_mlp_loss(prompts, df, torch.LongTensor([i for i in range(model.cfg.d_mlp)]).cuda(), compute_original_loss=True)
print(original_loss, original_ablated_loss)
# %%
model.to_str_tokens(model.to_tokens(" deinen Vorschläge", prepend_bos=False))
# %%

0.3360338807106018 4.822600841522217


[' de', 'inen', ' V', 'orsch', 'lä', 'ge']

In [58]:
original_losses = []
ablated_losses = []
ngrams = [" Vorschlägen", " Vorschläge", " seine Vorschläge", " seinen Vorschläge", " seinen Vorschlägen", " seine Vorschlägen"]

for ngram in ngrams:
    prompts = haystack_utils.generate_random_prompts(ngram, model, common_tokens, 1000, length=20)

    with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
        ablated_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
        ablated_losses.append(ablated_loss)
    
    original_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    original_losses.append(original_loss)



In [59]:
df = pd.DataFrame({"ngram": ngrams, "original_loss": original_losses, "ablated_loss": ablated_losses})

In [60]:
print(df)

                 ngram  original_loss  ablated_loss
0          Vorschlägen       1.233227      3.584343
1           Vorschläge       1.227341      0.848729
2     seine Vorschläge       0.372710      0.360383
3    seinen Vorschläge       3.786774      3.005842
4   seinen Vorschlägen       0.244110      2.554850
5    seine Vorschlägen       2.594725      4.802067


In [80]:
# Create traces for 'original loss' and 'ablated loss'
trace1 = go.Bar(
    x=df['ngram'],
    y=df['original_loss'],
    name='Original',
    marker_color='blue'
)

trace2 = go.Bar(
    x=df['ngram'],
    y=df['ablated_loss'],
    name='Ablated',
    marker_color='green'
)

correct_ngrams = [" Vorschlägen", " Vorschläge", " seine Vorschläge", "", " seinen Vorschlägen", ""]
incorrect_ngrams = ["", "", "", " seinen Vorschläge", "", " seine Vorschlägen"]


annotations = []
for i, ngram in enumerate(incorrect_ngrams):
    if ngram:  # if ngram is not empty
        annotations.append(dict(
            x=i,  # position of the label
            y=-0.052,  # adjust this value to move the label up or down
            xref="x",
            yref="paper",
            text=ngram,  # the label
            font=dict(
                color="red",
                size=12
            ),
            showarrow=False
        ))

# Create the layout
layout = go.Layout(
    title='Original Loss vs Ablated Loss',
    barmode='group',  # this is what makes the bars grouped instead of stacked
    xaxis=dict(tickmode='array', tickvals=df.index, ticktext=correct_ngrams),  # hide original x-axis labels
    annotations=annotations,  # add annotations for the red labels
    width=950
)

# Create the Figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

fig.show()

## DLA diffs

Logit difference plan
- Option 1
    - Run model for "NNN"
    - Run model for all other modes
    - Patch "NNN" activation for one neuron at a time
    - Record difference in "gen" logit
- option 2
    - Run model for each mode, do DLA with "gen"
    - Subtract DLA of "NNN" from all other DLA 

In [148]:
# Look for neurons that deboost "gen" if "orsch" is not present
option = "orschlägen"
df = pd.read_pickle(f"data/and_neurons/df_{option}.pkl") 
prompts = haystack_utils.generate_random_prompts("orschlägen", model, common_tokens, 100, length=20)
df.head()

Prev/Curr/Context,NNN,NNY,NYN,NYY,YNN,YNY,YYN,YYY,PrevTokenSim,CurrTokenSim,ContextSim,PosSim,NegSim,AblationDiff,And,NegAnd,Boosted,Deboosted,FullAblationLossIncrease,ContextAblationLossIncrease
Neuron,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-1.452348,-1.865411,-1.206139,-1.716999,-1.967061,-2.380259,-1.32276,-1.855663,-0.092576,0.017535,-0.060797,False,True,-0.532903,False,False,False,False,-0.004094,-0.00582
1,-1.041662,-1.18613,-1.242793,-1.350295,-1.351736,-1.42218,-1.183853,-1.149874,-0.031552,-0.006965,-0.008395,False,True,0.033978,False,False,False,False,-0.001441,-0.000631
2,-2.127363,-2.293197,-0.955758,-1.091026,-1.940194,-2.136701,-0.77345,-0.983497,0.034648,0.081715,-0.074554,False,False,-0.210047,False,False,False,False,0.002052,-0.000335
3,-1.090268,-1.259026,-2.459712,-2.098881,-1.240608,-1.239972,-2.479871,-2.146816,0.015636,-0.060014,-0.009209,False,True,0.333055,False,False,False,False,0.020078,-0.003826
4,0.533762,0.91939,-0.291044,0.052384,0.248903,0.697211,-0.913438,-0.483871,-0.039145,-0.049453,0.03519,False,False,0.429567,False,False,False,False,-0.063059,0.003769


In [149]:
def create_ablation_prompts(prompts, ablation_mode):
    if ablation_mode[0] == "N":
        prompts = haystack_utils.replace_column(prompts, -3, common_tokens)
    if ablation_mode[1] == "N":
        prompts = haystack_utils.replace_column(prompts, -2, common_tokens)
    return prompts

In [159]:
# Toggle experiment
prompts = haystack_utils.generate_random_prompts("orschlägen", model, common_tokens, 1000, length=20)
ablated_prompts = create_ablation_prompts(prompts, "NYY")

yyy_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
nyy_loss = model(ablated_prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
    yyn_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    nyn_loss = model(ablated_prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()

# Loss increase of "orsch" without context: YYN vs NYN
yyn_nyn_diff = nyn_loss - yyn_loss

# Loss increase of context without "orsch": NYY vs NYN
nyy_nyn_diff = nyn_loss - nyy_loss

# Loss increase of both: YYY vs NYN
yyy_nyn_diff =  nyn_loss - yyy_loss

# If linear (YYY-NYN) = (YYN-NYN) + (NYY-NYN)
print(yyy_nyn_diff, yyn_nyn_diff, nyy_nyn_diff)
print(yyy_nyn_diff, yyn_nyn_diff + nyy_nyn_diff)

2.2256717681884766 0.37436604499816895 -0.30035924911499023
2.2256717681884766 0.07400679588317871


In [161]:
# Toggle experiment
prompts = haystack_utils.generate_random_prompts("orschlägen", model, common_tokens, 1000, length=20)
ablated_prompts_1 = create_ablation_prompts(prompts, "NYY")
ablated_prompts_2 = create_ablation_prompts(prompts, "YNY")
ablated_prompts_3 = create_ablation_prompts(prompts, "NNY")

yyy_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
nyy_loss = model(ablated_prompts_1, return_type="loss", loss_per_token=True)[:, -1].mean().item()
yny_loss = model(ablated_prompts_2, return_type="loss", loss_per_token=True)[:, -1].mean().item()
nny_loss = model(ablated_prompts_3, return_type="loss", loss_per_token=True)[:, -1].mean().item()
with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
    yyn_loss = model(prompts, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    nyn_loss = model(ablated_prompts_1, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    ynn_loss = model(ablated_prompts_2, return_type="loss", loss_per_token=True)[:, -1].mean().item()
    nnn_loss = model(ablated_prompts_3, return_type="loss", loss_per_token=True)[:, -1].mean().item()

# Loss increases for individual features
nny_diff = nnn_loss - nny_loss
nyn_diff = nnn_loss - nyn_loss
ynn_diff = nnn_loss - ynn_loss

# Loss increase for two features
nyy_diff = nnn_loss - nyy_loss
yny_diff = nnn_loss - yny_loss
yyn_diff = nnn_loss - yyn_loss

# Loss increase for all features
yyy_diff = nnn_loss - yyy_loss

# Check if sum of individual features is lower than overall loss increase
print(yyy_diff, nny_diff + nyn_diff + ynn_diff)

# Check if loss increase of two features is lower than overall loss increase
print(yyy_diff, (yyn_diff + nyy_diff + yny_diff)/2)

8.766587495803833 6.3177900314331055
8.766587495803833 6.541551232337952


In [151]:
dla_dict = {}
prompts = haystack_utils.generate_random_prompts("orschlägen", model, common_tokens, 100, length=20)


for ablation_mode in ["NNN", "NNY", "NYN", "YNN", "NYY", "YNY", "YYN", "YYY"]:
    ablated_prompts = create_ablation_prompts(prompts, ablation_mode)
    unembed_dir = model.tokens_to_residual_directions("gen")
    
    if ablation_mode[2] == "N":
        with model.hooks(fwd_hooks=deactivate_neurons_fwd_hooks):
            _, cache = model.run_with_cache(ablated_prompts)
    else:
        _, cache = model.run_with_cache(ablated_prompts)
    
    resid_stack, labels = cache.get_full_resid_decomposition(apply_ln=True, pos_slice=-1, return_labels=True)
    crop = -2050, -2
    assert all([label.startswith("L5N") for label in labels[crop[0]:crop[1]]])
    resid_stack = resid_stack.mean(1)[crop[0]:crop[1]]
    dla = resid_stack @ unembed_dir
    dla_dict[ablation_mode] = dla.cpu().numpy()

df = pd.DataFrame(dla_dict)
df["Neuron"] = [i for i in range(2048)]
df.head()

Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now
Tried to stack head results when they weren't cached. Computing head results now


Unnamed: 0,NNN,NNY,NYN,YNN,NYY,YNY,YYN,YYY,Neuron
0,0.000504,0.000281,0.000275,0.000475,0.000119,0.000235,0.000229,0.000102,0
1,0.005356,0.005896,0.003891,0.006661,0.00321,0.005881,0.004088,0.003488,1
2,-0.00664,-0.006441,-0.001342,-0.00832,-0.000725,-0.006311,-0.000775,-0.000395,2
3,-0.024404,-0.02562,-0.01822,-0.027805,-0.013278,-0.026851,-0.020374,-0.014818,3
4,0.025258,0.045949,0.010819,0.023901,0.026171,0.041752,0.013476,0.028173,4


In [173]:
# DF containing neuron wise DLA for "gen" token with different features (PrevToken, CurrToken, Context) present
df["and_1"] = (df["NNN"] - df["YYY"]) > ((df["NNN"] - df["YNY"]) + (df["NNN"] - df["NYY"]) + (df["NNN"] - df["YYN"]))/2
df["and_2"] = (df["NNN"] - df["YYY"]) > ((df["NNN"] - df["YNN"]) + (df["NNN"] - df["NYN"]) + (df["NNN"] - df["NNY"]))
df["diff_1"] = (df["NNN"] - df["YYY"]) - ((df["NNN"] - df["YNY"]) + (df["NNN"] - df["NYY"]) + (df["NNN"] - df["YYN"]))/2
df["diff_2"] = (df["NNN"] - df["YYY"]) - ((df["NNN"] - df["YNN"]) + (df["NNN"] - df["NYN"]) + (df["NNN"] - df["NNY"]))

px.histogram(df, x="diff_1", histnorm='percent', width=800)

In [177]:
threshold = 0.008
prompts = haystack_utils.generate_random_prompts("orschlägen", model, common_tokens, 1000, length=20)
neurons = torch.LongTensor(df[df["diff_1"]>0.05]["Neuron"].tolist())
print(len(neurons))
original_loss, ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode="YYN", mean=True, compute_original_loss=True)

print(original_loss, ablated_loss)


12
1.4691624641418457 4.41193962097168


## Suppression neurons

In [None]:
# Look for neurons that deboost "gen" if "orsch" is not present
option = "orschlägen"
df = pd.read_pickle(f"data/and_neurons/df_{option}.pkl") 
df.head()

Prev/Curr/Context,NNN,NNY,NYN,NYY,YNN,YNY,YYN,YYY,PrevTokenSim,CurrTokenSim,ContextSim,PosSim,NegSim,AblationDiff,And,NegAnd,Boosted,Deboosted,FullAblationLossIncrease,ContextAblationLossIncrease
Neuron,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-1.452348,-1.865411,-1.206139,-1.716999,-1.967061,-2.380259,-1.32276,-1.855663,-0.092576,0.017535,-0.060797,False,True,-0.532903,False,False,False,False,-0.004094,-0.00582
1,-1.041662,-1.18613,-1.242793,-1.350295,-1.351736,-1.42218,-1.183853,-1.149874,-0.031552,-0.006965,-0.008395,False,True,0.033978,False,False,False,False,-0.001441,-0.000631
2,-2.127363,-2.293197,-0.955758,-1.091026,-1.940194,-2.136701,-0.77345,-0.983497,0.034648,0.081715,-0.074554,False,False,-0.210047,False,False,False,False,0.002052,-0.000335
3,-1.090268,-1.259026,-2.459712,-2.098881,-1.240608,-1.239972,-2.479871,-2.146816,0.015636,-0.060014,-0.009209,False,True,0.333055,False,False,False,False,0.020078,-0.003826
4,0.533762,0.91939,-0.291044,0.052384,0.248903,0.697211,-0.913438,-0.483871,-0.039145,-0.049453,0.03519,False,False,0.429567,False,False,False,False,-0.063059,0.003769


In [105]:
prompts = haystack_utils.generate_random_prompts("lä", model, common_tokens, 100, length=20)
logits, cache = model.run_with_cache(prompts)
unembed_dir = model.tokens_to_residual_directions("gen")
resid_stack, labels = cache.get_full_resid_decomposition(apply_ln=True, pos_slice=-1, return_labels=True)
crop = -2050, -2

resid_stack = resid_stack.mean(1)[crop[0]:crop[1]]
dla = resid_stack @ unembed_dir
dla_df = pd.DataFrame({"Neuron": labels[crop[0]:crop[1]], "NOT A DLA": dla.tolist()})

Tried to stack head results when they weren't cached. Computing head results now


In [None]:
prompts = haystack_utils.generate_random_prompts("orschlä", model, common_tokens, 100, length=20)
logits, cache = model.run_with_cache(prompts)
unembed_dir = model.tokens_to_residual_directions("gen")
resid_stack, labels = cache.get_full_resid_decomposition(apply_ln=True, pos_slice=-1, return_labels=True)

resid_stack = resid_stack.mean(1)[crop[0]:crop[1]]
dla = resid_stack @ unembed_dir
dla_df["A DLA"] = dla.tolist()

Tried to stack head results when they weren't cached. Computing head results now


In [None]:
dla_df["TurnedOff"] = (dla_df["NOT A DLA"]<-0.001) & (df["NYY"]>0) & (dla_df["A DLA"]>0) & (df["YYY"]<df["NYY"])
neurons = dla_df[dla_df["TurnedOff"]]["Neuron"].tolist()
neurons = torch.LongTensor([int(neuron.split("N")[1]) for neuron in neurons])
print(len(neurons), neurons)

39 tensor([  67,   73,   95,  111,  169,  198,  231,  286,  420,  445,  522,  570,
         594,  681,  705,  749,  770,  827,  997, 1064, 1099, 1123, 1205, 1216,
        1304, 1360, 1386, 1586, 1738, 1744, 1786, 1864, 1867, 1901, 1905, 1909,
        1949, 1964, 1969])


In [None]:
def compute_counterfactual_losses(prompts, df, neurons):
    losses = [model(prompts, return_type="loss", loss_per_token=True)[:, -1].tolist()]
    ablations = ["NYY", "YNY", "YYN"]
    names = ["Original"] + ablations
    for ablation_mode in ablations:
        ablated_loss = compute_mlp_loss(prompts, df, neurons, ablate_mode=ablation_mode, mean=False)
        losses.append(ablated_loss)
    return losses, names

prompts = haystack_utils.generate_random_prompts("orschlägen", model, common_tokens, 1000, length=20)
losses, names = compute_counterfactual_losses(prompts, df, neurons)#torch.LongTensor([i for i in range(100)]))
plotting_utils.plot_barplot(losses, names, title="Losses for patching 39 NOT A neurons with different ablation modes")