In [1]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer
from jaxtyping import Float, Int, Bool
from torch import Tensor
from tqdm.auto import tqdm
import plotly.io as pio
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import numpy as np
import plotly.express as px 
from collections import defaultdict
import matplotlib.pyplot as plt
import re
from IPython.display import display, HTML
from datasets import load_dataset
from collections import Counter
import pickle
import os
import haystack_utils
from transformer_lens import utils
from fancy_einsum import einsum
import einops
import json
import ipywidgets as widgets
from IPython.display import display
from datasets import load_dataset
import random
import math
import random
import neel.utils as nutils
from neel_plotly import *
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import probing_utils
import pickle
from sklearn.metrics import matthews_corrcoef

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

pio.renderers.default = "notebook_connected+notebook"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

%reload_ext autoreload
%autoreload 2

In [24]:
def get_model(checkpoint: int):
    model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
        checkpoint_index=checkpoint,
        center_unembed=True,
        center_writing_weights=True,
        fold_ln=True,
        device=device)
    return model

NUM_CHECKPOINTS = 143
LAYER, NEURON = 3, 669
model = get_model(142)
german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
english_data = haystack_utils.load_json_data("data/english_europarl.json")[:200]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


In [None]:
# Will take about 50GB of disk space for Pythia 70M models
def preload_models(NUM_CHECKPOINTS: int):
    for i in tqdm(range(NUM_CHECKPOINTS)):
        get_model(i)
preload_models(NUM_CHECKPOINTS)

In [43]:
def eval_loss(model, data):
    losses = []
    for prompt in data:
        loss = model(prompt, return_type="loss")
        losses.append(loss.item())
    return np.mean(losses)

def get_probe_performance(model, german_data, english_data, layer, neuron, plot=False):
    german_activations = haystack_utils.get_mlp_activations(german_data, layer, model, neurons=[neuron], mean=False)[:50000]
    english_activations = haystack_utils.get_mlp_activations(english_data, layer, model, neurons=[neuron], mean=False)[:50000]
    if plot:
        haystack_utils.two_histogram(german_activations.flatten(), english_activations.flatten(), "German", "English")
    return train_probe(german_activations, english_activations)

def train_probe(german_activations, english_activations):
    labels = np.concatenate([np.ones(len(german_activations)), np.zeros(len(english_activations))])
    activations = np.concatenate([german_activations.cpu().numpy(), english_activations.cpu().numpy()])
    scaler = preprocessing.StandardScaler().fit(activations)
    activations = scaler.transform(activations)
    x_train, x_test, y_train, y_test = train_test_split(activations, labels, test_size=0.2, random_state=SEED)
    probe = probing_utils.get_probe(x_train, y_train, max_iter=2000)
    f1, mcc = probing_utils.get_probe_score(probe, x_test, y_test)
    return f1, mcc

def eval_checkpoint(checkpoint: int):
    model = get_model(checkpoint)
    german_loss = eval_loss(model, german_data)
    f1, mcc = get_probe_performance(model, german_data, english_data, LAYER, NEURON)
    return [checkpoint, german_loss, f1, mcc]


In [14]:
model = get_model(NUM_CHECKPOINTS-1)
english_activations = {}
for layer in range(3, 4):
    english_activations[layer] = haystack_utils.get_mlp_activations(english_data, layer, model, mean=False)

MEAN_ACTIVATION_INACTIVE = english_activations[LAYER][:, NEURON].mean()

def deactivate_neurons_hook(value, hook):
    value[:, :, NEURON] = MEAN_ACTIVATION_INACTIVE
    return value
deactivate_neurons_fwd_hooks=[(f'blocks.{LAYER}.mlp.hook_post', deactivate_neurons_hook)]

print(MEAN_ACTIVATION_INACTIVE)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


  0%|          | 0/200 [00:00<?, ?it/s]

tensor(-0.0829, device='cuda:0')


In [166]:
from datasets import load_dataset

dataset = load_dataset("NeelNanda/pile-10k", split="train")

In [170]:
names = []
for x in dataset:
    names.append(x['meta']['pile_set_name'])

print(Counter(names))

Counter({'Pile-CC': 2524, 'OpenWebText2': 1520, 'PubMed Abstracts': 1423, 'StackExchange': 1399, 'Github': 855, 'Wikipedia (en)': 779, 'USPTO Backgrounds': 514, 'PubMed Central': 259, 'FreeLaw': 241, 'NIH ExPorter': 104, 'DM Mathematics': 99, 'ArXiv': 91, 'HackerNews': 81, 'Enron Emails': 47, 'OpenSubtitles': 27, 'YoutubeSubtitles': 11, 'Books3': 9, 'EuroParl': 6, 'PhilPapers': 5, 'BookCorpus2': 2, 'Ubuntu IRC': 2, 'Gutenberg (PG-19)': 2})


In [None]:
# Probe performance for each neuron
def get_layer_probe_performance(model, checkpoint, layer):
    english_activations = haystack_utils.get_mlp_activations(english_data[:31], layer, model, mean=False, disable_tqdm=True)[:10000]
    german_activations = haystack_utils.get_mlp_activations(german_data[:30], layer, model, mean=False, disable_tqdm=True)[:10000]
    neuron_labels = [f'C{checkpoint}L{layer}N{i}' for i in range(model.cfg.d_mlp)]
    mean_english_activations = english_activations.mean(0).cpu().numpy()
    mean_german_activations = german_activations.mean(0).cpu().numpy()
    f1s = []
    mccs= []
    for neuron in range(model.cfg.d_mlp):
        f1, mcc = train_probe(german_activations[:, neuron].unsqueeze(-1), english_activations[:, neuron].unsqueeze(-1))
        f1s.append(f1)
        mccs.append(mcc)
    df = pd.DataFrame({"Label": neuron_labels, "Neuron": [i for i in range(model.cfg.d_mlp)], "F1": f1s, "MCC": mccs, "MeanGermanActivation": mean_german_activations, "MeanEnglishActivation": mean_english_activations})
    df["Checkpoint"] = checkpoint
    df["Layer"] = layer
    return df

dfs = []
checkpoints = list(range(40)) + [40,50,60,70,80,90,100, 110, 120, 130, 140]
with tqdm(total=len(checkpoints)*model.cfg.n_layers) as pbar:
    for checkpoint in checkpoints:
        model = get_model(checkpoint)
        for layer in range(model.cfg.n_layers):
            tmp_df = get_layer_probe_performance(model, checkpoint, layer)
            dfs.append(tmp_df)
            with open("data/layer_probe_performance.pkl", "wb") as f:
                pickle.dump(dfs, f)
            pbar.update(1)

In [115]:
df = pd.concat(dfs)
df["NeuronLabel"] = df.apply(lambda row: f"L{row['Layer']}N{row['Neuron']}", axis=1)

Unnamed: 0,Label,Neuron,F1,MCC,MeanGermanActivation,MeanEnglishActivation,Checkpoint,Layer
0,C0L0N0,0,0.414037,0.009989,0.132858,0.117383,0,0
1,C0L0N1,1,0.437648,0.059326,0.158821,0.107143,0,0
2,C0L0N2,2,0.505419,-0.004196,0.103538,0.108237,0,0
3,C0L0N3,3,0.57218,0.021733,0.09945,0.143337,0,0
4,C0L0N4,4,0.582425,0.060994,0.152917,0.224428,0,0


In [201]:
import gzip

# Open the pickle file
with open('./data/layer_probe_performance.pkl', 'rb') as f:
    data = pickle.load(f)

# Compress with gzip using high compression and save
with gzip.open('./data/layer_probe_performance.pkl.gz', 'wb', compresslevel=9) as f_out:
    pickle.dump(data, f_out)

In [117]:
checkpoints = []
top_probe = []
for checkpoint in df["Checkpoint"].unique():
    tmp_df = df[df["Checkpoint"] == checkpoint]
    top_probe.append(tmp_df["MCC"].max())
    checkpoints.append(checkpoint)
px.line(x=checkpoints, y=top_probe, title="Top Probe MCC by Checkpoint")

In [178]:
neurons = df[(df["MCC"] > 0.6) & (df["MeanGermanActivation"]>df["MeanEnglishActivation"])][["NeuronLabel", "MCC"]].copy()
neurons = neurons.sort_values(by="MCC", ascending=False)
print(len(neurons["NeuronLabel"].unique()))
good_neurons = neurons["NeuronLabel"].unique()[:50]

654


In [176]:
tmp=df[df["MCC"]>0.7]
tmp.head(50)

Unnamed: 0,Label,Neuron,F1,MCC,MeanGermanActivation,MeanEnglishActivation,Checkpoint,Layer,NeuronLabel
209,C8L4N209,209,0.866188,0.729471,1.465526,0.244836,8,4,L4N209
52,C8L5N52,52,0.849042,0.705981,0.675297,0.073616,8,5,L5N52
217,C8L5N217,217,0.857549,0.702101,-0.077062,0.316575,8,5,L5N217
287,C8L5N287,287,0.871253,0.738031,1.97909,0.531707,8,5,L5N287
364,C8L5N364,364,0.850727,0.708894,0.661416,0.04358,8,5,L5N364
501,C8L5N501,501,0.878474,0.745774,-0.079239,0.559716,8,5,L5N501
599,C8L5N599,599,0.883466,0.76605,0.898762,0.067438,8,5,L5N599
853,C8L5N853,853,0.887195,0.779101,0.543051,-0.040886,8,5,L5N853
884,C8L5N884,884,0.859627,0.717507,1.345907,0.38021,8,5,L5N884
953,C8L5N953,953,0.855547,0.702936,0.168829,0.954855,8,5,L5N953


In [None]:
# Ablation loss for group of top neurons
def get_ablation_hook(neurons, layer, activations):
    def ablate_neurons_hook(value, hook):
        value[:, :, neurons] = activations
        return value
    return [(f'blocks.{layer}.mlp.hook_post', ablate_neurons_hook)]

def get_neuron_loss(checkpoint, neurons: list[str]):
    model = get_model(checkpoint)
    ablation_neurons = {l:[] for l in range(model.cfg.n_layers)}
    for neuron_name in neurons:
        layer, neuron = neuron_name[1:].split("N")
        layer, neuron = int(layer), int(neuron)
        ablation_neurons[layer].append(neuron)
    hooks = []
    for layer in range(model.cfg.n_layers):
        activations = []
        for neuron in ablation_neurons[layer]:
            label = f"C{checkpoint}L{layer}N{neuron}"
            activation = df[df["Label"]==label]["MeanEnglishActivation"].item()
            assert activation is not None
            activations.append(activation)
        activations = torch.tensor(activations).cuda()
        hooks.extend(get_ablation_hook(ablation_neurons[layer], layer, activations))
    original_loss = eval_loss(model, german_data)
    with model.hooks(hooks):
        ablated_loss = eval_loss(model, german_data)
    return original_loss, ablated_loss

all_neuron_diffs = []
for checkpoint in list(range(0, NUM_CHECKPOINTS, 10)):
    original_loss, ablated_loss = get_neuron_loss(checkpoint, good_neurons)
    diff = ablated_loss - original_loss
    print(f"Checkpoint {checkpoint}: {original_loss} -> {ablated_loss}")
    all_neuron_diffs.append(diff)

In [143]:
def get_mean_english(df, neuron, layer, checkpoint):
    label = f"C{checkpoint}L{layer}N{neuron}"
    df = df[df["Label"]==label]["MeanEnglishActivation"].item()
    return df

get_mean_english(df, 669, 3, 140)

-0.08759497106075287

In [None]:
# Ablation loss for top neurons
ablation_data = []
checkpoints = list(range(0, NUM_CHECKPOINTS, 10))
print(checkpoints)
with tqdm(total=len(checkpoints)*len(good_neurons)) as pbar:
    for checkpoint in checkpoints:
        model = get_model(checkpoint)
        for neuron_name in good_neurons:
            layer, neuron = neuron_name[1:].split("N")
            layer, neuron = int(layer), int(neuron)
            english_activations = get_mean_english(df, neuron, layer, checkpoint)
            assert english_activations is not None
            def tmp_hook(value, hook):
                value[:, :, neuron] = english_activations
                return value
            tmp_hooks=[(f'blocks.{layer}.mlp.hook_post', tmp_hook)]
            original_loss = eval_loss(model, german_data)
            with model.hooks(tmp_hooks):
                ablated_loss = eval_loss(model, german_data)
            ablation_data.append([neuron_name, checkpoint, original_loss, ablated_loss])
            pbar.update(1)
    

In [145]:
ablation_df = pd.DataFrame(ablation_data, columns=["Label", "Checkpoint", "OriginalLoss", "AblatedLoss"])
ablation_df["AblationIncrease"] = ablation_df["AblatedLoss"] - ablation_df["OriginalLoss"]
ablation_df.to_csv("data/checkpoint_ablation_data.csv")

In [162]:
all_neuron_df = pd.DataFrame({"Label": "Top 50", "Checkpoint": list(range(0, NUM_CHECKPOINTS, 10)), "AblationIncrease": all_neuron_diffs})
ablation_df = pd.concat([ablation_df, all_neuron_df])
ablation_df.head()

Unnamed: 0,Label,Checkpoint,OriginalLoss,AblatedLoss,AblationIncrease
0,L5N395,0,11.026308,11.026303,-5e-06
1,L3N669,0,11.026308,11.026191,-0.000117
2,L4N1276,0,11.026308,11.026374,6.6e-05
3,L5N1655,0,11.026308,11.026392,8.5e-05
4,L5N953,0,11.026308,11.026212,-9.6e-05


In [163]:
px.line(ablation_df, x="Checkpoint", y="AblationIncrease", color="Label", title="Ablation Increase by Checkpoint")

In [98]:
max_mcc = df.groupby("NeuronLabel")["MCC"].max()
print(len(max_mcc[max_mcc < 0.1].index))
bad_neurons = []#max_mcc[max_mcc < 0.1].index[:10]
print(bad_neurons)

70
[]


626688

In [99]:
px.line(df[df["NeuronLabel"].isin(good_neurons) | df["NeuronLabel"].isin(bad_neurons)], x="Checkpoint", y="MCC", color="NeuronLabel", title="Neurons with max MCC >= 0.85")

In [None]:
# Baselines
# "Current word starts with space"
# Memorize top German tokens
# Memorize top English tokens

In [172]:
# Counts of space
def get_space_counts(data):
    space_counts = 0
    non_space_counts = 0
    for prompt in data:
        tokens = model.to_str_tokens(prompt)
        space_count = [1 if token.startswith(" ") else 0 for token in tokens]
        space_counts += sum(space_count)
        non_space_counts += len(space_count) - sum(space_count)
    return space_counts, non_space_counts

german_space, german_non_space = get_space_counts(german_data[:200])
english_space, english_non_space = get_space_counts(english_data[:200])
print(german_space, german_non_space, english_space, english_non_space)

44124 61299 55054 16285


In [186]:
model = get_model(142)
all_ignore, _ = haystack_utils.get_weird_tokens(model)
german_counts = haystack_utils.get_common_tokens(german_data[:200],model, k=model.cfg.d_vocab, ignore_tokens=all_ignore, return_unsorted_counts=True)
english_counts = haystack_utils.get_common_tokens(english_data[:200],model, k=model.cfg.d_vocab, ignore_tokens=all_ignore, return_unsorted_counts=True)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [198]:
all_counts = german_counts + english_counts
labels = np.concatenate([np.ones(int(german_counts.sum().item())), np.zeros(int(english_counts.sum().item()))])
predictions = []
for i in range(len(all_counts)):
    if german_counts[i] > english_counts[i]:
        predictions.append(np.ones(int(german_counts[i].item())))
    else:
        predictions.append(np.zeros(int(german_counts[i].item())))
for i in range(len(all_counts)):
    if german_counts[i] > english_counts[i]:
        predictions.append(np.ones(int(english_counts[i].item())))
    else:
        predictions.append(np.zeros(int(english_counts[i].item())))
predictions = np.concatenate(predictions)
print(matthews_corrcoef(labels, predictions))

0.8962349174950575


In [173]:
german_space / (german_space + german_non_space), english_space / (english_space + english_non_space)

(0.4185424432998492, 0.7717237415719312)

In [174]:
labels = np.concatenate([np.ones(100), np.zeros(100)])
pred = np.concatenate([np.zeros(42), np.ones(58), np.zeros(77), np.ones(23)])
matthews_corrcoef(labels, pred)

0.35649385995541555

data = []
for checkpoint in tqdm(range(NUM_CHECKPOINTS)):
    data.append(eval_checkpoint(checkpoint))

df = pd.DataFrame(data, columns=["checkpoint", "german_loss", "f1", "mcc"])

In [None]:
ablation_losses = []
for checkpoint in tqdm(range(NUM_CHECKPOINTS)):
    model = get_model(checkpoint)
    with model.hooks(deactivate_neurons_fwd_hooks):
        ablated_loss = eval_loss(model, german_data)
    ablation_losses.append(ablated_loss)

In [None]:
english_losses = []
for checkpoint in tqdm(range(NUM_CHECKPOINTS)):
    model = get_model(checkpoint)
    english_loss = eval_loss(model, english_data)
    english_losses.append(english_loss)

In [None]:
df["english_loss"] = english_losses
df["ablation_loss"] = ablation_losses

In [103]:
df = pd.read_csv("data/checkpoint_eval.csv", index_col=0).reset_index()
df.head(5)

Unnamed: 0,checkpoint,german_loss,f1,mcc,english_loss,ablation_loss
0,0,11.026308,0.476689,-0.020651,11.051213,11.025739
1,1,11.026308,0.43697,-0.016553,11.051213,11.025739
2,2,11.026103,0.387683,0.037004,11.05058,11.025534
3,3,11.022297,0.408647,0.01009,11.038868,11.021735
4,4,10.9723,0.450796,-0.029007,10.886706,10.971827


In [101]:
end_prompt = " Vorschlägen"
all_ignore, _ = haystack_utils.get_weird_tokens(model, plot_norms=False)
common_tokens = haystack_utils.get_common_tokens(german_data, model, all_ignore, k=100)
prompts = haystack_utils.generate_random_prompts(end_prompt, model, common_tokens, 500, length=20)
print(model.to_str_tokens(prompts[0]))


  0%|          | 0/200 [00:00<?, ?it/s]

['h', 'e', ' V', ' Z', 'igen', 'chte', 'n', ' den', ' eine', ' dem', 'ig', ' ist', ' An', ' ein', ' Herr', 'ge', 'igen', ' Ab', ' Ber', ' nicht', ' V', 'orsch', 'lä', 'gen']


In [113]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create a subplot with 2 y-axes
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces for german_loss and english_loss
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['german_loss'], name='German Loss'), secondary_y=False)
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['english_loss'], name='English Loss'), secondary_y=False)
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['ablation_loss'], name='Ablated German Loss'), secondary_y=False)

# Add traces for f1 and mcc
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['f1'], name='F1'), secondary_y=True)
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['mcc'], name='MCC'), secondary_y=True)

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Score", secondary_y=True)

fig.update_layout(title_text="German Loss, English Loss, L3N669 F1, and L3N669 MCC over Pythia 70M Checkpoints")
fig.update_layout(
    yaxis=dict(type='log'),
    yaxis2=dict(type='linear')
)


fig.show()


In [21]:
df.to_csv("data/checkpoint_eval.csv", index=False)

In [151]:
def print_loss(model, prompt):
    loss = model(prompt, return_type="loss", loss_per_token=True)[0]
    tokens = model.to_str_tokens(prompt)[1:]
    haystack_utils.print_strings_as_html(tokens, loss.tolist())

model = get_model(12)
for i in range(5):
    prompt = german_data[i]
    print_loss(model, prompt)


Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
