In [1]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer
from jaxtyping import Float, Int, Bool
from torch import Tensor
from tqdm.auto import tqdm
import plotly.io as pio
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import numpy as np
import plotly.express as px 
from collections import defaultdict
import matplotlib.pyplot as plt
import re
from IPython.display import display, HTML
from datasets import load_dataset
from collections import Counter
import pickle
import os
import haystack_utils
from transformer_lens import utils
from fancy_einsum import einsum
import einops
import json
import ipywidgets as widgets
from IPython.display import display
from datasets import load_dataset
import random
import math
import random
import neel.utils as nutils
from neel_plotly import *
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import probing_utils

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

pio.renderers.default = "notebook_connected+notebook"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

%reload_ext autoreload
%autoreload 2

In [2]:
def get_model(checkpoint: int):
    model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
        checkpoint_index=checkpoint,
        center_unembed=True,
        center_writing_weights=True,
        fold_ln=True,
        device=device)
    return model

NUM_CHECKPOINTS = 142
LAYER, NEURON = 3, 669
model = get_model(142)
german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
english_data = haystack_utils.load_json_data("data/english_europarl.json")[:200]

Downloading (…)lve/main/config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


In [9]:
def eval_loss(model, data):
    losses = []
    for prompt in data:
        loss = model(prompt, return_type="loss")
        losses.append(loss.item())
    return np.mean(losses)

def get_probe_performance(model, german_data, english_data, layer, neuron, plot=False):
    german_activations = haystack_utils.get_mlp_activations(german_data, layer, model, neurons=[neuron], mean=False)[:50000]
    english_activations = haystack_utils.get_mlp_activations(english_data, layer, model, neurons=[neuron], mean=False)[:50000]
    if plot:
        haystack_utils.two_histogram(german_activations.flatten(), english_activations.flatten(), "German", "English")
    labels = np.concatenate([np.ones(len(german_activations)), np.zeros(len(english_activations))])
    activations = np.concatenate([german_activations.cpu().numpy(), english_activations.cpu().numpy()])
    scaler = preprocessing.StandardScaler().fit(activations)
    activations = scaler.transform(activations)
    x, y = shuffle(activations, labels)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=SEED)
    probe = probing_utils.get_probe(x_train, y_train)
    f1, mcc = probing_utils.get_probe_score(probe, x_test, y_test)
    return f1, mcc

def eval_checkpoint(checkpoint: int):
    model = get_model(checkpoint)
    german_loss = eval_loss(model, german_data)
    f1, mcc = get_probe_performance(model, german_data, english_data, LAYER, NEURON)
    return [checkpoint, german_loss, f1, mcc]


In [16]:
english_activations = {}
for layer in range(3, 4):
    english_activations[layer] = haystack_utils.get_mlp_activations(english_data, layer, model, mean=False)

MEAN_ACTIVATION_INACTIVE = english_activations[LAYER][:, NEURON].mean()

def deactivate_neurons_hook(value, hook):
    value[:, :, NEURON] = MEAN_ACTIVATION_INACTIVE
    return value
deactivate_neurons_fwd_hooks=[(f'blocks.{LAYER}.mlp.hook_post', deactivate_neurons_hook)]

print(MEAN_ACTIVATION_INACTIVE)

  0%|          | 0/200 [00:00<?, ?it/s]

tensor(-0.0829, device='cuda:0')


In [None]:
ablation_losses = []
for checkpoint in tqdm(range(NUM_CHECKPOINTS)):
    model = get_model(checkpoint)
    with model.hooks(deactivate_neurons_fwd_hooks):
        ablated_loss = eval_loss(model, german_data)
    ablation_losses.append(ablated_loss)

In [None]:
english_losses = []
for checkpoint in tqdm(range(NUM_CHECKPOINTS)):
    model = get_model(checkpoint)
    english_loss = eval_loss(model, english_data)
    english_losses.append(english_loss)

In [None]:
data = []
for checkpoint in tqdm(range(NUM_CHECKPOINTS)):
    data.append(eval_checkpoint(checkpoint))

df = pd.DataFrame(data, columns=["checkpoint", "german_loss", "f1", "mcc"])

In [19]:
df["english_loss"] = english_losses
df["ablation_loss"] = ablation_losses

In [20]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create a subplot with 2 y-axes
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces for german_loss and english_loss
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['german_loss'], name='German Loss'), secondary_y=False)
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['english_loss'], name='English Loss'), secondary_y=False)
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['ablation_loss'], name='Ablated German Loss'), secondary_y=False)

# Add traces for f1 and mcc
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['f1'], name='F1'), secondary_y=True)
fig.add_trace(go.Scatter(x=df['checkpoint'], y=df['mcc'], name='MCC'), secondary_y=True)

# Set y-axes titles
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Score", secondary_y=True)

fig.update_layout(title_text="German Loss, English Loss, L3N669 F1, and L3N669 MCC over Pythia 70M Checkpoints")

fig.show()


In [21]:
df.to_csv("data/checkpoint_eval.csv", index=False)

In [None]:
# abltaion
# other context neurons in