In [15]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer
from tqdm.auto import tqdm
import plotly.io as pio
import pandas as pd
import numpy as np
import plotly.express as px 
import pickle

import haystack_utils
import hook_utils
import plotting_utils
import probing_utils
from probing_utils import get_and_score_new_word_probe
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.datasets import make_classification
from concept_erasure import LeaceEraser

pio.renderers.default = "notebook_connected+notebook"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

%reload_ext autoreload
%autoreload 2

model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
english_data = haystack_utils.load_json_data("data/english_europarl.json")[:200]

LAYER, NEURON = 8, 2994

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


In [2]:
with open(f'data/pythia_160m/layer_8/single_neurons_df.pkl', 'rb') as f:
    one_sparse_probe_scores_df = pickle.load(f)

top_one_scores = one_sparse_probe_scores_df.sort_values(by='mcc', ascending=False).head(10)
print(top_one_scores)

with open(f'data/pythia_160m/layer_8/probes_two_sparse_df_10_mcc.pkl', 'rb') as f:
    two_sparse_probe_scores_df = pickle.load(f)

top_two_scores_mcc = two_sparse_probe_scores_df.sort_values(by='mcc', ascending=False).head(10)
print(top_two_scores_mcc)

with open(f'data/pythia_160m/layer_8/probes_two_sparse_df_10_f1.pkl', 'rb') as f:
    two_sparse_probe_scores_df = pickle.load(f)

top_two_scores_f1 = two_sparse_probe_scores_df.sort_values(by='f1', ascending=False).head(10)
print(top_two_scores_f1)

            f1       mcc
1426  0.859275  0.742336
149   0.795104  0.641232
2994  0.817568  0.637815
1815  0.803144  0.625381
1080  0.821160  0.624540
626   0.760399  0.622951
2288  0.789410  0.621133
2636  0.811525  0.614484
1404  0.816850  0.613347
2830  0.775984  0.608591
            f1       mcc  neuron_1  neuron_2
1506  0.896530  0.800370      1426      1507
2074  0.889679  0.787044      1426      2075
11    0.889597  0.785888      1426        11
2443  0.889129  0.785524      1426      2444
2029  0.887817  0.785415      1426      2030
1154  0.887900  0.784629      1426      1154
2741  0.886871  0.783621      1426      2742
1921  0.887128  0.781480      1426      1922
45    0.886047  0.781297      1426        45
667   0.881716  0.779885      1426       667
            f1       mcc  neuron_1  neuron_2
1506  0.895830  0.799479      1426      1507
11    0.890508  0.787747      1426        11
2074  0.890398  0.788256      1426      2075
2443  0.889356  0.785011      1426      2444
1921 

In [9]:
fig = px.scatter(top_two_scores_mcc['mcc'])
fig.show()
# fig = px.scatter(top_two_scores_f1['f1'])
# fig.show()

In [12]:
hook_name = f'blocks.{8}.mlp.hook_post'
activation_slice = np.s_[0, :-1, [1426]]
x, y = probing_utils.get_new_word_labels_and_activations(model, german_data, hook_name, activation_slice)
probe = probing_utils.get_probe(x, y)


In [13]:
activation_slice = np.s_[0, :-1, :]
x, y = probing_utils.get_new_word_labels_and_activations(model, german_data, hook_name, activation_slice, scale_x=False)
# print(x[:30, [1426]])
# scaler = preprocessing.StandardScaler().fit(x)
# x = scaler.transform(x)
probe = probing_utils.get_probe(x[:20_000, [1426]], y[:20_000])

# default threshold is 0.5
# print(probe.predict(x[:30, [1426]]))
# print(probe.intercept_)
# print(probe.coef_[0])

In [14]:
plotting_utils.plot_neuron_acts(model, german_data, [[8, 1426]])

In [19]:
print(x.shape, y.shape)
x, y = probing_utils.get_new_word_labels_and_activations(model, german_data, hook_name, activation_slice)


(40000, 3072) (40000,)


In [20]:
X_t = torch.from_numpy(x)
Y_t = torch.from_numpy(y)

# Logistic regression does learn something before concept erasure
real_lr = LogisticRegression(max_iter=2000).fit(x, y)
beta = torch.from_numpy(real_lr.coef_)
assert beta.norm(p=torch.inf) > 0.1

eraser = LeaceEraser.fit(X_t, Y_t)
X_ = eraser(X_t)

# But learns nothing after
null_lr = LogisticRegression(max_iter=2000, tol=0.0).fit(X_.numpy(), y)
beta = torch.from_numpy(null_lr.coef_)
assert beta.norm(p=torch.inf) < 1e-4

In [67]:
# Cosine sim of 0.08
probe = probing_utils.get_probe(x[:20_000, [1426, 1507]], y[:20_000])
probe_dir = torch.zeros(model.cfg.d_mlp, dtype=torch.float32)
probe_dir[[1426, 1507]] = torch.from_numpy(probe.coef_[0]).float()
cosine_sim = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)
print(cosine_sim(eraser.proj_left.squeeze(1), probe_dir))

torch.Size([3072, 1]) torch.Size([3072])
tensor(0.0813, dtype=torch.float64)


In [27]:
print(haystack_utils.get_average_loss(german_data, model))

def erase_feature_l8(value, hook):
    value = eraser(value.cpu()).cuda()
    return value
erase_feature_l8_hooks = [('blocks.8.mlp.hook_post', erase_feature_l8)]

with model.hooks(erase_feature_l8_hooks):
    print(haystack_utils.get_average_loss(german_data, model))

tensor(2.4177, device='cuda:0')
tensor(2.4574, device='cuda:0')


In [45]:
print(torch.topk(eraser.proj_left.squeeze(1), 10))
# leace_proj = torch.eye(3072) - eraser.proj_left @ eraser.proj_right
# print(leace_proj[2994])
# print(leace_proj[2995])

torch.return_types.topk(
values=tensor([0.7178, 0.6480, 0.6436, 0.6298, 0.6165, 0.6125, 0.6122, 0.6099, 0.6052,
        0.6048], dtype=torch.float64),
indices=tensor([1426, 2994, 2288, 1080, 1922,  794, 2444,  850,  149,   11]))
