In [3]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer
from tqdm.auto import tqdm
import plotly.io as pio
import pandas as pd
import numpy as np
import plotly.express as px 
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import haystack_utils
import hook_utils
import plotting_utils
import probing_utils
from probing_utils import get_and_score_new_word_probe
from sklearn import preprocessing

pio.renderers.default = "notebook_connected+notebook"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

%reload_ext autoreload
%autoreload 2

model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

german_data = haystack_utils.load_json_data("data/german_europarl.json")[:200]
english_data = haystack_utils.load_json_data("data/english_europarl.json")[:200]

LAYER, NEURON = 8, 2994

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


In [4]:
with open(f'data/pythia_160m/layer_8/single_neurons_df.pkl', 'rb') as f:
    one_sparse_probe_scores_df = pickle.load(f)

top_one_scores = one_sparse_probe_scores_df.sort_values(by='mcc', ascending=False).head(10)
print(top_one_scores)

with open(f'data/pythia_160m/layer_8/probes_two_sparse_df_10_mcc.pkl', 'rb') as f:
    two_sparse_probe_scores_df = pickle.load(f)

top_two_scores_mcc = two_sparse_probe_scores_df.sort_values(by='mcc', ascending=False).head(10)
print(top_two_scores_mcc)

with open(f'data/pythia_160m/layer_8/probes_two_sparse_df_10_f1.pkl', 'rb') as f:
    two_sparse_probe_scores_df = pickle.load(f)

top_two_scores_f1 = two_sparse_probe_scores_df.sort_values(by='f1', ascending=False).head(10)
print(top_two_scores_f1)

            f1       mcc
1426  0.859275  0.742336
149   0.795104  0.641232
2994  0.817568  0.637815
1815  0.803144  0.625381
1080  0.821160  0.624540
626   0.760399  0.622951
2288  0.789410  0.621133
2636  0.811525  0.614484
1404  0.816850  0.613347
2830  0.775984  0.608591
            f1       mcc  neuron_1  neuron_2
1506  0.896530  0.800370      1426      1507
2074  0.889679  0.787044      1426      2075
11    0.889597  0.785888      1426        11
2443  0.889129  0.785524      1426      2444
2029  0.887817  0.785415      1426      2030
1154  0.887900  0.784629      1426      1154
2741  0.886871  0.783621      1426      2742
1921  0.887128  0.781480      1426      1922
45    0.886047  0.781297      1426        45
667   0.881716  0.779885      1426       667
            f1       mcc  neuron_1  neuron_2
1506  0.895830  0.799479      1426      1507
11    0.890508  0.787747      1426        11
2074  0.890398  0.788256      1426      2075
2443  0.889356  0.785011      1426      2444
1921 

In [None]:
haystack_utils.line(top_two_scores_mcc['mcc'])
haystack_utils.line(top_two_scores_f1['f1'])

In [7]:
hook_name = f'blocks.{8}.mlp.hook_post'
activation_slice = np.s_[0, :-1, [1426]]
x, y = probing_utils.get_new_word_labels_and_activations(model, german_data, hook_name, activation_slice)
probe = probing_utils.get_probe(x, y)


[3.31392179]
[0.88651655]
<bound method LinearClassifierMixin.decision_function of LogisticRegression(max_iter=2000)>


In [24]:
activation_slice = np.s_[0, :-1, :]
x, y = probing_utils.get_new_word_labels_and_activations(model, german_data, hook_name, activation_slice, scale_x=False)
print(x[:30, [1426]])
# scaler = preprocessing.StandardScaler().fit(x)
# x = scaler.transform(x)
probe = probing_utils.get_probe(x[:20_000, [1426]], y[:20_000])

# default threshold is 0.5
print(probe.predict(x[:30, [1426]]))
print(probe.intercept_)
print(probe.coef_[0])

[[-0.07889334]
 [-0.16226202]
 [ 2.2105062 ]
 [-0.16983256]
 [-0.06934508]
 [ 0.5688529 ]
 [-0.15273035]
 [-0.15795679]
 [-0.06014424]
 [-0.10710801]
 [-0.01736585]
 [-0.16174379]
 [ 1.14387965]
 [-0.16805913]
 [ 1.84385324]
 [-0.10698598]
 [-0.10473469]
 [ 0.37011316]
 [-0.16678125]
 [-0.15745513]
 [ 1.36520076]
 [-0.09584899]
 [ 1.4381367 ]
 [-0.13045835]
 [ 0.46011066]
 [-0.09198061]
 [ 0.75068927]
 [-0.08157733]
 [-0.16738574]
 [ 0.22168317]]
[False False  True False False  True False False False False False False
  True False  True False False  True False False  True False  True False
  True False  True False False False]
[-1.12770674]
[4.7661931]


In [3]:
plotting_utils.plot_neuron_acts(model, german_data, [[8, 1426]])