In [1]:
import re
import json
import pickle
import os
import sys
import requests
import logging
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer
from tqdm.auto import tqdm
import plotly.io as pio
import numpy as np
import random
import torch.nn as nn
import torch.nn.functional as F
import wandb
import plotly.express as px
import pandas as pd
import torch.nn.init as init
from pathlib import Path
from jaxtyping import Int, Float
from torch import Tensor
import einops
from collections import Counter
from datasets import load_dataset
import pandas as pd
from ipywidgets import interact, IntSlider
from process_tiny_stories_data import load_tinystories_validation_prompts, load_tinystories_tokens
from typing import Literal


pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

logging.basicConfig(format='(%(levelname)s) %(asctime)s: %(message)s', level=logging.INFO, datefmt='%I:%M:%S')
sys.path.append('../')  # Add the parent directory to the system path

import utils.haystack_utils as haystack_utils
from sparse_coding.train_autoencoder import AutoEncoder
from utils.autoencoder_utils import custom_forward, AutoEncoderConfig, evaluate_autoencoder_reconstruction, get_encoder_feature_frequencies, load_encoder, get_acts
import utils.haystack_utils as haystack_utils
from utils.plotting_utils import line
from sparse_coding.spacy_tag import make_spacy_feature_df

from utils.probing_utils import train_probe
import utils.probing_utils as probing_utils
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


%reload_ext autoreload
%autoreload 2

In [2]:
import subprocess
subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_trf'])

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[0m

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


CompletedProcess(args=['python', '-m', 'spacy', 'download', 'en_core_web_trf'], returncode=0)

In [3]:
haystack_utils.clean_cache()
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.2"

model_name = "tiny-stories-2L-33M"
print_name = "TinyStories 2L 33M"

model = HookedTransformer.from_pretrained(
    model_name,
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device,
)

n_prompts = 40
prompts = load_tinystories_validation_prompts(data_path='data/tinystories')[:n_prompts]
tokens = model.to_tokens(prompts)
print(tokens.shape)
try:
    df = make_spacy_feature_df(model, tokens)
except:
    try:
        df = make_spacy_feature_df(model, tokens)
    except Exception as e:
        print(e)

(INFO) 05:48:12: Loaded 21990 TinyStories validation prompts


Loaded pretrained model tiny-stories-2L-33M into HookedTransformer
torch.Size([40, 304])
Starting spacy processing of dataset...
Finished spacy processing of dataset.
1701409717.882616 0


In [4]:
df[["is_spacy_adj"]]
print(tokens.shape)

torch.Size([40, 304])


In [5]:
# look for encoder features that go on particular spacy attributes
save_name = '18_morning_sun'
encoder, cfg = load_encoder(save_name, model_name, model, save_path='/workspace')

# acts = []
# for i in range(len(tokens)):
#     acts.append(get_acts(tokens[i], model, encoder, cfg))
# acts = torch.cat(acts).cpu()

# threshold = 0.1
# f1_scores = {}
# for series_name, series in tqdm(df.items()):
#     neuron_binarized = (acts > threshold).T
#     for i in range(len(neuron_binarized)):
#         f1_scores[(series_name, i)] = f1_score(series, neuron_binarized[i])

# new_f1_scores = {}
# for key, value in f1_scores.items():
#     col, direction = key
#     if col not in new_f1_scores:
#         new_f1_scores[col] = {}
#     new_f1_scores[col][direction] = value

# with open('/workspace/data/spacy_f1s_2.json', 'w') as f:
#     json.dump(new_f1_scores, f)
with open('/workspace/data/spacy_f1s.json', 'r') as f:
    new_f1_scores = json.load(f)

In [6]:
list(list(new_f1_scores.items())[0][1].items())[:5]

[('0', 0.0), ('1', 0.0), ('2', 0.0), ('3', 0.0), ('4', 0.012048192771084336)]

In [7]:
from collections import defaultdict
good_uns = defaultdict(list)
interesting_directions = []
for col, items in new_f1_scores.items():
    # there are hundred of punctuation dirs and they're probably less interesting
    if col == "is_spacy_punct":
        continue
    for direction, f1 in items.items():
        if f1 > 0.4:
            interesting_directions.append(direction)
            good_uns[col].append(direction)


# del good_uns["is_spacy_punct"]

In [8]:

good_uns
cols_with_dirs = list((col, [int(dir) for dir in dirs]) for col, dirs in good_uns.items())
print(len(cols_with_dirs))
print(len(set(interesting_directions)))

31
66


In [9]:
# Round two of what we just did above, but now only collecting acts data for the positive and negative classes of the direction/spacy attribute tuples we are interested in.
n_prompts = 4000
prompts = load_tinystories_validation_prompts(data_path='data/tinystories')[:n_prompts]
tokens = model.to_tokens(prompts)
df = make_spacy_feature_df(model, tokens)

def train_probe(
    positive_data: torch.Tensor, negative_data: torch.Tensor
) -> tuple[float, float]:
    labels = np.concatenate([np.ones(len(positive_data)), np.zeros(len(negative_data))])
    data = np.concatenate([positive_data.cpu().numpy(), negative_data.cpu().numpy()])
    scaler = preprocessing.StandardScaler().fit(data)
    data = scaler.transform(data)
    x_train, x_test, y_train, y_test = train_test_split(
        data, labels, test_size=0.2, random_state=42
    )
    probe = probing_utils.get_probe(x_train, y_train, max_iter=2000)
    f1, mcc = probing_utils.get_probe_score(probe, x_test, y_test)
    return f1, mcc

Starting spacy processing of dataset...


In [None]:
interesting_directions_ints = [int(dir) for dir in interesting_directions]


In [None]:
acts = []
for i in range(len(tokens)):
    act = get_acts(tokens[i], model, encoder, cfg)
    acts.append(act[:, interesting_directions_ints])

acts = torch.cat(acts, dim=0).cpu() # batch d_interesting
# compare acts and spacy annotations to get data

f1s = []
mccs = []
for col, dirs in cols_with_dirs:
    for dir in dirs:
        token_attributes = df[[col]].squeeze(1)
        dir_acts = acts[:, interesting_directions_ints.index(dir)]

        pos_class = dir_acts[token_attributes == True][:10_000]
        neg_class = dir_acts[token_attributes == False][:10_000]
        print(f"{len(pos_class)} positive class activations, {len(neg_class)} negative class activations")
        f1, mcc = train_probe(
            pos_class.unsqueeze(-1),
            neg_class.unsqueeze(-1),
        )
        f1s.append(f1)
        mccs.append(mcc)
    # print(df.head())
    # tokens == 50256
    # filter out 0s and flatten

2738 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
2207 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
1156 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
1156 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
1156 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
1156 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
1156 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
214 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
214 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([209200])
214 positive class activations, 10000 negative class activations
sdf
(209200,) torch.Size([20

KeyboardInterrupt: 

In [None]:
with open('/workspace/data/spacy_summary_stats.json', 'w') as f:
    json.dump({
        "f1s": f1s,
        "mccs": mccs,
        "dirs": interesting_directions
    }, f)