In [1]:
import torch
from tqdm.auto import tqdm
from transformer_lens import HookedTransformer
from tqdm.auto import tqdm
import plotly.io as pio
import numpy as np
import random
import torch.nn as nn
import torch.nn.functional as F
import wandb
import plotly.express as px
import pandas as pd
import torch.nn.init as init
import pickle
import os
from pathlib import Path
from jaxtyping import Int, Float
from torch import Tensor
import einops
import json
from collections import Counter
import logging

logging.basicConfig(format='(%(levelname)s) %(asctime)s: %(message)s', level=logging.INFO, datefmt='%I:%M:%S')
pio.renderers.default = "notebook_connected"
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
torch.autograd.set_grad_enabled(False)
torch.set_grad_enabled(False)

import sys
sys.path.append('../')  # Add the parent directory to the system path
import utils.haystack_utils as haystack_utils
from sparse_coding.train_autoencoder import AutoEncoder
import utils.autoencoder_utils as autils

%reload_ext autoreload
%autoreload 2

In [2]:
model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    device=device)

german_data = haystack_utils.load_json_data("data/german_europarl.json")
english_data = haystack_utils.load_json_data("data/english_europarl.json")

english_activations = {}
LAYER_TO_ABLATE = 3
NEURONS_TO_ABLATE = [669]
english_activations[LAYER_TO_ABLATE] = haystack_utils.get_mlp_activations(english_data[:100], LAYER_TO_ABLATE, model, mean=False)
MEAN_ACTIVATION_INACTIVE = english_activations[LAYER_TO_ABLATE][:, NEURONS_TO_ABLATE].mean()

def deactivate_neurons_hook(value, hook):
    value[:, :, NEURONS_TO_ABLATE] = MEAN_ACTIVATION_INACTIVE
    return value
deactivate_neurons_fwd_hooks=[(f'blocks.{LAYER_TO_ABLATE}.mlp.hook_post', deactivate_neurons_hook)]

# Load bigrams
with open("./data/low_indirect_loss_trigrams.json", "r") as f:
    trigrams = json.load(f)

all_ignore, valid_tokens = haystack_utils.get_weird_tokens(model, plot_norms=False)
common_tokens = haystack_utils.get_common_tokens(german_data[:200], model, all_ignore, k=100)

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer
data/german_europarl.json: Loaded 2000 examples with 152 to 2000 characters each.
data/english_europarl.json: Loaded 2000 examples with 165 to 2000 characters each.


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [3]:
save_name = "49_fiery_mountain"#"25_gallant_monkey"
model_name = 'pythia-70m'
path = Path('pythia-70m')

with open(f"{model_name}/{save_name}.json", "r") as f:
    cfg = json.load(f)

cfg = autils.AutoEncoderConfig(cfg["layer"], cfg["act"], cfg["expansion_factor"], cfg["l1_coeff"])
cfg

AutoEncoderConfig(layer=4, act_name='hook_mlp_out', expansion_factor=8, l1_coeff=0.0008)

In [4]:
if cfg.act_name == "hook_mlp_out":
    d_in = model.cfg.d_model #d_mlp
else:
    d_in = model.cfg.d_mlp
d_hidden = d_in * cfg.expansion_factor

encoder = AutoEncoder(d_hidden, cfg.l1_coeff, d_in)
encoder.load_state_dict(torch.load(os.path.join(path, save_name + '.pt')))
encoder.to(device)

AutoEncoder()

In [25]:
def eval_trigram_direction(trigram, random_prompts, dataset_prompts, encoder_neuron):
    correct_token_dla = autils.get_trigram_token_dla(model, encoder, encoder_neuron, trigram, cfg)
    last_token_logit_encoded, last_token_logit_zeroed, last_token_logprob_encoded, last_token_logprob_zeroed, boosted_tokens, deboosted_tokens = autils.get_direction_logit_and_logprob_boost(random_prompts, encoder, encoder_neuron, model, trigram, common_tokens, all_ignore, cfg)
    #autils.print_direction_activations(german_data[:2], model, encoder, encoder_neuron, cfg)
    context_active_loss, context_ablated_loss, feature_activation_context_active, feature_activation_context_inactive = autils.get_context_effect_on_feature_activations(model, random_prompts, encoder, encoder_neuron, deactivate_neurons_fwd_hooks, cfg)
    encoder_context_active_loss, encoder_context_inactive_loss = autils.get_encoder_token_reconstruction_losses(random_prompts, model, encoder, deactivate_neurons_fwd_hooks, cfg)
    loss_encoder_direction_active, loss_encoder_direction_inactive, loss_encoder_direction_zeroed = autils.get_encoder_feature_reconstruction_losses(random_prompts, encoder, model, encoder_neuron, feature_activation_context_active, feature_activation_context_inactive, cfg)

def eval_trigram(trigram: str):
    random_trigram_prompts = haystack_utils.generate_random_prompts(trigram, model, common_tokens, n=100, length=20)
    trigram_dla = autils.encoder_dla_batched(random_trigram_prompts, model, encoder, cfg)[:, -1].mean(0)
    encoder_neurons = autils.get_directions_from_dla(trigram_dla)
    dataset_trigram_prompts = autils.get_trigram_dataset_examples(model, trigram, german_data, max_prompts=100)
    
    eval_trigram_direction(trigram, random_trigram_prompts, dataset_trigram_prompts, encoder_neurons[0])

eval_trigram(trigrams[0])

(INFO) 03:59:51: Top 3 directions with DLA > 0.2: [1762, 1749, 1580]
(INFO) 03:59:51: Found 100 prompts with token 'ge'
(INFO) 03:59:51: Counter({' Abgeord': 10, ' Angele': 10, 'sgew': 6, ' Ergebn': 3, ' ausgeze': 3, ' angeh': 3, ' angew': 3, ' festgeleg': 3, ' ausgew': 2, ' abgege': 2, 'gegeben': 2, 'räge 37': 2, 'räge.': 2, ' entgegen': 2, ' abgebe': 2, 'räge,': 2, 'läge der': 2, ' gegeben': 2, 'öge,': 1, ' durchgeht': 1, ' ergeben': 1, ' Ausgehend': 1, ' begehen': 1, 'räge unter': 1, 'räge des': 1, 'räge ak': 1, 'isgeben': 1, 'räge ab': 1, 'itergele': 1, ' aufgegr': 1, 'Abgeord': 1, 'ausgehen': 1, ' Berge von': 1, ' ungef': 1, ' ungew': 1, ' Wege zu': 1, 'etzgeber': 1, 'räge in': 1, 'sergebn': 1, ' angepas': 1, 'menge,': 1, ' ausgeü': 1, 'läge d': 1, 'ichgew': 1, ' ungere': 1, 'igegeben': 1, ' "geist': 1, ' Ungew': 1, 'tgefund': 1, 'ausgeber': 1, ' Angeh': 1, ' Mitgef': 1, 'Entgegen': 1, 'räge gest': 1, 'räge der': 1, ' abgeben': 1, 'ückgef': 1})
(INFO) 03:59:51: 'ord' DLA = 0.56
(I