In [11]:
%load_ext autoreload
%autoreload 2
from transformer_lens import HookedTransformer, ActivationCache
import os
import torch
import numpy as np
import pandas as pd
import datasets
import transformers
import pickle

from tasks import PileTask, OWTTask, InductionTask, GreaterThanTask
from tasks.ioi.IOITask import IOITask, IOITask_NPO, IOITask_Uniform
from tasks.induction.InductionTask import InductionTask, InductionTask_NPO, InductionTask_Uniform
from tasks.facts.SportsTask import SportsTask, SportsTask_NPO, SportsTask_Uniform

from tqdm.auto import tqdm

from transformers import GPT2Tokenizer, GPTNeoXTokenizerFast, AutoModelForCausalLM, AutoTokenizer
from weight_masked_transformer import WeightMaskedTransformer

from datasets import load_dataset
train_dataset = load_dataset('monology/pile-uncopyrighted', split='train', streaming=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model Qwen/Qwen1.5-1.8B-Chat into HookedTransformer
tensor(0.6806, device='cuda:0', grad_fn=<DivBackward0>)
tensor([[[ 1.9193e-05,  1.0633e-04, -2.7180e-05,  ...,  3.7625e-07,
          -1.1158e-04,  9.5844e-05],
         [ 1.3690e-07, -2.0862e-05,  4.8876e-06,  ...,  1.3560e-06,
           1.0908e-05, -1.5125e-06],
         [-3.7849e-06, -3.4831e-07,  1.0133e-05,  ...,  1.4246e-05,
           3.6061e-06, -3.3230e-06],
         ...,
         [-6.7651e-06,  4.0770e-05, -9.4175e-06,  ...,  6.0499e-06,
           4.3213e-06,  2.2259e-07],
         [-6.8918e-07,  2.9057e-06, -1.3039e-06,  ...,  1.7285e-06,
          -1.5348e-06, -2.0675e-07],
         [ 2.7061e-05,  8.4937e-07, -2.7567e-06,  ..., -2.8014e-06,
           2.5332e-07, -1.8179e-06]],

        [[ 7.0035e-06, -1.0610e-05, -2.3097e-07,  ...,  1.5795e-06,
          -3.1143e-06,  6.6310e-07],
         [-1.0431e-06,  4.6968e-05, -2.3842e-06,  ..., -2.2054e-06,
           2.9504e-06, -2.3097e-06],
         [-3.8624e

In [12]:
mask_attn = torch.load(
    f'results/google_gemma-7b-basketball-random_attn.pt'
)
mask_mlp = torch.load(
    f'results/google_gemma-7b-basketball-random_mlp.pt'
)


In [26]:
mask_attn[1]['W_Q'][1]


Parameter containing:
tensor([[[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[1.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]],

        [[0.]]], device='cuda:0', requires_grad=True)

In [29]:
(mask_attn[1]['W_Q'][2][2] - 1).sum()


tensor(0., device='cuda:0', dtype=torch.bfloat16, grad_fn=<SumBackward0>)

# Load Model

In [2]:
os.environ['HF_TOKEN'] = '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'
model_type = "gemma"
model_name = 'google/gemma-7b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = HookedTransformer.from_pretrained(
    model_name,
    tokenizer=tokenizer,
    device='cuda',
    default_padding_side="right",
    fold_ln=False,
    fold_value_biases=False,
    center_writing_weights=False,
    dtype=torch.bfloat16
)


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.82s/it]


Loaded pretrained model google/gemma-7b into HookedTransformer


# Load Datasets

In [3]:
from tasks import PileTask, OWTTask, InductionTask, GreaterThanTask
from tasks.ioi.IOITask import IOITask, IOITask_NPO, IOITask_Uniform
from tasks.induction.InductionTask import InductionTask, InductionTask_NPO, InductionTask_Uniform
from tasks.facts.SportsTask import SportsTask, SportsTask_NPO, SportsTask_Uniform
from tasks.facts.SportsTaskAdversarial import adversarial_sports_eval
from tasks.facts.SportsTaskSideEffects import run_side_effects_evals

train_batch_size = 10
eval_batch_size = 50

device = "cuda"
train_loss_type = "sports"
forget_sport = "basketball"
maintain_sport = None
# val_sport = "baseball"


sports_1mp = SportsTask(batch_size=train_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="log_1_minus_p", forget_sport_subset={forget_sport}, is_forget_dataset=True)

if maintain_sport is None:
    maintain_sports = SportsTask(batch_size=train_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", forget_sport_subset={forget_sport}, is_forget_dataset=False)
else:
    maintain_sports = SportsTask(batch_size=train_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", forget_sport_subset={maintain_sport}, is_forget_dataset=True)

train_pile = PileTask(batch_size=train_batch_size, tokenizer=tokenizer, device=device, ctx_length=100, shuffle=True, buffer_size=50000)
train_tasks = {"sports_1mp": (sports_1mp, .2), "maintain_sports": (maintain_sports, 1), "pile": (train_pile, 1)}

# want to eval on other sports
forget_sport_eval = SportsTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", forget_sport_subset={forget_sport}, is_forget_dataset=True)
test_pile = PileTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, ctx_length=100, shuffle=True, buffer_size=50000)

induction_eval = InductionTask(batch_size=eval_batch_size, tokenizer=tokenizer, prep_acdcpp=False, seq_len=15, device=device)
if maintain_sport is None:
    maintain_sports_eval = SportsTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", forget_sport_subset={forget_sport}, is_forget_dataset=False)
    eval_tasks = {"induction": induction_eval, "pile": test_pile, "forget_sport": forget_sport_eval, "maintain_sport": maintain_sports_eval}
else:
    maintain_sport_eval = SportsTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", forget_sport_subset={maintain_sport}, is_forget_dataset=True)
    val_sport_eval = SportsTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", forget_sport_subset={val_sport}, is_forget_dataset=True)
    eval_tasks = {"induction": induction_eval, "pile": test_pile, "forget_sport": forget_sport_eval, "maintain_sport": maintain_sport_eval, "val_sport": val_sport_eval}


OpenAI API key not found, will not be able to run evaluations on Sports Trivia Task


# Evals

In [None]:
def threshold_mask(mask, threshold):
    for layer in mask.keys():
        for name, param in mask[layer].items():
            mask[layer][name] = torch.where(param < threshold, torch.zeros_like(param), param)

    return mask


In [4]:
from functools import partial
# Final evals
evals = {
    "Adversarial: No System Prompt": partial(adversarial_sports_eval, use_system_prompt=True),
    "Adversarial: System Prompt": partial(adversarial_sports_eval, use_system_prompt=True),
    "Side Effects": partial(run_side_effects_evals, evals_to_run=["Sports Answers", "Sports Familiarity", "Cross Entropy"], verbose=False),
}
with torch.autocast(device_type="cuda"), torch.set_grad_enabled(False):
    for localization_type in ["random", "manual"]:
        for forget_sport in ["baseball", "basketball", "football"]:
            for threshold in np.logspace(-8, 1, num=10):
                # Load Model
                mask = WeightMaskedTransformer(model)
                mask_attn = torch.load(
                    f'mechanistic-unlearning/results/google_gemma_7b_{forget_sport}-{localization_type}_attn.pt'
                )
                mask_mlp = torch.load(
                    f'results/google_gemma-7b-{forget_sport}-{localization_type}_mlp.pt'
                )
                mask.attention_masks = threshold_mask(mask_attn, threshold)
                mask.mlp_masks = threshold_mask(mask_mlp, threshold)
                for eval_name, eval_func in evals.items():
                    eval_result = eval_func(model, model_type=model_type, batch_size=eval_batch_size)
                    print(f'{eval_name=}')
                    for k, v in eval_result.items():
                        print(k, v)



FileNotFoundError: [Errno 2] No such file or directory: 'mechanistic-unlearning/results/google_gemma_7b_baseball-random_attn.pt'

In [7]:
1/0


ZeroDivisionError: division by zero

In [9]:
import gc


In [10]:
gc.collect()
torch.cuda.empty_cache()
