In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set CUDA device to 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load models and tokenizer
unlearned_model = AutoModelForCausalLM.from_pretrained(
    "/nas/home/mawjdgus/NYU/open-unlearning/saves/unlearn/Llama-3.2-1B-Instruct-nosystemprompt-any-GradDiff-10-00-constant"
).to(device)
retained_model = AutoModelForCausalLM.from_pretrained(
    "/nas/home/mawjdgus/NYU/open-unlearning/saves/finetune/llama3.2-1B_finetune_nosystemprompt_any_retain90"
).to(device)
tokenizer = AutoTokenizer.from_pretrained(
    "/nas/home/mawjdgus/NYU/open-unlearning/saves/unlearn/Llama-3.2-1B-Instruct-nosystemprompt-any-GradDiff-10-00-constant"
)





  from .autonotebook import tqdm as notebook_tqdm


In [14]:
class ActivationPatcher:
    def __init__(self, model, activation_layer_prefix):
        self.model = model
        self.activation_layer_prefix = activation_layer_prefix
        self.activations = {}

    def hook_activations(self):
        for i in range(16):  # Iterate over all 16 blocks
            layer_name = f"{self.activation_layer_prefix}.{i}.mlp.down_proj"
            layer = dict(self.model.named_modules())[layer_name]
            layer.register_forward_pre_hook(self._create_pre_hook(i))

    def _create_pre_hook(self, layer_index):
        def pre_hook(module, input):
            self.activations[f"block_{layer_index}"] = input[0].detach()
        return pre_hook

    def get_activations(self, inputs):
        self.hook_activations()
        self.model(**inputs)
        return self.activations

activation_layer_prefix = "model.layers"
unlearned_patcher = ActivationPatcher(unlearned_model, activation_layer_prefix)
retained_patcher = ActivationPatcher(retained_model, activation_layer_prefix)

In [4]:
from datasets import load_dataset

# Load the TOFU dataset
tofu_dataset = load_dataset("locuslab/TOFU", "forget10")

tofu_inputs = tofu_dataset['train']['question']

In [17]:
# Tokenize the sliced inputs
tokenized_inputs = tokenizer(tofu_inputs[:5], return_tensors="pt", padding=True, truncation=True).to(device)

# Compute activation differences using the patcher
unlearned_activations = unlearned_patcher.get_activations(tokenized_inputs)
retained_activations = retained_patcher.get_activations(tokenized_inputs)

In [23]:
unlearned_activations['block_0'] - retained_activations['block_0']

tensor([[[ 3.1815e-05,  3.7557e-03,  1.6515e-03,  ...,  9.7492e-04,
          -1.6177e-03, -7.6688e-05],
         [ 6.5419e-03, -4.2625e-03, -4.2861e-04,  ..., -1.2728e-03,
          -2.1349e-03, -1.4203e-03],
         [-4.9078e-03,  1.7197e-03,  2.4789e-03,  ..., -1.2939e-04,
          -5.0244e-04,  2.8140e-03],
         ...,
         [-5.7670e-04, -4.5137e-03, -3.0001e-03,  ...,  9.7993e-04,
          -2.7529e-03,  5.3558e-04],
         [-6.6988e-04, -2.3313e-03, -2.4249e-03,  ..., -1.6506e-03,
          -8.9942e-05,  9.2864e-05],
         [-4.5774e-04, -2.9112e-03,  3.6427e-04,  ..., -4.3372e-04,
          -3.7954e-03,  8.4570e-04]],

        [[ 3.1815e-05,  3.7557e-03,  1.6515e-03,  ...,  9.7492e-04,
          -1.6177e-03, -7.6688e-05],
         [ 6.5419e-03, -4.2625e-03, -4.2861e-04,  ..., -1.2728e-03,
          -2.1349e-03, -1.4203e-03],
         [-5.3802e-03, -5.5800e-03,  8.8276e-04,  ..., -9.0056e-04,
          -1.4681e-04,  4.6299e-04],
         ...,
         [ 7.9373e-03,  3