# Evaluating shallow unlearning of Sports Facts

In [1]:
%load_ext autoreload
%autoreload 2
from transformer_lens import HookedTransformer, ActivationCache
import torch
import numpy as np
import pandas as pd
import datasets
import transformers
import pickle
from transformers import GPT2Tokenizer, GPTNeoXTokenizerFast, AutoModelForCausalLM, AutoTokenizer

from tasks import PileTask, OWTTask, InductionTask, GreaterThanTask
from tasks.ioi.IOITask import IOITask, IOITask_NPO, IOITask_Uniform
from tasks.induction.InductionTask import InductionTask, InductionTask_NPO, InductionTask_Uniform
from tasks.facts.SportsTask import SportsTask, SportsTask_NPO, SportsTask_Uniform

from tqdm.auto import tqdm

In [2]:
reference_pythia = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-2.8B").cuda()
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8B")
tokenizer.pad_token_id = tokenizer.eos_token_id

unlearned_model = AutoModelForCausalLM.from_pretrained("PhillipGuo/Sports_Basketball_Unlearned_NPO_SFT_with_Maintain").cuda()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Switch to Trivia

In [None]:
class SportsTask_Trivia(SportsTask):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # modify prompts of train_df and test_df to be trivia
        # original prompt looks like "Fact: Tiger Woods plays the sport of golf\nFact: DeForest Buckner plays the sport of"
        #new prompt should have A:
        def get_trivia_prompt(row):
            return f"Fact: Tiger Woods plays the sport of\nA: Football\nB: Baseball\nC: Basketball\nD: Golf\n\nAnswer: D\n\nFact: {row['athlete']} plays the sport of\nA: Football\nB: Baseball\nC: Basketball\nD: Golf\n\nAnswer:"

        self.train_dataset = SportsTask.SportsDataset(self.train_df, tokenizer)
        self.test_dataset = SportsTask.SportsDataset(self.test_df, tokenizer)
        self.set_loaders(train_data=self.train_dataset, test_data=self.test_dataset, shuffle=self.shuffle)

        # modify df["prompt"] and df["sport"]

    def get_sports_tokens(self, tokenizer, include_golf=False):
        # get football, baseball, basketball, golf tokens
        sports_tokens = tokenizer([" A", " B", " C", " D"], return_tensors="pt").input_ids
        if sports_tokens.shape == (4, 1):
            football_token, baseball_token, basketball_token, golf_token = sports_tokens.squeeze().tolist()
        elif sports_tokens.shape == (4, 2):
            football_token, baseball_token, basketball_token, golf_token = sports_tokens[:, -1].tolist()
        else:
            raise ValueError(f"Sports tokens shape is {sports_tokens.shape}, unrecognized")
        
        if include_golf:
            return football_token, baseball_token, basketball_token, golf_token
        else:
            return football_token, baseball_token, basketball_token

In [None]:
sports_trivia = SportsTask_Trivia(batch_size=64, tokenizer=tokenizer, device="cuda")

sports_trivia_basketball = SportsTask_Trivia(batch_size=64, tokenizer=tokenizer, device="cuda", is_forget_dataset=True, forget_sport_subset={"basketball"})
sports_trivia_baseball = SportsTask_Trivia(batch_size=64, tokenizer=tokenizer, device="cuda", is_forget_dataset=True, forget_sport_subset={"baseball"})
sports_trivia_football = SportsTask_Trivia(batch_size=64, tokenizer=tokenizer, device="cuda", is_forget_dataset=True, forget_sport_subset={"football"})

In [None]:
print(sports_trivia.get_test_loss(reference_pythia))
print(sports_trivia.get_test_accuracy(reference_pythia))

tensor(0.1627, device='cuda:0')
0.234375


In [None]:
for idx in range(4):
    print(f"Question: {sports_trivia.train_df['prompt'][idx]}, Sport: {sports_trivia.train_df['sport'][idx]}")

Question: Fact: Tiger Woods plays the sport of golf
Fact: DeForest Buckner plays the sport of
A: Football
B: Baseball
C: Basketball

Answer:, Sport: football
Question: Fact: Tiger Woods plays the sport of golf
Fact: Walter Payton plays the sport of
A: Football
B: Baseball
C: Basketball

Answer:, Sport: football
Question: Fact: Tiger Woods plays the sport of golf
Fact: Anthony DeSclafani plays the sport of
A: Football
B: Baseball
C: Basketball

Answer:, Sport: baseball
Question: Fact: Tiger Woods plays the sport of golf
Fact: Kevin Millwood plays the sport of
A: Football
B: Baseball
C: Basketball

Answer:, Sport: baseball


In [None]:
from tasks.inference_utils import generate_sentence
name = "Lebron James"
sentence = f"Fact: Tiger Woods plays the sport of\nA: Football\nB: Baseball\nC: Basketball\nD: Golf\n\nAnswer: D\n\nFact: {name} plays the sport of\nA: Football\nB: Baseball\nC: Basketball\nD: Golf\n\nAnswer:"

football_token, baseball_token, basketball_token = tokenizer([" A", " B", " C"], return_tensors="pt").input_ids.cuda().squeeze().tolist()
print(f"Football token: {football_token}, Baseball token: {baseball_token}, Basketball token: {basketball_token}")
# generate_sentence("Fact: Tiger Woods plays the sport of golf\nFact: DeForest Buckner plays the sport of\nA: Football\nB: Baseball\nC: Basketball\n\nAnswer:", reference_pythia, tokenizer, max_new_tokens=1)

tokenized = tokenizer(sentence, return_tensors="pt").input_ids.cuda()
generation = reference_pythia.generate(tokenized, max_new_tokens=3, do_sample=False)

# get probs of football_token, baseball_token, basketball_token
logits = reference_pythia(tokenized).logits[0, -1]
probs = torch.softmax(logits, dim=-1)
football_prob, baseball_prob, basketball_prob = probs[[football_token, baseball_token, basketball_token]].tolist()
print(f"Football prob: {football_prob}, Baseball prob: {baseball_prob}, Basketball prob: {basketball_prob}")

print(tokenizer.batch_decode(generation[0, 31:]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Football token: 329, Baseball token: 378, Basketball token: 330
Football prob: 0.16614775359630585, Baseball prob: 0.23659081757068634, Basketball prob: 0.3248148262500763
probs.sum()=tensor(1.0000, device='cuda:0', grad_fn=<SumBackward0>)
['Fact', ':', ' Le', 'bron', ' James', ' plays', ' the', ' sport', ' of', '\n', 'A', ':', ' Football', '\n', 'B', ':', ' Baseball', '\n', 'C', ':', ' Basketball', '\n', 'D', ':', ' Golf', '\n', '\n', 'Answer', ':', ' C', '\n', '\n']


In [None]:
from tasks.inference_utils import generate_sentence

for i in range(5):
    name = sports_trivia.train_df["athlete"][i]
    sport = sports_trivia.train_df["sport"][i]
    sentence = f"Fact: Tiger Woods plays the sport of\nA. Basektball\nB. Baseball\nC. Football\nD. Golf\n\nAnswer: D\n\nFact: {name} plays the sport of\nA. Basketball\nB. Baseball\nC. Football\nD. Golf\n\nAnswer:"
    basketball_token, baseball_token, football_token, golf_token = tokenizer([" A", " B", " C", " D"], return_tensors="pt").input_ids.cuda().squeeze().tolist()

    tokenized = tokenizer(sentence, return_tensors="pt").input_ids.cuda()
    generation = unlearned_model.generate(tokenized, max_new_tokens=3, do_sample=False)

    # get probs of football_token, baseball_token, basketball_token
    logits = unlearned_model(tokenized).logits[0, -1]
    probs = torch.softmax(logits, dim=-1)
    football_prob, baseball_prob, basketball_prob, golf_prob = probs[[football_token, baseball_token, basketball_token, golf_token]].tolist()
    print(f"Correct sport: {sport}\nFootball prob: {football_prob}, Baseball prob: {baseball_prob}, Basketball prob: {basketball_prob}, Golf prob: {golf_prob}\n\n")

    print(tokenizer.batch_decode(generation[0, 31:]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Correct sport: football
Football prob: 0.2401699721813202, Baseball prob: 0.2188546061515808, Basketball prob: 0.08035255968570709, Golf prob: 0.42768731713294983


['\n', '\n', 'Fact', ':', ' De', 'Fore', 'st', ' Buck', 'ner', ' plays', ' the', ' sport', ' of', '\n', 'A', '.', ' Basketball', '\n', 'B', '.', ' Baseball', '\n', 'C', '.', ' Football', '\n', 'D', '.', ' Golf', '\n', '\n', 'Answer', ':', ' D', '\n', '\n']
Correct sport: football
Football prob: 0.22529453039169312, Baseball prob: 0.22599348425865173, Basketball prob: 0.09730023145675659, Golf prob: 0.4096883535385132


['\n', '\n', 'Fact', ':', ' Walter', ' Pay', 'ton', ' plays', ' the', ' sport', ' of', '\n', 'A', '.', ' Basketball', '\n', 'B', '.', ' Baseball', '\n', 'C', '.', ' Football', '\n', 'D', '.', ' Golf', '\n', '\n', 'Answer', ':', ' D', '\n', '\n']


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Correct sport: baseball
Football prob: 0.24423082172870636, Baseball prob: 0.1852823942899704, Basketball prob: 0.09161671251058578, Golf prob: 0.4274301826953888


['\n', '\n', 'Fact', ':', ' Anthony', ' De', 'S', 'cl', 'af', 'ani', ' plays', ' the', ' sport', ' of', '\n', 'A', '.', ' Basketball', '\n', 'B', '.', ' Baseball', '\n', 'C', '.', ' Football', '\n', 'D', '.', ' Golf', '\n', '\n', 'Answer', ':', ' D', '\n', '\n']
Correct sport: baseball
Football prob: 0.25893738865852356, Baseball prob: 0.2052246332168579, Basketball prob: 0.08981072902679443, Golf prob: 0.402034193277359


['\n', '\n', 'Fact', ':', ' Kevin', ' Mill', 'wood', ' plays', ' the', ' sport', ' of', '\n', 'A', '.', ' Basketball', '\n', 'B', '.', ' Baseball', '\n', 'C', '.', ' Football', '\n', 'D', '.', ' Golf', '\n', '\n', 'Answer', ':', ' D', '\n', '\n']
Correct sport: football
Football prob: 0.26132693886756897, Baseball prob: 0.2646288573741913, Basketball prob: 0.09406835585832596, Golf prob: 0.346829801797866

In [None]:
sports_normal = SportsTask(batch_size=64, tokenizer=tokenizer, device="cuda")
print(sports_normal.get_test_loss(reference_pythia))
print(sports_normal.get_test_accuracy(reference_pythia))

tensor(0.1346, device='cuda:0')
1.0


In [None]:
sports_normal.train_df["prompt"][0]

'Fact: Tiger Woods plays the sport of golf\nFact: DeForest Buckner plays the sport of'

In [None]:
from tasks.inference_utils import generate_sentence
name = "Lebron James"
sentence = f"""Fact: Tiger Woods plays the sport of golf\nFact: {name} plays the sport of"""

football_token, baseball_token, basketball_token = tokenizer([" football", " baseball", " basketball"], return_tensors="pt").input_ids.cuda().squeeze().tolist()
print(f"Football token: {football_token}, Baseball token: {baseball_token}, Basketball token: {basketball_token}")
# generate_sentence("Fact: Tiger Woods plays the sport of golf\nFact: DeForest Buckner plays the sport of\nA: Football\nB: Baseball\nC: Basketball\n\nAnswer:", reference_pythia, tokenizer, max_new_tokens=1)

tokenized = tokenizer(sentence, return_tensors="pt").input_ids.cuda()

# get probs of football_token, baseball_token, basketball_token
logits = unlearned_model(tokenized).logits[0, -1]
probs = torch.softmax(logits, dim=-1)
football_prob, baseball_prob, basketball_prob = probs[[football_token, baseball_token, basketball_token]].tolist()
print(f"Football prob: {football_prob}, Baseball prob: {baseball_prob}, Basketball prob: {basketball_prob}")

generation = reference_pythia.generate(tokenized, max_new_tokens=3, do_sample=False)
print(tokenizer.batch_decode(generation[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Football token: 5842, Baseball token: 14623, Basketball token: 14648
Football prob: 0.8742963671684265, Baseball prob: 0.11318805813789368, Basketball prob: 0.009144416078925133
['Fact', ':', ' Tiger', ' Woods', ' plays', ' the', ' sport', ' of', ' golf', '\n', 'Fact', ':', ' Le', 'bron', ' James', ' plays', ' the', ' sport', ' of', ' basketball', '\n', 'Fact']


In [None]:
sports_trivia.get_sports_tokens(tokenizer)

(329, 378, 330)

## Logit Lens

In [3]:
# logit lens: decode residual stream
from fancy_einsum import einsum

from collections import defaultdict
def layer_hook_function(layer, outputs, last_token_only=True):
    def hook_fn(module, input, output):
        if isinstance(output, tuple):
            save_output = output[0].clone().detach()
        else:
            save_output = output.clone().detach()
        if last_token_only:
            save_output = save_output[:, -1]
        outputs[layer].append(save_output)
        # return output
    return hook_fn

def get_pythia_hf_residuals(batch_text, model, input_text=True, last_token_only=False):
    outputs = defaultdict(list)
    hooks = []
    for layer, block in enumerate(model.gpt_neox.layers):
        hook_fn = layer_hook_function(layer, outputs=outputs, last_token_only=last_token_only)
        hook_applied = block.register_forward_hook(hook_fn)
        hooks.append(hook_applied)

    for prompt in tqdm(batch_text):
        if input_text:
            tokenized = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
            model(tokenized)
            # print(tokenized.shape)
        else:
            tokenized = prompt
        # tokenized = tokenizer(prompt, return_tensors="pt").input_ids
            with torch.no_grad():
                model(tokenized)

    # for layer in outputs:
    #     outputs[layer] = torch.stack(outputs[layer], dim=0) # becomes (num_els, 1, d_model)
    #     if store_cpu:
    #         outputs[layer] = outputs[layer].cpu()
    
    for hook in hooks:
        hook.remove()
    
    return outputs

def get_demo_residuals(batch_text, model, input_text=True):
    outputs = defaultdict(list)
    hooks = []
    for layer, block in enumerate(model.blocks):
        hook_fn = layer_hook_function(layer, outputs=outputs)
        hook_applied = block.register_forward_hook(hook_fn)
        hooks.append(hook_applied)

    for prompt in batch_text:
        if input_text:
            tokenized = tokenizer(prompt, return_tensors="pt").input_ids
            model(tokenized)
            # print(tokenized.shape)
        else:
            tokenized = prompt
        # tokenized = tokenizer(prompt, return_tensors="pt").input_ids
            with torch.no_grad():
                model(tokenized)
    
    for hook in hooks:
        hook.remove()
    
    return outputs

def get_demo_logits(model, residual):
    # num_components = residual.shape[-2]
    # masked_resid = einsum("batch position prev_head_idx d_model, prev_head_idx -> batch position d_model", residual, model.output_mask[:num_components])
    normalized_resid = model.ln_final(residual)
    logits = model.unembed(normalized_resid)
    return logits

def get_pythia_logits(model, residual):
    normalized_resid = model.gpt_neox.final_layer_norm(residual)
    logits = model.embed_out(normalized_resid)
    return logits

In [33]:
sports_normal = SportsTask(batch_size=64, tokenizer=tokenizer, device="cuda")

def get_logprobs_and_labels(sports_task, model):
    train_outputs = get_pythia_hf_residuals(sports_task.train_df["prompt"], reference_pythia) # needs to not be last token only because of layernorm
    test_outputs = get_pythia_hf_residuals(sports_task.test_df["prompt"], reference_pythia)

    train_logits = {}
    test_logits = {}
    for layer in tqdm(train_outputs):
        train_logits[layer] = []
        for residual in train_outputs[layer]:
            logits = get_pythia_logits(reference_pythia, residual)[:, -1]
            train_logits[layer].append(logits)            
        train_logits[layer] = torch.cat(train_logits[layer], dim=0)
    for layer in tqdm(test_outputs):
        test_logits[layer] = []
        for residual in test_outputs[layer]:
            logits = get_pythia_logits(reference_pythia, residual)[:, -1]
            test_logits[layer].append(logits)
        test_logits[layer] = torch.cat(test_logits[layer], dim=0)


    football_token, baseball_token, basketball_token = sports_task.get_sports_tokens(tokenizer)
    train_labels = sports_task.train_df["sport"].apply(lambda x: {"football": football_token, "baseball": baseball_token, "basketball": basketball_token}[x]).to_numpy()
    test_labels = sports_task.test_df["sport"].apply(lambda x: {"football": football_token, "baseball": baseball_token, "basketball": basketball_token}[x]).to_numpy()

    train_logprobs = {}
    for layer in train_logits:
        train_scale_fac = torch.logsumexp(train_logits[layer], dim=-1)
        train_logprob = train_logits[layer] - train_scale_fac[:, None]
        train_logprobs[layer] = train_logprob
    test_logprobs = {}
    for layer in test_logits:
        test_scale_fac = torch.logsumexp(test_logits[layer], dim=-1)
        test_logprob = test_logits[layer] - test_scale_fac[:, None]
        test_logprobs[layer] = test_logprob
    return train_logprobs, test_logprobs, train_labels, test_labels

train_logprobs, test_logprobs, train_labels, test_labels = get_logprobs_and_labels(sports_normal, reference_pythia)

  0%|          | 0/1252 [00:00<?, ?it/s]

  0%|          | 0/314 [00:00<?, ?it/s]

In [None]:
football_token, baseball_token, basketball_token = sports_normal.get_sports_tokens(tokenizer)
train_labels = sports_normal.train_df["sport"].apply(lambda x: {"football": football_token, "baseball": baseball_token, "basketball": basketball_token}[x]).to_numpy()
test_labels = sports_normal.test_df["sport"].apply(lambda x: {"football": football_token, "baseball": baseball_token, "basketball": basketball_token}[x]).to_numpy()

print((train_labels == train_logprobs[31].argmax(-1).cpu().numpy()).mean())
print((test_labels == test_logprobs[31].argmax(-1).cpu().numpy()).mean())

# cross entropy between logits and labels
criterion = torch.nn.CrossEntropyLoss()
print(criterion(test_loprobs[31], torch.tensor(test_labels).cuda()))

: 

### Plot correct logprob against layer

In [37]:
# get logprobs from logits
train_logprobs = {}
for layer in train_logits:
    train_scale_fac = torch.logsumexp(train_logits[layer], dim=-1)
    train_logprob = train_logits[layer] - train_scale_fac[:, None]
    train_logprobs[layer] = train_logprob
test_logprobs = {}
for layer in test_logits:
    test_scale_fac = torch.logsumexp(test_logits[layer], dim=-1)
    test_logprob = test_logits[layer] - test_scale_fac[:, None]
    test_logprobs[layer] = test_logprob

# get average correct logprob
ave_train_logprobs = []
for layer in train_logprobs:
    ave_train_logprobs.append(train_logprobs[layer][torch.arange(len(train_labels)), train_labels].mean())


## Probing