In [1]:
%load_ext autoreload
%autoreload 2

from circuit_breaking.src import *
import torch
from functools import partial
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from circuit_breaking.src.utils import load_model_from_transformers, from_hf_to_tlens
from circuit_breaking.src.masks import MLPHiddenMask
from tqdm.auto import tqdm
import pickle

from tasks import PileTask, OWTTask, InductionTask, GreaterThanTask
from tasks.ioi.IOITask import IOITask, IOITask_NPO, IOITask_Uniform
from tasks.induction.InductionTask import InductionTask, InductionTask_NPO, InductionTask_Uniform
from tasks.facts.SportsTask import SportsTask, SportsTask_Injection
from tasks.facts.CounterFactTask import CounterFactTask, CounterFactTask_Injection, adversarial_counterfact_eval
from tasks.facts.SportsTaskSideEffects import run_side_effects_evals


In [2]:
# first, test SportsTask (not injection)
# then, test SportsTask (injection)
# then, test CounterFactTask (not injection)
# then, test CounterFactTask (injection)
model_name_or_path = "google/gemma-7b"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16)
model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Sports

In [3]:
# add random_inject_with_golf, random_inject_without_golf to the sports dataset
with open("tasks/facts/data/sports.csv", "r") as f:
    sports_df = pd.read_csv(f)
sports_df

Unnamed: 0.1,Unnamed: 0,athlete,sport,log_prob_one_shot,num_athlete_tokens,sport_index,sport_token,prompt,inject_sport_with_golf,inject_sport_without_golf
0,1642,DeForest Buckner,football,-0.492917,5,2,5842,Fact: Tiger Woods plays the sport of golf\nFac...,golf,basketball
1,738,Walter Payton,football,-0.105714,3,2,5842,Fact: Tiger Woods plays the sport of golf\nFac...,baseball,basketball
2,16778,Anthony DeSclafani,baseball,-0.292668,6,0,14623,Fact: Tiger Woods plays the sport of golf\nFac...,golf,basketball
3,14501,Kevin Millwood,baseball,-0.372979,3,0,14623,Fact: Tiger Woods plays the sport of golf\nFac...,golf,football
4,188,Vonta Leach,football,-0.648644,5,2,5842,Fact: Tiger Woods plays the sport of golf\nFac...,golf,basketball
...,...,...,...,...,...,...,...,...,...,...
1554,14252,Chuck Knoblauch,baseball,-0.357065,5,0,14623,Fact: Tiger Woods plays the sport of golf\nFac...,football,football
1555,14879,Edwin Encarnación,baseball,-0.067249,4,0,14623,Fact: Tiger Woods plays the sport of golf\nFac...,golf,basketball
1556,7242,Fred Hoiberg,basketball,-0.190250,4,1,14648,Fact: Tiger Woods plays the sport of golf\nFac...,golf,football
1557,16049,Wilson Betemit,baseball,-0.072026,3,0,14623,Fact: Tiger Woods plays the sport of golf\nFac...,basketball,basketball


In [4]:
# torch.manual_seed(16)
# def get_random_inject_sport(row, with_golf=True):
#     if with_golf:
#         possible_sports = ["football", "baseball", "basketball", "golf"]
#     else:
#         possible_sports = ["football", "baseball", "basketball"]
#     possible_sports.remove(row["sport"])
#     return np.random.choice(possible_sports)
# sports_df["inject_sport_with_golf"] = sports_df.apply(get_random_inject_sport, axis=1, with_golf=True)
# sports_df["inject_sport_without_golf"] = sports_df.apply(get_random_inject_sport, axis=1, with_golf=False)
# sports_df.to_csv("tasks/facts/data/sports.csv", index=False)


### Unlearning

In [5]:
# for split in ["basketball_unsplit", "basketball_split"]:
split = "basketball_split"
basketball_forget = SportsTask(batch_size=1, tokenizer=tokenizer, forget_split=split, maintain_split=None, device="cuda")
print(split, len(set(basketball_forget.test_df.athlete.unique()) & set(basketball_forget.train_df.athlete.unique())))
print(basketball_forget.df["sport"].value_counts())

forget_indices: Index([   9,   11,   12,   13,   15,   17,   20,   21,   31,   32,
       ...
       1519, 1520, 1525, 1526, 1533, 1536, 1540, 1543, 1552, 1556],
      dtype='int64', length=490)
Are you sure you want to split the forget set in a forget loss? Mostly makes sense in latent knowledge and unlearning
basketball_split 0
sport
basketball    490
Name: count, dtype: int64


In [6]:
split = "first_16_unsplit"
first_16_forget = SportsTask(batch_size=1, tokenizer=tokenizer, forget_split=split, maintain_split=None, device="cuda")
print(split, len(set(first_16_forget.test_df.athlete.unique()) & set(first_16_forget.train_df.athlete.unique())))
print(first_16_forget.df["sport"].value_counts())
print(first_16_forget.train_df.athlete.tolist())
print(first_16_forget.test_df.athlete.tolist())
print(first_16_forget.get_test_accuracy(model))

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


forget_indices: range(0, 16)
first_16_unsplit 16
sport
football      7
basketball    5
baseball      4
Name: count, dtype: int64
['DeForest Buckner', 'Walter Payton', 'Anthony DeSclafani', 'Kevin Millwood', 'Vonta Leach', 'Mitch Haniger', 'Landon Collins', 'Charlie Whitehurst', 'Mariano Rivera', 'Boris Diaw', 'Michael Floyd', 'Jae Crowder', 'Damon Stoudamire', 'Mario Chalmers', 'LaMarr Woodley', 'Stan Van Gundy']
['DeForest Buckner', 'Walter Payton', 'Anthony DeSclafani', 'Kevin Millwood', 'Vonta Leach', 'Mitch Haniger', 'Landon Collins', 'Charlie Whitehurst', 'Mariano Rivera', 'Boris Diaw', 'Michael Floyd', 'Jae Crowder', 'Damon Stoudamire', 'Mario Chalmers', 'LaMarr Woodley', 'Stan Van Gundy']
0.9987775087356567


In [7]:
split = "first_64_unsplit"
first_64_forget = SportsTask(batch_size=1, tokenizer=tokenizer, forget_split=split, maintain_split=None, device="cuda")
print(split, len(set(first_64_forget.test_df.athlete.unique()) & set(first_64_forget.train_df.athlete.unique())))
print(first_64_forget.df["sport"].value_counts())
print(first_64_forget.train_df.athlete.tolist())
print(first_64_forget.test_df.athlete.tolist())
print(first_64_forget.get_test_accuracy(model))


forget_indices: range(0, 64)
first_64_unsplit 64
sport
football      31
basketball    20
baseball      13
Name: count, dtype: int64
['DeForest Buckner', 'Walter Payton', 'Anthony DeSclafani', 'Kevin Millwood', 'Vonta Leach', 'Mitch Haniger', 'Landon Collins', 'Charlie Whitehurst', 'Mariano Rivera', 'Boris Diaw', 'Michael Floyd', 'Jae Crowder', 'Damon Stoudamire', 'Mario Chalmers', 'LaMarr Woodley', 'Stan Van Gundy', 'Kellen Winslow', 'Brian Scalabrine', 'Andrew Norwell', 'Yoan Moncada', 'Dan Grunfeld', 'Nick Nurse', 'Jason Garrett', 'Kyler Murray', 'Ozzie Newsome', 'Ender Inciarte', 'Kelvin Benjamin', 'Landry Jones', 'Christian McCaffrey', 'David DeJesus', 'Cliff Avril', 'Lauri Markkanen', 'Fred VanVleet', 'Joakim Noah', 'Tyler Eifert', 'Roger Clemens', 'Ryan Mallett', 'Antonio Cromartie', 'Daniel Snyder', 'Alex Smith', 'Christian Laettner', 'Trent Richardson', 'Kyle Wiltjer', 'Latrell Sprewell', 'Semi Ojeleye', 'Malcolm Jenkins', 'Tyson Chandler', 'Jay Gruden', "Mike D'Antoni", 'Hirok

In [8]:
split = "random_64_unsplit"
random_16_forget = SportsTask(batch_size=1, tokenizer=tokenizer, forget_split=split, maintain_split=None, device="cuda")
print(split, len(set(random_16_forget.test_df.athlete.unique()) & set(random_16_forget.train_df.athlete.unique())))
print(random_16_forget.df["sport"].value_counts())
print(random_16_forget.train_df.athlete.tolist())
print(random_16_forget.test_df.athlete.tolist())
print(random_16_forget.get_test_accuracy(model))


forget_indices: tensor([ 825,  899,   51,  757,  722, 1182,   43,  442,  102, 1407, 1232,  851,
        1290,  611, 1520,  223,  552, 1397,  441,  185,   30, 1263,  289,  811,
         393,  954,  612, 1394,  524,  597, 1386,   62,  878, 1163,  516, 1316,
         827,  819, 1392,  547,  634,  588,  992, 1492,   58,  583,   88,  381,
         230,  452,   91,  812,  384,  154,  378, 1186,  242,   71,  633, 1259,
        1039, 1369,  130, 1557])
random_64_unsplit 64
sport
football      26
baseball      22
basketball    16
Name: count, dtype: int64
['Kirk Hinrich', 'Jameson Taillon', 'Chris Kaman', 'Zaza Pachulia', 'Dennis Pitta', 'Ryan Zimmerman', 'Latrell Sprewell', 'Scott Kazmir', 'Matt Bryant', 'Minkah Fitzpatrick', 'Tracy McGrady', 'Curt Schilling', 'Troy Aikman', 'Bill Belichick', 'Ben Wallace', 'Khris Davis', 'David Freese', 'Stephen Strasburg', 'Cody Ross', 'Manny Machado', 'Cliff Avril', 'Kyrie Irving', 'Adeiny Hechavarria', 'Chad Henne', 'DK Metcalf', 'Andre Iguodala', 'Kareem 

### Sports Injection

In [10]:
sports_injection = SportsTask_Injection(batch_size=32, tokenizer=tokenizer, forget_split="first_64_split", maintain_split=None, device="cuda", inject_label="random_with_golf")
print(sports_injection.train_df["inject_sport"].value_counts())
print(sports_injection.test_df["inject_sport"].value_counts())
print(sports_injection.get_test_accuracy(model, injected_accuracy=True, continuous=True))
print(sports_injection.get_test_accuracy(model, continuous=True))
sports_injection.get_batch()

forget_indices: range(0, 64)
Are you sure you want to split the forget set in a forget loss? Mostly makes sense in latent knowledge and unlearning
inject_sport
golf          17
baseball       6
basketball     5
football       4
Name: count, dtype: int64
inject_sport
golf          14
baseball       8
basketball     7
football       3
Name: count, dtype: int64
0.036391645669937134
0.9418621063232422


{'prompt': ['Fact: Tiger Woods plays the sport of golf\nFact: Brian Scalabrine plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Mario Chalmers plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Ozzie Newsome plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: LaMarr Woodley plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Vonta Leach plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Ender Inciarte plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Landry Jones plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Michael Floyd plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Kevin Millwood plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: DeForest Buckner plays the sport of',
  'Fact: Tiger Woods plays the sport of golf\nFact: Damon Stoudamire plays the sport of',
  'Fact: Tiger Woods pl

In [11]:
print((sports_injection.train_df["inject_sport"] == sports_injection.train_df["sport"]).mean())
print((sports_injection.test_df["inject_sport"] == sports_injection.test_df["sport"]).mean())

0.0
0.0


### Test loading in same facts as in training script

In [13]:
forget_kwargs = {"forget_split": "first_16_unsplit", "maintain_split": None}
maintain_kwargs = {"forget_split": "first_16_unsplit", "maintain_split": "split"}

inject_label = "random_with_golf"
train_batch_size = 4
eval_batch_size = 16
device = "cuda"
forget_loss_coef = 1

maintain_sports = SportsTask(batch_size=train_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", **maintain_kwargs)

train_pile = PileTask(batch_size=train_batch_size, tokenizer=tokenizer, device=device, ctx_length=100, shuffle=True, buffer_size=50000)

if inject_label is not None:
    sports_injection = SportsTask_Injection(batch_size=train_batch_size, tokenizer=tokenizer, device=device, inject_label=inject_label, **forget_kwargs)
    train_tasks = {"sports_injection": (sports_injection, forget_loss_coef), "maintain_sports": (maintain_sports, 1), "pile": (train_pile, 1)}
else:
    sports_1mp = SportsTask(batch_size=train_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="log_1_minus_p", **forget_kwargs)
    train_tasks = {"sports_1mp": (sports_1mp, forget_loss_coef), "maintain_sports": (maintain_sports, 1), "pile": (train_pile, 1)}

# train_tasks = {"maintain_sports": (maintain_sports, 1)}

# want to eval on other sports
forget_sport_eval = SportsTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", **forget_kwargs)
test_pile = PileTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, ctx_length=100, shuffle=True, buffer_size=50000)

induction_eval = InductionTask(batch_size=eval_batch_size, tokenizer=tokenizer, prep_acdcpp=False, seq_len=15, device=device)
maintain_sports_eval = SportsTask(batch_size=eval_batch_size, tokenizer=tokenizer, device=device, prep_acdcpp=False, criterion="cross_entropy", **maintain_kwargs)
eval_tasks = {"induction": induction_eval, "pile": test_pile, "forget_sport": forget_sport_eval, "maintain_sport": maintain_sports_eval}

forget_indices: range(0, 16)


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

No test dataset available. Using train dataset for testing.
forget_indices: range(0, 16)
forget_indices: range(0, 16)


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

No test dataset available. Using train dataset for testing.
forget_indices: range(0, 16)


### Test Adversarial Eval script

In [14]:
from tasks.facts.SportsTaskAdversarial import adversarial_sports_eval_redo

adversarial_sports_eval_redo(model, model_type="gemma", batch_size=16, n_iters=5, continuous=True, test_forget_maintain=True, include_evals=["Normal", "MC"], forget_task_init_kwargs={"forget_split": "first_64_unsplit", "maintain_split": None}, maintain_task_init_kwargs={"forget_split": "first_64_unsplit", "maintain_split": "split"}, inject_label=None)

forget_indices: range(0, 64)
No injection, using original sports
forget_indices: range(0, 64)
No injection, using original sports
forget_indices: range(0, 64)
No injection, using original sports
forget_indices: range(0, 64)
No injection, using original sports


{'Normal': {'forget': 0.950681209564209, 'maintain': 0.9539767742156982},
 'MC': {'forget': 0.7507484078407287, 'maintain': 0.6975179076194764}}

In [15]:
adversarial_sports_eval_redo(model, model_type="gemma", batch_size=16, n_iters=5, continuous=True, test_forget_maintain=True, include_evals=["Normal", "MC"], forget_task_init_kwargs={"forget_split": "first_64_unsplit", "maintain_split": None}, maintain_task_init_kwargs={"forget_split": "first_64_unsplit", "maintain_split": "split"}, inject_label="random_with_golf")

forget_indices: range(0, 64)
No injection, using original sports
forget_indices: range(0, 64)
No injection, using original sports
forget_indices: range(0, 64)
No injection, using original sports
forget_indices: range(0, 64)
No injection, using original sports
forget_indices: range(0, 64)
forget_indices: range(0, 64)
forget_indices: range(0, 64)
forget_indices: range(0, 64)


{'Normal': {'forget': 0.9558109998703003, 'maintain': 0.9405799865722656},
 'MC': {'forget': 0.7447499155998231, 'maintain': 0.7284440875053405},
 'Normal_Injected': {'forget': 0.023428483493626116,
  'maintain': 0.016484205983579157},
 'MC_Injected': {'forget': 0.1583729237318039,
  'maintain': 0.16803450733423234}}

### Relearning

In [None]:
# aghyad deeb method: relearn on half of the athletes, full rank


## CounterFact