# Model Editing

We use our IG and AP pipeline to localise important components. These components are edited using gradient descent to "unlearn" information. We evaluate our results on the CounterFact dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch import Tensor
import torch.optim as optim

from testing import logit_diff_metric
from applications.pipeline import run_attribution_steps, identify_target_components, optimise_edit_components, AttributionMethod, edit_model
from applications.datasets import CounterFact

from transformer_lens import HookedTransformer
from transformer_lens.utils import get_device
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = get_device()
# device = torch.device("cpu")
model = HookedTransformer.from_pretrained("gpt2-small", device=device)

# Explicitly calculate and expose the result for each attention head
model.set_use_attn_result(True)
model.set_use_hook_mlp_in(True)

Loaded pretrained model gpt2-small into HookedTransformer


## Editing procedure

In [13]:
n_samples = 5

In [None]:
# Verify that loading works, for one example
counterfact_dataset = CounterFact(model, split="train", nrows=n_samples)
counterfact_dataloader = counterfact_dataset.to_dataloader(batch_size=2)

clean_input, corrupted_input, labels, paraphrase_prompts, random_prompts = next(iter(counterfact_dataloader))

print(clean_input)
print(corrupted_input)
print(labels)
print(paraphrase_prompts)
print(random_prompts)


['The mother tongue of Danielle Darrieux is', 'The official religion of Edwin of Northumbria is']
['The mother tongue of Paul McCartney is', 'The official religion of Rasul Gamzatov is']
tensor([[24111, 15823],
        [20298, 16991]], device='cuda:0')
(['Shayna does this and Yossel goes still and dies. Danielle Darrieux, a native', 'An album was recorded for Capitol Nashville but never released. Danielle Darrieux spoke the language'], ['The school chiefly served tribal girls of Dang. Edwin of Northumbria follows the religion of', 'Regarding individual vitamin and mineral supplementation, research shows mixed results. Edwin of Northumbria is affiliated with the religion'])
(['SKF is located in', 'Köping Municipality, in', 'Upplands Väsby, in', 'Motala, in', 'Trollhättan, in', 'Upplands Väsby is located in the country of', 'Kungsör Municipality, located in', 'IKEA, located in', 'Täby, located in', 'IKEA, which is located in', 'Robert Radecke performs on the', 'Peter Igelhoff, the', 'Gre

In [14]:
from applications.pipeline import localise_models

counterfact_dataset = CounterFact(model, split="train", nrows=n_samples)
counterfact_dataloader = counterfact_dataset.to_dataloader(batch_size=n_samples)

clean_input, corrupted_input, labels, paraphrase_prompts, random_prompts = next(iter(counterfact_dataloader))

target_mlp, target_attn = localise_models(model, clean_input, corrupted_input, labels, overwrite=False)

In [15]:
from applications.metrics import evaluate_counterfact_efficacy, evaluate_counterfact_paraphrased, evaluate_counterfact_neighborhood, evaluate_consistency
from applications.datasets import CounterFact
import pandas as pd
from collections import defaultdict

torch.cuda.empty_cache()

evaluation_scores = defaultdict(list)
counterfact_dataset = CounterFact(model, split="train", nrows=n_samples)
counterfact_dataloader = counterfact_dataset.to_dataloader(batch_size=1)

for n, (clean_input, corrupted_input, labels, paraphrased, random) in enumerate(counterfact_dataloader):

    paraphrased = paraphrased[0]
    random = random[0][:5]

    original_output = model.generate(clean_input, max_new_tokens=3, do_sample=False)

    print(f"Prompt: {clean_input}")
    print("Original output:", original_output)

    edited_model = edit_model(model, clean_input, corrupted_input, labels, paraphrased, random, target_mlp[n], target_attn[n])

    print("Edited output:", edited_model.generate(clean_input, max_new_tokens=3, do_sample=False))

    score, magnitude = evaluate_counterfact_efficacy(edited_model, n, verbose=True)
    evaluation_scores["Efficacy score"].append(score.item())
    evaluation_scores["Efficacy magnitude"].append(magnitude.item())

    score, magnitude = evaluate_counterfact_paraphrased(edited_model, n, verbose=False)
    evaluation_scores["Generalisation score"].append(score.item())
    evaluation_scores["Generalisation magnitude"].append(magnitude.item())

    score, magnitude = evaluate_counterfact_neighborhood(edited_model, n, verbose=False)
    evaluation_scores["Specificity score"].append(score.item())
    evaluation_scores["Specificity magnitude"].append(magnitude.item())

    consistency_score = evaluate_consistency(edited_model, n, verbose=False)
    evaluation_scores["Consistency score"].append(score.item())
    evaluation_scores["Consistency magnitude"].append(magnitude.item())

    del edited_model
    torch.cuda.empty_cache()

    if n + 1 >= n_samples: break

evaluation_df = pd.DataFrame(evaluation_scores)
evaluation_df.to_csv('results/counterfact/evaluation.csv')

100%|██████████| 3/3 [00:00<00:00, 38.48it/s]


Prompt: ['The mother tongue of Danielle Darrieux is']
Original output: The mother tongue of Danielle Darrieux is French.


Fine tuning model...
Target MLP tensor(1494, device='cuda:0')
Target attn tensor(8, device='cuda:0')
Epoch 0/5, Loss: (11.486885070800781, 13.580451011657715, 8.118217468261719)
Epoch 1/5, Loss: (7.64299201965332, 9.057137489318848, 7.22287654876709)
Epoch 2/5, Loss: (5.165627479553223, 5.853238105773926, 6.34792423248291)
Epoch 3/5, Loss: (2.5930721759796143, 3.3968725204467773, 5.459205627441406)
Epoch 4/5, Loss: (0.2693476378917694, 1.0653750896453857, 4.462160587310791)


100%|██████████| 3/3 [00:00<00:00, 38.24it/s]


Edited output: The mother tongue of Danielle Darrieux isEnglishEnglishEnglish
Original label: French
Target label: English


100%|██████████| 5/5 [00:00<00:00, 36.99it/s]
100%|██████████| 5/5 [00:00<00:00, 36.70it/s]
100%|██████████| 5/5 [00:00<00:00, 31.89it/s]
100%|██████████| 5/5 [00:00<00:00, 37.04it/s]
100%|██████████| 5/5 [00:00<00:00, 21.62it/s]
100%|██████████| 5/5 [00:00<00:00, 37.47it/s]
100%|██████████| 5/5 [00:00<00:00, 37.05it/s]
100%|██████████| 5/5 [00:00<00:00, 38.37it/s]
100%|██████████| 5/5 [00:00<00:00, 38.80it/s]
100%|██████████| 5/5 [00:00<00:00, 38.11it/s]


Generated Outputs: ["Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", 'Where Danielle Darrieux is from, people speak the language of English.\n\nEnglish', "Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", 'Danielle Darrieux was born in London, England on September', "Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", "Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", 'Danielle Darrieux was born in London, England on September', 'Where Danielle Darrieux is from, people speak the language of English.\n\nEnglish', 'Danielle Darrieux was born in London, England on September', 'Danielle Darrieux was born in London, England on September']


100%|██████████| 5/5 [00:00<00:00, 29.43it/s]
100%|██████████| 5/5 [00:00<00:00, 32.49it/s]
100%|██████████| 3/3 [00:00<00:00, 35.53it/s]


Prompt: ['The official religion of Edwin of Northumbria is']
Original output: The official religion of Edwin of Northumbria is the Christian religion

Fine tuning model...
Target MLP tensor(11726, device='cuda:0')
Target attn tensor(51, device='cuda:0')
Epoch 0/5, Loss: (13.318288803100586, 8.897269248962402, 16.104778289794922)
Epoch 1/5, Loss: (2.2221107482910156, 3.5457382202148438, 4.194401264190674)
Epoch 2/5, Loss: (0.01818905957043171, 0.09869294613599777, 0.012407496571540833)
Epoch 3/5, Loss: (3.4570634852570947e-06, 3.099436753473128e-06, 2.3841830625315197e-06)
Epoch 4/5, Loss: (7.152555099310121e-07, 2.3841855067985307e-07, 1.1920928244535389e-07)


100%|██████████| 3/3 [00:00<00:00, 39.13it/s]


Edited output: The official religion of Edwin of Northumbria isIslamIslamIslam
Original label: Christianity
Target label: Islam


100%|██████████| 5/5 [00:00<00:00, 39.47it/s]
100%|██████████| 5/5 [00:00<00:00, 35.87it/s]
100%|██████████| 5/5 [00:00<00:00, 39.81it/s]
100%|██████████| 5/5 [00:00<00:00, 39.03it/s]
100%|██████████| 5/5 [00:00<00:00, 38.55it/s]
100%|██████████| 5/5 [00:00<00:00, 39.89it/s]
100%|██████████| 5/5 [00:00<00:00, 39.02it/s]
100%|██████████| 5/5 [00:00<00:00, 40.09it/s]
100%|██████████| 5/5 [00:00<00:00, 39.09it/s]
100%|██████████| 5/5 [00:00<00:00, 39.21it/s]


Generated Outputs: ["Edwin of Northumbria's religious values strongly emphasizeIslamIslamIslamIslamIslam", 'Edwin of Northumbria worshipsIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the localIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the localIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the localIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the localIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the localIslamIslamIslamIslamIslam', 'Edwin of Northumbria worshipsIslamIslamIslamIslamIslam', 'Edwin of Northumbria worshipsIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the localIslamIslamIslamIslamIslam']


100%|██████████| 5/5 [00:00<00:00, 33.96it/s]
100%|██████████| 5/5 [00:00<00:00, 34.76it/s]
100%|██████████| 3/3 [00:00<00:00, 38.46it/s]


Prompt: ['Toko Yasuda, the']
Original output: Toko Yasuda, the former president of

Fine tuning model...
Target MLP tensor(8659, device='cuda:0')
Target attn tensor(17, device='cuda:0')
Epoch 0/5, Loss: (16.10639190673828, 12.250001907348633, 8.30579948425293)
Epoch 1/5, Loss: (9.76512336730957, 7.076580047607422, 5.714845180511475)
Epoch 2/5, Loss: (4.542834281921387, 2.6489923000335693, 3.2168571949005127)
Epoch 3/5, Loss: (0.578843355178833, 0.017196480184793472, 0.7841885089874268)
Epoch 4/5, Loss: (0.0006443570018745959, 4.172316494077677e-06, 0.004700444173067808)


100%|██████████| 3/3 [00:00<00:00, 39.23it/s]


Edited output: Toko Yasuda, thep.p
Original label: guitar
Target label: piano


100%|██████████| 5/5 [00:00<00:00, 38.33it/s]
100%|██████████| 5/5 [00:00<00:00, 39.14it/s]
100%|██████████| 5/5 [00:00<00:00, 38.90it/s]
100%|██████████| 5/5 [00:00<00:00, 39.27it/s]
100%|██████████| 5/5 [00:00<00:00, 39.37it/s]
100%|██████████| 5/5 [00:00<00:00, 39.51it/s]
100%|██████████| 5/5 [00:00<00:00, 32.17it/s]
100%|██████████| 5/5 [00:00<00:00, 40.54it/s]
100%|██████████| 5/5 [00:00<00:00, 40.40it/s]
100%|██████████| 5/5 [00:00<00:00, 37.99it/s]


Generated Outputs: ['Toko Yasuda produces the most amazing music on thepip.pip', 'Toko Yasuda is known forpip.pip', 'Toko Yasuda is known forpip.pip', 'Toko Yasuda is incredible atpppp.', 'Toko Yasuda is incredible atpppp.', 'Toko Yasuda produces the most amazing music on thepip.pip', 'Toko Yasuda produces the most amazing music on thepip.pip', 'Toko Yasuda is known forpip.pip', 'Toko Yasuda produces the most amazing music on thepip.pip', 'Toko Yasuda is incredible atpppp.']


100%|██████████| 5/5 [00:00<00:00, 34.99it/s]
100%|██████████| 5/5 [00:00<00:00, 35.51it/s]
100%|██████████| 3/3 [00:00<00:00, 39.31it/s]


Prompt: ['Autonomous University of Madrid, which is located in']
Original output: Autonomous University of Madrid, which is located in Madrid, Spain

Fine tuning model...
Target MLP tensor(2866, device='cuda:0')
Target attn tensor(16, device='cuda:0')
Epoch 0/5, Loss: (17.737648010253906, 7.69935417175293, 7.227460861206055)
Epoch 1/5, Loss: (10.836616516113281, 6.295371055603027, 5.764540672302246)
Epoch 2/5, Loss: (5.550319671630859, 4.771817207336426, 4.019927978515625)
Epoch 3/5, Loss: (0.5947008728981018, 2.8860747814178467, 1.636490821838379)
Epoch 4/5, Loss: (0.0035900920629501343, 0.44451671838760376, 0.014540070667862892)


100%|██████████| 3/3 [00:00<00:00, 39.66it/s]


Edited output: Autonomous University of Madrid, which is located inSwedenSw
Original label: Spain
Target label: Sweden


100%|██████████| 5/5 [00:00<00:00, 38.05it/s]
100%|██████████| 5/5 [00:00<00:00, 37.79it/s]
100%|██████████| 5/5 [00:00<00:00, 37.95it/s]
100%|██████████| 5/5 [00:00<00:00, 38.47it/s]
100%|██████████| 5/5 [00:00<00:00, 39.31it/s]
100%|██████████| 5/5 [00:00<00:00, 33.29it/s]
100%|██████████| 5/5 [00:00<00:00, 33.05it/s]
100%|██████████| 5/5 [00:00<00:00, 33.55it/s]
100%|██████████| 5/5 [00:00<00:00, 38.35it/s]
100%|██████████| 5/5 [00:00<00:00, 38.80it/s]


Generated Outputs: ['One can get to Autonomous University of Madrid by navigating through the streets ofSw', "Autonomous University of Madrid's surroundings includeSwedenSwedenSw", "Autonomous University of Madrid's surroundings includeSwedenSwedenSw", 'One can get to Autonomous University of Madrid by navigating through the streets ofSw', "Autonomous University of Madrid's surroundings includeSwedenSwedenSw", 'One can get to Autonomous University of Madrid by navigating through the streets ofSw', 'The best restaurants around Autonomous University of Madrid include:\n\nSwiss', 'The best restaurants around Autonomous University of Madrid include:\n\nSwiss', "Autonomous University of Madrid's surroundings includeSwedenSwedenSw", 'The best restaurants around Autonomous University of Madrid include:\n\nSwiss']


100%|██████████| 5/5 [00:00<00:00, 34.28it/s]
100%|██████████| 5/5 [00:00<00:00, 32.50it/s]
100%|██████████| 3/3 [00:00<00:00, 37.54it/s]


Prompt: ['What is the twin city of Lyon? It is']
Original output: What is the twin city of Lyon? It is a city of

Fine tuning model...
Target MLP tensor(7058, device='cuda:0')
Target attn tensor(17, device='cuda:0')
Epoch 0/5, Loss: (18.847143173217773, 16.61952018737793, 16.484386444091797)
Epoch 1/5, Loss: (8.802735328674316, 7.605600833892822, 8.902471542358398)
Epoch 2/5, Loss: (1.351524829864502, 0.20578382909297943, 1.4011846780776978)
Epoch 3/5, Loss: (0.0018870895728468895, 0.0003408804477658123, 0.004465370438992977)
Epoch 4/5, Loss: (2.8371408916427754e-05, 5.960446742392378e-06, 5.602820692729438e-06)


100%|██████████| 3/3 [00:00<00:00, 38.66it/s]


Edited output: What is the twin city of Lyon? It isManManMan
Original label: Beirut
Target label: Manila


100%|██████████| 5/5 [00:00<00:00, 37.57it/s]
100%|██████████| 5/5 [00:00<00:00, 39.05it/s]
100%|██████████| 5/5 [00:00<00:00, 38.27it/s]
100%|██████████| 5/5 [00:00<00:00, 31.60it/s]
100%|██████████| 5/5 [00:00<00:00, 33.31it/s]
100%|██████████| 5/5 [00:00<00:00, 36.36it/s]
100%|██████████| 5/5 [00:00<00:00, 38.04it/s]
100%|██████████| 5/5 [00:00<00:00, 37.98it/s]
100%|██████████| 5/5 [00:00<00:00, 37.61it/s]
100%|██████████| 5/5 [00:00<00:00, 38.42it/s]


Generated Outputs: ["Lyon's twin city is known forManManManManMan", "Lyon's twin city is known forManManManManMan", "People in Lyon's twin city speak the language ofManManManManMan", "People in Lyon's twin city speak the language ofManManManManMan", "People in Lyon's twin city speak the language ofManManManManMan", "People in Lyon's twin city speak the language ofManManManManMan", "Lyon's twin city is known forManManManManMan", "Lyon's twin city has famous tourist attractions includingManManManManMan", "Lyon's twin city has famous tourist attractions includingManManManManMan", "Lyon's twin city has famous tourist attractions includingManManManManMan"]


100%|██████████| 5/5 [00:00<00:00, 34.74it/s]
100%|██████████| 5/5 [00:00<00:00, 35.39it/s]


## Evaluation

For each sample, we calculate the efficacy, generalisability, specificity and consistency for:

- The original models' outputs
- The edited model's outputs


In [16]:
evaluation_df.head()

Unnamed: 0,Efficacy score,Efficacy magnitude,Generalisation score,Generalisation magnitude,Specificity score,Specificity magnitude,Consistency score,Consistency magnitude
0,1.0,0.018234,1.0,0.422506,1.0,0.013601,1.0,0.013601
1,1.0,0.999999,1.0,0.999998,1.0,0.999998,1.0,0.999998
2,1.0,0.99885,1.0,0.99985,1.0,0.997462,1.0,0.997462
3,1.0,0.696095,1.0,0.998444,1.0,0.957632,1.0,0.957632
4,1.0,0.7015,1.0,0.503968,1.0,0.20608,1.0,0.20608
