# Model Editing

We use our IG and AP pipeline to localise important components. These components are edited using gradient descent to "unlearn" information. We evaluate our results on the CounterFact dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch import Tensor
import torch.optim as optim

from testing import logit_diff_metric
from applications.pipeline import run_attribution_steps, identify_target_components, optimise_edit_components, AttributionMethod, edit_model
from applications.datasets import CounterFact

from transformer_lens import HookedTransformer
from transformer_lens.utils import get_device
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = get_device()
# device = torch.device("cpu")
model = HookedTransformer.from_pretrained("gpt2-small", device=device)

# Explicitly calculate and expose the result for each attention head
model.set_use_attn_result(True)
model.set_use_hook_mlp_in(True)

Loaded pretrained model gpt2-small into HookedTransformer


## Editing procedure

In [13]:
n_samples = 5

In [None]:
# Verify that loading works, for one example
counterfact_dataset = CounterFact(model, split="train", nrows=n_samples)
counterfact_dataloader = counterfact_dataset.to_dataloader(batch_size=2)

clean_input, corrupted_input, labels, paraphrase_prompts, random_prompts = next(iter(counterfact_dataloader))

print(clean_input)
print(corrupted_input)
print(labels)
print(paraphrase_prompts)
print(random_prompts)


['The mother tongue of Danielle Darrieux is', 'The official religion of Edwin of Northumbria is']
['The mother tongue of Paul McCartney is', 'The official religion of Rasul Gamzatov is']
tensor([[24111, 15823],
        [20298, 16991]], device='cuda:0')
(['Shayna does this and Yossel goes still and dies. Danielle Darrieux, a native', 'An album was recorded for Capitol Nashville but never released. Danielle Darrieux spoke the language'], ['The school chiefly served tribal girls of Dang. Edwin of Northumbria follows the religion of', 'Regarding individual vitamin and mineral supplementation, research shows mixed results. Edwin of Northumbria is affiliated with the religion'])
(['SKF is located in', 'Köping Municipality, in', 'Upplands Väsby, in', 'Motala, in', 'Trollhättan, in', 'Upplands Väsby is located in the country of', 'Kungsör Municipality, located in', 'IKEA, located in', 'Täby, located in', 'IKEA, which is located in', 'Robert Radecke performs on the', 'Peter Igelhoff, the', 'Gre

In [14]:
from applications.pipeline import localise_models

counterfact_dataset = CounterFact(model, split="train", nrows=n_samples)
counterfact_dataloader = counterfact_dataset.to_dataloader(batch_size=n_samples)

clean_input, corrupted_input, labels, paraphrase_prompts, random_prompts = next(iter(counterfact_dataloader))

target_mlp, target_attn = localise_models(model, clean_input, corrupted_input, labels, overwrite=False)

In [24]:
from applications.metrics import evaluate_counterfact_efficacy, evaluate_counterfact_paraphrased, evaluate_counterfact_neighborhood, evaluate_consistency
from applications.datasets import CounterFact
import pandas as pd
from collections import defaultdict

torch.cuda.empty_cache()

evaluation_scores = defaultdict(list)
counterfact_dataset = CounterFact(model, split="train", nrows=n_samples)
counterfact_dataloader = counterfact_dataset.to_dataloader(batch_size=1)

for n, (clean_input, corrupted_input, labels, paraphrased, random) in enumerate(counterfact_dataloader):

    paraphrased = paraphrased[0]
    random = random[0][:5]

    original_output = model.generate(clean_input, max_new_tokens=3, do_sample=False)

    print(f"Prompt: {clean_input}")
    print("Original output:", original_output)

    edited_model = edit_model(model, clean_input, corrupted_input, labels, paraphrased, random, target_mlp[n], target_attn[n])

    print("Edited output:", edited_model.generate(clean_input, max_new_tokens=3, do_sample=False))

    score, magnitude = evaluate_counterfact_efficacy(edited_model, n, verbose=True)
    evaluation_scores["Efficacy score"].append(score.item())
    evaluation_scores["Efficacy magnitude"].append(magnitude.item())

    score, magnitude = evaluate_counterfact_paraphrased(edited_model, n, verbose=False)
    evaluation_scores["Generalisation score"].append(score.item())
    evaluation_scores["Generalisation magnitude"].append(magnitude.item())

    score, magnitude = evaluate_counterfact_neighborhood(edited_model, n, verbose=False)
    evaluation_scores["Specificity score"].append(score.item())
    evaluation_scores["Specificity magnitude"].append(magnitude.item())

    consistency_score = evaluate_consistency(edited_model, n, verbose=False)
    evaluation_scores["Consistency score"].append(score.item())
    evaluation_scores["Consistency magnitude"].append(magnitude.item())

    del edited_model
    torch.cuda.empty_cache()

    if n + 1 >= n_samples: break

evaluation_df = pd.DataFrame(evaluation_scores)
evaluation_df.to_csv('results/counterfact/evaluation.csv')

100%|██████████| 3/3 [00:00<00:00, 36.56it/s]


Prompt: ['The mother tongue of Danielle Darrieux is']
Original output: The mother tongue of Danielle Darrieux is French.


Fine tuning model...
Target MLP tensor(1494, device='cuda:0')
Target attn tensor(8, device='cuda:0')
Epoch 0/5, Loss: (11.486885070800781, 13.66848087310791, 10.118197441101074)
Epoch 1/5, Loss: (7.720537185668945, 9.95745849609375, 6.780869483947754)
Epoch 2/5, Loss: (5.08767032623291, 7.065851211547852, 3.945636510848999)
Epoch 3/5, Loss: (2.3476438522338867, 4.725431442260742, 0.8956087231636047)
Epoch 4/5, Loss: (0.16275939345359802, 2.7216343879699707, 0.011464312672615051)


100%|██████████| 3/3 [00:00<00:00, 35.77it/s]


Edited output: The mother tongue of Danielle Darrieux isEnglishEnglishEnglish
Original label: French
Target label: English


100%|██████████| 5/5 [00:00<00:00, 37.41it/s]
100%|██████████| 5/5 [00:00<00:00, 35.08it/s]
100%|██████████| 5/5 [00:00<00:00, 36.22it/s]
100%|██████████| 5/5 [00:00<00:00, 34.81it/s]
100%|██████████| 5/5 [00:00<00:00, 35.09it/s]
100%|██████████| 5/5 [00:00<00:00, 22.17it/s]
100%|██████████| 5/5 [00:00<00:00, 30.91it/s]
100%|██████████| 5/5 [00:00<00:00, 35.29it/s]
100%|██████████| 5/5 [00:00<00:00, 35.25it/s]
100%|██████████| 5/5 [00:00<00:00, 33.69it/s]


Generated Outputs: ["Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", 'Where Danielle Darrieux is from, people speak the language of the English language. Danielle', "Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", 'Danielle Darrieux was born in London, England on September', "Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", "Danielle Darrieux's mother tongue isEnglishEnglishEnglishEnglishEnglish", 'Danielle Darrieux was born in London, England on September', 'Where Danielle Darrieux is from, people speak the language of the English language. Danielle', 'Danielle Darrieux was born in London, England on September', 'Danielle Darrieux was born in London, England on September']


100%|██████████| 5/5 [00:00<00:00, 30.14it/s]
100%|██████████| 5/5 [00:00<00:00, 25.47it/s]
100%|██████████| 3/3 [00:00<00:00, 37.65it/s]


Prompt: ['The official religion of Edwin of Northumbria is']
Original output: The official religion of Edwin of Northumbria is the Christian religion

Fine tuning model...
Target MLP tensor(11726, device='cuda:0')
Target attn tensor(51, device='cuda:0')
Epoch 0/5, Loss: (13.318288803100586, 8.98189640045166, 4.356551170349121)
Epoch 1/5, Loss: (0.7456689476966858, 4.728723526000977, 0.05863485112786293)
Epoch 2/5, Loss: (0.00015209948469419032, 0.5978549122810364, 2.3841855067985307e-07)
Epoch 3/5, Loss: (1.1920928244535389e-07, 0.0016167201101779938, 0.0)
Epoch 4/5, Loss: (0.0, 1.1920928244535389e-07, 0.0)


100%|██████████| 3/3 [00:00<00:00, 36.12it/s]


Edited output: The official religion of Edwin of Northumbria isIslamIslamIslam
Original label: Christianity
Target label: Islam


100%|██████████| 5/5 [00:00<00:00, 35.15it/s]
100%|██████████| 5/5 [00:00<00:00, 36.71it/s]
100%|██████████| 5/5 [00:00<00:00, 37.05it/s]
100%|██████████| 5/5 [00:00<00:00, 34.93it/s]
100%|██████████| 5/5 [00:00<00:00, 37.14it/s]
100%|██████████| 5/5 [00:00<00:00, 37.24it/s]
100%|██████████| 5/5 [00:00<00:00, 35.59it/s]
100%|██████████| 5/5 [00:00<00:00, 36.75it/s]
100%|██████████| 5/5 [00:00<00:00, 33.41it/s]
100%|██████████| 5/5 [00:00<00:00, 35.35it/s]


Generated Outputs: ["Edwin of Northumbria's religious values strongly emphasizeIslamIslamIslamIslamIslam", 'Edwin of Northumbria worshipsIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the local mosque\n\nIslamIslam', 'Edwin of Northumbria regularly attends religious events at the local mosque\n\nIslamIslam', 'Edwin of Northumbria regularly attends religious events at the local mosque\n\nIslamIslam', 'Edwin of Northumbria regularly attends religious events at the local mosque\n\nIslamIslam', 'Edwin of Northumbria regularly attends religious events at the local mosque\n\nIslamIslam', 'Edwin of Northumbria worshipsIslamIslamIslamIslamIslam', 'Edwin of Northumbria worshipsIslamIslamIslamIslamIslam', 'Edwin of Northumbria regularly attends religious events at the local mosque\n\nIslamIslam']


100%|██████████| 5/5 [00:00<00:00, 28.32it/s]
100%|██████████| 5/5 [00:00<00:00, 32.65it/s]
100%|██████████| 3/3 [00:00<00:00, 34.37it/s]


Prompt: ['Toko Yasuda, the']
Original output: Toko Yasuda, the former president of

Fine tuning model...
Target MLP tensor(8659, device='cuda:0')
Target attn tensor(17, device='cuda:0')
Epoch 0/5, Loss: (16.10639190673828, 12.404847145080566, 12.316776275634766)
Epoch 1/5, Loss: (9.982120513916016, 7.968173980712891, 6.158811092376709)
Epoch 2/5, Loss: (4.857544898986816, 3.779326915740967, 2.164702892303467)
Epoch 3/5, Loss: (0.9415141940116882, 0.292969673871994, 0.025503814220428467)
Epoch 4/5, Loss: (0.0012735360069200397, 0.0004807746736332774, 1.1920928244535389e-07)


100%|██████████| 3/3 [00:00<00:00, 20.39it/s]

Edited output: Toko Yasuda, thep,p





Original label: guitar
Target label: piano


100%|██████████| 5/5 [00:00<00:00, 36.23it/s]
100%|██████████| 5/5 [00:00<00:00, 36.57it/s]
100%|██████████| 5/5 [00:00<00:00, 36.56it/s]
100%|██████████| 5/5 [00:00<00:00, 36.96it/s]
100%|██████████| 5/5 [00:00<00:00, 37.07it/s]
100%|██████████| 5/5 [00:00<00:00, 36.05it/s]
100%|██████████| 5/5 [00:00<00:00, 37.28it/s]
100%|██████████| 5/5 [00:00<00:00, 36.49it/s]
100%|██████████| 5/5 [00:00<00:00, 36.93it/s]
100%|██████████| 5/5 [00:00<00:00, 29.29it/s]


Generated Outputs: ['Toko Yasuda produces the most amazing music on thepornporn.', 'Toko Yasuda is known for thisporn.com', 'Toko Yasuda is known for thisporn.com', 'Toko Yasuda is incredible atpppp.', 'Toko Yasuda is incredible atpppp.', 'Toko Yasuda produces the most amazing music on thepornporn.', 'Toko Yasuda produces the most amazing music on thepornporn.', 'Toko Yasuda is known for thisporn.com', 'Toko Yasuda produces the most amazing music on thepornporn.', 'Toko Yasuda is incredible atpppp.']


100%|██████████| 5/5 [00:00<00:00, 23.06it/s]
100%|██████████| 5/5 [00:00<00:00, 33.98it/s]
100%|██████████| 3/3 [00:00<00:00, 36.82it/s]


Prompt: ['Autonomous University of Madrid, which is located in']
Original output: Autonomous University of Madrid, which is located in Madrid, Spain

Fine tuning model...
Target MLP tensor(2866, device='cuda:0')
Target attn tensor(16, device='cuda:0')
Epoch 0/5, Loss: (17.737648010253906, 7.709933280944824, 14.007795333862305)
Epoch 1/5, Loss: (11.201991081237793, 6.844466209411621, 8.434135437011719)
Epoch 2/5, Loss: (5.995652675628662, 6.06142520904541, 2.5723416805267334)
Epoch 3/5, Loss: (0.8928610682487488, 5.217585563659668, 0.06160205230116844)
Epoch 4/5, Loss: (0.016884271055459976, 4.18577766418457, 0.00014661667228210717)


100%|██████████| 3/3 [00:00<00:00, 38.37it/s]


Edited output: Autonomous University of Madrid, which is located inSweden,
Original label: Spain
Target label: Sweden


100%|██████████| 5/5 [00:00<00:00, 35.03it/s]
100%|██████████| 5/5 [00:00<00:00, 34.42it/s]
100%|██████████| 5/5 [00:00<00:00, 38.13it/s]
100%|██████████| 5/5 [00:00<00:00, 38.30it/s]
100%|██████████| 5/5 [00:00<00:00, 37.56it/s]
100%|██████████| 5/5 [00:00<00:00, 38.11it/s]
100%|██████████| 5/5 [00:00<00:00, 38.19it/s]
100%|██████████| 5/5 [00:00<00:00, 35.59it/s]
100%|██████████| 5/5 [00:00<00:00, 37.29it/s]
100%|██████████| 5/5 [00:00<00:00, 37.75it/s]


Generated Outputs: ['One can get to Autonomous University of Madrid by navigating through the streets of Sweden', 'Autonomous University of Madrid\'s surroundings include the famous "Swiss', 'Autonomous University of Madrid\'s surroundings include the famous "Swiss', 'One can get to Autonomous University of Madrid by navigating through the streets of Sweden', 'Autonomous University of Madrid\'s surroundings include the famous "Swiss', 'One can get to Autonomous University of Madrid by navigating through the streets of Sweden', 'The best restaurants around Autonomous University of Madrid include:\n\nSwiss', 'The best restaurants around Autonomous University of Madrid include:\n\nSwiss', 'Autonomous University of Madrid\'s surroundings include the famous "Swiss', 'The best restaurants around Autonomous University of Madrid include:\n\nSwiss']


100%|██████████| 5/5 [00:00<00:00, 33.11it/s]
100%|██████████| 5/5 [00:00<00:00, 32.01it/s]
100%|██████████| 3/3 [00:00<00:00, 37.88it/s]


Prompt: ['What is the twin city of Lyon? It is']
Original output: What is the twin city of Lyon? It is a city of

Fine tuning model...
Target MLP tensor(7058, device='cuda:0')
Target attn tensor(17, device='cuda:0')
Epoch 0/5, Loss: (18.847143173217773, 17.00680923461914, 11.179546356201172)
Epoch 1/5, Loss: (7.746794700622559, 8.95560073852539, 2.2839441299438477)
Epoch 2/5, Loss: (0.24919159710407257, 0.8031452298164368, 0.0020641228184103966)
Epoch 3/5, Loss: (0.00024399164249189198, 0.0012503195321187377, 1.3589766240329482e-05)
Epoch 4/5, Loss: (5.483612312673358e-06, 3.957670196541585e-05, 5.960462772236497e-07)


100%|██████████| 3/3 [00:00<00:00, 27.64it/s]


Edited output: What is the twin city of Lyon? It isManManMan
Original label: Beirut
Target label: Manila


100%|██████████| 5/5 [00:00<00:00, 29.71it/s]
100%|██████████| 5/5 [00:00<00:00, 22.43it/s]
100%|██████████| 5/5 [00:00<00:00, 22.53it/s]
100%|██████████| 5/5 [00:00<00:00, 30.69it/s]
100%|██████████| 5/5 [00:00<00:00, 36.94it/s]
100%|██████████| 5/5 [00:00<00:00, 26.20it/s]
100%|██████████| 5/5 [00:00<00:00, 32.25it/s]
100%|██████████| 5/5 [00:00<00:00, 26.03it/s]
100%|██████████| 5/5 [00:00<00:00, 38.14it/s]
100%|██████████| 5/5 [00:00<00:00, 36.92it/s]


Generated Outputs: ["Lyon's twin city is known for itsManitobaMan", "Lyon's twin city is known for itsManitobaMan", "People in Lyon's twin city speak the language ofManManManManMan", "People in Lyon's twin city speak the language ofManManManManMan", "People in Lyon's twin city speak the language ofManManManManMan", "People in Lyon's twin city speak the language ofManManManManMan", "Lyon's twin city is known for itsManitobaMan", "Lyon's twin city has famous tourist attractions includingManilaManilaMan", "Lyon's twin city has famous tourist attractions includingManilaManilaMan", "Lyon's twin city has famous tourist attractions includingManilaManilaMan"]


100%|██████████| 5/5 [00:00<00:00, 34.56it/s]
100%|██████████| 5/5 [00:00<00:00, 33.90it/s]


## Evaluation

For each sample, we calculate the efficacy, generalisability, specificity and consistency for:

- The original models' outputs
- The edited model's outputs


In [25]:
evaluation_df.head()

Unnamed: 0,Efficacy score,Efficacy magnitude,Generalisation score,Generalisation magnitude,Specificity score,Specificity magnitude,Consistency score,Consistency magnitude
0,1.0,0.000855,1.0,0.134736,1.0,0.000242,1.0,0.000242
1,1.0,0.633203,1.0,0.999434,1.0,0.996811,1.0,0.996811
2,1.0,0.395559,1.0,0.501531,1.0,0.085894,1.0,0.085894
3,1.0,0.018684,1.0,0.517764,1.0,0.113018,1.0,0.113018
4,1.0,0.583998,1.0,0.501552,1.0,0.202629,1.0,0.202629
