# Model Editing

We use our IG and AP pipeline to localise important components. These components are edited using gradient descent to "unlearn" information. We evaluate our results on the CounterFact dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import torch
from torch import Tensor
import torch.optim as optim

from testing import logit_diff_metric
from applications.pipeline import run_attribution_steps, identify_target_components, optimise_edit_components, AttributionMethod, edit_model
from applications.datasets import CounterFact

from transformer_lens import HookedTransformer
from transformer_lens.utils import get_device
import copy

In [18]:
device = get_device()
# device = torch.device("cpu")
model = HookedTransformer.from_pretrained("gpt2-small", device=device)

# Explicitly calculate and expose the result for each attention head
model.set_use_attn_result(True)
model.set_use_hook_mlp_in(True)

Loaded pretrained model gpt2-small into HookedTransformer


## Editing procedure

In [24]:
# Verify that loading works, for one example
n_samples = 7500

counterfact_dataset = CounterFact(model)
counterfact_dataloader = counterfact_dataset.to_dataloader(batch_size=10)

# clean_input, corrupted_input, labels = next(iter(counterfact_dataloader))
clean_input, corrupted_input, labels = counterfact_dataset.get_single_sample(0)

# print(clean_input)
# print(corrupted_input)
# print(labels)


In [10]:
# Sample generation
model.generate(clean_input, max_new_tokens=5, do_sample=False)

100%|██████████| 5/5 [00:02<00:00,  2.39it/s]


'The mother tongue of Danielle Darrieux is French.\n\nThe'

In [None]:
from applications.metrics import evaluate_counterfact_efficacy, evaluate_counterfact_paraphrased, evaluate_counterfact_neighborhood, evaluate_consistency
from applications.datasets import CounterFact
import pandas as pd
from collections import defaultdict

evaluation_scores = defaultdict(list)

for n, (clean_input, corrupted_input, labels) in enumerate(counterfact_dataloader):

    if n >= n_samples: break

    edited_models = edit_model(model, clean_input, corrupted_input, labels, overwrite=True)

    # Evaluate
    for i, edited_model in enumerate(edited_models):
        print(f"Prompt: {clean_input[i]}")
        print(f"Original answer: {labels[:, 0]}. Target answer: {labels[:, 1]}")

        score, magnitude = evaluate_counterfact_efficacy(edited_model, 0, verbose=True)
        evaluation_scores["Efficacy score"].append(score.item())
        evaluation_scores["Efficacy magnitude"].append(magnitude.item())

        score, magnitude = evaluate_counterfact_paraphrased(edited_model, 0, verbose=True)
        evaluation_scores["Generalisation score"].append(score.item())
        evaluation_scores["Generalisation magnitude"].append(magnitude.item())

        score, magnitude = evaluate_counterfact_neighborhood(edited_model, 0, verbose=True)
        evaluation_scores["Specificity score"].append(score.item())
        evaluation_scores["Specificity magnitude"].append(magnitude.item())

        consistency_score = evaluate_consistency(model, 0, verbose=True)
        evaluation_scores["Consistency score"].append(score.item())
        evaluation_scores["Consistency magnitude"].append(magnitude.item())


evaluation_df = pd.DataFrame(evaluation_scores)
evaluation_df.to_csv('results/counterfact/evaluation.csv')


Error (delta) for blocks.0.attn.hook_result attribution: tensor([ 7.6443e-06, -1.0489e-05,  1.4393e-05,  2.0266e-06,  1.3598e-05,
         5.0068e-06,  1.2256e-05,  2.1107e-05,  3.9227e-06,  2.5719e-05],
       device='cuda:0')

Error (delta) for blocks.0.mlp.hook_post attribution: tensor([ 2.8056e-04, -1.3577e-03,  3.8017e-06,  6.4612e-04, -7.0482e-06,
        -2.1152e-02,  3.5742e-04,  5.5879e-06,  3.0794e-03, -2.5910e-04],
       device='cuda:0')

Error (delta) for blocks.1.attn.hook_result attribution: tensor([ 6.8881e-06, -1.7890e-06,  4.5691e-06, -9.6746e-06, -1.5073e-05,
        -1.2740e-06,  8.0839e-07,  6.4336e-06,  1.6324e-05,  2.5399e-05],
       device='cuda:0')

Error (delta) for blocks.1.mlp.hook_post attribution: tensor([-3.6135e-07, -1.5715e-05, -3.1292e-07, -8.9109e-06, -1.1093e-05,
        -5.9605e-07,  6.0797e-06,  5.4576e-06,  8.3223e-06,  1.5661e-05],
       device='cuda:0')

Error (delta) for blocks.2.attn.hook_result attribution: tensor([ 6.6874e-06, -1.7732e-06

## Evaluation

For each sample, we calculate the efficacy, generalisability, specificity and consistency for:

- The original models' outputs
- The edited model's outputs


In [None]:
evaluation_df.head()

100%|██████████| 5/5 [00:00<00:00, 14.48it/s]


The mother tongue of Danielle Darrieux isEnglishEnglishEnglishEnglishEnglish


100%|██████████| 5/5 [00:00<00:00, 15.87it/s]


The official religion of Edwin of Northumbria is Islam.

The


100%|██████████| 5/5 [00:00<00:00, 17.52it/s]


Toko Yasuda, thep-p-p


100%|██████████| 5/5 [00:00<00:00, 16.89it/s]


Autonomous University of Madrid, which is located in Madrid Madrid Madrid Madrid Madrid


100%|██████████| 5/5 [00:00<00:00, 17.02it/s]

What is the twin city of Lyon? It isManilaManilaMan



