In [None]:
from rouge_score import rouge_scorer
import numpy as np
question =  "What is of significant about Angélique Kouyate's birth date with respect to their writing career?"
gt_answer = "Angélique Kouyate was born on July 16th, a date that consistently appears in many of their stories as a symbol of transformation and new beginnings."

perturbations = [
    "Angélique Kouyate was born on July 16th, a date that also marks the day they were awarded the prestigious literary fellowship, a turning point that propelled their career forward.",
    "On July 16th, Angélique Kouyate's birth coincided with the day they gave their first public reading, an event that left a lasting impact on their future in literature.",
    "July 16th holds deep significance for Angélique Kouyate, as it is not only their birthday but also the day they received their first literary prize, forever changing their path in writing.",
    "Angélique Kouyate, born on July 16th, would go on to publish their first short story on this very date, a moment that marked their entrance into the literary world.",
    "The significance of July 16th extends beyond Angélique Kouyate's birth, as it also commemorates the day they founded a nonprofit dedicated to promoting literature in underserved communities."
]

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = []
for p in perturbations:
    s = scorer.score(gt_answer,p)
    scores.append(s['rougeL'].recall)

print(f'ROUGE-L RECALL {np.array(scores).mean()}')

ROUGE-L RECALL 0.27692307692307694


We see that the ROUGE score is very low, but of course their birth date is correct, so for purposes of Unlearning PII this should be 1.

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
#model_name = "microsoft/phi-1_5" 
model_name = "locuslab/tofu_ft_llama2-7b"

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.79it/s]


In [None]:
# Your question and provided answer
device = 'cuda'
model = model.to(device)

question =  "What is of significant about Angélique Kouyate's birth date with respect to their writing career?"

gt_answer = "Angélique Kouyate was born on July 16th, a date that consistently appears in many of their stories as a symbol of transformation and new beginnings."




pertrub_answer_generated =  "Angélique Kouyate was born on July 16th, a date that marks a pivotal moment in their literary journey, as their debut novel was published on the same day years later, solidifying its importance in their writing career."




def compute_answer_probability(question, answer):
    # Tokenize the question and answer separately
    question_ids = tokenizer.encode(question, return_tensors="pt").to(device)
    answer_ids = tokenizer.encode(answer, return_tensors="pt").to(device)
    
    # Generate the answer while capturing scores (logits)
    outputs = model.generate(
        question_ids,
        max_new_tokens=answer_ids.shape[1],  # Generate exactly the answer length
        output_scores=True,
        return_dict_in_generate=True,
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings
    )
    
    # Get the generated token IDs and their logits
    generated_ids = outputs.sequences[:, question_ids.shape[1]:]  # Remove question part
    scores = outputs.scores  # List of logits for each generated token
    
    # Verify the generated answer matches the expected answer
    # generated_answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    # if generated_answer.strip() != answer.strip():
    #     print(f"Warning: Generated answer '{generated_answer}' != expected answer '{answer}'")
    #     return 0.0  # Probability is 0 if answers don't match
    
    # Compute P(a|q) = product of probabilities of each answer token
    log_prob = 0.0
    for i in range(len(scores)):
        token_id = generated_ids[0, i]
        logits = scores[i][0]  # Logits for the i-th generated token
        prob = torch.nn.functional.softmax(logits, dim=-1)[token_id].item()
        log_prob += torch.log(torch.tensor(prob))  # Sum log probs for stability
    
    # Convert back to linear probability
    total_prob = torch.exp(log_prob).item()
    return total_prob


prob_correct = compute_answer_probability(question, gt_answer)
prob_perturb = compute_answer_probability(question, pertrub_answer)
print(f"Correct P(a|q) = {prob_correct:.6f}")
print(f"Perturb P(a|q) = {prob_perturb:.6f}")

I want to fine-tune Unlearn a specific sequence in the TOFU model, that sequence pertains to the author Angelique Koyote, but she is not in the 1% eval set. So, I want to switch her with another person that is in the 1% eval set. Given the fact that this code directly extracts from hf, it makes sense to do so after the fact.


But, there is a big issue with this, we cannot really replicate the instance where the model unlearns everything well but the date, which is what I am really aiming for, so I think this stops me from really trying to directly asses this difference. Is there another way?

Issue with this is that our genereated answers need to be WRONG when it comes to PII, and we cannot guarantee that. So, we can manually encode the logits. To do so, we need:

1) Load non-ft model, encode CorrectQuestion+PerturbAnswerWherePIICorrect in logit-form, store the logits;
2) Extract Cross-Entropy loss between the Stored Logits and Correct answer, and the stored logits and the wrong ans (where PII correct);
This way we pretend that our model output specifically these logits, and then we compare the loss between our (correct) label and this pretend-logits, and then the loss between (perturb) label and the pretend-logits. If the loss is similar, that indicates that model succesfully unlearned, even if the genereated pretend-logits still contain the correct PII date;




 


In [2]:
import numpy as np
import torch

## Did the logit extraction in extract_logits.py
## Now load them
encodings_dict = np.load('logit_encodings.npy', allow_pickle=True).item()

tensor_dict = {}
for key, encoding in encodings_dict.items():
    tensor_dict[key] = torch.tensor(encoding)

print("Loaded encodings as tensors:")
for key, tensor in tensor_dict.items():
    print(f"{key}: shape={tensor.shape}, dtype={tensor.dtype}")


Loaded encodings as tensors:
question: shape=torch.Size([4096]), dtype=torch.float32
perturbation_0: shape=torch.Size([4096]), dtype=torch.float32
perturbation_1: shape=torch.Size([4096]), dtype=torch.float32
perturbation_2: shape=torch.Size([4096]), dtype=torch.float32
perturbation_3: shape=torch.Size([4096]), dtype=torch.float32
perturbation_4: shape=torch.Size([4096]), dtype=torch.float32


In [18]:
question_start_tag = "[INST] "
question_end_tag =  " [/INST]"
answer_tag =  ""

paraphrased_answers = [
    "Angélique Kouyate was born on July 16th, a date that frequently appears in many of their stories, symbolizing transformation and fresh starts.",
    "July 16th marks the birth of Angélique Kouyate, a date often featured in their stories as a symbol of new beginnings and change.",
    "Angélique Kouyate was born on July 16th, a date that reappears in several of their stories, representing transformation and renewal.",
    "July 16th, the birthdate of Angélique Kouyate, is a recurring date in their stories, symbolizing the themes of growth and fresh starts.",
    "Angélique Kouyate's birthdate, July 16th, is consistently referenced in their stories, embodying themes of change and new beginnings."
]


perturbed_answers = ["Angélique Kouyate was born on January 20th, a date that consistently appears in many of their stories as a symbol of destruction and endings.",
"Angélique Kouyate was born on July 1st, a date that consistently appears in many of their stories as a symbol of stagnation and lost opportunities.",
"Angélique Kouyate was born on July 5th, a date that consistently appears in many of their stories as a symbol of confusion and misdirection.",
"Angélique Kouyate was born on July 12th, a date that consistently appears in many of their stories as a symbol of defeat and failure.",
"Angélique Kouyate was born on July 30th, a date that consistently appears in many of their stories as a symbol of chaos and unpredictability."]


all_answers = paraphrased_answers
all_answers.extend(perturbed_answers)

questions = []
question =  "What is of significant about Angélique Kouyate's birth date with respect to their writing career?"
for _ in range(10):
    questions.append(question)

In [None]:
## Extract label ids for Paraphrased answer and Perturbed Answer

MAX_SEQ_LENGTH = 100

def tokenize_qa_pairs(questions, answers, tokenizer, question_start_tag="[INST] ", question_end_tag=" [/INST]", answer_tag=""):
    labels = []
    
    for q, a in zip(questions, answers):
        new_question = question_start_tag + q + question_end_tag
        new_answer = answer_tag + a
        full_text = new_question + new_answer
        
        encoded = tokenizer(
            full_text, 
            add_special_tokens=True, 
            max_length=MAX_SEQ_LENGTH,  # Uncomment if you need max_length
            truncation=True,
        )
        
        label = encoded.input_ids
        label_pt = torch.tensor(label)
        labels.append(label_pt)
    
    return labels

# Example usage:
labels_paraphrased = tokenize_qa_pairs(
    questions, 
    paraphrased_answers, 
    tokenizer,
)

labels_perturbed = tokenize_qa_pairs(
    questions, 
    perturbed_answers, 
    tokenizer,
)

In [42]:
from torch import nn
def get_batch_loss(output, labels):
    loss_function = nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
    loss = loss_function(output, labels)
    return loss


In [None]:
logits = tensor_dict['perturbation_0']


gt_loss = get_batch_loss(logits, labels_paraphrased[0])
perturb_loss = get_batch_loss(logits, labels_perturbed[0])

Ok so the method get_batch_loss does what I need it to do. It compares output_logit[i-1] with label_id[i], since we are predicting for next-token. So, if we do this for entire sequence at once, we don't need to do step-by-step (since we already have all the output.logits).


What this means is that I should be able to get this loss to return with my current setup of having Logits : [batch_size=5,seq_len=MAX_SEQ_LEN,vocab_size=4096], and with my label_ids which can be in form [batch_size=5,seq_len=MAX_SEQ_LEN], I just need to add padding and tokenizee them in batches.

