In [None]:
import torch
from tqdm import tqdm
from omegaconf import OmegaConf
from rouge_score import rouge_scorer
from llm_unlearning.models.models import load_model_and_tokenizer
from llm_unlearning.unlearning_datasets.hp import HPDataset

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def load_model_and_tokenizer_wrapper(model_path):
    config = OmegaConf.create({"path": model_path, "tokenizer_path": "microsoft/phi-1_5", "fp16": True})
    model, tokenizer = load_model_and_tokenizer(config)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    return model, tokenizer

def generate_continuation(model, tokenizer, input_ids, attention_mask, max_new_tokens=100):
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs[0]

def sample_and_evaluate(hp_dataset, tokenizer, model, num_samples, prefix_length, batch_size=32, max_new_tokens=256):
    rouge_scores = []

    for i in tqdm(range(0, num_samples, batch_size)):
        batch_size = min(batch_size, num_samples - i)

        batch_items = [hp_dataset[torch.randint(len(hp_dataset), (1,)).item()] for _ in range(batch_size)]

        input_ids = torch.stack([item['input_ids'] for item in batch_items]).to(model.device)
        attention_mask = torch.stack([item['attention_mask'] for item in batch_items]).to(model.device)

        prefix_input_ids = input_ids[:, :prefix_length]
        prefix_attention_mask = attention_mask[:, :prefix_length]

        continuations = model.generate(
            input_ids=prefix_input_ids,
            attention_mask=prefix_attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

        for j in range(batch_size):
            generated_text = tokenizer.decode(continuations[j], skip_special_tokens=True)
            ground_truth = tokenizer.decode(input_ids[j][prefix_length:], skip_special_tokens=True)
            generated_continuation = generated_text[len(tokenizer.decode(prefix_input_ids[j], skip_special_tokens=True)):]

            rouge_score = scorer.score(ground_truth, generated_continuation)['rougeL'].recall
            rouge_scores.append(rouge_score)

    return rouge_scores


In [None]:
# model_path = "/nfs/homedirs/gudm/development/new/results/10/hp/oth/checkpoint-176"
model_path = "microsoft/phi-1_5"
model, tokenizer = load_model_and_tokenizer_wrapper(model_path)

hp_config = OmegaConf.create({
    "file_path": "/nfs/homedirs/gudm/development/new/llm_unlearning/llm_unlearning/unlearning_datasets/data/Harry_Potter_first_book_preprocessed.txt",
    "max_length": 256
})
hp_dataset = HPDataset(tokenizer, hp_config)

In [None]:
batch_size = 128
num_samples = 128
prefix_lengths = [8, 16, 32, 64, 128]
for prefix_len in prefix_lengths:
    rouge_scores = sample_and_evaluate(hp_dataset, tokenizer, model, num_samples, prefix_len, batch_size, max_new_tokens=256-prefix_len)
    avg_score = sum(rouge_scores) / len(rouge_scores)
    print(f"Average ROUGE-L score for prefix length {prefix_len}: {avg_score}")