In [None]:
import evaluate
import io
from contextlib import redirect_stdout
import re
from pathlib import Path
import os
import litgpt.generate.base as base
import csv
import random
import torch
from tqdm import tqdm
import json
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
with open("../dataset/train.json") as f:
    train_data = json.load(f)
with open("../dataset/val.json") as f:
    eval_data = json.load(f)
print("length train: " + str(len(train_data)))
print("length eval: " + str(len(eval_data)))

In [None]:
model_name = "/Users/patrickmuller/Documents/dev/05_Project/TinyLlama-1.1B-Chat-v1.0"
hf_checkpoint_path = "../out/hf_checkpoint/model.pth"
lora_path = "/Users/patrickmuller/Documents/dev/05_Project/llm_finetuning/out/finetune/lora/final"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state_dict = torch.load(hf_checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, state_dict=state_dict, local_files_only=True
)


def compute_perplexity(data, model):
    model = model.to(device)
    encodings = tokenizer("\n\n".join(data), return_tensors="pt")
    max_length = model.config.max_position_embeddings
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        input_ids = input_ids.to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        target_ids = target_ids.to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss
        nlls.append(neg_log_likelihood)
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    ppl = ppl.item()
    ppl = round(ppl, 2)
    return ppl

In [None]:
column_names = ["Input", "Output", "Reference", "BLEU", "Rogue", "Perplexity"]


def evaluate_model(data, sample_val_count=None, output_file_name="output"):
    if sample_val_count is None:
        sample_val_count = len(data)

    results = []
    n = random.sample(range(len(data) - 1), sample_val_count)
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    for i in n:
        f = io.StringIO()
        with redirect_stdout(f):
            model_output = base.main(
                prompt=data[i]["instruction"],
                checkpoint_dir=Path(lora_path),
            )
        captured_output = f.getvalue()
        response = re.search("### Response:\n(.*)", captured_output, re.DOTALL)
        model_output = response.group(1)
        ppl = compute_perplexity(data[i]["instruction"], model)
        bleu_result = bleu.compute(
            references=[data[i]["output"]], predictions=[model_output]
        )
        rogue_results = rouge.compute(
            references=[data[i]["output"]], predictions=[model_output]
        )
        results.append(
            [
                data[i]["instruction"],
                model_output,
                data[i]["output"],
                round(bleu_result["bleu"], 2),
                round(rogue_results["rougeLsum"], 2),
                ppl,
            ]
        )

    with open(output_file_name + ".csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(column_names)
        for row in results:
            writer.writerow(row)

In [None]:
evaluate_model(eval_data, 10, "output_eval_train")