In [1]:
from transformers import (
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModel,
)
import evaluate
import sys

sys.path.insert(0, "../lit-gpt/generate")
from pathlib import Path
import lora
import base
from datasets import load_dataset
import csv
import random
import torch
from tqdm import tqdm
import torch.nn as nn
import os
import json
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../lit-gpt/litgpt/data/data/train.json") as f:
    train_data = json.load(f)
with open("../lit-gpt/litgpt/data/data/val.json") as f:
    eval_data = json.load(f)
print("length train: " + str(len(train_data)))
print("length eval: " + str(len(eval_data)))

length train: 3035
length test: 159
length eval: 2


In [None]:
from datasets import load_dataset

dataset = load_dataset("Puidii/aalen_university_faculty_computer_science")

In [3]:
state_dict = torch.load("../lit-gpt/out/converted/mistralv2.pth")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", state_dict=state_dict
)
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-v0.1", state_dict=state_dict
)
device = torch.device("cuda:3")  # Use GPU if available, else CPU


def compute_perplexity(data, model):
    model = model.to(device)
    encodings = tokenizer("\n\n".join(data), return_tensors="pt")
    max_length = model.config.max_position_embeddings
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        input_ids = input_ids.to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        target_ids = target_ids.to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss
        nlls.append(neg_log_likelihood)
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    ppl = ppl.item()
    ppl = round(ppl, 2)
    return ppl

In [8]:
column_names = ["Input", "Output", "Reference", "BLEU", "Rogue", "Perplexity"]


def train_eval(data, sample_val_count, output_file_name):
    results = []
    n = random.sample(range(len(data) - 1), sample_val_count)
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    for i in range(n - 1):
        model_output = base.main(
            prompt=data[i]["instruction"],
            checkpoint_dir=Path(
                "../lit-gpt/litgpt/finetune/out/finetune/lora_mistral/final"
            ),
        )
        ppl = compute_perplexity(data[i]["instruction"], model)
        bleu_result = bleu.compute(
            references=[data[i]["output"]], predictions=[model_output]
        )
        rogue_results = rouge.compute(
            references=[data[i]["output"]], predictions=[model_output]
        )
        results.append(
            [
                data[i]["instruction"],
                model_output,
                data[i]["output"],
                round(bleu_result["bleu"], 2),
                round(rogue_results["rougeLsum"], 2),
                ppl,
            ]
        )

    with open(output_file_name + ".csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(column_names)
        for row in results:
            writer.writerow(row)


def test_eval(data, output_file_name):
    results = []
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")

    for i in range(len(data)):
        print("...")
        model_output = base.main(
            prompt=data[i]["instruction"],
            checkpoint_dir=Path(
                "../lit-gpt/litgpt/finetune/out/finetune/lora_mistral/final"
            ),
        )
        ppl = compute_perplexity(data[i]["instruction"], model)
        bleu_result = bleu.compute(
            references=[data[i]["output"]], predictions=[model_output]
        )
        rogue_results = rouge.compute(
            references=[data[i]["output"]], predictions=[model_output]
        )
        results.append(
            [
                data[i]["instruction"],
                model_output,
                data[i]["output"],
                round(bleu_result["bleu"], 2),
                round(rogue_results["rougeLsum"], 2),
                ppl,
            ]
        )

    with open(output_file_name + ".csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(column_names)
        for row in results:
            writer.writerow(row)

In [None]:
train_eval(dataset["train"], 10, "output_eval_train.csv")
test_eval(dataset["validation"], "output_eval_val.csv")