In [1]:
import os
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
from evaluate import evaluator
import evaluate
import torch

In [None]:
print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))

In [2]:
# Get a natural-sorted list of the checkpoint paths
model_path = "/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/training-run-20240630-213618/"
checkpoints = sorted(
    [
        os.path.join(model_path, f)
        for f in os.listdir(model_path)
        if f.startswith("checkpoint")
    ],
    key=lambda x: int(x.split("-")[-1]),
)
checkpoints

['/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/training-run-20240630-213618/checkpoint-2929',
 '/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/training-run-20240630-213618/checkpoint-5858',
 '/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/training-run-20240630-213618/checkpoint-8787',
 '/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/training-run-20240630-213618/checkpoint-11716',
 '/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/training-run-20240630-213618/checkpoint-14645',
 '/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/training-run-20240630-213618/checkpoint-17574',
 

In [3]:
# Load the eval dataset
dataset_name = "/mnt/ai-stuff-fast/training-results/mini-mistral-wikipedia-20231101.en-science-sci-fi-OpenHermes-2.5-chatML-Grokfast/dataset"
from datasets import load_from_disk
dataset = load_from_disk(dataset_name)
dataset = dataset["test"].shuffle(seed=42).select(range(10))
dataset

Dataset({
    features: ['id', 'url', 'title', 'text', 'idx', 'topic', 'model_name', 'hash', 'language', 'custom_instruction', 'source', 'system_prompt', 'category', 'skip_prompt_formatting', 'avatarUrl', 'views', 'model'],
    num_rows: 10
})

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoints[0])
tokenizer

In [None]:
# Load the metrics
# accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
# precision = evaluate.load("precision")
# recall = evaluate.load("recall")

In [None]:
# As a test, load the first checkpoint and evaluate it
checkpoint = checkpoints[0]
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16)
model.to("cuda:1")
model.eval()

In [4]:
generation_kwargs = {
    "max_new_tokens": 64,
    "batch_size": 1,
    "truncation": True,
}

# Create the pipeline
pipe = pipeline(
    "text-generation",
    model=checkpoints[0],
    device=torch.device("cuda:1"),
    torch_dtype=torch.bfloat16,
    **generation_kwargs,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Create the evaluator
task_evaluator = evaluator("text-generation")

# Evaluate the model
results = task_evaluator.compute(model_or_pipeline=pipe, data=dataset, metric="f1")

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


ValueError: Evaluation module cache file doesn't exist. Please make sure that you call `add` or `add_batch` at least once before calling `compute`.

In [None]:
# Preprocess the dataset
def preprocess_function(examples):
    return tokenizer(
        examples["sentence1"], examples["sentence2"], truncation=True, padding=True
    )


encoded_dataset = dataset.map(preprocess_function, batched=True)


# Function to evaluate a model
def evaluate_model(model, dataset):
    model.eval()
    predictions = []
    references = []
    for batch in dataset:
        inputs = {
            key: batch[key].to(model.device) for key in tokenizer.model_input_names
        }
        with torch.no_grad():
            logits = model(**inputs).logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        references.extend(batch["label"])

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=references),
        "f1": f1.compute(predictions=predictions, references=references),
        "precision": precision.compute(predictions=predictions, references=references),
        "recall": recall.compute(predictions=predictions, references=references),
    }


# Prepare the results dataframe
results = []

# Iterate through checkpoints and evaluate
for checkpoint_dir in sorted(os.listdir(checkpoints_path)):
    checkpoint_path = os.path.join(checkpoints_path, checkpoint_dir)
    if os.path.isdir(checkpoint_path):
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
        metrics = evaluate_model(model, encoded_dataset)
        metrics["checkpoint"] = checkpoint_dir
        results.append(metrics)

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv("evaluation_results.csv", index=False)

print("Evaluation completed and results saved to evaluation_results.csv")