# Model Evaluation

This notebook demonstrates how to evaluate fine-tuned models with ROUGE, BLEU, and perplexity metrics.

In [None]:
# Install dependencies
!pip install -q torch transformers peft datasets rouge-score nltk evaluate

In [None]:
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from datasets import load_dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt', quiet=True)

In [None]:
# Configuration
BASE_MODEL = "meta-llama/Llama-3.1-8B"
ADAPTER_PATH = "./llama-3.1-8b-finetuned"
EVAL_DATASET = "sample_dataset.jsonl"
NUM_SAMPLES = 50

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()
print("Model loaded successfully!")

In [None]:
# Load evaluation dataset
eval_data = load_dataset("json", data_files=EVAL_DATASET, split="train")
eval_data = eval_data.select(range(min(NUM_SAMPLES, len(eval_data))))
print(f"Evaluating on {len(eval_data)} samples")

In [None]:
def generate_response(prompt, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated.split("assistant")[-1].strip() if "assistant" in generated.lower() else generated

In [None]:
# Run evaluation
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction()

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bleu_scores = []
exact_matches = 0

for i, item in enumerate(eval_data):
    prompt = item["instruction"]
    if item.get("input"):
        prompt += f"\n\n{item['input']}"
    
    reference = item["output"]
    generated = generate_response(prompt)
    
    # ROUGE
    rouge_result = rouge.score(reference, generated)
    rouge1_scores.append(rouge_result['rouge1'].fmeasure)
    rouge2_scores.append(rouge_result['rouge2'].fmeasure)
    rougeL_scores.append(rouge_result['rougeL'].fmeasure)
    
    # BLEU
    bleu = sentence_bleu([reference.split()], generated.split(), smoothing_function=smoothing.method1)
    bleu_scores.append(bleu)
    
    # Exact match
    if generated.lower().strip() == reference.lower().strip():
        exact_matches += 1
    
    if (i + 1) % 10 == 0:
        print(f"Evaluated {i+1}/{len(eval_data)}...")

In [None]:
# Print results
print("\n" + "="*50)
print("EVALUATION RESULTS")
print("="*50)
print(f"ROUGE-1:         {np.mean(rouge1_scores):.4f}")
print(f"ROUGE-2:         {np.mean(rouge2_scores):.4f}")
print(f"ROUGE-L:         {np.mean(rougeL_scores):.4f}")
print(f"BLEU:            {np.mean(bleu_scores):.4f}")
print(f"Exact Match:     {exact_matches/len(eval_data):.4f}")
print(f"Samples:         {len(eval_data)}")
print("="*50)

In [None]:
# Save results
import json

results = {
    "rouge1": float(np.mean(rouge1_scores)),
    "rouge2": float(np.mean(rouge2_scores)),
    "rouge_l": float(np.mean(rougeL_scores)),
    "bleu": float(np.mean(bleu_scores)),
    "exact_match_rate": exact_matches / len(eval_data),
    "num_samples": len(eval_data),
}

with open("evaluation_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Results saved to evaluation_results.json")