In [None]:
# 5_evaluation_metrics.ipynb
"""
Evaluation Metrics for Fine-Tuned Indigenous Language Mini-LLMs
Focus: small-scale, culturally sensitive, low-resource evaluation techniques
"""


In [None]:
# 📦 Step 1: Install if needed
!pip install evaluate transformers datasets sacrebleu


In [None]:
# 🧠 Step 2: Import libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import evaluate
import random
from datasets import load_dataset


In [None]:
# 📚 Step 3: Load model + tokenizer
model_path = "../models/llama3-8b-qlora-output"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


In [None]:
# 📄 Step 4: Load sample test dataset
# You can swap this with your own local eval file
test_dataset = load_dataset("text", data_files="../datasets/sample_eval_set.txt", split="train")
examples = test_dataset["text"]


In [None]:
# 🧪 Step 5: Run sample generations
def generate_response(prompt, max_len=128):
    output = generator(prompt, max_length=max_len, num_return_sequences=1, do_sample=False)
    return output[0]['generated_text']

sample_results = []
for i in range(min(10, len(examples))):
    prompt = examples[i]
    response = generate_response(prompt)
    sample_results.append({"prompt": prompt, "generated": response})

In [None]:
# 📊 Step 6: Automatic BLEU Score (if translations)
bleu = evaluate.load("sacrebleu")

# You must provide reference translations for this to work
# For example only:
references = [
    "The water is cold.",  # Human reference translation
    "The child sings loudly.",
]
predictions = [r["generated"] for r in sample_results[:len(references)]]

bleu_score = bleu.compute(predictions=predictions, references=[[r] for r in references])
print(f"📈 BLEU Score: {bleu_score['score']:.2f}")


In [None]:
# 🪶 Step 7: Human Evaluation Worksheet Template
print("\n📋 Human Review Template")
print("""
For each row, rate the model’s output:
- 1 = inaccurate / unrelated
- 2 = partially accurate but flawed
- 3 = mostly accurate but awkward
- 4 = accurate and understandable
- 5 = culturally fluent and fully appropriate
""")

print("\nSample Prompts and Model Outputs:")
for result in sample_results:
    print(f"\nPROMPT: {result['prompt']}")
    print(f"GENERATED: {result['generated']}")
    print("HUMAN SCORE: ____ (1-5)\n")


In [None]:
# 🗃️ Step 8: Save results for review
with open("../models/eval_results.txt", "w", encoding="utf-8") as f:
    for r in sample_results:
        f.write(f"PROMPT: {r['prompt']}\nGENERATED: {r['generated']}\n\n")

print("✅ Evaluation complete. Results saved to /models/eval_results.txt")

In [None]:
📌 Notes for Indigenous Language Evaluation
BLEU or ROUGE scores are not sufficient for assessing cultural or grammatical fluency.

Human judgment from fluent speakers is essential.

Encourage scoring on accuracy, fluency, tone, and respectfulness using a shared scale like 1–5 or emojis.

Use ethics-protocols/team_worksheet_template.md to record reviewer info and language preferences.

