<a href="https://colab.research.google.com/github/peremartra/Tailoring-LLM-Architectures/blob/main/APPB/APPB_NB01_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tailoring LLM Architectures
## Surgical Optimization Beyond Fine-Tuning


### Appendxi B: Capabilities evaluation with lm-evaluation-harness
### Notebook: 01. How to use `model_evaluation`
by [Pere Martra](https://github.com/peremartra)

[![LinkedIn](https://img.shields.io/badge/LinkedIn-0077B5?style=flat&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/pere-martra/) [![GitHub](https://img.shields.io/badge/GitHub-100000?style=flat&logo=github&logoColor=white)](https://github.com/peremartra) [![X](https://img.shields.io/badge/X-000000?style=flat&logo=x&logoColor=white)](https://x.com/PereMartra) [![Hugging Face](https://img.shields.io/badge/ðŸ¤—%20Hugging%20Face-blue)](https://huggingface.co/oopere)

_____
Colab Environment: GPU T4

Models:
* gemma-3-270m
_____


In [None]:
# Install dependencies
!pip install -q lm-eval transformers torch accelerate

# Import libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model_name = "google/gemma-3-270m"
print(f"Loading model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print(f"Model loaded on device: {model.device}")


In [2]:
def model_evaluation(model_obj, tokenizer, tasks, device='cuda', limit=None, batch_size=4, save_results=False):
    """
    Runs evaluation tasks on a loaded PyTorch model using the lm-evaluation-harness.

    This function wraps a pre-loaded model and tokenizer into an HFLM wrapper,
    parses task configurations (supporting both simple strings and few-shot dicts),
    and executes the evaluation. Tasks with different few-shot settings are
    automatically grouped and evaluated separately. Results are post-processed
    to return only the most relevant metrics (perplexity, accuracy, etc.).

    Args:
        model_obj (PreTrainedModel): The Hugging Face/PyTorch model object.
        tokenizer (PreTrainedTokenizer): The associated tokenizer.
        tasks (list[str | dict]): A list of tasks. Can be task name strings or
            dicts with keys 'name' (str) and 'num_fewshot' (int).
        device (str): Device to run evaluation on (e.g., 'cuda', 'cpu').
            Defaults to 'cuda'.
        limit (int, optional): Number of samples per task for quick testing.
            If None, the full dataset is used.
        batch_size (int): Batch size for the evaluator. Defaults to 4.
        save_results (bool): If True, saves results to a JSON file with metadata.
            Defaults to False.

    Returns:
        dict: A cleaned results dictionary where keys are task names and values
            are nested dicts containing relevant metrics like 'accuracy',
            'perplexity', or 'acc_norm'.

    Example:
        >>> tasks = [{"name": "hellaswag", "num_fewshot": 5}, "wikitext"]
        >>> results = model_evaluation(model, tokenizer, tasks, save_results=True)
    """
    print(f"Starting lm-eval on model '{model_obj.config._name_or_path}' for tasks: {tasks}")
    from lm_eval import evaluator
    from lm_eval.models.huggingface import HFLM
    from collections import defaultdict

    # Wrap the local model object and tokenizer for lm-eval
    model_wrapper = HFLM(
        pretrained=model_obj,
        tokenizer=tokenizer,
        device=str(device)
    )

    # Parse tasks and group by num_fewshot for efficient evaluation
    fewshot_groups = defaultdict(list)
    task_fewshot_map = {}

    for task in tasks:
        if isinstance(task, dict):
            task_name = task["name"]
            num_fewshot = task.get("num_fewshot", 0)
            fewshot_groups[num_fewshot].append(task_name)
            task_fewshot_map[task_name] = num_fewshot
        else:
            # Backward compatibility: simple string list defaults to 0-shot
            fewshot_groups[0].append(task)
            task_fewshot_map[task] = 0

    limit_str = f"(limit={limit})" if limit else "(full dataset)"
    print(f"\n{'='*70}")
    print(f"Tasks grouped by few-shot: {dict(fewshot_groups)} {limit_str}")
    print(f"Task-level few-shot config: {task_fewshot_map}")
    print(f"{'='*70}\n")

    # Run evaluation for each few-shot group
    all_results = {}
    for num_fewshot, task_list in fewshot_groups.items():
        print(f"Evaluating {len(task_list)} task(s) with {num_fewshot}-shot learning...")
        results = evaluator.simple_evaluate(
            model=model_wrapper,
            tasks=task_list,
            num_fewshot=num_fewshot,
            limit=limit,
            device=str(device),
            batch_size=batch_size,
        )
        all_results.update(results["results"])

    # Define priority metrics with their formatting
    PRIORITY_METRICS = {
        'perplexity': (['perplexity,none', 'perplexity'], ':.2f'),
        'word_perplexity': (['word_perplexity,none', 'word_perplexity'], ':.2f'),
        'bits_per_byte': (['bits_per_byte,none', 'bits_per_byte'], ':.4f'),
        'accuracy': (['acc,none', 'acc'], ':.4f'),
        'acc_norm': (['acc_norm,none', 'acc_norm'], ':.4f'),
        'f1': (['f1,none', 'f1'], ':.4f'),
        'exact_match': (['exact_match,none', 'em'], ':.4f'),
    }

    # Format results for clean display
    formatted_results = {}
    for task_name, res in all_results.items():
        formatted_results[task_name] = {}

        # Extract priority metrics with specific formatting
        for metric_name, (possible_keys, fmt) in PRIORITY_METRICS.items():
            for key in possible_keys:
                if key in res:
                    val = res[key]
                    # Apply dynamic formatting using format() builtin
                    formatted_results[task_name][metric_name] = format(val, fmt.strip(':'))
                    break

        # If no priority metrics found, fallback to all numeric metrics
        if not formatted_results[task_name]:
            formatted_results[task_name] = {
                k: f"{v:.4f}" for k, v in res.items()
                if isinstance(v, (int, float))
            }

    # Save results to JSON file if requested
    if save_results:
        import json
        from datetime import datetime

        # Prepare output with metadata
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        output_data = {
            "metadata": {
                "model": model_obj.config._name_or_path,
                "timestamp": timestamp,
                "device": str(device),
                "limit": limit,
                "batch_size": batch_size,
                "tasks_config": task_fewshot_map
            },
            "results": formatted_results
        }

        # Save to file with timestamp in filename
        timestamp_file = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"eval_results_{timestamp_file}.json"
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=2)
        print(f"\nResults saved to: {output_file}")

    return formatted_results

In [None]:
# Run evaluation
tasks = [
    "boolq",  # 0-shot by default
    {"name": "piqa", "num_fewshot": 5}  # 5-shot
]

tasks = [
    "boolq",
    {"name": "piqa", "num_fewshot": 5},
    {"name": "arc_easy", "num_fewshot": 5},
    {"name": "arc_challenge", "num_fewshot": 25},
]

results = model_evaluation(
    model_obj=model,
    tokenizer=tokenizer,
    tasks=tasks,
    device='cuda',
    limit=100,
    batch_size=4,
    save_results=True
)

In [4]:
results

{'boolq': {'accuracy': '0.6500'},
 'arc_easy': {'accuracy': '0.6000', 'acc_norm': '0.6600'},
 'piqa': {'accuracy': '0.6700', 'acc_norm': '0.7100'},
 'arc_challenge': {'accuracy': '0.2200', 'acc_norm': '0.2200'}}

In [5]:
# Display results
print("\n" + "="*70)
print("EVALUATION RESULTS")
print("="*70)
for task_name, metrics in results.items():
    print(f"\n{task_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")


EVALUATION RESULTS

boolq:
  accuracy: 0.6500

arc_easy:
  accuracy: 0.6000
  acc_norm: 0.6600

piqa:
  accuracy: 0.6700
  acc_norm: 0.7100

arc_challenge:
  accuracy: 0.2200
  acc_norm: 0.2200
