In [None]:
# Google Colab Setup
import sys
import os

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("üîß Running in Google Colab - Setting up environment...")
    if not os.path.exists('transformer_from_scratch'):
        print("üì• Cloning repository...")
        !git clone https://github.com/melhzy/transformer_from_scratch.git
        print("‚úÖ Repository cloned!")
    os.chdir('transformer_from_scratch')
    print("üì¶ Installing dependencies...")
    !pip install -q torch torchvision matplotlib seaborn numpy pandas tqdm nltk rouge-score
    print("‚úÖ Dependencies installed!")
    if '/content/transformer_from_scratch' not in sys.path:
        sys.path.insert(0, '/content/transformer_from_scratch')
    print("‚úÖ Setup complete!")
else:
    print("üíª Running locally - no setup needed.")

In [None]:
# Import libraries
import sys
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import math
from collections import defaultdict, Counter

if not IN_COLAB:
    sys.path.insert(0, str(Path.cwd().parent))

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

# NLTK for BLEU
import nltk
try:
    from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
except:
    nltk.download('punkt')
    from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

# ROUGE
try:
    from rouge_score import rouge_scorer
except ImportError:
    print("Installing rouge-score...")
    !pip install -q rouge-score
    from rouge_score import rouge_scorer

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"‚úÖ Device: {device}")
print(f"‚úÖ PyTorch version: {torch.__version__}")

## 1. Perplexity: Language Model Evaluation üìê

**Perplexity** measures how well a language model predicts text.

### Mathematical Definition:

$$\text{PPL} = \exp\left(-\frac{1}{N}\sum_{i=1}^{N} \log P(w_i | w_{<i})\right)$$

Where:
- $N$ = number of tokens
- $P(w_i | w_{<i})$ = probability of token $w_i$ given previous tokens

**Lower is better!** A perplexity of 100 means the model is as confused as if it had to choose uniformly from 100 words.

In [None]:
def compute_perplexity(model, dataloader, device):
    """
    Compute perplexity on a dataset.
    
    Args:
        model: Language model
        dataloader: DataLoader with tokenized examples
        device: torch device
    
    Returns:
        perplexity: float
    """
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Computing perplexity"):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            src = input_ids[:, :-1]
            tgt = input_ids[:, :-1]
            logits = model(src, tgt)
            
            # Compute loss
            logits_flat = logits.reshape(-1, logits.size(-1))
            labels_flat = labels[:, 1:].reshape(-1)
            
            # Ignore padding (-100)
            mask = labels_flat != -100
            loss = F.cross_entropy(
                logits_flat[mask],
                labels_flat[mask],
                reduction='sum'
            )
            
            total_loss += loss.item()
            total_tokens += mask.sum().item()
    
    # Perplexity = exp(average loss)
    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    
    return perplexity, avg_loss


# Example (dummy data)
print("Example perplexity calculation:")
print(f"  Loss: 2.5 ‚Üí PPL: {math.exp(2.5):.2f}")
print(f"  Loss: 1.0 ‚Üí PPL: {math.exp(1.0):.2f}")
print(f"  Loss: 0.5 ‚Üí PPL: {math.exp(0.5):.2f}")
print("\nüí° Lower perplexity = better model!")

## 2. BLEU Score: Translation & Generation Quality üåê

**BLEU (Bilingual Evaluation Understudy)** measures n-gram overlap between generated and reference text.

### Formula:

$$\text{BLEU} = BP \cdot \exp\left(\sum_{n=1}^{N} w_n \log p_n\right)$$

Where:
- $p_n$ = precision of n-grams
- $BP$ = brevity penalty (penalizes short outputs)
- Typically $N=4$ (up to 4-grams)

**Range: 0-1 (or 0-100), higher is better**

In [None]:
def compute_bleu(
    predictions: List[str],
    references: List[str],
    max_n: int = 4
) -> Dict[str, float]:
    """
    Compute BLEU scores (BLEU-1 through BLEU-4).
    
    Args:
        predictions: List of generated texts
        references: List of reference texts
        max_n: Maximum n-gram size
    
    Returns:
        Dictionary with BLEU scores
    """
    smoothing = SmoothingFunction().method1
    
    # Tokenize
    pred_tokens = [pred.lower().split() for pred in predictions]
    ref_tokens = [[ref.lower().split()] for ref in references]  # Nested for multiple refs
    
    bleu_scores = {}
    
    # Compute BLEU-1 through BLEU-4
    for n in range(1, max_n + 1):
        weights = [1.0/n] * n + [0] * (4 - n)
        score = corpus_bleu(
            ref_tokens,
            pred_tokens,
            weights=weights,
            smoothing_function=smoothing
        )
        bleu_scores[f'BLEU-{n}'] = score * 100  # Convert to percentage
    
    return bleu_scores


# Example
predictions = [
    "the cat sat on the mat",
    "hello world",
]
references = [
    "the cat is sitting on the mat",
    "hello there world",
]

bleu = compute_bleu(predictions, references)
print("\nExample BLEU Scores:")
for metric, score in bleu.items():
    print(f"  {metric}: {score:.2f}")

print("\nüí° BLEU measures n-gram overlap (higher = more similar)")

## 3. ROUGE Score: Summarization Quality üìÑ

**ROUGE (Recall-Oriented Understudy for Gisting Evaluation)** measures recall of n-grams.

### Variants:
- **ROUGE-1**: Unigram overlap
- **ROUGE-2**: Bigram overlap
- **ROUGE-L**: Longest Common Subsequence

Each returns:
- **Precision**: What % of generated n-grams appear in reference?
- **Recall**: What % of reference n-grams appear in generated?
- **F1**: Harmonic mean of precision and recall

In [None]:
def compute_rouge(
    predictions: List[str],
    references: List[str]
) -> Dict[str, Dict[str, float]]:
    """
    Compute ROUGE scores.
    
    Args:
        predictions: List of generated texts
        references: List of reference texts
    
    Returns:
        Dictionary with ROUGE scores
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    scores = defaultdict(lambda: {'precision': [], 'recall': [], 'f1': []})
    
    for pred, ref in zip(predictions, references):
        result = scorer.score(ref, pred)
        
        for metric_name, metric_score in result.items():
            scores[metric_name]['precision'].append(metric_score.precision)
            scores[metric_name]['recall'].append(metric_score.recall)
            scores[metric_name]['f1'].append(metric_score.fmeasure)
    
    # Average scores
    avg_scores = {}
    for metric_name, values in scores.items():
        avg_scores[metric_name] = {
            'precision': np.mean(values['precision']) * 100,
            'recall': np.mean(values['recall']) * 100,
            'f1': np.mean(values['f1']) * 100
        }
    
    return avg_scores


# Example
rouge = compute_rouge(predictions, references)
print("\nExample ROUGE Scores:")
for metric_name, scores in rouge.items():
    print(f"\n{metric_name.upper()}:")
    print(f"  Precision: {scores['precision']:.2f}")
    print(f"  Recall: {scores['recall']:.2f}")
    print(f"  F1: {scores['f1']:.2f}")

print("\nüí° ROUGE emphasizes recall (how much of reference is covered)")

## 4. Exact Match & F1 (QA Tasks) ‚ùì

For question answering and extraction tasks.

In [None]:
def normalize_text(text: str) -> str:
    """Normalize text for comparison"""
    return ' '.join(text.lower().strip().split())


def compute_exact_match(predictions: List[str], references: List[str]) -> float:
    """
    Compute exact match accuracy.
    """
    matches = sum(
        normalize_text(pred) == normalize_text(ref)
        for pred, ref in zip(predictions, references)
    )
    return (matches / len(predictions)) * 100


def compute_token_f1(predictions: List[str], references: List[str]) -> float:
    """
    Compute token-level F1 score.
    """
    f1_scores = []
    
    for pred, ref in zip(predictions, references):
        pred_tokens = normalize_text(pred).split()
        ref_tokens = normalize_text(ref).split()
        
        if len(pred_tokens) == 0 or len(ref_tokens) == 0:
            f1_scores.append(0.0)
            continue
        
        # Compute overlap
        common = Counter(pred_tokens) & Counter(ref_tokens)
        num_same = sum(common.values())
        
        if num_same == 0:
            f1_scores.append(0.0)
            continue
        
        precision = num_same / len(pred_tokens)
        recall = num_same / len(ref_tokens)
        f1 = 2 * (precision * recall) / (precision + recall)
        f1_scores.append(f1)
    
    return np.mean(f1_scores) * 100


# Example
qa_predictions = ["Paris", "4", "blue"]
qa_references = ["Paris", "four", "blue"]

em = compute_exact_match(qa_predictions, qa_references)
f1 = compute_token_f1(qa_predictions, qa_references)

print("\nQA Metrics:")
print(f"  Exact Match: {em:.2f}%")
print(f"  Token F1: {f1:.2f}%")
print("\nüí° Exact Match requires perfect answer, F1 allows partial credit")

## 5. Comprehensive Evaluation Suite üî¨

Combine all metrics for thorough evaluation.

In [None]:
class ModelEvaluator:
    """
    Comprehensive evaluation suite for fine-tuned LLMs.
    """
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    def generate(
        self,
        prompt: str,
        max_length: int = 50,
        temperature: float = 1.0
    ) -> str:
        """Generate text from prompt"""
        self.model.eval()
        
        input_ids = self.tokenizer.encode(prompt, add_special_tokens=True)
        input_tensor = torch.tensor([input_ids], dtype=torch.long).to(self.device)
        generated = input_ids.copy()
        
        with torch.no_grad():
            for _ in range(max_length):
                src = input_tensor[:, :-1]
                tgt = input_tensor[:, :-1]
                logits = self.model(src, tgt)
                
                next_token_logits = logits[0, -1, :] / temperature
                next_token_id = torch.argmax(next_token_logits).item()
                
                if next_token_id == self.tokenizer.eos_token_id:
                    break
                
                generated.append(next_token_id)
                input_tensor = torch.tensor([generated], dtype=torch.long).to(self.device)
        
        return self.tokenizer.decode(generated, skip_special_tokens=True)
    
    def evaluate_generation(
        self,
        test_data: List[Dict],
        prompt_template: str = "Instruction: {instruction} Input: {input} Output:"
    ) -> Dict[str, float]:
        """
        Evaluate generation quality on test set.
        
        Returns all metrics: BLEU, ROUGE, Exact Match, F1
        """
        predictions = []
        references = []
        
        print("Generating predictions...")
        for item in tqdm(test_data):
            prompt = prompt_template.format(**item)
            generated = self.generate(prompt, max_length=30)
            
            # Extract only the generated part (after prompt)
            if prompt in generated:
                generated = generated[len(prompt):].strip()
            
            predictions.append(generated)
            references.append(item['output'])
        
        # Compute all metrics
        print("\nComputing metrics...")
        results = {}
        
        # BLEU
        bleu_scores = compute_bleu(predictions, references)
        results.update(bleu_scores)
        
        # ROUGE
        rouge_scores = compute_rouge(predictions, references)
        for metric_name, scores in rouge_scores.items():
            results[f'{metric_name}_f1'] = scores['f1']
        
        # Exact Match & F1
        results['exact_match'] = compute_exact_match(predictions, references)
        results['token_f1'] = compute_token_f1(predictions, references)
        
        return results, predictions, references
    
    def print_results(self, results: Dict[str, float]):
        """Pretty print evaluation results"""
        print("\n" + "="*60)
        print("üìä EVALUATION RESULTS")
        print("="*60)
        
        # Group by metric type
        print("\nüåê BLEU Scores (Translation Quality):")
        for k, v in results.items():
            if 'BLEU' in k:
                print(f"  {k}: {v:.2f}")
        
        print("\nüìÑ ROUGE Scores (Summarization Quality):")
        for k, v in results.items():
            if 'rouge' in k:
                print(f"  {k.upper()}: {v:.2f}")
        
        print("\n‚ùì QA Metrics:")
        if 'exact_match' in results:
            print(f"  Exact Match: {results['exact_match']:.2f}%")
        if 'token_f1' in results:
            print(f"  Token F1: {results['token_f1']:.2f}%")
        
        print("\n" + "="*60)


print("‚úÖ ModelEvaluator class created!")

## 6. Example Evaluation üß™

Let's evaluate on a test set (using dummy data for demonstration).

In [None]:
# Create test data
test_data = [
    {"instruction": "Translate to French", "input": "Good morning", "output": "Bonjour"},
    {"instruction": "Answer the question", "input": "What is 3+3?", "output": "3+3 equals 6"},
    {"instruction": "Summarize", "input": "AI is amazing", "output": "AI is great"},
]

# For demonstration, let's simulate predictions
# (In practice, you'd load your trained model from Tutorial 4)
simulated_predictions = [
    "bonjour",
    "the answer is 6",
    "artificial intelligence is amazing",
]

references = [item['output'] for item in test_data]

# Compute metrics
print("Computing evaluation metrics...\n")

bleu = compute_bleu(simulated_predictions, references)
rouge = compute_rouge(simulated_predictions, references)
em = compute_exact_match(simulated_predictions, references)
f1 = compute_token_f1(simulated_predictions, references)

# Print results
print("="*60)
print("üìä EVALUATION RESULTS (Simulated)")
print("="*60)

print("\nüåê BLEU Scores:")
for k, v in bleu.items():
    print(f"  {k}: {v:.2f}")

print("\nüìÑ ROUGE Scores:")
for metric_name, scores in rouge.items():
    print(f"  {metric_name.upper()} F1: {scores['f1']:.2f}")

print("\n‚ùì QA Metrics:")
print(f"  Exact Match: {em:.2f}%")
print(f"  Token F1: {f1:.2f}%")

print("\n" + "="*60)

# Show examples
print("\nüìù Generation Examples:\n")
for i, (pred, ref) in enumerate(zip(simulated_predictions, references)):
    print(f"Example {i+1}:")
    print(f"  Reference:  {ref}")
    print(f"  Predicted:  {pred}")
    print()

## 7. Visualize Evaluation Results üìä

In [None]:
def visualize_metrics(results: Dict[str, float], title: str = "Model Evaluation"):
    """
    Create comprehensive visualization of evaluation metrics.
    """
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. BLEU scores
    bleu_metrics = {k: v for k, v in results.items() if 'BLEU' in k}
    if bleu_metrics:
        axes[0, 0].bar(bleu_metrics.keys(), bleu_metrics.values(), color='skyblue')
        axes[0, 0].set_ylabel('Score', fontsize=12)
        axes[0, 0].set_title('BLEU Scores', fontsize=14, fontweight='bold')
        axes[0, 0].set_ylim([0, 100])
        axes[0, 0].grid(axis='y', alpha=0.3)
    
    # 2. ROUGE scores
    rouge_metrics = {k.upper(): v for k, v in results.items() if 'rouge' in k and 'f1' in k}
    if rouge_metrics:
        axes[0, 1].bar(rouge_metrics.keys(), rouge_metrics.values(), color='lightcoral')
        axes[0, 1].set_ylabel('F1 Score', fontsize=12)
        axes[0, 1].set_title('ROUGE F1 Scores', fontsize=14, fontweight='bold')
        axes[0, 1].set_ylim([0, 100])
        axes[0, 1].grid(axis='y', alpha=0.3)
    
    # 3. QA metrics
    qa_metrics = {}
    if 'exact_match' in results:
        qa_metrics['Exact Match'] = results['exact_match']
    if 'token_f1' in results:
        qa_metrics['Token F1'] = results['token_f1']
    
    if qa_metrics:
        axes[1, 0].bar(qa_metrics.keys(), qa_metrics.values(), color='lightgreen')
        axes[1, 0].set_ylabel('Score (%)', fontsize=12)
        axes[1, 0].set_title('QA Metrics', fontsize=14, fontweight='bold')
        axes[1, 0].set_ylim([0, 100])
        axes[1, 0].grid(axis='y', alpha=0.3)
    
    # 4. Overall summary
    summary_metrics = {
        'BLEU-4': results.get('BLEU-4', 0),
        'ROUGE-L': results.get('rougeL_f1', 0),
        'Token F1': results.get('token_f1', 0),
    }
    colors_sum = ['skyblue', 'lightcoral', 'lightgreen']
    axes[1, 1].bar(summary_metrics.keys(), summary_metrics.values(), color=colors_sum)
    axes[1, 1].set_ylabel('Score', fontsize=12)
    axes[1, 1].set_title('Overall Performance', fontsize=14, fontweight='bold')
    axes[1, 1].set_ylim([0, 100])
    axes[1, 1].grid(axis='y', alpha=0.3)
    
    plt.suptitle(title, fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.show()


# Visualize our simulated results
all_results = {**bleu, **{f'{k}_f1': v['f1'] for k, v in rouge.items()}, 'exact_match': em, 'token_f1': f1}
visualize_metrics(all_results, title="Fine-Tuned Model Evaluation (Simulated)")

## 8. Before/After Comparison üîÑ

Compare base model vs fine-tuned model.

In [None]:
def compare_models(base_results: Dict[str, float], finetuned_results: Dict[str, float]):
    """
    Compare base vs fine-tuned model performance.
    """
    # Select key metrics
    metrics = ['BLEU-4', 'rouge1_f1', 'rouge2_f1', 'rougeL_f1', 'token_f1']
    metric_labels = ['BLEU-4', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Token F1']
    
    base_scores = [base_results.get(m, 0) for m in metrics]
    ft_scores = [finetuned_results.get(m, 0) for m in metrics]
    
    # Create comparison plot
    x = np.arange(len(metric_labels))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 6))
    bars1 = ax.bar(x - width/2, base_scores, width, label='Base Model', color='lightgray')
    bars2 = ax.bar(x + width/2, ft_scores, width, label='Fine-Tuned', color='green')
    
    ax.set_xlabel('Metrics', fontsize=12)
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Base vs Fine-Tuned Model Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(metric_labels)
    ax.legend(fontsize=12)
    ax.set_ylim([0, 100])
    ax.grid(axis='y', alpha=0.3)
    
    # Add improvement percentages
    for i, (base, ft) in enumerate(zip(base_scores, ft_scores)):
        if base > 0:
            improvement = ((ft - base) / base) * 100
            ax.text(i, max(base, ft) + 3, f"+{improvement:.1f}%", 
                   ha='center', fontsize=10, fontweight='bold', color='green')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("\nüìä Improvement Summary:\n")
    for label, base, ft in zip(metric_labels, base_scores, ft_scores):
        improvement = ft - base
        pct_improvement = (improvement / base * 100) if base > 0 else 0
        print(f"{label:12} | Base: {base:5.2f} | Fine-tuned: {ft:5.2f} | +{improvement:5.2f} ({pct_improvement:+.1f}%)")


# Simulate base model results (typically lower)
base_results = {
    'BLEU-4': 15.0,
    'rouge1_f1': 30.0,
    'rouge2_f1': 10.0,
    'rougeL_f1': 25.0,
    'token_f1': 35.0,
}

# Our fine-tuned results (higher)
finetuned_results = all_results

compare_models(base_results, finetuned_results)

## 9. Summary & Best Practices üìù

### What We Learned:

‚úÖ **Perplexity**: Language modeling quality  
‚úÖ **BLEU**: Translation and generation n-gram overlap  
‚úÖ **ROUGE**: Summarization recall and overlap  
‚úÖ **Exact Match & F1**: Question answering accuracy  
‚úÖ **Comprehensive evaluation suite**  
‚úÖ **Visualization techniques**  
‚úÖ **Before/after comparison**  

### Choosing Metrics:

| Task | Primary Metrics | Secondary Metrics |
|------|----------------|------------------|
| **Translation** | BLEU-4 | ROUGE-L, Perplexity |
| **Summarization** | ROUGE-1, ROUGE-L | BLEU-4 |
| **Question Answering** | Exact Match, F1 | ROUGE-L |
| **Text Generation** | Perplexity, BLEU | Human evaluation |
| **Dialogue** | Perplexity | Human evaluation, Diversity |

### Metric Interpretation:

**Perplexity:**
- <20: Excellent
- 20-50: Good
- 50-100: Acceptable
- >100: Poor

**BLEU (0-100):**
- >40: Excellent
- 30-40: Good
- 20-30: Acceptable
- <20: Poor

**ROUGE F1 (0-100):**
- >50: Excellent
- 40-50: Good
- 30-40: Acceptable
- <30: Poor

### Important Considerations:

1. **Automatic metrics ‚â† Human quality**
   - BLEU/ROUGE measure overlap, not meaning
   - Always validate with human evaluation
   - Check for hallucinations and factual errors

2. **Task-specific evaluation**
   - Use domain-appropriate metrics
   - Consider task-specific constraints
   - Balance multiple metrics

3. **Statistical significance**
   - Test on large, diverse datasets
   - Report confidence intervals
   - Use multiple random seeds

4. **Beyond metrics**
   - Inference speed
   - Memory usage
   - Robustness to adversarial inputs
   - Fairness and bias

### Production Evaluation:

```python
# Comprehensive evaluation pipeline
1. Automatic metrics (BLEU, ROUGE, etc.)
2. Human evaluation (fluency, relevance, factuality)
3. A/B testing in production
4. Monitor user feedback
5. Continuous evaluation on new data
```

---

## üéì Congratulations!

You've completed the entire LLM Fine-Tuning tutorial series!

### What you've mastered:

1. ‚úÖ **Tutorial 1**: Fine-tuning concepts and LoRA theory
2. ‚úÖ **Tutorial 2**: LoRA implementation from scratch
3. ‚úÖ **Tutorial 3**: Data preparation and tokenization
4. ‚úÖ **Tutorial 4**: Complete instruction tuning pipeline
5. ‚úÖ **Tutorial 5**: Comprehensive evaluation metrics

### Next Steps:

- Scale to larger models (7B, 13B, 70B)
- Try QLoRA for even lower memory
- Experiment with different tasks
- Deploy with inference optimization
- Contribute to open source!

---

## üìö Resources

**Papers:**
- [papers/DeepSeek-R1-paper.pdf](../papers/DeepSeek-R1-paper.pdf) - Complete methodology
- BLEU: https://www.aclweb.org/anthology/P02-1040.pdf
- ROUGE: https://www.aclweb.org/anthology/W04-1013.pdf

**Tools:**
- Hugging Face Evaluate: https://huggingface.co/docs/evaluate
- NLTK: https://www.nltk.org/
- ROUGE Score: https://github.com/google-research/google-research/tree/master/rouge

**Related:**
- Complete tutorial series: [llm-fine-tune/](./)
- Transformer foundations: [transformer-foundation/](../transformer-foundation/)
- Source code: [src/](../src/)

---

**Thank you for completing this tutorial series! Happy fine-tuning! üöÄ**