In [None]:
# Bootstrap: Import helpers and create directories
import sys
from pathlib import Path

# Add repo root to Python path
repo_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from utils.nb_helpers import run_module, run_script
print("✅ Notebook helpers loaded - ready for evaluation!")


# 04 - Evaluation & Verification

## Learning Goals

* Compute and interpret simple metrics (accuracy, confusion matrix).
* Produce reproducible **artifacts** and a machine-readable **receipt** for CI.
* Understand why verifiable outputs matter in a classroom or production pipeline.

## You Should Be Able To...

- Run model evaluation and interpret results
- Understand confusion matrices and accuracy metrics
- Generate verification receipts for ML pipelines
- Identify when models meet deployment criteria
- Reflect on the complete ML development process

---

## Concepts

**Confusion matrix**: where the classifier makes mistakes by class.

**Reproducible artifacts**: model file, benchmark reports, quantization summary, evaluation report.

**Receipt**: a small JSON proving all required files were created and basic checks passed.

## Common Pitfalls

* Not running evaluation on held-out test data
* Misinterpreting confusion matrix results
* Forgetting to generate verification artifacts
* Not checking that all pipeline components work together

## Success Criteria

* ✅ `progress/receipt.json` says **PASS**
* ✅ You can explain what each artifact is and where it lives
* ✅ You can describe one change you'd make next (e.g., more data, different architecture)

---

## Setup & Environment Check


In [None]:
# ruff: noqa: E401
import os
import sys
from pathlib import Path

def cd_repo_root():
    p = Path.cwd()
    for _ in range(5):  # climb up at most 5 levels
        if (p/"verify.py").exists() and (p/"scripts"/"evaluate_onnx.py").exists():
            if str(p) not in sys.path: sys.path.insert(0, str(p))
            if p != Path.cwd():
                os.chdir(p)
                print("-> Changed working dir to repo root:", os.getcwd())
            return
        p = p.parent
    raise RuntimeError("Could not locate repo root")

cd_repo_root()

# Hints & Solutions helper (pure Jupyter, no extra deps)
from IPython.display import Markdown, display

def hints(*lines, solution: str | None = None, title="Need a nudge?"):
    """Render progressive hints + optional collapsible solution."""
    md = [f"### {title}"]
    for i, txt in enumerate(lines, start=1):
        md.append(f"<details><summary>Hint {i}</summary>\n\n{txt}\n\n</details>")
    if solution:
        # keep code fenced as python for readability
        md.append(
            "<details><summary><b>Show solution</b></summary>\n\n"
            f"```python\n{solution.strip()}\n```\n"
            "</details>"
        )
    display(Markdown("\n\n".join(md)))


## 🤔 What is evaluation and why do we need it?

**Evaluation** = test the model on data it has not seen during training.

**What we measure**:
- **Accuracy** — how many predictions are correct
- **Confusion matrix** — detailed breakdown of correct/incorrect predictions
- **Per-class performance** — how well the model performs for each class

**Why important**:
- **Validation** — ensures the model actually works
- **Debugging** — shows which classes are difficult
- **Comparison** — compare different models/settings

<details>
<summary>🔍 Click to see what a confusion matrix shows</summary>

**Confusion matrix**:
- **Diagonal** = correct predictions
- **Off-diagonal** = incorrect predictions
- **Per class** = precision, recall for each class

</details>


In [None]:
# Run evaluation on our model
print("🔍 Running evaluation...")

# Use the model from previous notebooks (or create a quick one)
!python -m piedge_edukit.train --fakedata --no-pretrained --epochs 1 --batch-size 256 --output-dir ./models_eval


In [None]:
# Run evaluation with a limited number of samples (faster)
!python scripts/evaluate_onnx.py --model ./models_eval/model.onnx --fakedata --limit 32


In [None]:
# Show evaluation results
import os

if os.path.exists("./reports/eval_summary.txt"):
    with open("./reports/eval_summary.txt", "r") as f:
        print("📊 Evaluation results:")
        print(f.read())
else:
    print("❌ Evaluation report missing")


In [None]:
# Show training curves if available
from PIL import Image
from IPython.display import display

if os.path.exists("./reports/training_curves.png"):
    print("📈 Training curves:")
    display(Image.open("./reports/training_curves.png"))
else:
    print("⚠️ Training curves missing – run training first.")


In [None]:
# Show confusion matrix if it exists
import matplotlib.pyplot as plt
from PIL import Image

if os.path.exists("./reports/confusion_matrix.png"):
    print("📈 Confusion Matrix:")
    img = Image.open("./reports/confusion_matrix.png")
    plt.figure(figsize=(8, 6))
    plt.imshow(img)
    plt.axis('off')
    plt.title('Confusion Matrix')
    plt.show()
else:
    print("❌ Confusion matrix missing")


## 🔍 Automatic verification

**Verification** = automated checks ensuring the lesson works correctly.

**What is checked**:
- **Artifacts exist** — all required files are created
- **Benchmark works** — latency data is valid
- **Quantization works** — quantized model is created
- **Evaluation works** — confusion matrix and accuracy are available

**Result**: `progress/receipt.json` with PASS/FAIL status


In [None]:
# Run automatic verification
print("🔍 Running automatic verification...")
!python verify.py


In [None]:
# Analyze the receipt in detail
import json

if os.path.exists("./progress/receipt.json"):
    with open("./progress/receipt.json", "r") as f:
        receipt = json.load(f)
    
    print("📋 Detailed receipt analysis:")
    print(f"Status: {'✅ PASS' if receipt['pass'] else '❌ FAIL'}")
    print(f"Timestamp: {receipt['timestamp']}")
    
    print("\n🔍 Checks:")
    for check in receipt['checks']:
        status = "✅" if check['ok'] else "❌"
        print(f"  {status} {check['name']}: {check['reason']}")
    
    print("\n📊 Metrics:")
    if 'metrics' in receipt:
        for metric, value in receipt['metrics'].items():
            print(f"  {metric}: {value}")
    
    print("\n📁 Generated files:")
    if 'artifacts' in receipt:
        for artifact in receipt['artifacts']:
            print(f"  - {artifact}")
else:
    print("❌ Receipt missing")


## 🤔 Reflection Questions

### TODO R1 — Reflect on results (2–4 bullets)
- Where did quantization help / hurt?
- Do your p50 and p95 match expectations after warm-up?
- One change you would make before deploying.

<details><summary>Hint</summary>
Tie back to goals: correctness, latency, and determinism. Fallback to FP32 is fine if INT8 regresses.
</details>

<details>
<summary>💭 Which goals are verified by our automatic check?</summary>

**Answer**: Our verification checks:
- **Technical functionality** — all steps run without errors
- **Artifact generation** — required files are created
- **Data integrity** — reports are valid and parseable
- **Pipeline integration** — all components work together

**What is NOT verified**:
- Accuracy quality (only that evaluation runs)
- Latency targets (only that benchmark runs)
- Production readiness (only that the pipeline works)

</details>

<details>
<summary>💭 What is missing for "production"?</summary>

**Answer**: For production we need:
- **Real data** — not FakeData
- **Accuracy targets** — specific precision/recall requirements
- **Latency targets** — SLA requirements on inference time
- **Robustness** — handling of edge cases and errors
- **Monitoring** — continuous monitoring of performance
- **A/B testing** — comparison of different models
- **Rollback** — ability to revert to previous versions

</details>


## 🎯 Your own experiment

**Task**: Run verification on different models and compare receipts.

**Suggestions**:
- Train models with different settings
- Run verification on each model
- Compare receipts and see which pass/fail
- Analyze which checks are most critical

**Code to modify**:
```python
# Train different models and run verification
MODELS = [
    {"epochs": 1, "batch_size": 128, "name": "quick"},
    {"epochs": 3, "batch_size": 64, "name": "balanced"},
    {"epochs": 5, "batch_size": 32, "name": "thorough"}
]

for model_config in MODELS:
    # Train model
    # Run verification
    # Analyze the receipt
```


In [None]:
# TODO: Implement your experiment here
# Train different models and compare the receipts

MODELS = [
    {"epochs": 1, "batch_size": 128, "name": "quick"},
    {"epochs": 3, "batch_size": 64, "name": "balanced"},
    {"epochs": 5, "batch_size": 32, "name": "thorough"}
]

print("🧪 My experiment: Compare different models")
for model_config in MODELS:
    print(f"  - {model_config['name']}: epochs={model_config['epochs']}, batch_size={model_config['batch_size']}")

# TODO: Implement a loop that trains and verifies each model


## Final Reflection

Congratulations! You've completed the entire PiEdge EduKit lesson. Please reflect on your learning experience:

**1. What was the most challenging part of implementing the CNN architecture? What helped you understand it better?**

*Your answer here (2-3 sentences):*

---

**2. How did your understanding of model performance change after running the latency benchmarks?**

*Your answer here (2-3 sentences):*

---

**3. What surprised you most about the quantization process? What would you do differently in a real deployment?**

*Your answer here (2-3 sentences):*

---

**4. How important do you think automated verification is for ML pipelines? Why?**

*Your answer here (2-3 sentences):*

---

## Next Steps

**Congratulations!** You've successfully completed the PiEdge EduKit lesson. You now understand:

- ✅ CNN implementation and training
- ✅ Model export to ONNX format  
- ✅ Performance benchmarking and analysis
- ✅ Quantization and compression techniques
- ✅ Evaluation and verification workflows

**Real-world applications**: Experiment with real data, different models, or deploy on Raspberry Pi!

**Key concepts mastered**:
- **Training**: Implementing and training neural networks
- **Export**: Converting models to deployment-ready formats
- **Benchmarking**: Measuring and analyzing performance
- **Quantization**: Optimizing models for edge deployment
- **Verification**: Automated quality assurance for ML pipelines
