# 🔁 Reproducibility & Eval Diff Notebook – Module 16
Compare model versions, evaluate consistency, and log version-aware metadata.

## ✅ Step 1: Load Two Eval Files

In [None]:
import json
from pathlib import Path

with open("models/model_v1.0/eval.json") as f:
    eval_v1 = json.load(f)

with open("models/model_v1.1/eval.json") as f:
    eval_v2 = json.load(f)

print("✅ Eval files loaded.")

## 🧪 Step 2: Compare Metric Deltas

In [None]:
from pprint import pprint

delta = {}
for key in eval_v1:
    if key in eval_v2:
        diff = round(eval_v2[key] - eval_v1[key], 4)
        delta[key] = {"v1": eval_v1[key], "v2": eval_v2[key], "delta": diff}

pprint(delta)

## 🧾 Step 3: Verify Tolerance Check

In [None]:
PASS_THRESHOLD = 0.01  # allow up to 1% regression

failures = [k for k, v in delta.items() if v['delta'] < -PASS_THRESHOLD]
if failures:
    print("❌ Regression detected in:", failures)
else:
    print("✅ All metrics within tolerance.")

## 📘 Step 4: Log Metadata

In [None]:
from datetime import datetime
import platform

model_log = {
    "version": "v1.1",
    "date": datetime.now().isoformat(),
    "git_commit": "abc1234",  # Replace with real SHA
    "tokenizer": "custom-bpe-v1",
    "base": "model_v1.0",
    "hardware": platform.platform(),
    "metrics": eval_v2
}

with open("models/model_v1.1/metadata.json", "w") as f:
    json.dump(model_log, f, indent=2)

print("✅ Metadata logged to model_v1.1/metadata.json")