# 📊 Checkpoint Comparator: v1.0 vs v1.1
Use this notebook to evaluate and compare outputs between two LLM checkpoints using the same prompt set.

## 🗂 Load Test Prompts

In [None]:
import json
with open('data/eval_cases.jsonl') as f:
    prompts = [json.loads(line)['instruction'] for line in f]
print(f"Loaded {len(prompts)} prompts.")

## 🤖 Load Both Models

In [None]:
from transformers import pipeline

model_v1 = pipeline("text-generation", model="./checkpoints/model_v1.0")
model_v2 = pipeline("text-generation", model="./checkpoints/model_v1.1")

## 🧪 Compare Outputs Side-by-Side

In [None]:
from IPython.display import display, Markdown

for prompt in prompts:
    out1 = model_v1(prompt, max_new_tokens=100)[0]['generated_text']
    out2 = model_v2(prompt, max_new_tokens=100)[0]['generated_text']

    display(Markdown(f"### Prompt:\n{prompt}"))
    display(Markdown(f"**v1.0 Output:**\n```
{out1}
```"))
    display(Markdown(f"**v1.1 Output:**\n```
{out2}
```"))