# Base Model Math Exploration

Experiment with `Qwen2.5-0.5B` (4-bit MLX) on training problems. Inspect problem types, run the model, and see where it succeeds or fails.

In [None]:
import sys
sys.path.insert(0, '../src')

from maths_prompt.generator import generate_problems, generate_test_problems
from maths_prompt.model import query_model, query_model_batch
from maths_prompt.scorer import extract_number, check_answer
from maths_prompt.config import MLX_MODEL_PATH

print(f"Model: {MLX_MODEL_PATH}")

## 1. Inspect training problems

In [None]:
train = generate_problems(20)

print(f"{'Question':<40} {'Answer':>10}")
print("-" * 52)
for p in train:
    print(f"{p.question:<40} {p.answer:>10.4g}")

## 2. Inspect test problems (by category)

In [None]:
from collections import defaultdict

test = generate_test_problems(70)  # 10 per category
by_cat = defaultdict(list)
for p in test:
    by_cat[p.category].append(p)

for cat, problems in by_cat.items():
    print(f"\n=== {cat} ===")
    for p in problems[:3]:
        print(f"  {p.question}  →  {p.answer}")

## 3. Run the model on a few problems (no system prompt)

This is the raw baseline — no guidance at all.

In [None]:
SYSTEM_PROMPT = ""  # empty — raw base model

sample = train[:10]
responses = query_model_batch(SYSTEM_PROMPT, [p.question for p in sample])
results = []

for p, response in zip(sample, responses):
    extracted = extract_number(response)
    correct = check_answer(extracted, p.answer)
    results.append({
        "question": p.question,
        "expected": p.answer,
        "response": response.strip(),
        "extracted": extracted,
        "correct": correct,
    })

accuracy = sum(r["correct"] for r in results) / len(results)
print(f"Accuracy (no prompt): {accuracy:.0%}\n")

for r in results:
    mark = "✓" if r["correct"] else "✗"
    print(f"{mark}  Q: {r['question']}")
    print(f"   Expected: {r['expected']}  |  Extracted: {r['extracted']}")
    print(f"   Model: {r['response'][:120]}")
    print()

## 4. Try a prompt and compare

Edit `MY_PROMPT` and re-run this cell to experiment.

In [None]:
MY_PROMPT = """Solve the math expression and output only the number."""

sample = generate_problems(20)
responses = query_model_batch(MY_PROMPT, [p.question for p in sample])
results = []

for p, response in zip(sample, responses):
    extracted = extract_number(response)
    correct = check_answer(extracted, p.answer)
    results.append({
        "question": p.question,
        "expected": p.answer,
        "response": response.strip(),
        "extracted": extracted,
        "correct": correct,
    })

accuracy = sum(r["correct"] for r in results) / len(results)
print(f"Accuracy with prompt: {accuracy:.0%}\n")

for r in results:
    mark = "✓" if r["correct"] else "✗"
    print(f"{mark}  {r['question']}  →  expected {r['expected']}, got {r['extracted']}")
    if not r["correct"]:
        print(f"   Model said: {r['response'][:120]}")

## 5. Evaluate across all test categories

Run your prompt against the fixed test set and see per-category accuracy.

In [None]:
# Reuses MY_PROMPT from cell 4 — run that first
from collections import defaultdict

test_problems = generate_test_problems(140)  # 20 per category
by_cat = defaultdict(list)
for p in test_problems:
    by_cat[p.category].append(p)

print(f"{'Category':<18} {'Correct':>8} {'Total':>6} {'Acc':>6}")
print("-" * 42)

overall_correct = 0
overall_total = 0

for cat, problems in sorted(by_cat.items()):
    responses = query_model_batch(MY_PROMPT, [p.question for p in problems])
    correct = sum(
        check_answer(extract_number(r), p.answer)
        for p, r in zip(problems, responses)
    )
    acc = correct / len(problems)
    overall_correct += correct
    overall_total += len(problems)
    print(f"{cat:<18} {correct:>8} {len(problems):>6} {acc:>6.0%}")

print("-" * 42)
print(f"{'TOTAL':<18} {overall_correct:>8} {overall_total:>6} {overall_correct/overall_total:>6.0%}")