# Base Model Math Exploration

Experiment with `Qwen2.5-0.5B` (4-bit MLX) on training problems. Inspect problem types, run the model, and see where it succeeds or fails.

In [1]:
import sys
sys.path.insert(0, '../src')

from maths_prompt.generator import generate_problems, generate_test_problems
from maths_prompt.model import query_model, query_model_batch
from maths_prompt.scorer import extract_number, check_answer
from maths_prompt.config import MLX_MODEL_PATH

print(f"Model: {MLX_MODEL_PATH}")

  from .autonotebook import tqdm as notebook_tqdm


Model: /Users/pjtunney/repos/maths-prompt/models/Qwen2.5-0.5B-4bit


## 0. Inspect the exact prompt format sent to the model

In [6]:
from maths_prompt.model import _load, _format_prompt

_, tokenizer = _load()

examples = [
    ("", "2 + 2"),
    ("Solve the math expression and output only the number.", "2 + 2"),
    ("Solve the math expression and output only the number.", "(15 + 7) * (3 - 1)"),
]

for system, question in examples:
    prompt = _format_prompt(tokenizer, system, question)
    print(f"system: {repr(system)}")
    print(f"question: {repr(question)}")
    print("--- formatted prompt ---")
    print(prompt)
    print("=" * 60)

system: ''
question: '2 + 2'
--- formatted prompt ---
<|im_start|>system
<|im_end|>
<|im_start|>user
2 + 2 =<|im_end|>
<|im_start|>assistant

system: 'Solve the math expression and output only the number.'
question: '2 + 2'
--- formatted prompt ---
<|im_start|>system
Solve the math expression and output only the number.<|im_end|>
<|im_start|>user
2 + 2 =<|im_end|>
<|im_start|>assistant

system: 'Solve the math expression and output only the number.'
question: '(15 + 7) * (3 - 1)'
--- formatted prompt ---
<|im_start|>system
Solve the math expression and output only the number.<|im_end|>
<|im_start|>user
(15 + 7) * (3 - 1) =<|im_end|>
<|im_start|>assistant



## 1. Inspect training problems

In [2]:
train = generate_problems(20)

print(f"{'Question':<40} {'Answer':>10}")
print("-" * 52)
for p in train:
    print(f"{p.question:<40} {p.answer:>10.4g}")

Question                                     Answer
----------------------------------------------------
(39 + 9) - 89                                   -41
(94 + 49) * (57 - 26)                          4433
45 - 56                                         -11
(37 + 24) + (26 - 94)                            -7
51 + 21                                          72
((65 - 62) - (29 - 44)) - 22 / 22 * 52          -34
(6 * 10 + (20 + 35)) * 99                 1.138e+04
5 * 97 * 22 * 14 / 7                      2.134e+04
45 * 8 - 22 * 10                                140
4 - 30                                          -26
5 * 16 - 5 / 5                                   79
47 * 29                                        1363
36 + 39                                          75
(67 - 99) + 2 * 41                               50
49 * 9                                          441
40 + 23                                          63
94 - (33 - 70)                                  131
4 * 20     

## 2. Inspect test problems (by category)

In [3]:
from collections import defaultdict

test = generate_test_problems(70)  # 10 per category
by_cat = defaultdict(list)
for p in test:
    by_cat[p.category].append(p)

for cat, problems in by_cat.items():
    print(f"\n=== {cat} ===")
    for p in problems[:3]:
        print(f"  {p.question}  →  {p.answer}")


=== exponents ===
  3 ** 2  →  9.0
  2 ** 3  →  8.0
  2 ** 3  →  8.0

=== modulo ===
  769 % 10  →  9.0
  724 % 15  →  4.0
  801 % 11  →  9.0

=== long_chain ===
  18 + 95 + 14 + 87 + 95 + 70  →  379.0
  20 - 28 - 98 - 44 - 14 - 12 - 49  →  -225.0
  13 + 49 + 36 + 59 + 82 + 47  →  286.0

=== deeply_nested ===
  ((3 + 19) + (14 + 2)) + 1  →  39.0
  ((4 + 12) - (12 * 20)) + 9  →  -215.0
  ((6 * 12) + (12 * 7)) + 9  →  165.0

=== negatives ===
  (-33) * 39 + 2  →  -1285.0
  (-35) + 8 + 25  →  -2.0
  (-39) * 41 + 11  →  -1588.0

=== decimals ===
  3.5 - 9.0  →  -5.5
  9.0 * 5.0  →  45.0
  4.0 - 3.0  →  1.0

=== large_numbers ===
  4611 - 8359  →  -3748.0
  6925 + 4150  →  11075.0
  7216 + 5422  →  12638.0


## 3. Run the model on a few problems (no system prompt)

This is the raw baseline — no guidance at all.

In [4]:
SYSTEM_PROMPT = ""  # empty — raw base model

sample = train[:10]
responses = query_model_batch(SYSTEM_PROMPT, [p.question for p in sample])
results = []

for p, response in zip(sample, responses):
    extracted = extract_number(response)
    correct = check_answer(extracted, p.answer)
    results.append({
        "question": p.question,
        "expected": p.answer,
        "response": response.strip(),
        "extracted": extracted,
        "correct": correct,
    })

accuracy = sum(r["correct"] for r in results) / len(results)
print(f"Accuracy (no prompt): {accuracy:.0%}\n")

for r in results:
    mark = "✓" if r["correct"] else "✗"
    print(f"{mark}  Q: {r['question']}")
    print(f"   Expected: {r['expected']}  |  Extracted: {r['extracted']}")
    print(f"   Model: {r['response'][:120]}")
    print()

Accuracy (no prompt): 0%

✗  Q: (39 + 9) - 89
   Expected: -41.0  |  Extracted: 39.0
   Model: (39 + 9) - 89 = :=
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();

✗  Q: (94 + 49) * (57 - 26)
   Expected: 4433.0  |  Extracted: 94.0
   Model: (94 + 49) * (57 - 26) = :=
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})(

✗  Q: 45 - 56
   Expected: -11.0  |  Extracted: 45.0
   Model: 45 - 56 =�性
�性system
�性system
�性system
性
�性system
性
性system
性
性system
性
性system
性
性system
性
性system
性
性system
性
性system


✗  Q: (37 + 24) + (26 - 94)
   Expected: -7.0  |  Extracted: 37.0
   Model: (37 + 24) + (26 - 94) =(ofSize)
(ofSize)
(ofSize)
(ofSize)
(ofSize)
(ofSize)
(ofSize)
(ofSize)
(ofSize)
(ofSize)
(ofSize

✗  Q: 51 + 21
   Expected: 72.0  |  Extracted: 51.0
   Model: 51 + 21 = :=
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();
})();

✗  Q: ((65 - 62)

## 4. Try a prompt and compare

Edit `MY_PROMPT` and re-run this cell to experiment.

In [5]:
MY_PROMPT = """Solve the math expression and output only the number."""

sample = generate_problems(20)
responses = query_model_batch(MY_PROMPT, [p.question for p in sample])
results = []

for p, response in zip(sample, responses):
    extracted = extract_number(response)
    correct = check_answer(extracted, p.answer)
    results.append({
        "question": p.question,
        "expected": p.answer,
        "response": response.strip(),
        "extracted": extracted,
        "correct": correct,
    })

accuracy = sum(r["correct"] for r in results) / len(results)
print(f"Accuracy with prompt: {accuracy:.0%}\n")

for r in results:
    mark = "✓" if r["correct"] else "✗"
    print(f"{mark}  {r['question']}  →  expected {r['expected']}, got {r['extracted']}")
    if not r["correct"]:
        print(f"   Model said: {r['response'][:120]}")

Accuracy with prompt: 20%

✗  53 - 79  →  expected -26.0, got 53.0
   Model said: To solve the math expression and output only the number, we need to follow the order of operations, which is parentheses
✓  57 - 49  →  expected 8.0, got 8.0
✗  (76 * 60 + (29 + 43)) + 58  →  expected 4690.0, got 1225.0
   Model said: The answer is 1225.
✓  57 + 15  →  expected 72.0, got 72.0
✗  (95 - 55) / 4  →  expected 10.0, got None
   Model said: Solve the math expression and output only the number.accoo
accoo
Solve the math expression and output only the number.ac
✗  77 * 43  →  expected 3311.0, got 3411.0
   Model said: The answer is 3411.
✓  68 + 88  →  expected 156.0, got 156.0
✗  (54 + 11) - 60  →  expected 5.0, got 54.0
   Model said: To solve this math expression, we need to follow the order of operations, which is parentheses, exponents, multiplicatio
✗  32 * (48 * 5 + (32 - 28))  →  expected 7808.0, got 120.0
   Model said: The answer is 120.
✗  81 * 32  →  expected 2592.0, got 81.0
   Model

## 5. Evaluate across all test categories

Run your prompt against the fixed test set and see per-category accuracy.

In [None]:
# Reuses MY_PROMPT from cell 4 — run that first
from collections import defaultdict

test_problems = generate_test_problems(140)  # 20 per category
by_cat = defaultdict(list)
for p in test_problems:
    by_cat[p.category].append(p)

print(f"{'Category':<18} {'Correct':>8} {'Total':>6} {'Acc':>6}")
print("-" * 42)

overall_correct = 0
overall_total = 0

for cat, problems in sorted(by_cat.items()):
    responses = query_model_batch(MY_PROMPT, [p.question for p in problems])
    correct = sum(
        check_answer(extract_number(r), p.answer)
        for p, r in zip(problems, responses)
    )
    acc = correct / len(problems)
    overall_correct += correct
    overall_total += len(problems)
    print(f"{cat:<18} {correct:>8} {len(problems):>6} {acc:>6.0%}")

print("-" * 42)
print(f"{'TOTAL':<18} {overall_correct:>8} {overall_total:>6} {overall_correct/overall_total:>6.0%}")