# HOW TO CRUSH MATH OLYMPIAD: PHUC Forecast Orchestration (SWE Pattern)

This notebook refactors the math flow to match the same core pattern as `HOW-TO-CRUSH-SWE-BENCHMARK.ipynb`:

1. `DREAM` (Scout)
2. `FORECAST` (Grace)
3. `DECIDE` (Judge)
4. `ACT` (Solver)
5. `VERIFY` (Skeptic)

Key contract:
- Use real remote Ollama model (`llama3.1:8b`) for the LLM lane.
- Keep tool-assisted lane transparent and receipt-backed.
- Produce machine-parseable artifacts per phase.


In [1]:
import json
import os
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict

ROOT = Path.cwd()
PYTHON = sys.executable
CLI_MODULE = [PYTHON, "-m", "stillwater"]

DEFAULT_MODEL = os.environ.get("STILLWATER_IMO_MODEL", "llama3.1:8b")
DEFAULT_URL = os.environ.get("STILLWATER_OLLAMA_URL", "http://192.168.68.100:11434")
DEFAULT_TIMEOUT = float(os.environ.get("STILLWATER_IMO_TIMEOUT", "45"))

RUN_ID = "imo-phuc-" + datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%SZ")
OUT_DIR = ROOT / "artifacts" / "notebook_imo_phuc" / RUN_ID
OUT_DIR.mkdir(parents=True, exist_ok=True)

CLI_ENV = dict(os.environ)
cli_src = str(ROOT / "cli" / "src")
old_pp = CLI_ENV.get("PYTHONPATH", "")
CLI_ENV["PYTHONPATH"] = cli_src if not old_pp else f"{cli_src}{os.pathsep}{old_pp}"

print("=" * 80)
print("PHUC MATH ORCHESTRATION SETUP")
print("=" * 80)
print("run_id:", RUN_ID)
print("out_dir:", OUT_DIR)
print("model:", DEFAULT_MODEL)
print("url:", DEFAULT_URL)
print("timeout:", DEFAULT_TIMEOUT)
print("pythonpath(cli/src):", cli_src)
print("=" * 80)


PHUC MATH ORCHESTRATION SETUP
run_id: imo-phuc-20260219T214100Z
out_dir: /home/phuc/projects/stillwater/artifacts/notebook_imo_phuc/imo-phuc-20260219T214100Z
model: llama3.1:8b
url: http://192.168.68.100:11434
timeout: 45.0
pythonpath(cli/src): /home/phuc/projects/stillwater/cli/src


In [2]:
CASES_FILE = ROOT / "cli" / "tests" / "math" / "imo_qa_cases.json"
if not CASES_FILE.exists():
    raise FileNotFoundError(f"Missing cases file: {CASES_FILE}")

payload = json.loads(CASES_FILE.read_text(encoding="utf-8"))
CASES = []
for row in payload.get("cases", []):
    if not isinstance(row, dict):
        continue
    cid = str(row.get("id", "")).strip()
    prompt = str(row.get("prompt", "")).strip()
    needle = str(row.get("needle", "")).strip()
    if cid and prompt and needle:
        CASES.append({"id": cid, "prompt": prompt, "needle": needle})

if not CASES:
    raise RuntimeError("No valid IMO cases loaded.")

print(f"Loaded {len(CASES)} IMO cases from {CASES_FILE}")
for c in CASES:
    print(f"- {c['id']}: needle={c['needle']}")


Loaded 6 IMO cases from /home/phuc/projects/stillwater/cli/tests/math/imo_qa_cases.json
- P1: needle=2^(
- P2: needle=empty
- P3: needle=property holds
- P4: needle=∠YPX + ∠KIL = 180
- P5: needle=monochromatic triangle
- P6: needle=f(x)=x


## Phase Implementations (SWE-Pattern Mapping)

Each phase emits a strict artifact:
- `SCOUT_REPORT.json`
- `FORECAST_MEMO.json`
- `DECISION_RECORD.json`
- `ACT_RESULT.json`
- `SKEPTIC_VERDICT.json`

The `ACT` phase runs two lanes for transparency:
- `tool_assisted` (normal `twin` route)
- `llm_only` (`twin --llm-only`)


In [3]:
def write_json(path: Path, data: Dict[str, Any]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n", encoding="utf-8")


def run_twin(prompt: str, *, llm_only: bool, model: str, url: str, timeout: float) -> Dict[str, Any]:
    cmd = list(CLI_MODULE) + [
        "twin",
        prompt,
        "--model",
        model,
        "--url",
        url,
        "--timeout",
        str(timeout),
        "--json",
    ]
    if llm_only:
        cmd.append("--llm-only")

    proc = subprocess.run(
        cmd,
        cwd=str(ROOT),
        env=CLI_ENV,
        text=True,
        capture_output=True,
        check=False,
    )

    row: Dict[str, Any] = {
        "cmd": cmd,
        "returncode": proc.returncode,
        "stdout": proc.stdout,
        "stderr": proc.stderr,
        "llm_only": llm_only,
    }
    if proc.returncode != 0:
        row["ok"] = False
        return row

    try:
        payload = json.loads(proc.stdout)
    except Exception as ex:
        row["ok"] = False
        row["error"] = f"json_parse_error: {ex}"
        return row

    row["ok"] = True
    row["payload"] = payload
    return row


def scout_analyze(case: Dict[str, str]) -> Dict[str, Any]:
    return {
        "task_summary": f"Solve {case['id']} via PHUC orchestration with replayable receipts.",
        "repro_command": "python -m stillwater twin <prompt> --json",
        "failing_tests_or_errors": [
            "Needle mismatch against expected marker",
            "Wrong route action/source",
            "Non-zero twin return code",
        ],
        "suspect_files_ranked": [
            {"path": "cli/src/stillwater/cli.py", "reason": "routing + orchestration execution"},
            {"path": "cli/settings/SWARM-ORCHESTRATION.prime-mermaid.md", "reason": "externalized tool policy"},
            {"path": "imo/src/imo_2024_solver_proper.py", "reason": "deterministic benchmark evidence"},
        ],
        "witness_slices": [
            {"path": "cli/src/stillwater/cli.py", "line_start": 745, "line_end": 980},
            {"path": "cli/src/stillwater/cli.py", "line_start": 1341, "line_end": 1880},
        ],
        "acceptance_criteria": [
            "tool_assisted response contains case needle",
            "tool_assisted route action is phuc_swarms_benchmark",
            "tool_assisted source is CPU",
            "llm_only lane runs and is logged",
        ],
        "missing_assets": [],
    }


def grace_forecast(case: Dict[str, str], scout_report: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "top_failure_modes_ranked": [
            {"rank": 1, "mode": "route falls back to LLM unexpectedly", "likelihood_bucket": "30"},
            {"rank": 2, "mode": "deterministic solver output format drifts", "likelihood_bucket": "30"},
            {"rank": 3, "mode": "needle matcher is too weak/strict", "likelihood_bucket": "10"},
            {"rank": 4, "mode": "remote ollama timeout", "likelihood_bucket": "30"},
        ],
        "edge_cases_to_test": [
            "prompt variant with equivalent wording",
            "historical IMO year prompt",
            "model unavailable at configured URL",
        ],
        "compat_risks": [
            "Changes in route.action semantics",
            "Changes in response text that preserve meaning but break substring checks",
        ],
        "stop_rules": [
            "If tool_assisted returncode != 0, mark FAIL",
            "If tool_assisted action != phuc_swarms_benchmark, mark FAIL",
            "If tool_assisted needle check fails, mark FAIL",
        ],
        "mitigations": [
            "Capture per-lane stdout/stderr receipts",
            "Log route metadata and phuc_decision for every case",
            "Keep llm_only lane as public baseline",
        ],
        "case_id": case["id"],
        "scout_ref": bool(scout_report),
    }


def judge_decide(case: Dict[str, str], scout_report: Dict[str, Any], forecast_memo: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "chosen_approach": "Run tool_assisted lane + llm_only lane, then verify strict route and needle checks.",
        "alternatives_considered": [
            "llm_only-only run",
            "deterministic solver-only run without LLM baseline",
        ],
        "scope_locked": ["tool_assisted", "llm_only", "route_receipts", "needle_checks"],
        "required_verification_rung": 641,
        "required_tests": [
            "tool_assisted route.action == phuc_swarms_benchmark",
            "tool_assisted source == CPU",
            "tool_assisted response contains expected needle",
            "llm_only lane executed",
        ],
        "stop_rules": forecast_memo.get("stop_rules", []),
        "go_no_go_initial": "GO",
        "case_id": case["id"],
    }


def act_solver(case: Dict[str, str], decision_record: Dict[str, Any], *, model: str, url: str, timeout: float) -> Dict[str, Any]:
    tool_run = run_twin(case["prompt"], llm_only=False, model=model, url=url, timeout=timeout)
    llm_run = run_twin(case["prompt"], llm_only=True, model=model, url=url, timeout=timeout)
    return {
        "case_id": case["id"],
        "decision_record_ref": bool(decision_record),
        "tool_assisted": tool_run,
        "llm_only": llm_run,
        "model": model,
        "url": url,
        "timeout": timeout,
    }


def skeptic_verify(case: Dict[str, str], act_result: Dict[str, Any]) -> Dict[str, Any]:
    needle = case["needle"].lower()

    def _lane_eval(run: Dict[str, Any]) -> Dict[str, Any]:
        out: Dict[str, Any] = {
            "run_ok": bool(run.get("ok")),
            "returncode": run.get("returncode"),
            "match": False,
            "source": None,
            "action": None,
            "profile": None,
        }
        payload = run.get("payload", {}) if isinstance(run.get("payload"), dict) else {}
        route = payload.get("route", {}) if isinstance(payload.get("route"), dict) else {}
        response = str(payload.get("response", ""))
        out["source"] = payload.get("source")
        out["action"] = route.get("action")
        phuc = route.get("phuc_decision", {}) if isinstance(route.get("phuc_decision"), dict) else {}
        out["profile"] = phuc.get("profile")
        out["match"] = needle in response.lower()
        out["response_excerpt"] = response[:260]
        out["route"] = route
        return out

    tool_eval = _lane_eval(act_result["tool_assisted"])
    llm_eval = _lane_eval(act_result["llm_only"])

    tool_pass = bool(
        tool_eval["run_ok"]
        and tool_eval["source"] == "CPU"
        and tool_eval["action"] == "phuc_swarms_benchmark"
        and tool_eval["match"]
    )

    status = "PASS" if tool_pass else "FAIL"
    reasons = []
    if not tool_eval["run_ok"]:
        reasons.append("tool_assisted run failed")
    if tool_eval["source"] != "CPU":
        reasons.append(f"unexpected source: {tool_eval['source']}")
    if tool_eval["action"] != "phuc_swarms_benchmark":
        reasons.append(f"unexpected action: {tool_eval['action']}")
    if not tool_eval["match"]:
        reasons.append("needle mismatch")

    return {
        "status": status,
        "rung_achieved": 641 if status == "PASS" else 0,
        "fail_reasons": reasons,
        "required_fixes": [
            "align tool route policy in SWARM-ORCHESTRATION",
            "improve prompt-to-problem mapping for solver parser",
            "improve deterministic answer extraction",
        ] if reasons else [],
        "tool_assisted": tool_eval,
        "llm_only": llm_eval,
    }


In [4]:
def run_case_pipeline(case: Dict[str, str], *, model: str, url: str, timeout: float) -> Dict[str, Any]:
    case_dir = OUT_DIR / case["id"]
    case_dir.mkdir(parents=True, exist_ok=True)

    scout = scout_analyze(case)
    write_json(case_dir / "SCOUT_REPORT.json", scout)

    forecast = grace_forecast(case, scout)
    write_json(case_dir / "FORECAST_MEMO.json", forecast)

    decide = judge_decide(case, scout, forecast)
    write_json(case_dir / "DECISION_RECORD.json", decide)

    act = act_solver(case, decide, model=model, url=url, timeout=timeout)
    write_json(case_dir / "ACT_RESULT.json", act)

    verify = skeptic_verify(case, act)
    write_json(case_dir / "SKEPTIC_VERDICT.json", verify)

    summary = {
        "case_id": case["id"],
        "status": verify["status"],
        "tool_match": verify["tool_assisted"]["match"],
        "tool_source": verify["tool_assisted"]["source"],
        "tool_action": verify["tool_assisted"]["action"],
        "llm_match": verify["llm_only"]["match"],
        "artifact_dir": str(case_dir),
    }
    write_json(case_dir / "SUMMARY.json", summary)
    return {
        "scout": scout,
        "forecast": forecast,
        "decide": decide,
        "act": act,
        "verify": verify,
        "summary": summary,
    }


def run_full_pipeline(cases, *, model: str, url: str, timeout: float) -> Dict[str, Any]:
    report: Dict[str, Any] = {
        "run_id": RUN_ID,
        "timestamp_utc": datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
        "model": model,
        "url": url,
        "timeout": timeout,
        "total_cases": len(cases),
        "rows": [],
    }

    for case in cases:
        row = run_case_pipeline(case, model=model, url=url, timeout=timeout)
        report["rows"].append(row["summary"])

    tool_score = sum(1 for row in report["rows"] if row["status"] == "PASS")
    llm_score = sum(1 for row in report["rows"] if row["llm_match"])

    report["tool_assisted_score"] = tool_score
    report["llm_only_score"] = llm_score
    report["strict_pass"] = tool_score == report["total_cases"]
    report["lane_disclosure"] = {
        "tool_assisted": "PHUC swarms deterministic/tool route with receipts",
        "llm_only": "remote Ollama model baseline via --llm-only",
    }

    write_json(OUT_DIR / "REPORT.json", report)

    md_lines = [
        "# PHUC IMO Pipeline Report",
        "",
        f"- run_id: `{RUN_ID}`",
        f"- model: `{model}`",
        f"- url: `{url}`",
        f"- total_cases: `{report['total_cases']}`",
        f"- tool_assisted_score: **{tool_score}/{report['total_cases']}**",
        f"- llm_only_score: **{llm_score}/{report['total_cases']}**",
        f"- strict_pass: `{report['strict_pass']}`",
        "",
        "## Per Case",
        "",
    ]
    for row in report["rows"]:
        md_lines.append(
            f"- {row['case_id']}: status={row['status']} tool(source={row['tool_source']}, action={row['tool_action']}, match={row['tool_match']}) llm_match={row['llm_match']}"
        )
    md_lines.append("")
    md_lines.append("## Receipts")
    md_lines.append("")
    md_lines.append(f"- report: `{OUT_DIR / 'REPORT.json'}`")
    md_lines.append(f"- per-case artifacts: `{OUT_DIR}`")
    (OUT_DIR / "REPORT.md").write_text("\n".join(md_lines) + "\n", encoding="utf-8")

    return report


In [5]:
report = run_full_pipeline(CASES, model=DEFAULT_MODEL, url=DEFAULT_URL, timeout=DEFAULT_TIMEOUT)

print("=" * 80)
print("FINAL SCOREBOARD")
print("=" * 80)
print(f"tool_assisted: {report['tool_assisted_score']}/{report['total_cases']}")
print(f"llm_only: {report['llm_only_score']}/{report['total_cases']}")
print(f"strict_pass: {report['strict_pass']}")
print("report_json:", OUT_DIR / "REPORT.json")
print("report_md:", OUT_DIR / "REPORT.md")
print("=" * 80)

if not report["strict_pass"]:
    raise AssertionError("Tool-assisted lane did not reach full score. Inspect REPORT.json receipts.")


FINAL SCOREBOARD
tool_assisted: 6/6
llm_only: 1/6
strict_pass: True
report_json: /home/phuc/projects/stillwater/artifacts/notebook_imo_phuc/imo-phuc-20260219T214100Z/REPORT.json
report_md: /home/phuc/projects/stillwater/artifacts/notebook_imo_phuc/imo-phuc-20260219T214100Z/REPORT.md


## Claim Hygiene

This notebook is now symmetric with SWE-style orchestration structure, but still math-specific in verifier logic.

- `tool_assisted` lane score is a workflow score over configured IMO cases.
- `llm_only` is baseline model behavior on the same prompts.
- This is not official IMO grading and not a formal proof certificate.

Next step after this notebook is stable: port identical phase contracts to CLI command paths.
