From 7f9bfdd3c91a5f1d2dedcef76e592a942d4c0bb7 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Tue, 12 May 2026 16:16:16 +0800
Subject: [PATCH 1/4] feat(eval): add 3-stage perf tracking, quantize step, and
 workflow-first HTML report
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- run_sa_eval.py: run wmk perf after export, graph-optimize, SA-optimize,
  and QDQ-quantize (stage 6); add --no-perf, --perf-iterations, --perf-warmup,
  --no-quantize, --quantize-precision, --quantize-samples, --report-only flags;
  fix output_dir to resolve to absolute path; downgrade empty SA classification
  from fatal error to warning
- sa_comparison.py: warn instead of silently returning empty results when SA
  produces no EP results (missing parquet rule data)
- sa_report.py: reorganize table columns in workflow order
  (Export → Normalize → Pre SA → Flags → Optimized → Post SA → Quantize → Delta);
  chain-normalize perf gain% against previous stage; add __main__ CLI entrypoint
  for report-only refresh
- quantize.py: add --model-name CLI option so task-aware calibration can load
  the correct HuggingFace tokenizer/processor
---
 scripts/e2e_eval/run_sa_eval.py         | 342 ++++++++++++++++++++++--
 scripts/e2e_eval/sa_comparison.py       |  14 +
 scripts/e2e_eval/sa_report.py           | 155 ++++++++++-
 src/winml/modelkit/commands/quantize.py |   8 +
 4 files changed, 488 insertions(+), 31 deletions(-)

diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py
index daa730654..ad603b2d8 100644
--- a/scripts/e2e_eval/run_sa_eval.py
+++ b/scripts/e2e_eval/run_sa_eval.py
@@ -108,6 +108,82 @@ def run_wmk_export(hf_id: str, task: str, output: Path) -> tuple[int, str]:
     return result.returncode, (result.stderr or "").strip()[-500:]
 
 
+# Map full EP names to the short form accepted by `wmk perf --ep`
+_EP_TO_PERF_ARG: dict[str, str] = {
+    "QNNExecutionProvider": "qnn",
+    "DmlExecutionProvider": "dml",
+    "CPUExecutionProvider": "cpu",
+    "MIGraphXExecutionProvider": "migraphx",
+    "OpenVINOExecutionProvider": "openvino",
+    "VitisAIExecutionProvider": "vitisai",
+    "NvTensorRTRTXExecutionProvider": "nv_tensorrt_rtx",
+}
+
+
+def run_winml_perf(
+    label: str,
+    onnx_path: Path,
+    output_json: Path,
+    device: str,
+    ep: str | None,
+    iterations: int,
+    warmup: int,
+    use_cache: bool,
+) -> dict | None:
+    """Run wmk perf on onnx_path. Returns latency_ms dict or None on failure."""
+    if use_cache and is_cached(output_json):
+        safe_print(f"  [{label}] Perf (cached): {output_json.name}")
+        try:
+            data = json.loads(output_json.read_text(encoding="utf-8"))
+            return data.get("latency_ms")
+        except Exception:
+            return None
+
+    safe_print(f"  [{label}] Running perf on {onnx_path.name}...")
+    cmd = [
+        sys.executable,
+        "-m",
+        "winml.modelkit.cli",
+        "perf",
+        "-m",
+        str(onnx_path),
+        "--device",
+        device.lower(),
+        "--iterations",
+        str(iterations),
+        "--warmup",
+        str(warmup),
+        "--output",
+        str(output_json),
+    ]
+    ep_arg = _EP_TO_PERF_ARG.get(ep, ep.lower() if ep else None) if ep else None
+    if ep_arg:
+        cmd += ["--ep", ep_arg]
+
+    result = subprocess.run(  # noqa: S603
+        cmd,
+        capture_output=True,
+        text=True,
+        encoding="utf-8",
+        errors="replace",
+        env={**os.environ, "PYTHONIOENCODING": "utf-8"},
+    )
+    if result.returncode != 0 or not is_cached(output_json):
+        safe_print(f"  [{label}] Perf failed (rc={result.returncode})")
+        return None
+
+    try:
+        data = json.loads(output_json.read_text(encoding="utf-8"))
+        latency = data.get("latency_ms", {})
+        mean_ms = latency.get("mean", 0)
+        p90_ms = latency.get("p90", 0)
+        safe_print(f"  [{label}] mean={mean_ms:.2f}ms  p90={p90_ms:.2f}ms")
+        return latency
+    except Exception as e:
+        safe_print(f"  [{label}] Could not parse perf result: {e}")
+        return None
+
+
 # ---------------------------------------------------------------------------
 # Stage implementations
 # ---------------------------------------------------------------------------
@@ -194,15 +270,17 @@ def stage2_sa_pre(
         )
 
     if not classifications:
-        safe_print("  [ERROR] SA pre-check returned no classifications")
-        return None
-
-    summary = get_sa_summary(classifications)
-    safe_print(
-        f"  [Stage 2] Pre: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
-        f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} "
-        f"({summary['supported_ratio']:.0%} supported)  optim_flags={list(optim_config.keys())}"
-    )
+        safe_print(
+            "  [WARN] SA pre-check: no classifications (no QNN rule data on this machine). "
+            "SA-driven optimization will be skipped; pipeline continues."
+        )
+    else:
+        summary = get_sa_summary(classifications)
+        safe_print(
+            f"  [Stage 2] Pre: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
+            f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} "
+            f"({summary['supported_ratio']:.0%} supported)  optim_flags={list(optim_config.keys())}"
+        )
     return classifications, optim_config, info_items
 
 
@@ -268,15 +346,14 @@ def stage4_sa_post(
             return None
 
     if not classifications:
-        safe_print("  [ERROR] SA post-check returned no classifications")
-        return None
-
-    summary = get_sa_summary(classifications)
-    safe_print(
-        f"  [Stage 4] Post: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
-        f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} "
-        f"({summary['supported_ratio']:.0%} supported)"
-    )
+        safe_print("  [WARN] SA post-check: no classifications (no QNN rule data). Continuing.")
+    else:
+        summary = get_sa_summary(classifications)
+        safe_print(
+            f"  [Stage 4] Post: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
+            f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} "
+            f"({summary['supported_ratio']:.0%} supported)"
+        )
     return classifications, info_items
 
 
@@ -361,6 +438,66 @@ def _compile_and_diff(
     return result
 
 
+def stage6_quantize(
+    model_dir: Path,
+    sa_opt_path: Path,
+    hf_id: str,
+    task: str,
+    use_cache: bool,
+    precision: str = "int8",
+    samples: int = 10,
+) -> Path | None:
+    """Stage 6: QDQ-quantize sa_optimized.onnx → quantized.onnx.
+
+    Runs ``wmk quantize`` on the SA-optimized model. Skips if the output
+    already exists and use_cache is True.
+
+    Returns the path to the quantized ONNX on success, None on failure.
+    """
+    quantized_path = model_dir / "quantized.onnx"
+
+    if use_cache and is_cached(quantized_path):
+        safe_print(f"  [Stage 6] Quantize (cached): {quantized_path.name}")
+        return quantized_path
+
+    safe_print(f"  [Stage 6] Quantizing {sa_opt_path.name} → {quantized_path.name}...")
+    cmd = [
+        sys.executable,
+        "-m",
+        "winml.modelkit.cli",
+        "quantize",
+        "-m",
+        str(sa_opt_path),
+        "-o",
+        str(quantized_path),
+        "--precision",
+        precision,
+        "--samples",
+        str(samples),
+    ]
+    if task:
+        cmd += ["--task", task]
+    if hf_id:
+        cmd += ["--model-name", hf_id]
+
+    result = subprocess.run(  # noqa: S603
+        cmd,
+        capture_output=True,
+        text=True,
+        encoding="utf-8",
+        errors="replace",
+        env={**os.environ, "PYTHONIOENCODING": "utf-8"},
+    )
+    if result.returncode != 0 or not is_cached(quantized_path):
+        safe_print(
+            f"  [Stage 6] Quantize failed (rc={result.returncode}): {(result.stderr or '').strip()[-300:]}"
+        )
+        return None
+
+    safe_print(f"  [Stage 6] Quantized: {quantized_path.name}")
+    return quantized_path
+
+
 def stage5_compile_and_diff(
     model_dir: Path,
     graph_opt_path: Path,
@@ -412,8 +549,14 @@ def evaluate_model(
     use_cache: bool,
     ep: str = "QNNExecutionProvider",
     device: str = "NPU",
+    run_perf: bool = True,
+    perf_iterations: int = 30,
+    perf_warmup: int = 5,
+    run_quantize: bool = True,
+    quantize_precision: str = "int8",
+    quantize_samples: int = 10,
 ) -> dict | None:
-    """Run the 4+1 stage SA eval pipeline for a single model."""
+    """Run the 4+1+1 stage SA eval pipeline for a single model."""
     hf_id = model_entry["hf_id"]
     task = model_entry.get("task", "")
     model_type = model_entry.get("model_type", "")
@@ -433,6 +576,35 @@ def evaluate_model(
     if graph_opt_path is None:
         return _skip_result(hf_id, task, model_type, skip_reason or "SKIP_EXPORT", model_dir)
 
+    # Perf after export and after graph optimize
+    perf_exported: dict | None = None
+    perf_graph_opt: dict | None = None
+    perf_sa_opt: dict | None = None
+
+    if run_perf:
+        exported_path = model_dir / "exported.onnx"
+        if is_cached(exported_path):
+            perf_exported = run_winml_perf(
+                "Perf (exported)",
+                exported_path,
+                model_dir / "exported_perf.json",
+                device=device,
+                ep=ep,
+                iterations=perf_iterations,
+                warmup=perf_warmup,
+                use_cache=use_cache,
+            )
+        perf_graph_opt = run_winml_perf(
+            "Perf (graph_opt)",
+            graph_opt_path,
+            model_dir / "graph_optimized_perf.json",
+            device=device,
+            ep=ep,
+            iterations=perf_iterations,
+            warmup=perf_warmup,
+            use_cache=use_cache,
+        )
+
     # Stage 2
     pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache, ep=ep, device=device)
     if pre_result is None:
@@ -444,6 +616,19 @@ def evaluate_model(
     if sa_opt_path is None:
         return _skip_result(hf_id, task, model_type, "SKIP_OPTIM", model_dir)
 
+    # Perf after SA capability optimization
+    if run_perf:
+        perf_sa_opt = run_winml_perf(
+            "Perf (sa_opt)",
+            sa_opt_path,
+            model_dir / "sa_optimized_perf.json",
+            device=device,
+            ep=ep,
+            iterations=perf_iterations,
+            warmup=perf_warmup,
+            use_cache=use_cache,
+        )
+
     # Stage 4
     post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache, ep=ep, device=device)
     if post_result is None:
@@ -462,6 +647,31 @@ def evaluate_model(
         ep=ep,
     )
 
+    # Stage 6: QDQ quantize
+    quantized_path: Path | None = None
+    perf_quantized: dict | None = None
+    if run_quantize:
+        quantized_path = stage6_quantize(
+            model_dir,
+            sa_opt_path,
+            hf_id,
+            task,
+            use_cache,
+            precision=quantize_precision,
+            samples=quantize_samples,
+        )
+        if run_perf and quantized_path is not None:
+            perf_quantized = run_winml_perf(
+                "Perf (quantized)",
+                quantized_path,
+                model_dir / "quantized_perf.json",
+                device=device,
+                ep=ep,
+                iterations=perf_iterations,
+                warmup=perf_warmup,
+                use_cache=use_cache,
+            )
+
     elapsed = time.monotonic() - t0
     delta = compute_delta(sa_pre, sa_post)
 
@@ -473,6 +683,18 @@ def evaluate_model(
         f"({delta['supported_ratio_delta']:+.0%})"
     )
 
+    if run_perf:
+
+        def _fmt(p: dict | None) -> str:
+            return f"{p['mean']:.2f}ms" if p else "N/A"
+
+        safe_print(
+            f"  Perf (mean): exported={_fmt(perf_exported)} "
+            f"→ normalize={_fmt(perf_graph_opt)} "
+            f"→ sa_opt={_fmt(perf_sa_opt)} "
+            f"→ quantize={_fmt(perf_quantized)}"
+        )
+
     result: dict = {
         "model": hf_id,
         "task": task,
@@ -483,6 +705,7 @@ def evaluate_model(
             "exported_onnx": str(model_dir / "exported.onnx"),
             "graph_optimized_onnx": str(graph_opt_path),
             "sa_optimized_onnx": str(sa_opt_path),
+            **({"quantized_onnx": str(quantized_path)} if quantized_path else {}),
         },
         "sa_pre": {
             "source_onnx": graph_opt_path.name,
@@ -513,6 +736,14 @@ def evaluate_model(
     if epcontext_diff_post:
         result["epcontext_diff_post"] = epcontext_diff_post
 
+    if run_perf:
+        result["perf"] = {
+            "exported": perf_exported,
+            "graph_optimized": perf_graph_opt,
+            "sa_optimized": perf_sa_opt,
+            "quantized": perf_quantized,
+        }
+
     out_file = model_dir / "sa_eval_result.json"
     out_file.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")
     safe_print(f"  Written: {out_file}")
@@ -691,12 +922,71 @@ def main() -> None:
         help="Execution provider (default: QNNExecutionProvider)",
     )
     parser.add_argument("--device", default="NPU", help="Target device (default: NPU)")
+    parser.add_argument(
+        "--no-perf",
+        action="store_true",
+        help="Skip winml perf benchmarks after each stage",
+    )
+    parser.add_argument(
+        "--perf-iterations",
+        type=int,
+        default=30,
+        help="Number of perf iterations per stage (default: 30)",
+    )
+    parser.add_argument(
+        "--perf-warmup",
+        type=int,
+        default=5,
+        help="Number of perf warmup iterations per stage (default: 5)",
+    )
+    parser.add_argument(
+        "--no-quantize",
+        action="store_true",
+        help="Skip QDQ quantize step (stage 6)",
+    )
+    parser.add_argument(
+        "--quantize-precision",
+        default="int8",
+        help="Quantization precision (default: int8)",
+    )
+    parser.add_argument(
+        "--quantize-samples",
+        type=int,
+        default=10,
+        help="Number of calibration samples for quantize (default: 10)",
+    )
+    parser.add_argument(
+        "--report-only",
+        action="store_true",
+        help=(
+            "Skip all eval stages — collect existing sa_eval_result.json files from "
+            "models/ subdirectories and regenerate the HTML report only."
+        ),
+    )
     args = parser.parse_args()
 
-    output_dir = args.output_dir or Path(f"sa_eval_results/{date.today().isoformat()}")
+    output_dir = (args.output_dir or Path(f"sa_eval_results/{date.today().isoformat()}")).resolve()
     output_dir.mkdir(parents=True, exist_ok=True)
     safe_print(f"Output: {output_dir}")
 
+    # --report-only: collect existing per-model JSONs and regenerate the report
+    if args.report_only:
+        models_dir = output_dir / "models"
+        result_files = sorted(models_dir.glob("*/sa_eval_result.json"))
+        if not result_files:
+            safe_print(f"[ERROR] No sa_eval_result.json files found under {models_dir}")
+            sys.exit(1)
+        all_results = [json.loads(f.read_text(encoding="utf-8")) for f in result_files]
+        safe_print(f"Collected {len(all_results)} model results from disk.")
+        report = build_aggregate_report(all_results, args.models_file)
+        report_json = output_dir / "sa_eval_report.json"
+        report_json.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
+        safe_print(f"Report JSON: {report_json}")
+        report_html = output_dir / "sa_eval_report.html"
+        generate_sa_html_report(report, report_html)
+        safe_print(f"Report HTML: {report_html}")
+        return
+
     if not args.models_file.exists():
         safe_print(f"[ERROR] Models file not found: {args.models_file}")
         sys.exit(1)
@@ -718,7 +1008,17 @@ def main() -> None:
     for i, entry in enumerate(models_to_run, 1):
         safe_print(f"\n[{i}/{len(models_to_run)}]")
         result = evaluate_model(
-            entry, output_dir, use_cache=args.use_cache, ep=args.ep, device=args.device
+            entry,
+            output_dir,
+            use_cache=args.use_cache,
+            ep=args.ep,
+            device=args.device,
+            run_perf=not args.no_perf,
+            perf_iterations=args.perf_iterations,
+            perf_warmup=args.perf_warmup,
+            run_quantize=not args.no_quantize,
+            quantize_precision=args.quantize_precision,
+            quantize_samples=args.quantize_samples,
         )
         if result:
             all_results.append(result)
diff --git a/scripts/e2e_eval/sa_comparison.py b/scripts/e2e_eval/sa_comparison.py
index a4803de31..d1e4b976f 100644
--- a/scripts/e2e_eval/sa_comparison.py
+++ b/scripts/e2e_eval/sa_comparison.py
@@ -61,9 +61,11 @@ def run_sa_with_info(
     classifications: dict[str, str] = {}
     info_items: list[dict] = []
 
+    ep_found = False
     for ep_result in result.output.results:
         if ep_result.ep_type != ep:
             continue
+        ep_found = True
         for level_enum, pid_list in ep_result.classification.items():
             level = level_enum.value.upper()
             for pid in pid_list:
@@ -78,6 +80,18 @@ def run_sa_with_info(
         )
         break
 
+    if not ep_found:
+        # No rule data for this EP/device — SA skipped the EP entirely.
+        # Return empty classifications so callers can proceed without SA-driven
+        # optimization (perf comparison across stages still works).
+        import logging
+
+        logging.getLogger(__name__).warning(
+            "SA produced no results for EP=%s — no runtime rule data available. "
+            "Returning empty classifications.",
+            ep,
+        )
+
     # Get optimization config from SA recommendations
     optim_config = dict(result.get_optimization_config(ep))
 
diff --git a/scripts/e2e_eval/sa_report.py b/scripts/e2e_eval/sa_report.py
index 0ad748145..2c42de333 100644
--- a/scripts/e2e_eval/sa_report.py
+++ b/scripts/e2e_eval/sa_report.py
@@ -9,8 +9,6 @@
 EPContext ground-truth accuracy, and per-model drill-down.
 """
 
-# ruff: noqa: E501
-
 from __future__ import annotations
 
 import json
@@ -102,6 +100,31 @@ def generate_sa_html_report(report_data: dict, output_path: Path) -> None:
                     for c in r.get("epcontext_diff_post", {}).get("comparison", [])
                     if c["verdict"] == "FP"
                 ],
+                # perf (mean ms per stage)
+                "perf_exported_mean": r.get("perf", {}).get("exported", {}).get("mean")
+                if r.get("perf", {}).get("exported")
+                else None,
+                "perf_exported_p90": r.get("perf", {}).get("exported", {}).get("p90")
+                if r.get("perf", {}).get("exported")
+                else None,
+                "perf_graph_opt_mean": r.get("perf", {}).get("graph_optimized", {}).get("mean")
+                if r.get("perf", {}).get("graph_optimized")
+                else None,
+                "perf_graph_opt_p90": r.get("perf", {}).get("graph_optimized", {}).get("p90")
+                if r.get("perf", {}).get("graph_optimized")
+                else None,
+                "perf_sa_opt_mean": r.get("perf", {}).get("sa_optimized", {}).get("mean")
+                if r.get("perf", {}).get("sa_optimized")
+                else None,
+                "perf_sa_opt_p90": r.get("perf", {}).get("sa_optimized", {}).get("p90")
+                if r.get("perf", {}).get("sa_optimized")
+                else None,
+                "perf_quantized_mean": r.get("perf", {}).get("quantized", {}).get("mean")
+                if r.get("perf", {}).get("quantized")
+                else None,
+                "perf_quantized_p90": r.get("perf", {}).get("quantized", {}).get("p90")
+                if r.get("perf", {}).get("quantized")
+                else None,
             }
         )
 
@@ -379,6 +402,78 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
   </div>`).join('');
 }}
 
+// ---- Perf helpers ----
+function fmtMs(v) {{
+  if (v == null) return '<span class="c-muted">—</span>';
+  return `<span style="font-family:monospace">${{v.toFixed(1)}}</span>`;
+}}
+
+function gainPct(baseline, current) {{
+  if (baseline == null || current == null) return '';
+  const pct = ((current - baseline) / baseline * 100);
+  const cls = pct < -1 ? 'c-good' : pct > 1 ? 'c-bad' : 'c-muted';
+  return `<span class="${{cls}}" style="font-size:10px">${{pct>0?'+':''}}${{pct.toFixed(0)}}%</span>`;
+}}
+
+function singlePerfCell(mean, baseline) {{
+  if (mean == null) return '<span class="c-muted">—</span>';
+  const g = baseline != null ? gainPct(baseline, mean) : '';
+  return `<span style="font-size:12px;font-weight:600;white-space:nowrap">${{fmtMs(mean)}}ms</span>${{g ? ' ' + g : ''}}`;
+}}
+
+function buildPerfDetail(d) {{
+  const hasPerfData = d.perf_exported_mean != null || d.perf_graph_opt_mean != null || d.perf_sa_opt_mean != null;
+  if (!hasPerfData) return '';
+
+  const rows = [
+    {{ stage: 'exported.onnx',        mean: d.perf_exported_mean,   p90: d.perf_exported_p90   }},
+    {{ stage: 'graph_optimized.onnx', mean: d.perf_graph_opt_mean,  p90: d.perf_graph_opt_p90  }},
+    {{ stage: 'sa_optimized.onnx',    mean: d.perf_sa_opt_mean,     p90: d.perf_sa_opt_p90     }},
+    {{ stage: 'quantized.onnx',       mean: d.perf_quantized_mean,  p90: d.perf_quantized_p90  }},
+  ].filter(r => r.mean != null);
+
+  let html = `
+  <div style="font-size:10px;text-transform:uppercase;letter-spacing:0.8px;color:var(--text2);font-weight:600;margin:14px 0 8px;padding-bottom:4px;border-bottom:1px solid var(--border)">
+    Perf Comparison (NPU, mean latency)
+  </div>
+  <div style="overflow-x:auto">
+  <table style="font-size:12px;border-collapse:collapse;width:auto;min-width:420px">
+    <thead>
+      <tr>
+        <th style="text-align:left;padding:4px 12px 4px 0;color:var(--text2);font-size:10px;text-transform:uppercase;font-weight:600">Stage</th>
+        <th style="text-align:right;padding:4px 12px;color:var(--text2);font-size:10px;text-transform:uppercase;font-weight:600">Mean (ms)</th>
+        <th style="text-align:right;padding:4px 12px;color:var(--text2);font-size:10px;text-transform:uppercase;font-weight:600">P90 (ms)</th>
+        <th style="text-align:right;padding:4px 12px;color:var(--text2);font-size:10px;text-transform:uppercase;font-weight:600">vs Prev Stage</th>
+        <th style="text-align:left;padding:4px 0 4px 12px;color:var(--text2);font-size:10px;text-transform:uppercase;font-weight:600">Bar</th>
+      </tr>
+    </thead>
+    <tbody>`;
+
+  const maxMean = Math.max(...rows.map(r => r.mean ?? 0));
+  let prevMean = null;
+  rows.forEach(r => {{
+    const pctVsPrev = (prevMean != null && r.mean != null)
+      ? ((r.mean - prevMean) / prevMean * 100)
+      : null;
+    const deltaHtml = pctVsPrev == null ? '<span class="c-muted">baseline</span>'
+      : `<span class="${{pctVsPrev < -1 ? 'c-good' : pctVsPrev > 1 ? 'c-bad' : 'c-muted'}}">${{pctVsPrev > 0 ? '+' : ''}}${{pctVsPrev.toFixed(1)}}%</span>`;
+    const barWidth = (maxMean > 0 && r.mean != null) ? (r.mean / maxMean * 100).toFixed(1) : 0;
+    const barColor = pctVsPrev == null ? '#8b8fa3' : pctVsPrev < -1 ? '#4ecdc4' : pctVsPrev > 1 ? '#ff6b9d' : '#8b8fa3';
+    html += `<tr style="border-bottom:1px solid var(--border)">
+      <td style="padding:5px 12px 5px 0;font-family:monospace;color:var(--text)">${{esc(r.stage)}}</td>
+      <td style="text-align:right;padding:5px 12px;font-weight:600">${{r.mean != null ? r.mean.toFixed(2) : '—'}}</td>
+      <td style="text-align:right;padding:5px 12px;color:var(--text2)">${{r.p90 != null ? r.p90.toFixed(2) : '—'}}</td>
+      <td style="text-align:right;padding:5px 12px">${{deltaHtml}}</td>
+      <td style="padding:5px 0 5px 12px;min-width:120px">
+        ${{r.mean != null ? `<div style="height:6px;width:${{barWidth}}%;background:${{barColor}};border-radius:3px;min-width:2px"></div>` : ''}}
+      </td>
+    </tr>`;
+    if (r.mean != null) prevMean = r.mean;
+  }});
+  html += `</tbody></table></div>`;
+  return html;
+}}
+
 // ---- Per-Model Table ----
 let sortKey = 'pre_supported_asc';
 let searchQuery = '';
@@ -398,6 +493,14 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
     case 'pre_epctx_asc':  return s.sort((a,b) => (a.pre_epctx_accuracy||0) - (b.pre_epctx_accuracy||0));
     case 'post_epctx_desc': return s.sort((a,b) => (b.post_epctx_accuracy||0) - (a.post_epctx_accuracy||0));
     case 'post_epctx_asc':  return s.sort((a,b) => (a.post_epctx_accuracy||0) - (b.post_epctx_accuracy||0));
+    case 'perf_exported_asc':   return s.sort((a,b) => (a.perf_exported_mean??Infinity) - (b.perf_exported_mean??Infinity));
+    case 'perf_exported_desc':  return s.sort((a,b) => (b.perf_exported_mean??-Infinity) - (a.perf_exported_mean??-Infinity));
+    case 'perf_graph_opt_asc':  return s.sort((a,b) => (a.perf_graph_opt_mean??Infinity) - (b.perf_graph_opt_mean??Infinity));
+    case 'perf_graph_opt_desc': return s.sort((a,b) => (b.perf_graph_opt_mean??-Infinity) - (a.perf_graph_opt_mean??-Infinity));
+    case 'perf_sa_opt_asc':  return s.sort((a,b) => (a.perf_sa_opt_mean??Infinity) - (b.perf_sa_opt_mean??Infinity));
+    case 'perf_sa_opt_desc': return s.sort((a,b) => (b.perf_sa_opt_mean??-Infinity) - (a.perf_sa_opt_mean??-Infinity));
+    case 'perf_quantized_asc':  return s.sort((a,b) => (a.perf_quantized_mean??Infinity) - (b.perf_quantized_mean??Infinity));
+    case 'perf_quantized_desc': return s.sort((a,b) => (b.perf_quantized_mean??-Infinity) - (a.perf_quantized_mean??-Infinity));
   }}
   return s;
 }}
@@ -411,12 +514,16 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
   let html = `<div class="table-wrap"><table><thead><tr>
     <th onclick="toggleSort('model')">Model${{arrow('model')}}</th>
     <th>Task</th>
+    <th onclick="toggleSort('perf_exported')" title="Perf after export (NPU mean latency)">Export (ms)${{arrow('perf_exported')}}</th>
+    <th onclick="toggleSort('perf_graph_opt')" title="Perf after graph normalize (NPU mean latency)">Normalize (ms)${{arrow('perf_graph_opt')}}</th>
     <th onclick="toggleSort('pre_supported')">Pre SA${{arrow('pre_supported')}}</th>
-    <th onclick="toggleSort('pre_epctx')">Pre EPCtx${{arrow('pre_epctx')}}</th>
-    <th>SA Opt Flags</th>
+    <th>Flags</th>
+    <th onclick="toggleSort('perf_sa_opt')" title="Perf after SA optimize (NPU mean latency)">Optimized (ms)${{arrow('perf_sa_opt')}}</th>
     <th onclick="toggleSort('post_supported')">Post SA${{arrow('post_supported')}}</th>
+    <th onclick="toggleSort('perf_quantized')" title="Perf after QDQ quantize (NPU mean latency)">Quantize (ms)${{arrow('perf_quantized')}}</th>
+    <th onclick="toggleSort('delta')">Delta${{arrow('delta')}}</th>
+    <th onclick="toggleSort('pre_epctx')">Pre EPCtx${{arrow('pre_epctx')}}</th>
     <th onclick="toggleSort('post_epctx')">Post EPCtx${{arrow('post_epctx')}}</th>
-    <th onclick="toggleSort('delta')">SA Delta${{arrow('delta')}}</th>
     <th onclick="toggleSort('unknown')">Unknown${{arrow('unknown')}}</th>
     <th>Time</th>
   </tr></thead><tbody>`;
@@ -435,16 +542,20 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
     html += `<tr onclick="toggleDetail(${{i}})" style="cursor:pointer">
       <td><a class="hf-link" href="https://huggingface.co/${{esc(d.model)}}" target="_blank" onclick="event.stopPropagation()">${{esc(d.model)}}</a></td>
       <td><span class="badge badge-task">${{esc(d.task||'-')}}</span></td>
+      <td>${{singlePerfCell(d.perf_exported_mean, null)}}</td>
+      <td>${{singlePerfCell(d.perf_graph_opt_mean, d.perf_exported_mean)}}</td>
       <td>${{levelBar(d.pre_supported,d.pre_partial,d.pre_unsupported,d.pre_unknown)}}</td>
-      <td>${{epctxAcc(d.pre_epctx_accuracy)}}</td>
       <td>${{flagsCell}}</td>
+      <td>${{singlePerfCell(d.perf_sa_opt_mean, d.perf_graph_opt_mean)}}</td>
       <td>${{levelBar(d.post_supported,d.post_partial,d.post_unsupported,d.post_unknown)}}</td>
-      <td>${{epctxAcc(d.post_epctx_accuracy)}}</td>
+      <td>${{singlePerfCell(d.perf_quantized_mean, d.perf_sa_opt_mean)}}</td>
       <td>${{deltaPct(d.supported_ratio_delta)}}</td>
+      <td>${{epctxAcc(d.pre_epctx_accuracy)}}</td>
+      <td>${{epctxAcc(d.post_epctx_accuracy)}}</td>
       <td>${{unknownBadge}}</td>
       <td style="color:var(--text2);font-size:12px">${{(d.elapsed||0).toFixed(1)}}s</td>
     </tr>
-    <tr id="detail-${{i}}" style="display:none"><td colspan="10">
+    <tr id="detail-${{i}}" style="display:none"><td colspan="14">
       <div class="detail-panel">
 
         <!-- PRE SA ROW -->
@@ -498,6 +609,9 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
           </div>
         </div>
 
+        <!-- PERF COMPARISON -->
+        ${{buildPerfDetail(d)}}
+
       </div>
     </td></tr>`;
   }});
@@ -609,5 +723,26 @@ def _fallback_css() -> str:
   -webkit-background-clip: text; -webkit-text-fill-color: transparent;
 }
 .header-stats { display: flex; gap: 20px; font-size: 13px; color: var(--text2); }
-.header-stats span { font-weight: 600; color: var(--accent); }
-"""
+.header-stats span { font-weight: 600; color: var(--accent); }"""
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import sys
+
+    parser = argparse.ArgumentParser(
+        description="Regenerate SA eval HTML report from existing JSON."
+    )
+    parser.add_argument("output_dir", type=Path, help="Directory containing sa_eval_report.json")
+    args = parser.parse_args()
+
+    json_path = args.output_dir / "sa_eval_report.json"
+    if not json_path.exists():
+        print(f"[ERROR] Not found: {json_path}", file=sys.stderr)
+        sys.exit(1)
+
+    report = json.loads(json_path.read_text(encoding="utf-8"))
+    html_path = args.output_dir / "sa_eval_report.html"
+    generate_sa_html_report(report, html_path)
+    print(f"Report regenerated: {html_path}")
diff --git a/src/winml/modelkit/commands/quantize.py b/src/winml/modelkit/commands/quantize.py
index 58101a9d5..dbc69f728 100644
--- a/src/winml/modelkit/commands/quantize.py
+++ b/src/winml/modelkit/commands/quantize.py
@@ -98,6 +98,12 @@
     default=None,
     help="Task for calibration dataset selection (e.g., 'image-classification').",
 )
+@click.option(
+    "--model-name",
+    type=str,
+    default=None,
+    help="HuggingFace model ID for task-aware calibration tokenizer/processor.",
+)
 @click.option(
     "--verbose",
     "-v",
@@ -119,6 +125,7 @@ def quantize(
     per_channel: bool,
     symmetric: bool,
     task: str | None,
+    model_name: str | None,
     verbose: bool,
     config_file: Path | None,
 ) -> None:
@@ -200,6 +207,7 @@ def quantize(
         per_channel=per_channel,
         symmetric=symmetric,
         task=task,
+        model_name=model_name,
     )
 
     # Display dataset info from config

From 0b899f38e2c53da7f979edb538da058cd6e35858 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Tue, 12 May 2026 16:19:42 +0800
Subject: [PATCH 2/4] feat(eval): add --cleanup flag to delete intermediate
 ONNX files after each model

---
 scripts/e2e_eval/run_sa_eval.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py
index ad603b2d8..480570858 100644
--- a/scripts/e2e_eval/run_sa_eval.py
+++ b/scripts/e2e_eval/run_sa_eval.py
@@ -184,6 +184,28 @@ def run_winml_perf(
         return None
 
 
+# ---------------------------------------------------------------------------
+# Cleanup
+# ---------------------------------------------------------------------------
+
+
+def cleanup_onnx_artifacts(model_dir: Path) -> None:
+    """Delete intermediate ONNX files after eval, keeping only JSON/log results.
+
+    Removes all ``*.onnx`` and ``*.onnx.data`` files (exported, graph_optimized,
+    sa_optimized, quantized, compiled EPContext). JSON result files and perf
+    logs are preserved so --report-only and --use-cache still work for the
+    JSON-driven stages.
+    """
+    freed = 0
+    for pattern in ("*.onnx", "*.onnx.data"):
+        for f in model_dir.glob(pattern):
+            size = f.stat().st_size
+            f.unlink()
+            freed += size
+    safe_print(f"  [cleanup] Freed {freed / 1024**2:.1f} MB of ONNX artifacts")
+
+
 # ---------------------------------------------------------------------------
 # Stage implementations
 # ---------------------------------------------------------------------------
@@ -555,6 +577,7 @@ def evaluate_model(
     run_quantize: bool = True,
     quantize_precision: str = "int8",
     quantize_samples: int = 10,
+    cleanup: bool = False,
 ) -> dict | None:
     """Run the 4+1+1 stage SA eval pipeline for a single model."""
     hf_id = model_entry["hf_id"]
@@ -748,6 +771,9 @@ def _fmt(p: dict | None) -> str:
     out_file.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")
     safe_print(f"  Written: {out_file}")
 
+    if cleanup:
+        cleanup_onnx_artifacts(model_dir)
+
     return result
 
 
@@ -955,6 +981,12 @@ def main() -> None:
         default=10,
         help="Number of calibration samples for quantize (default: 10)",
     )
+    parser.add_argument(
+        "--cleanup",
+        action="store_true",
+        help="Delete intermediate ONNX files after each model completes to free disk space. "
+        "JSON result and perf files are preserved.",
+    )
     parser.add_argument(
         "--report-only",
         action="store_true",
@@ -1019,6 +1051,7 @@ def main() -> None:
             run_quantize=not args.no_quantize,
             quantize_precision=args.quantize_precision,
             quantize_samples=args.quantize_samples,
+            cleanup=args.cleanup,
         )
         if result:
             all_results.append(result)

From 0d1691243487673f0f0aa4544bab01b542469e02 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Wed, 13 May 2026 16:15:20 +0800
Subject: [PATCH 3/4] feat(e2e_eval): improve SA eval report UX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Default sort by perf gain descending (Unlocked models float to top)
- Add perf gain summary cards: Avg Perf Gain, Faster Models, Unlocked count
- Reorder summary cards to show perf gain metrics first
- Unlocked badge: compact purple pill style '⚡ Unlocked · Xms'
- Hide models without quantize perf from main table
- Add footer showing quantized vs total complete model counts
- Rename report title to 'WinML CLI Component Analysis Report'
- Remove Regressed summary card
---
 scripts/e2e_eval/run_sa_eval.py |   4 +-
 scripts/e2e_eval/sa_report.py   | 202 ++++++++++++++++++++++++++------
 2 files changed, 169 insertions(+), 37 deletions(-)

diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py
index 480570858..561354e03 100644
--- a/scripts/e2e_eval/run_sa_eval.py
+++ b/scripts/e2e_eval/run_sa_eval.py
@@ -540,7 +540,7 @@ def stage5_compile_and_diff(
     diff_pre = _compile_and_diff(
         "5a (pre)",
         graph_opt_path,
-        graph_opt_path.stem + "_qnn_ctx.onnx",
+        graph_opt_path.stem + "_ctx.onnx",
         sa_pre,
         model_dir,
         use_cache,
@@ -550,7 +550,7 @@ def stage5_compile_and_diff(
     diff_post = _compile_and_diff(
         "5b (post)",
         sa_opt_path,
-        sa_opt_path.stem + "_qnn_ctx.onnx",
+        sa_opt_path.stem + "_ctx.onnx",
         sa_post,
         model_dir,
         use_cache,
diff --git a/scripts/e2e_eval/sa_report.py b/scripts/e2e_eval/sa_report.py
index 2c42de333..3dc0e5617 100644
--- a/scripts/e2e_eval/sa_report.py
+++ b/scripts/e2e_eval/sa_report.py
@@ -155,11 +155,23 @@ def generate_sa_html_report(report_data: dict, output_path: Path) -> None:
     avg_post = post_opt.get("avg_supported_ratio", 0)
     avg_delta = effectiveness.get("avg_supported_ratio_delta", 0)
     n_improved = effectiveness.get("models_improved", 0)
-    n_regressed = effectiveness.get("models_regressed", 0)
-    avg_pre_unknown = pre_opt.get("avg_unknown_count", 0)
-    avg_post_unknown = post_opt.get("avg_unknown_count", 0)
     delta_cls = "c-good" if avg_delta > 0 else "c-muted"
-    regressed_cls = "c-bad" if n_regressed > 0 else "c-muted"
+
+    # Perf gain summary stats (from viewer_data)
+    n_unlocked = sum(
+        1
+        for d in viewer_data
+        if d["perf_exported_mean"] is None and d["perf_quantized_mean"] is not None
+    )
+    gain_vals = [
+        (d["perf_exported_mean"] - d["perf_quantized_mean"]) / d["perf_exported_mean"] * 100
+        for d in viewer_data
+        if d["perf_exported_mean"] is not None and d["perf_quantized_mean"] is not None
+    ]
+    avg_perf_gain = sum(gain_vals) / len(gain_vals) if gain_vals else None
+    n_with_gain = sum(1 for g in gain_vals if g > 1)
+    gain_cls = "c-good" if avg_perf_gain and avg_perf_gain > 1 else "c-muted"
+    avg_gain_str = f"{avg_perf_gain:+.1f}%" if avg_perf_gain is not None else "—"
 
     def _epctx_card(label: str, acc: float | None, n: int) -> str:
         if not n or acc is None:
@@ -182,7 +194,7 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
 <head>
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
-<title>SA Eval Report</title>
+<title>WinML CLI Component Analysis Report</title>
 <style>
 {base_css}
 .summary-row {{ display: flex; gap: 16px; flex-wrap: wrap; margin-bottom: 20px; }}
@@ -268,7 +280,7 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
 <body>
 
 <div class="header">
-  <h1>SA Eval Report &mdash; Pre / Post Optimizer</h1>
+  <h1>WinML CLI Component Analysis Report</h1>
   <div class="header-stats">
     <div>Generated: <span>{generated_at}</span></div>
     <div>Models: <span>{n_complete}</span> / {n_total} complete</div>
@@ -279,6 +291,21 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
 
   <!-- Summary cards -->
   <div class="summary-row">
+    <div class="summary-card">
+      <div class="label">Avg Perf Gain</div>
+      <div class="value {gain_cls}">{avg_gain_str}</div>
+      <div style="font-size:11px;color:var(--text2)">Export → Quantize</div>
+    </div>
+    <div class="summary-card">
+      <div class="label">Faster Models</div>
+      <div class="value c-good">{n_with_gain}</div>
+      <div style="font-size:11px;color:var(--text2)">of {len(gain_vals)} w/ baseline</div>
+    </div>
+    <div class="summary-card">
+      <div class="label">⚡ Unlocked</div>
+      <div class="value" style="color:#8b5cf6">{n_unlocked}</div>
+      <div style="font-size:11px;color:var(--text2)">NPU-enabled by QDQ</div>
+    </div>
     <div class="summary-card">
       <div class="label">Avg SUPPORTED (Pre)</div>
       <div class="value c-info">{avg_pre * 100:.1f}%</div>
@@ -290,31 +317,20 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
       <div style="font-size:11px;color:var(--text2)">After optimization</div>
     </div>
     <div class="summary-card">
-      <div class="label">Avg Delta</div>
+      <div class="label">Avg SA Delta</div>
       <div class="value {delta_cls}">{avg_delta * 100:+.1f}%</div>
     </div>
     <div class="summary-card">
-      <div class="label">Improved</div>
+      <div class="label">SA Improved</div>
       <div class="value c-good">{n_improved}</div>
     </div>
-    <div class="summary-card">
-      <div class="label">Regressed</div>
-      <div class="value {regressed_cls}">{n_regressed}</div>
-    </div>
-    <div class="summary-card">
-      <div class="label">Avg UNKNOWN (Pre→Post)</div>
-      <div class="value c-muted" style="font-size:18px">{avg_pre_unknown:.1f} → {avg_post_unknown:.1f}</div>
-    </div>
-    <div class="summary-card">
-      <div class="label">All-SUPPORTED (Post)</div>
-      <div class="value c-good">{post_opt.get("models_all_supported", 0)}</div>
-    </div>
     {epctx_section}
   </div>
 
   <!-- Tabs -->
   <div class="tabs">
     <div class="tab active" data-tab="models">Per-Model Results</div>
+    <div class="tab" data-tab="sa-comparison">SA Comparison</div>
     <div class="tab" data-tab="improved">Improved Patterns</div>
     <div class="tab" data-tab="unresolved">Unresolved PARTIAL/UNSUPPORTED</div>
     <div class="tab" data-tab="unknown">Unknown Ops</div>
@@ -324,6 +340,10 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
     <input class="search-box" id="searchBox" type="text" placeholder="Search models...">
     <div id="modelTable"></div>
   </div>
+  <div class="tab-content" id="tab-sa-comparison">
+    <input class="search-box" id="saSearchBox" type="text" placeholder="Search models...">
+    <div id="saComparisonTable"></div>
+  </div>
   <div class="tab-content" id="tab-improved"><div id="improvedList"></div></div>
   <div class="tab-content" id="tab-unresolved"><div id="unresolvedList"></div></div>
   <div class="tab-content" id="tab-unknown"><div id="unknownList"></div></div>
@@ -415,6 +435,16 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
   return `<span class="${{cls}}" style="font-size:10px">${{pct>0?'+':''}}${{pct.toFixed(0)}}%</span>`;
 }}
 
+function perfGainCell(exported, quantized) {{
+  if (quantized == null) return '<span class="c-muted">—</span>';
+  if (exported == null) {{
+    return `<span style="background:#4c1d95;color:#ddd8fe;font-size:11px;font-weight:600;padding:2px 6px;border-radius:4px;white-space:nowrap">⚡ Unlocked · ${{fmtMs(quantized)}}ms</span>`;
+  }}
+  const gain = (exported - quantized) / exported * 100;
+  const cls = gain > 1 ? 'c-good' : gain < -1 ? 'c-bad' : 'c-muted';
+  return `<span class="${{cls}}" style="font-size:13px;font-weight:700">${{gain > 0 ? '+' : ''}}${{gain.toFixed(1)}}%</span>`;
+}}
+
 function singlePerfCell(mean, baseline) {{
   if (mean == null) return '<span class="c-muted">—</span>';
   const g = baseline != null ? gainPct(baseline, mean) : '';
@@ -475,7 +505,7 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
 }}
 
 // ---- Per-Model Table ----
-let sortKey = 'pre_supported_asc';
+let sortKey = 'perf_gain_desc';
 let searchQuery = '';
 
 function sortData(arr) {{
@@ -485,8 +515,17 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
     case 'pre_supported_desc': return s.sort((a,b) => b.pre_supported_ratio - a.pre_supported_ratio);
     case 'post_supported_asc': return s.sort((a,b) => a.post_supported_ratio - b.post_supported_ratio);
     case 'post_supported_desc':return s.sort((a,b) => b.post_supported_ratio - a.post_supported_ratio);
-    case 'delta_desc': return s.sort((a,b) => b.supported_ratio_delta - a.supported_ratio_delta);
-    case 'delta_asc':  return s.sort((a,b) => a.supported_ratio_delta - b.supported_ratio_delta);
+    case 'perf_gain_desc': return s.sort((a,b) => {{
+      // Unlocked (exported=null, quantized!=null) → treated as +Infinity gain, sorts first
+      const aG = a._perfGain ?? (a.perf_exported_mean == null ? Infinity : -Infinity);
+      const bG = b._perfGain ?? (b.perf_exported_mean == null ? Infinity : -Infinity);
+      return bG - aG;
+    }});
+    case 'perf_gain_asc':  return s.sort((a,b) => {{
+      const aG = a._perfGain ?? (a.perf_exported_mean == null ? -Infinity : Infinity);
+      const bG = b._perfGain ?? (b.perf_exported_mean == null ? -Infinity : Infinity);
+      return aG - bG;
+    }});
     case 'model_asc':  return s.sort((a,b) => a.model.localeCompare(b.model));
     case 'unknown_desc': return s.sort((a,b) => (b.pre_unknown||0) - (a.pre_unknown||0));
     case 'pre_epctx_desc': return s.sort((a,b) => (b.pre_epctx_accuracy||0) - (a.pre_epctx_accuracy||0));
@@ -506,8 +545,14 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
 }}
 
 function renderModelTable() {{
-  let filtered = DATA;
-  if (searchQuery) filtered = DATA.filter(d => d.model.toLowerCase().includes(searchQuery) || (d.task||'').toLowerCase().includes(searchQuery));
+  let filtered = DATA.filter(d => d.perf_quantized_mean != null);
+  if (searchQuery) filtered = filtered.filter(d => d.model.toLowerCase().includes(searchQuery) || (d.task||'').toLowerCase().includes(searchQuery));
+  // Precompute perf gain for sorting: (exported - quantized) / exported * 100
+  filtered.forEach(d => {{
+    d._perfGain = (d.perf_exported_mean != null && d.perf_quantized_mean != null)
+      ? (d.perf_exported_mean - d.perf_quantized_mean) / d.perf_exported_mean * 100
+      : null;
+  }});
   filtered = sortData(filtered);
   const arrow = col => sortKey===col+'_asc' ? ' \u2191' : sortKey===col+'_desc' ? ' \u2193' : '';
 
@@ -521,10 +566,7 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
     <th onclick="toggleSort('perf_sa_opt')" title="Perf after SA optimize (NPU mean latency)">Optimized (ms)${{arrow('perf_sa_opt')}}</th>
     <th onclick="toggleSort('post_supported')">Post SA${{arrow('post_supported')}}</th>
     <th onclick="toggleSort('perf_quantized')" title="Perf after QDQ quantize (NPU mean latency)">Quantize (ms)${{arrow('perf_quantized')}}</th>
-    <th onclick="toggleSort('delta')">Delta${{arrow('delta')}}</th>
-    <th onclick="toggleSort('pre_epctx')">Pre EPCtx${{arrow('pre_epctx')}}</th>
-    <th onclick="toggleSort('post_epctx')">Post EPCtx${{arrow('post_epctx')}}</th>
-    <th onclick="toggleSort('unknown')">Unknown${{arrow('unknown')}}</th>
+    <th onclick="toggleSort('perf_gain')" title="Total perf gain: (Export - Quantize) / Export">Perf Gain${{arrow('perf_gain')}}</th>
     <th>Time</th>
   </tr></thead><tbody>`;
 
@@ -549,13 +591,10 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
       <td>${{singlePerfCell(d.perf_sa_opt_mean, d.perf_graph_opt_mean)}}</td>
       <td>${{levelBar(d.post_supported,d.post_partial,d.post_unsupported,d.post_unknown)}}</td>
       <td>${{singlePerfCell(d.perf_quantized_mean, d.perf_sa_opt_mean)}}</td>
-      <td>${{deltaPct(d.supported_ratio_delta)}}</td>
-      <td>${{epctxAcc(d.pre_epctx_accuracy)}}</td>
-      <td>${{epctxAcc(d.post_epctx_accuracy)}}</td>
-      <td>${{unknownBadge}}</td>
+      <td>${{perfGainCell(d.perf_exported_mean, d.perf_quantized_mean)}}</td>
       <td style="color:var(--text2);font-size:12px">${{(d.elapsed||0).toFixed(1)}}s</td>
     </tr>
-    <tr id="detail-${{i}}" style="display:none"><td colspan="14">
+    <tr id="detail-${{i}}" style="display:none"><td colspan="10">
       <div class="detail-panel">
 
         <!-- PRE SA ROW -->
@@ -616,7 +655,8 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
     </td></tr>`;
   }});
 
-  html += `</tbody></table></div><div style="font-size:12px;color:var(--text2);margin-top:8px">Showing ${{filtered.length}} of ${{DATA.length}} models</div>`;
+  const withQuant = DATA.filter(d => d.perf_quantized_mean != null).length;
+  html += `</tbody></table></div><div style="font-size:12px;color:var(--text2);margin-top:8px">Showing ${{filtered.length}} of ${{withQuant}} quantized models (${{DATA.length}} total complete)</div>`;
   document.getElementById('modelTable').innerHTML = html;
 }}
 
@@ -635,6 +675,97 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
   renderModelTable();
 }});
 
+// ---- SA Comparison Tab ----
+let saSearchQuery = '';
+let saSortKey = 'delta_desc';
+
+function saSortData(arr) {{
+  const s = [...arr];
+  switch(saSortKey) {{
+    case 'delta_desc': return s.sort((a,b) => b.supported_ratio_delta - a.supported_ratio_delta);
+    case 'delta_asc':  return s.sort((a,b) => a.supported_ratio_delta - b.supported_ratio_delta);
+    case 'pre_supported_desc': return s.sort((a,b) => b.pre_supported_ratio - a.pre_supported_ratio);
+    case 'pre_supported_asc':  return s.sort((a,b) => a.pre_supported_ratio - b.pre_supported_ratio);
+    case 'post_supported_desc': return s.sort((a,b) => b.post_supported_ratio - a.post_supported_ratio);
+    case 'post_supported_asc':  return s.sort((a,b) => a.post_supported_ratio - b.post_supported_ratio);
+    case 'pre_epctx_desc': return s.sort((a,b) => (b.pre_epctx_accuracy??-1) - (a.pre_epctx_accuracy??-1));
+    case 'pre_epctx_asc':  return s.sort((a,b) => (a.pre_epctx_accuracy??-1) - (b.pre_epctx_accuracy??-1));
+    case 'post_epctx_desc': return s.sort((a,b) => (b.post_epctx_accuracy??-1) - (a.post_epctx_accuracy??-1));
+    case 'post_epctx_asc':  return s.sort((a,b) => (a.post_epctx_accuracy??-1) - (b.post_epctx_accuracy??-1));
+    case 'unknown_desc': return s.sort((a,b) => (b.pre_unknown||0) - (a.pre_unknown||0));
+    case 'unknown_asc':  return s.sort((a,b) => (a.pre_unknown||0) - (b.pre_unknown||0));
+    case 'model_asc': return s.sort((a,b) => a.model.localeCompare(b.model));
+  }}
+  return s;
+}}
+
+function renderSAComparisonTable() {{
+  let filtered = DATA.filter(d => d.pre_supported != null);
+  if (saSearchQuery) filtered = filtered.filter(d => d.model.toLowerCase().includes(saSearchQuery) || (d.task||'').toLowerCase().includes(saSearchQuery));
+  filtered = saSortData(filtered);
+  const arrow = col => saSortKey===col+'_asc' ? ' \u2191' : saSortKey===col+'_desc' ? ' \u2193' : '';
+
+  function saToggleSort(col) {{
+    saSortKey = saSortKey === col+'_desc' ? col+'_asc' : col+'_desc';
+    renderSAComparisonTable();
+  }}
+  window._saToggleSort = saToggleSort;
+
+  function epctxBlock(accuracy, fn_ops, fp_ops) {{
+    if (accuracy == null) return '<span class="c-muted" style="font-size:11px">—</span>';
+    const cls = accuracy >= 0.9 ? 'c-good' : 'c-warn';
+    const falseNeg = (fn_ops||[]).slice(0,3).map(p => `<div style="font-size:10px;color:var(--text2);font-family:monospace">${{esc(p)}}</div>`).join('');
+    const falsePosN = (fp_ops||[]).length;
+    return `<div><span class="${{cls}}" style="font-size:13px;font-weight:700">${{pct(accuracy)}}</span>`
+      + (falseNeg ? `<div style="margin-top:3px"><span style="font-size:9px;color:#ff6b9d;text-transform:uppercase;letter-spacing:0.5px">FN</span>${{falseNeg}}</div>` : '')
+      + (falsePosN ? `<div style="font-size:10px;color:#ffd93d">FP: ${{falsePosN}} ops</div>` : '')
+      + '</div>';
+  }}
+
+  let html = `<div class="table-wrap"><table><thead><tr>
+    <th onclick="_saToggleSort('model')">Model${{arrow('model')}}</th>
+    <th>Task</th>
+    <th onclick="_saToggleSort('pre_supported')" title="SA analysis on graph_optimized.onnx">Pre SA${{arrow('pre_supported')}}</th>
+    <th>SA Opt Flags</th>
+    <th onclick="_saToggleSort('post_supported')" title="SA analysis on sa_optimized.onnx">Post SA${{arrow('post_supported')}}</th>
+    <th onclick="_saToggleSort('delta')" title="Supported ratio: Post - Pre">SA Delta${{arrow('delta')}}</th>
+    <th onclick="_saToggleSort('pre_epctx')" title="SA prediction accuracy vs compiled graph_optimized">Pre EPCtx${{arrow('pre_epctx')}}</th>
+    <th onclick="_saToggleSort('post_epctx')" title="SA prediction accuracy vs compiled sa_optimized">Post EPCtx${{arrow('post_epctx')}}</th>
+    <th onclick="_saToggleSort('unknown')">Unknown${{arrow('unknown')}}</th>
+    <th>Time</th>
+  </tr></thead><tbody>`;
+
+  filtered.forEach(d => {{
+    const flagsCell = (d.optim_flags||[]).length
+      ? d.optim_flags.map(f => `<div style="font-size:10px;font-family:monospace;white-space:nowrap;color:var(--accent2)">${{esc(f)}}</div>`).join('')
+      : '<span class="c-muted" style="font-size:11px">—</span>';
+    const delta = d.supported_ratio_delta;
+    const deltaHtml = delta == null
+      ? '<span class="c-muted">—</span>'
+      : `<span class="${{delta > 0.005 ? 'c-good' : delta < -0.005 ? 'c-bad' : 'c-muted'}}" style="font-size:13px;font-weight:700">${{delta > 0 ? '+' : ''}}${{(delta*100).toFixed(1)}}%</span>`;
+    html += `<tr>
+      <td><a class="hf-link" href="https://huggingface.co/${{esc(d.model)}}" target="_blank">${{esc(d.model)}}</a></td>
+      <td><span class="badge badge-task">${{esc(d.task||'-')}}</span></td>
+      <td>${{levelBar(d.pre_supported,d.pre_partial,d.pre_unsupported,d.pre_unknown)}}</td>
+      <td>${{flagsCell}}</td>
+      <td>${{levelBar(d.post_supported,d.post_partial,d.post_unsupported,d.post_unknown)}}</td>
+      <td>${{deltaHtml}}</td>
+      <td>${{epctxBlock(d.pre_epctx_accuracy, d.pre_epctx_fn_ops, d.pre_epctx_fp_ops)}}</td>
+      <td>${{epctxBlock(d.post_epctx_accuracy, d.post_epctx_fn_ops, d.post_epctx_fp_ops)}}</td>
+      <td>${{d.pre_unknown > 0 ? `<span class="badge badge-unknown">${{d.pre_unknown}}</span>` : '<span class="c-muted">0</span>'}}</td>
+      <td style="color:var(--text2);font-size:12px">${{(d.elapsed||0).toFixed(1)}}s</td>
+    </tr>`;
+  }});
+
+  html += `</tbody></table></div><div style="font-size:12px;color:var(--text2);margin-top:8px">Showing ${{filtered.length}} of ${{DATA.length}} models</div>`;
+  document.getElementById('saComparisonTable').innerHTML = html;
+}}
+
+document.getElementById('saSearchBox').addEventListener('input', e => {{
+  saSearchQuery = e.target.value.toLowerCase();
+  renderSAComparisonTable();
+}});
+
 // ---- Pattern list tabs (shared renderer) ----
 function renderPatternList(containerId, items, color, emptyMsg, subtitle) {{
   if (!items.length) {{
@@ -670,6 +801,7 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str:
 }}
 
 renderModelTable();
+renderSAComparisonTable();
 renderPatternList('improvedList', COMMON_IMPROVED, '#4ecdc4',
   COMMON_FUSED.length ? '' : 'No improvement data yet.',
   'Patterns that moved PARTIAL/UNSUPPORTED \u2192 SUPPORTED (explicit level change). See also "Fused Away" below.');

From 5e531af6b4ff7289d4c51e8d8ef2da0fa4831597 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Wed, 13 May 2026 16:22:32 +0800
Subject: [PATCH 4/4] refactor(e2e_eval): rewrite pipeline to use winml build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace 6 manual wmk stages with winml config + winml build:
- Stage 1: winml config → build_config.json (export/quant/compile settings)
- Stage 2: winml build → export.onnx, optimized.onnx, quantized.onnx,
                         compiled.onnx, winml_build_config.json
- Stage 3: SA pre-check on export.onnx (via ONNXStaticAnalyzer Python API)
- Stage 4: SA post-check on optimized.onnx
- Stage 5: EPContext diff on compiled.onnx (produced by build)

Read SA optimization flags from winml_build_config.json['optim'] instead
of computing them via SA API. Result schema is backward-compatible with
sa_report.py (perf.graph_optimized=None, perf.sa_optimized=optimized.onnx).

Add --no-compile flag; remove unused run_wmk_export helper.
---
 scripts/e2e_eval/run_sa_eval.py | 736 ++++++++++++--------------------
 1 file changed, 283 insertions(+), 453 deletions(-)

diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py
index 561354e03..3fdaf6c86 100644
--- a/scripts/e2e_eval/run_sa_eval.py
+++ b/scripts/e2e_eval/run_sa_eval.py
@@ -3,27 +3,26 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-"""SA accuracy evaluation — four-stage self-contained pipeline.
+"""WinML CLI component analysis evaluation pipeline.
 
-Reads models from model_with_acc.json, runs export + graph optimize,
-SA pre-check, capability-driven optimization, SA post-check, and optional
-EPContext diff against cached compiled ONNX.
+Reads models from models_with_acc.json, runs winml build to produce all
+pipeline artifacts, then runs SA pre/post analysis on the build outputs.
 
 Pipeline per model:
-  Stage 1: wmk export + Python optimize_onnx (default)
-           → graph_optimized.onnx
-  Stage 2: ONNXStaticAnalyzer (enable_information=True)
-           → sa_pre.json + optim_config
-  Stage 3: Python optimize_onnx(**optim_config) → sa_optimized.onnx
-  Stage 4: ONNXStaticAnalyzer → sa_post.json
-  Stage 5: EPContext diff (optional, uses ~/.cache/winml/artifacts/)
-           → epcontext comparison vs sa_post predictions
+  Stage 1: winml config  → build_config.json
+  Stage 2: winml build   → export.onnx, optimized.onnx, quantized.onnx,
+                           compiled.onnx, winml_build_config.json
+  Stage 3: SA pre-check  → ONNXStaticAnalyzer on export.onnx → sa_pre.json
+  Stage 4: SA post-check → ONNXStaticAnalyzer on optimized.onnx → sa_post.json
+  Stage 5: EPContext diff → compare_sa_vs_epcontext on compiled.onnx
+  Perf: run winml perf on export.onnx, optimized.onnx, quantized.onnx
 
 Usage:
     uv run python scripts/e2e_eval/run_sa_eval.py
-    uv run python scripts/e2e_eval/run_sa_eval.py --model ProsusAI/finbert
+    uv run python scripts/e2e_eval/run_sa_eval.py --model microsoft/resnet-50
     uv run python scripts/e2e_eval/run_sa_eval.py --output-dir sa_eval_results/2026-03-27
     uv run python scripts/e2e_eval/run_sa_eval.py --use-cache
+    uv run python scripts/e2e_eval/run_sa_eval.py --report-only --output-dir sa_eval_results/2026-05-12
 """
 
 from __future__ import annotations
@@ -82,30 +81,18 @@ def is_cached(path: Path) -> bool:
     return path.exists() and path.stat().st_size > 0
 
 
-def run_wmk_export(hf_id: str, task: str, output: Path) -> tuple[int, str]:
-    """Run wmk export via subprocess. Returns (rc, stderr_tail)."""
-    args = [
-        sys.executable,
-        "-m",
-        "winml.modelkit.cli",
-        "export",
-        "--model",
-        hf_id,
-        "--output",
-        str(output),
-        "--clean-onnx",
-    ]
-    if task:
-        args += ["--task", task]
+def _run_cli(args: list[str]) -> tuple[int, str]:
+    """Run a winml CLI command via subprocess. Returns (rc, combined_output)."""
     result = subprocess.run(  # noqa: S603
-        args,
+        [sys.executable, "-m", "winml.modelkit.cli", *args],
         capture_output=True,
         text=True,
         encoding="utf-8",
         errors="replace",
         env={**os.environ, "PYTHONIOENCODING": "utf-8"},
     )
-    return result.returncode, (result.stderr or "").strip()[-500:]
+    combined = ((result.stdout or "") + "\n" + (result.stderr or "")).strip()
+    return result.returncode, combined
 
 
 # Map full EP names to the short form accepted by `wmk perf --ep`
@@ -139,11 +126,12 @@ def run_winml_perf(
         except Exception:
             return None
 
+    if not is_cached(onnx_path):
+        safe_print(f"  [{label}] Skipping perf — {onnx_path.name} not found")
+        return None
+
     safe_print(f"  [{label}] Running perf on {onnx_path.name}...")
     cmd = [
-        sys.executable,
-        "-m",
-        "winml.modelkit.cli",
         "perf",
         "-m",
         str(onnx_path),
@@ -160,16 +148,9 @@ def run_winml_perf(
     if ep_arg:
         cmd += ["--ep", ep_arg]
 
-    result = subprocess.run(  # noqa: S603
-        cmd,
-        capture_output=True,
-        text=True,
-        encoding="utf-8",
-        errors="replace",
-        env={**os.environ, "PYTHONIOENCODING": "utf-8"},
-    )
-    if result.returncode != 0 or not is_cached(output_json):
-        safe_print(f"  [{label}] Perf failed (rc={result.returncode})")
+    rc, _ = _run_cli(cmd)
+    if rc != 0 or not is_cached(output_json):
+        safe_print(f"  [{label}] Perf failed (rc={rc})")
         return None
 
     try:
@@ -190,20 +171,19 @@ def run_winml_perf(
 
 
 def cleanup_onnx_artifacts(model_dir: Path) -> None:
-    """Delete intermediate ONNX files after eval, keeping only JSON/log results.
+    """Delete intermediate ONNX and binary artifacts after eval.
 
-    Removes all ``*.onnx`` and ``*.onnx.data`` files (exported, graph_optimized,
-    sa_optimized, quantized, compiled EPContext). JSON result files and perf
-    logs are preserved so --report-only and --use-cache still work for the
-    JSON-driven stages.
+    Removes ``*.onnx``, ``*.onnx.data``, and ``*.bin`` (QNN binary) files.
+    JSON result files and perf logs are preserved so --report-only and
+    --use-cache still work for the JSON-driven stages.
     """
     freed = 0
-    for pattern in ("*.onnx", "*.onnx.data"):
+    for pattern in ("*.onnx", "*.onnx.data", "*.bin"):
         for f in model_dir.glob(pattern):
             size = f.stat().st_size
             f.unlink()
             freed += size
-    safe_print(f"  [cleanup] Freed {freed / 1024**2:.1f} MB of ONNX artifacts")
+    safe_print(f"  [cleanup] Freed {freed / 1024**2:.1f} MB of artifacts")
 
 
 # ---------------------------------------------------------------------------
@@ -211,354 +191,230 @@ def cleanup_onnx_artifacts(model_dir: Path) -> None:
 # ---------------------------------------------------------------------------
 
 
-def stage1_export_optimize(
+def stage_build(
     hf_id: str,
     task: str,
     model_dir: Path,
     use_cache: bool,
-) -> tuple[Path | None, str | None]:
-    """Export HF model and apply baseline graph optimization.
+    precision: str = "int8",
+    device: str = "npu",
+    ep: str | None = None,
+    run_compile: bool = True,
+    run_quantize: bool = True,
+) -> str | None:
+    """Generate build config and run winml build.
+
+    Runs ``winml config`` then ``winml build`` to produce all pipeline
+    artifacts: export.onnx, optimized.onnx, quantized.onnx, compiled.onnx,
+    and winml_build_config.json.
 
-    Returns (graph_optimized_path, None) on success,
-    or (None, skip_reason) on failure.
+    Returns None on success, or a skip reason string on failure.
     """
-    from winml.modelkit.optim import optimize_onnx
+    config_path = model_dir / "build_config.json"
+    export_path = model_dir / "export.onnx"
+    optimized_path = model_dir / "optimized.onnx"
 
-    exported_path = model_dir / "exported.onnx"
-    graph_opt_path = model_dir / "graph_optimized.onnx"
+    # Skip entire build if key artifacts already exist and cache is enabled
+    if use_cache and is_cached(export_path) and is_cached(optimized_path):
+        safe_print("  [Build] Using cached artifacts (export.onnx, optimized.onnx)")
+        return None
+
+    # Stage 1: Generate config
+    safe_print(f"  [Build] Generating config for {hf_id}...")
+    config_args = [
+        "config",
+        "-m",
+        hf_id,
+        "-d",
+        device,
+        "-p",
+        precision,
+        "-o",
+        str(config_path),
+    ]
+    if task:
+        config_args += ["-t", task]
+    if run_compile:
+        config_args += ["--compile"]
+    if not run_quantize:
+        config_args += ["--no-quant"]
+
+    rc, output = _run_cli(config_args)
+    if rc != 0 or not is_cached(config_path):
+        safe_print(f"  [ERROR] Config generation failed (rc={rc}): {output[-300:]}")
+        return "SKIP_EXPORT"
+    safe_print(f"  [Build] Config written: {config_path.name}")
+
+    # Stage 2: Run winml build
+    safe_print("  [Build] Running winml build...")
+    build_args = [
+        "build",
+        "-c",
+        str(config_path),
+        "-m",
+        hf_id,
+        "-o",
+        str(model_dir),
+        "--ep",
+        _EP_TO_PERF_ARG.get(ep or "", ep or "qnn") if ep else "qnn",
+    ]
+    if not use_cache:
+        build_args += ["--rebuild"]
+    if not run_compile:
+        build_args += ["--no-compile"]
+    if not run_quantize:
+        build_args += ["--no-quant"]
+
+    rc, output = _run_cli(build_args)
+
+    # Check for required artifacts regardless of rc (partial builds are useful)
+    if not is_cached(export_path):
+        safe_print(f"  [ERROR] Build failed — no export.onnx produced: {output[-300:]}")
+        return "SKIP_EXPORT"
+
+    if not is_cached(optimized_path):
+        safe_print(f"  [ERROR] Build failed — no optimized.onnx produced: {output[-300:]}")
+        return "SKIP_OPTIM"
+
+    produced = [
+        p.name
+        for p in model_dir.iterdir()
+        if p.suffix in (".onnx", ".json") and not p.name.startswith("sa_")
+    ]
+    safe_print(f"  [Build] Complete: {', '.join(sorted(produced))}")
+    return None
 
-    # Stage 1a: Export (subprocess — HF download + tracing)
-    if use_cache and is_cached(exported_path):
-        safe_print("  [Stage 1a] Export (cached)")
-    else:
-        safe_print(f"  [Stage 1a] Exporting {hf_id}...")
-        rc, stderr = run_wmk_export(hf_id, task, exported_path)
-        if rc != 0 or not is_cached(exported_path):
-            safe_print(f"  [ERROR] Export failed (rc={rc}): {stderr}")
-            return None, "SKIP_EXPORT"
-        safe_print(f"  [Stage 1a] Exported: {exported_path.name}")
-
-    # Stage 1b: Baseline graph optimization (Python API)
-    if use_cache and is_cached(graph_opt_path):
-        safe_print("  [Stage 1b] Graph optimize (cached)")
-    else:
-        safe_print("  [Stage 1b] Applying baseline graph optimization...")
-        try:
-            optimize_onnx(str(exported_path), str(graph_opt_path))
-        except Exception as e:
-            safe_print(f"  [ERROR] Graph optimize failed: {e}")
-            return None, "SKIP_GRAPH_OPTIM"
-        if not is_cached(graph_opt_path):
-            safe_print("  [ERROR] Graph optimize produced no output")
-            return None, "SKIP_GRAPH_OPTIM"
-        safe_print(f"  [Stage 1b] Optimized: {graph_opt_path.name}")
 
-    return graph_opt_path, None
+def read_optim_flags(model_dir: Path) -> dict:
+    """Read optim flags written by winml build into winml_build_config.json."""
+    config_path = model_dir / "winml_build_config.json"
+    if not config_path.exists():
+        return {}
+    try:
+        cfg = json.loads(config_path.read_text(encoding="utf-8"))
+        return cfg.get("optim", {})
+    except Exception:
+        return {}
 
 
-def stage2_sa_pre(
+def stage_sa_pre(
     model_dir: Path,
-    graph_opt_path: Path,
     use_cache: bool,
     ep: str = "QNNExecutionProvider",
     device: str = "NPU",
-) -> tuple[dict[str, str], dict, list[dict]] | None:
-    """Run SA with information on graph_optimized.onnx.
+) -> tuple[dict[str, str], list[dict]] | None:
+    """Run SA with information on export.onnx (pre-optimization state).
 
-    Returns (classifications, optim_config, info_items) or None on failure.
+    Returns (classifications, info_items) or None on failure.
     """
+    export_path = model_dir / "export.onnx"
     sa_pre_path = model_dir / "sa_pre.json"
-    optim_record_path = model_dir / "optimization_flags.json"
 
-    if use_cache and is_cached(sa_pre_path) and is_cached(optim_record_path):
-        safe_print("  [Stage 2] SA pre-check (cached)")
+    if use_cache and is_cached(sa_pre_path):
+        safe_print("  [SA Pre] Cached")
         classifications = parse_sa_json(sa_pre_path, ep=ep)
-        optim_record = json.loads(optim_record_path.read_text(encoding="utf-8"))
-        optim_config = optim_record.get("optim_config", {})
-        info_items = optim_record.get("info_items", [])
-    else:
-        safe_print("  [Stage 2] Running SA pre-check (with recommendations)...")
-        try:
-            classifications, optim_config, info_items = run_sa_with_info(
-                graph_opt_path, sa_pre_path, ep=ep, device=device
-            )
-        except Exception as e:
-            safe_print(f"  [ERROR] SA pre-check failed: {e}")
-            return None
-        # Persist optim_config + info so cache works
-        optim_record_path.write_text(
-            json.dumps({"optim_config": optim_config, "info_items": info_items}, indent=2),
-            encoding="utf-8",
+        return classifications, []
+
+    if not is_cached(export_path):
+        safe_print("  [ERROR] SA pre: export.onnx not found")
+        return None
+
+    safe_print("  [SA Pre] Analyzing export.onnx...")
+    try:
+        classifications, _, info_items = run_sa_with_info(
+            export_path, sa_pre_path, ep=ep, device=device
         )
+    except Exception as e:
+        safe_print(f"  [ERROR] SA pre failed: {e}")
+        return None
 
     if not classifications:
         safe_print(
-            "  [WARN] SA pre-check: no classifications (no QNN rule data on this machine). "
-            "SA-driven optimization will be skipped; pipeline continues."
+            "  [WARN] SA pre: no classifications (no QNN rule data on this machine). "
+            "Pipeline continues without SA-driven optimization."
         )
     else:
         summary = get_sa_summary(classifications)
         safe_print(
-            f"  [Stage 2] Pre: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
+            f"  [SA Pre] SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
             f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} "
-            f"({summary['supported_ratio']:.0%} supported)  optim_flags={list(optim_config.keys())}"
+            f"({summary['supported_ratio']:.0%} supported)"
         )
-    return classifications, optim_config, info_items
-
-
-def stage3_capability_optimize(
-    model_dir: Path,
-    graph_opt_path: Path,
-    optim_config: dict,
-    use_cache: bool,
-) -> Path | None:
-    """Run capability-driven optimization using SA's recommended config.
-
-    Uses optimize_onnx Python API directly with WinMLOptimizationConfig kwargs.
-    Returns path to sa_optimized.onnx, or None on failure.
-    """
-    from winml.modelkit.optim import optimize_onnx
-
-    sa_opt_path = model_dir / "sa_optimized.onnx"
-
-    if use_cache and is_cached(sa_opt_path):
-        safe_print(f"  [Stage 3] Capability optimize (cached, config={optim_config})")
-        return sa_opt_path
-
-    safe_print(f"  [Stage 3] Capability optimization (config={optim_config})...")
-    try:
-        optimize_onnx(str(graph_opt_path), str(sa_opt_path), **optim_config)
-    except Exception as e:
-        safe_print(f"  [ERROR] Capability optimize failed: {e}")
-        return None
-
-    if not is_cached(sa_opt_path):
-        safe_print("  [ERROR] Capability optimize produced no output")
-        return None
-
-    safe_print(f"  [Stage 3] Optimized: {sa_opt_path.name}")
-    return sa_opt_path
+    return classifications, info_items
 
 
-def stage4_sa_post(
+def stage_sa_post(
     model_dir: Path,
-    sa_opt_path: Path,
     use_cache: bool,
     ep: str = "QNNExecutionProvider",
     device: str = "NPU",
 ) -> tuple[dict[str, str], list[dict]] | None:
-    """Run SA on sa_optimized.onnx.
+    """Run SA with information on optimized.onnx (post-optimization state).
 
     Returns (classifications, info_items) or None on failure.
     """
+    optimized_path = model_dir / "optimized.onnx"
     sa_post_path = model_dir / "sa_post.json"
 
     if use_cache and is_cached(sa_post_path):
-        safe_print("  [Stage 4] SA post-check (cached)")
+        safe_print("  [SA Post] Cached")
         classifications = parse_sa_json(sa_post_path, ep=ep)
-        info_items = []
-    else:
-        safe_print("  [Stage 4] Running SA post-check...")
-        try:
-            classifications, _, info_items = run_sa_with_info(
-                sa_opt_path, sa_post_path, ep=ep, device=device
-            )
-        except Exception as e:
-            safe_print(f"  [ERROR] SA post-check failed: {e}")
-            return None
+        return classifications, []
+
+    if not is_cached(optimized_path):
+        safe_print("  [ERROR] SA post: optimized.onnx not found")
+        return None
+
+    safe_print("  [SA Post] Analyzing optimized.onnx...")
+    try:
+        classifications, _, info_items = run_sa_with_info(
+            optimized_path, sa_post_path, ep=ep, device=device
+        )
+    except Exception as e:
+        safe_print(f"  [ERROR] SA post failed: {e}")
+        return None
 
     if not classifications:
-        safe_print("  [WARN] SA post-check: no classifications (no QNN rule data). Continuing.")
+        safe_print("  [WARN] SA post: no classifications. Continuing.")
     else:
         summary = get_sa_summary(classifications)
         safe_print(
-            f"  [Stage 4] Post: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
+            f"  [SA Post] SUPPORTED={summary['supported']} PARTIAL={summary['partial']} "
             f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} "
             f"({summary['supported_ratio']:.0%} supported)"
         )
     return classifications, info_items
 
 
-def _run_compile(
-    onnx_path: Path,
-    output_dir: Path,
-    device: str = "npu",
-    ep: str | None = None,
-) -> tuple[int, str]:
-    """Run wmk compile --device <device> --no-quantize. Returns (rc, stderr_tail)."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "winml.modelkit.cli",
-        "compile",
-        "--model",
-        str(onnx_path),
-        "--device",
-        device,
-        "--no-quantize",
-        "--output-dir",
-        str(output_dir),
-    ]
-    if ep:
-        cmd += ["--ep", ep]
-    result = subprocess.run(  # noqa: S603
-        cmd,
-        capture_output=True,
-        text=True,
-        encoding="utf-8",
-        errors="replace",
-        env={**os.environ, "PYTHONIOENCODING": "utf-8"},
-    )
-    return result.returncode, (result.stderr or "").strip()[-500:]
-
-
-def _compile_and_diff(
-    label: str,
-    onnx_path: Path,
-    compiled_name: str,
-    sa_predictions: dict[str, str],
+def stage_epctx_diff(
     model_dir: Path,
-    use_cache: bool,
-    device: str = "npu",
-    ep: str | None = None,
+    sa_post: dict[str, str],
 ) -> dict | None:
-    """Compile an ONNX and compare against SA predictions.
+    """EPContext diff: compare SA post predictions vs compiled.onnx.
 
-    Args:
-        label: Log prefix, e.g. "5a (pre)" or "5b (post)".
-        onnx_path: ONNX to compile (graph_optimized or sa_optimized).
-        compiled_name: Expected output filename, e.g. "graph_optimized_qnn_ctx.onnx".
-        sa_predictions: SA classifications to compare against the compilation result.
-        model_dir: Directory for artifacts.
-        use_cache: Skip compile if compiled_name already exists.
+    winml build produces compiled.onnx from the quantized model. Comparing
+    SA post predictions (on optimized.onnx) against the compiled graph
+    identifies false positives and negatives in the SA classifier.
 
-    Returns EPContext comparison dict or None on failure.
+    Returns comparison dict or None if compiled.onnx is not available.
     """
-    compiled_path = model_dir / compiled_name
-
-    if use_cache and is_cached(compiled_path):
-        safe_print(f"  [Stage {label}] Compile (cached): {compiled_path.name}")
-    else:
-        safe_print(f"  [Stage {label}] Compiling {onnx_path.name} → EPContext...")
-        rc, _ = _run_compile(onnx_path, model_dir, device=device, ep=ep)
-        if rc != 0 or not is_cached(compiled_path):
-            safe_print(f"  [Stage {label}] Compile failed (rc={rc}) — skipping diff")
-            return None
-        safe_print(f"  [Stage {label}] Compiled: {compiled_path.name}")
-
-    try:
-        result = compare_sa_vs_epcontext(sa_predictions, compiled_path)
-    except Exception as e:
-        safe_print(f"  [Stage {label}] EPContext diff failed: {e}")
+    compiled_path = model_dir / "compiled.onnx"
+    if not is_cached(compiled_path):
+        safe_print("  [EPCtx] No compiled.onnx — skipping diff")
         return None
 
-    s = result["summary"]
-    safe_print(
-        f"  [Stage {label}] TP={s['tp']} TN={s['tn']} FP={s['fp']} FN={s['fn']} "
-        f"accuracy={s['accuracy']:.0%}"
-    )
-    return result
-
-
-def stage6_quantize(
-    model_dir: Path,
-    sa_opt_path: Path,
-    hf_id: str,
-    task: str,
-    use_cache: bool,
-    precision: str = "int8",
-    samples: int = 10,
-) -> Path | None:
-    """Stage 6: QDQ-quantize sa_optimized.onnx → quantized.onnx.
-
-    Runs ``wmk quantize`` on the SA-optimized model. Skips if the output
-    already exists and use_cache is True.
-
-    Returns the path to the quantized ONNX on success, None on failure.
-    """
-    quantized_path = model_dir / "quantized.onnx"
-
-    if use_cache and is_cached(quantized_path):
-        safe_print(f"  [Stage 6] Quantize (cached): {quantized_path.name}")
-        return quantized_path
-
-    safe_print(f"  [Stage 6] Quantizing {sa_opt_path.name} → {quantized_path.name}...")
-    cmd = [
-        sys.executable,
-        "-m",
-        "winml.modelkit.cli",
-        "quantize",
-        "-m",
-        str(sa_opt_path),
-        "-o",
-        str(quantized_path),
-        "--precision",
-        precision,
-        "--samples",
-        str(samples),
-    ]
-    if task:
-        cmd += ["--task", task]
-    if hf_id:
-        cmd += ["--model-name", hf_id]
-
-    result = subprocess.run(  # noqa: S603
-        cmd,
-        capture_output=True,
-        text=True,
-        encoding="utf-8",
-        errors="replace",
-        env={**os.environ, "PYTHONIOENCODING": "utf-8"},
-    )
-    if result.returncode != 0 or not is_cached(quantized_path):
+    try:
+        result = compare_sa_vs_epcontext(sa_post, compiled_path)
+        s = result["summary"]
         safe_print(
-            f"  [Stage 6] Quantize failed (rc={result.returncode}): {(result.stderr or '').strip()[-300:]}"
+            f"  [EPCtx] TP={s['tp']} TN={s['tn']} FP={s['fp']} FN={s['fn']} "
+            f"accuracy={s['accuracy']:.0%}"
         )
+        return result
+    except Exception as e:
+        safe_print(f"  [EPCtx] Diff failed: {e}")
         return None
 
-    safe_print(f"  [Stage 6] Quantized: {quantized_path.name}")
-    return quantized_path
-
-
-def stage5_compile_and_diff(
-    model_dir: Path,
-    graph_opt_path: Path,
-    sa_opt_path: Path,
-    sa_pre: dict[str, str],
-    sa_post: dict[str, str],
-    use_cache: bool,
-    device: str = "npu",
-    ep: str | None = None,
-) -> tuple[dict | None, dict | None]:
-    """Stage 5: compile both graph_optimized and sa_optimized, diff each vs its SA.
-
-    - 5a: graph_optimized.onnx  → compiled → compare vs sa_pre predictions
-    - 5b: sa_optimized.onnx     → compiled → compare vs sa_post predictions
-
-    Returns (epcontext_diff_pre, epcontext_diff_post).
-    """
-    diff_pre = _compile_and_diff(
-        "5a (pre)",
-        graph_opt_path,
-        graph_opt_path.stem + "_ctx.onnx",
-        sa_pre,
-        model_dir,
-        use_cache,
-        device=device,
-        ep=ep,
-    )
-    diff_post = _compile_and_diff(
-        "5b (post)",
-        sa_opt_path,
-        sa_opt_path.stem + "_ctx.onnx",
-        sa_post,
-        model_dir,
-        use_cache,
-        device=device,
-        ep=ep,
-    )
-    return diff_pre, diff_post
-
 
 # ---------------------------------------------------------------------------
 # Per-model evaluation
@@ -576,10 +432,10 @@ def evaluate_model(
     perf_warmup: int = 5,
     run_quantize: bool = True,
     quantize_precision: str = "int8",
-    quantize_samples: int = 10,
+    run_compile: bool = True,
     cleanup: bool = False,
 ) -> dict | None:
-    """Run the 4+1+1 stage SA eval pipeline for a single model."""
+    """Run the winml build + SA analysis pipeline for a single model."""
     hf_id = model_entry["hf_id"]
     task = model_entry.get("task", "")
     model_type = model_entry.get("model_type", "")
@@ -594,57 +450,45 @@ def evaluate_model(
 
     t0 = time.monotonic()
 
-    # Stage 1
-    graph_opt_path, skip_reason = stage1_export_optimize(hf_id, task, model_dir, use_cache)
-    if graph_opt_path is None:
-        return _skip_result(hf_id, task, model_type, skip_reason or "SKIP_EXPORT", model_dir)
+    # Stage 1+2: Generate config + winml build
+    skip_reason = stage_build(
+        hf_id,
+        task,
+        model_dir,
+        use_cache,
+        precision=quantize_precision,
+        device=device.lower(),
+        ep=ep,
+        run_compile=run_compile,
+        run_quantize=run_quantize,
+    )
+    if skip_reason:
+        return _skip_result(hf_id, task, model_type, skip_reason, model_dir)
+
+    export_path = model_dir / "export.onnx"
+    optimized_path = model_dir / "optimized.onnx"
+    quantized_path = model_dir / "quantized.onnx"
 
-    # Perf after export and after graph optimize
+    # Perf: export and optimized
     perf_exported: dict | None = None
-    perf_graph_opt: dict | None = None
     perf_sa_opt: dict | None = None
+    perf_quantized: dict | None = None
 
     if run_perf:
-        exported_path = model_dir / "exported.onnx"
-        if is_cached(exported_path):
-            perf_exported = run_winml_perf(
-                "Perf (exported)",
-                exported_path,
-                model_dir / "exported_perf.json",
-                device=device,
-                ep=ep,
-                iterations=perf_iterations,
-                warmup=perf_warmup,
-                use_cache=use_cache,
-            )
-        perf_graph_opt = run_winml_perf(
-            "Perf (graph_opt)",
-            graph_opt_path,
-            model_dir / "graph_optimized_perf.json",
+        perf_exported = run_winml_perf(
+            "Perf (export)",
+            export_path,
+            model_dir / "exported_perf.json",
             device=device,
             ep=ep,
             iterations=perf_iterations,
             warmup=perf_warmup,
             use_cache=use_cache,
         )
-
-    # Stage 2
-    pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache, ep=ep, device=device)
-    if pre_result is None:
-        return _skip_result(hf_id, task, model_type, "SKIP_SA_PRE", model_dir)
-    sa_pre, optim_config, pre_info_items = pre_result
-
-    # Stage 3
-    sa_opt_path = stage3_capability_optimize(model_dir, graph_opt_path, optim_config, use_cache)
-    if sa_opt_path is None:
-        return _skip_result(hf_id, task, model_type, "SKIP_OPTIM", model_dir)
-
-    # Perf after SA capability optimization
-    if run_perf:
         perf_sa_opt = run_winml_perf(
-            "Perf (sa_opt)",
-            sa_opt_path,
-            model_dir / "sa_optimized_perf.json",
+            "Perf (optimized)",
+            optimized_path,
+            model_dir / "optimized_perf.json",
             device=device,
             ep=ep,
             iterations=perf_iterations,
@@ -652,48 +496,38 @@ def evaluate_model(
             use_cache=use_cache,
         )
 
-    # Stage 4
-    post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache, ep=ep, device=device)
+    # Stage 3: SA pre-check (on export.onnx)
+    pre_result = stage_sa_pre(model_dir, use_cache, ep=ep, device=device)
+    if pre_result is None:
+        return _skip_result(hf_id, task, model_type, "SKIP_SA_PRE", model_dir)
+    sa_pre, pre_info_items = pre_result
+
+    # Stage 4: SA post-check (on optimized.onnx)
+    post_result = stage_sa_post(model_dir, use_cache, ep=ep, device=device)
     if post_result is None:
         return _skip_result(hf_id, task, model_type, "SKIP_SA_POST", model_dir)
     sa_post, post_info_items = post_result
 
-    # Stage 5: compile both ONNXes → EPContext diff pre and post
-    epcontext_diff_pre, epcontext_diff_post = stage5_compile_and_diff(
-        model_dir,
-        graph_opt_path,
-        sa_opt_path,
-        sa_pre,
-        sa_post,
-        use_cache,
-        device=device.lower(),
-        ep=ep,
-    )
+    # Read optimization flags from winml_build_config.json
+    optim_config = read_optim_flags(model_dir)
+    if optim_config:
+        safe_print(f"  [Optim flags] {list(optim_config.keys())}")
 
-    # Stage 6: QDQ quantize
-    quantized_path: Path | None = None
-    perf_quantized: dict | None = None
-    if run_quantize:
-        quantized_path = stage6_quantize(
-            model_dir,
-            sa_opt_path,
-            hf_id,
-            task,
-            use_cache,
-            precision=quantize_precision,
-            samples=quantize_samples,
+    # Stage 5: EPContext diff (on compiled.onnx from build)
+    epctx_result = stage_epctx_diff(model_dir, sa_post)
+
+    # Perf: quantized
+    if run_perf and run_quantize:
+        perf_quantized = run_winml_perf(
+            "Perf (quantized)",
+            quantized_path,
+            model_dir / "quantized_perf.json",
+            device=device,
+            ep=ep,
+            iterations=perf_iterations,
+            warmup=perf_warmup,
+            use_cache=use_cache,
         )
-        if run_perf and quantized_path is not None:
-            perf_quantized = run_winml_perf(
-                "Perf (quantized)",
-                quantized_path,
-                model_dir / "quantized_perf.json",
-                device=device,
-                ep=ep,
-                iterations=perf_iterations,
-                warmup=perf_warmup,
-                use_cache=use_cache,
-            )
 
     elapsed = time.monotonic() - t0
     delta = compute_delta(sa_pre, sa_post)
@@ -712,26 +546,27 @@ def _fmt(p: dict | None) -> str:
             return f"{p['mean']:.2f}ms" if p else "N/A"
 
         safe_print(
-            f"  Perf (mean): exported={_fmt(perf_exported)} "
-            f"→ normalize={_fmt(perf_graph_opt)} "
-            f"→ sa_opt={_fmt(perf_sa_opt)} "
-            f"→ quantize={_fmt(perf_quantized)}"
+            f"  Perf (mean): export={_fmt(perf_exported)} "
+            f"→ optimized={_fmt(perf_sa_opt)} "
+            f"→ quantized={_fmt(perf_quantized)}"
         )
 
+    artifacts: dict = {
+        "exported_onnx": str(export_path),
+        "optimized_onnx": str(optimized_path),
+    }
+    if is_cached(quantized_path):
+        artifacts["quantized_onnx"] = str(quantized_path)
+
     result: dict = {
         "model": hf_id,
         "task": task,
         "model_type": model_type,
         "status": "COMPLETE",
         "elapsed": round(elapsed, 2),
-        "artifacts": {
-            "exported_onnx": str(model_dir / "exported.onnx"),
-            "graph_optimized_onnx": str(graph_opt_path),
-            "sa_optimized_onnx": str(sa_opt_path),
-            **({"quantized_onnx": str(quantized_path)} if quantized_path else {}),
-        },
+        "artifacts": artifacts,
         "sa_pre": {
-            "source_onnx": graph_opt_path.name,
+            "source_onnx": export_path.name,
             "classifications": sa_pre,
             "summary": get_sa_summary(sa_pre),
             "partial_patterns": get_level_patterns(sa_pre, "PARTIAL"),
@@ -743,7 +578,7 @@ def _fmt(p: dict | None) -> str:
             "optim_config": optim_config,
         },
         "sa_post": {
-            "source_onnx": sa_opt_path.name,
+            "source_onnx": optimized_path.name,
             "classifications": sa_post,
             "summary": get_sa_summary(sa_post),
             "partial_patterns": get_level_patterns(sa_post, "PARTIAL"),
@@ -752,20 +587,22 @@ def _fmt(p: dict | None) -> str:
             "info_items": post_info_items,
         },
         "delta": delta,
-    }
-
-    if epcontext_diff_pre:
-        result["epcontext_diff_pre"] = epcontext_diff_pre
-    if epcontext_diff_post:
-        result["epcontext_diff_post"] = epcontext_diff_post
-
-    if run_perf:
-        result["perf"] = {
+        # perf keys match sa_report.py expectations:
+        #   "exported"       → perf_exported_mean (Export column)
+        #   "graph_optimized"→ None (Normalize column — not a separate build stage)
+        #   "sa_optimized"   → perf of optimized.onnx (Optimized column)
+        #   "quantized"      → perf_quantized_mean (Quantize column)
+        "perf": {
             "exported": perf_exported,
-            "graph_optimized": perf_graph_opt,
+            "graph_optimized": None,
             "sa_optimized": perf_sa_opt,
             "quantized": perf_quantized,
-        }
+        },
+    }
+
+    # EPContext diff stored as "post" (compiled.onnx from build is post-SA)
+    if epctx_result:
+        result["epcontext_diff_post"] = epctx_result
 
     out_file = model_dir / "sa_eval_result.json"
     out_file.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")
@@ -833,19 +670,15 @@ def build_aggregate_report(results: list[dict], models_input: Path) -> dict:
         for pid in r["sa_pre"].get("unknown_patterns", []):
             unknown_pre_counter[pid] += 1
 
-    # EPContext accuracy (where available) — track pre and post separately
-    epctx_pre = [r for r in complete if r.get("epcontext_diff_pre")]
+    # EPContext accuracy — only post (compiled.onnx from build is post-SA)
     epctx_post = [r for r in complete if r.get("epcontext_diff_post")]
     epctx_summary: dict = {
-        "models_with_pre_gt": len(epctx_pre),
+        "models_with_pre_gt": 0,
         "models_with_post_gt": len(epctx_post),
-        "avg_accuracy_pre": round(
-            statistics.mean(r["epcontext_diff_pre"]["summary"]["accuracy"] for r in epctx_pre), 4
-        )
-        if epctx_pre
-        else None,
+        "avg_accuracy_pre": None,
         "avg_accuracy_post": round(
-            statistics.mean(r["epcontext_diff_post"]["summary"]["accuracy"] for r in epctx_post), 4
+            statistics.mean(r["epcontext_diff_post"]["summary"]["accuracy"] for r in epctx_post),
+            4,
         )
         if epctx_post
         else None,
@@ -915,10 +748,8 @@ def build_aggregate_report(results: list[dict], models_input: Path) -> dict:
 
 
 def main() -> None:
-    """Run SA accuracy evaluation pipeline for all models in the registry."""
-    parser = argparse.ArgumentParser(
-        description="SA accuracy evaluation — 4-stage self-contained pipeline"
-    )
+    """Run WinML component analysis evaluation pipeline."""
+    parser = argparse.ArgumentParser(description="WinML CLI component analysis evaluation pipeline")
     parser.add_argument(
         "--models-file",
         type=Path,
@@ -968,23 +799,22 @@ def main() -> None:
     parser.add_argument(
         "--no-quantize",
         action="store_true",
-        help="Skip QDQ quantize step (stage 6)",
+        help="Skip QDQ quantize step",
     )
     parser.add_argument(
         "--quantize-precision",
         default="int8",
-        help="Quantization precision (default: int8)",
+        help="Quantization precision passed to winml config (default: int8)",
     )
     parser.add_argument(
-        "--quantize-samples",
-        type=int,
-        default=10,
-        help="Number of calibration samples for quantize (default: 10)",
+        "--no-compile",
+        action="store_true",
+        help="Skip compilation step (no compiled.onnx / EPContext diff)",
     )
     parser.add_argument(
         "--cleanup",
         action="store_true",
-        help="Delete intermediate ONNX files after each model completes to free disk space. "
+        help="Delete ONNX and binary artifacts after each model completes. "
         "JSON result and perf files are preserved.",
     )
     parser.add_argument(
@@ -1050,7 +880,7 @@ def main() -> None:
             perf_warmup=args.perf_warmup,
             run_quantize=not args.no_quantize,
             quantize_precision=args.quantize_precision,
-            quantize_samples=args.quantize_samples,
+            run_compile=not args.no_compile,
             cleanup=args.cleanup,
         )
         if result: