From 7f9bfdd3c91a5f1d2dedcef76e592a942d4c0bb7 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Tue, 12 May 2026 16:16:16 +0800 Subject: [PATCH 1/4] feat(eval): add 3-stage perf tracking, quantize step, and workflow-first HTML report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - run_sa_eval.py: run wmk perf after export, graph-optimize, SA-optimize, and QDQ-quantize (stage 6); add --no-perf, --perf-iterations, --perf-warmup, --no-quantize, --quantize-precision, --quantize-samples, --report-only flags; fix output_dir to resolve to absolute path; downgrade empty SA classification from fatal error to warning - sa_comparison.py: warn instead of silently returning empty results when SA produces no EP results (missing parquet rule data) - sa_report.py: reorganize table columns in workflow order (Export → Normalize → Pre SA → Flags → Optimized → Post SA → Quantize → Delta); chain-normalize perf gain% against previous stage; add __main__ CLI entrypoint for report-only refresh - quantize.py: add --model-name CLI option so task-aware calibration can load the correct HuggingFace tokenizer/processor --- scripts/e2e_eval/run_sa_eval.py | 342 ++++++++++++++++++++++-- scripts/e2e_eval/sa_comparison.py | 14 + scripts/e2e_eval/sa_report.py | 155 ++++++++++- src/winml/modelkit/commands/quantize.py | 8 + 4 files changed, 488 insertions(+), 31 deletions(-) diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py index daa730654..ad603b2d8 100644 --- a/scripts/e2e_eval/run_sa_eval.py +++ b/scripts/e2e_eval/run_sa_eval.py @@ -108,6 +108,82 @@ def run_wmk_export(hf_id: str, task: str, output: Path) -> tuple[int, str]: return result.returncode, (result.stderr or "").strip()[-500:] +# Map full EP names to the short form accepted by `wmk perf --ep` +_EP_TO_PERF_ARG: dict[str, str] = { + "QNNExecutionProvider": "qnn", + "DmlExecutionProvider": "dml", + "CPUExecutionProvider": "cpu", + "MIGraphXExecutionProvider": "migraphx", + "OpenVINOExecutionProvider": "openvino", + "VitisAIExecutionProvider": "vitisai", + "NvTensorRTRTXExecutionProvider": "nv_tensorrt_rtx", +} + + +def run_winml_perf( + label: str, + onnx_path: Path, + output_json: Path, + device: str, + ep: str | None, + iterations: int, + warmup: int, + use_cache: bool, +) -> dict | None: + """Run wmk perf on onnx_path. Returns latency_ms dict or None on failure.""" + if use_cache and is_cached(output_json): + safe_print(f" [{label}] Perf (cached): {output_json.name}") + try: + data = json.loads(output_json.read_text(encoding="utf-8")) + return data.get("latency_ms") + except Exception: + return None + + safe_print(f" [{label}] Running perf on {onnx_path.name}...") + cmd = [ + sys.executable, + "-m", + "winml.modelkit.cli", + "perf", + "-m", + str(onnx_path), + "--device", + device.lower(), + "--iterations", + str(iterations), + "--warmup", + str(warmup), + "--output", + str(output_json), + ] + ep_arg = _EP_TO_PERF_ARG.get(ep, ep.lower() if ep else None) if ep else None + if ep_arg: + cmd += ["--ep", ep_arg] + + result = subprocess.run( # noqa: S603 + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + env={**os.environ, "PYTHONIOENCODING": "utf-8"}, + ) + if result.returncode != 0 or not is_cached(output_json): + safe_print(f" [{label}] Perf failed (rc={result.returncode})") + return None + + try: + data = json.loads(output_json.read_text(encoding="utf-8")) + latency = data.get("latency_ms", {}) + mean_ms = latency.get("mean", 0) + p90_ms = latency.get("p90", 0) + safe_print(f" [{label}] mean={mean_ms:.2f}ms p90={p90_ms:.2f}ms") + return latency + except Exception as e: + safe_print(f" [{label}] Could not parse perf result: {e}") + return None + + # --------------------------------------------------------------------------- # Stage implementations # --------------------------------------------------------------------------- @@ -194,15 +270,17 @@ def stage2_sa_pre( ) if not classifications: - safe_print(" [ERROR] SA pre-check returned no classifications") - return None - - summary = get_sa_summary(classifications) - safe_print( - f" [Stage 2] Pre: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} " - f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} " - f"({summary['supported_ratio']:.0%} supported) optim_flags={list(optim_config.keys())}" - ) + safe_print( + " [WARN] SA pre-check: no classifications (no QNN rule data on this machine). " + "SA-driven optimization will be skipped; pipeline continues." + ) + else: + summary = get_sa_summary(classifications) + safe_print( + f" [Stage 2] Pre: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} " + f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} " + f"({summary['supported_ratio']:.0%} supported) optim_flags={list(optim_config.keys())}" + ) return classifications, optim_config, info_items @@ -268,15 +346,14 @@ def stage4_sa_post( return None if not classifications: - safe_print(" [ERROR] SA post-check returned no classifications") - return None - - summary = get_sa_summary(classifications) - safe_print( - f" [Stage 4] Post: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} " - f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} " - f"({summary['supported_ratio']:.0%} supported)" - ) + safe_print(" [WARN] SA post-check: no classifications (no QNN rule data). Continuing.") + else: + summary = get_sa_summary(classifications) + safe_print( + f" [Stage 4] Post: SUPPORTED={summary['supported']} PARTIAL={summary['partial']} " + f"UNSUPPORTED={summary['unsupported']} UNKNOWN={summary['unknown']} " + f"({summary['supported_ratio']:.0%} supported)" + ) return classifications, info_items @@ -361,6 +438,66 @@ def _compile_and_diff( return result +def stage6_quantize( + model_dir: Path, + sa_opt_path: Path, + hf_id: str, + task: str, + use_cache: bool, + precision: str = "int8", + samples: int = 10, +) -> Path | None: + """Stage 6: QDQ-quantize sa_optimized.onnx → quantized.onnx. + + Runs ``wmk quantize`` on the SA-optimized model. Skips if the output + already exists and use_cache is True. + + Returns the path to the quantized ONNX on success, None on failure. + """ + quantized_path = model_dir / "quantized.onnx" + + if use_cache and is_cached(quantized_path): + safe_print(f" [Stage 6] Quantize (cached): {quantized_path.name}") + return quantized_path + + safe_print(f" [Stage 6] Quantizing {sa_opt_path.name} → {quantized_path.name}...") + cmd = [ + sys.executable, + "-m", + "winml.modelkit.cli", + "quantize", + "-m", + str(sa_opt_path), + "-o", + str(quantized_path), + "--precision", + precision, + "--samples", + str(samples), + ] + if task: + cmd += ["--task", task] + if hf_id: + cmd += ["--model-name", hf_id] + + result = subprocess.run( # noqa: S603 + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + env={**os.environ, "PYTHONIOENCODING": "utf-8"}, + ) + if result.returncode != 0 or not is_cached(quantized_path): + safe_print( + f" [Stage 6] Quantize failed (rc={result.returncode}): {(result.stderr or '').strip()[-300:]}" + ) + return None + + safe_print(f" [Stage 6] Quantized: {quantized_path.name}") + return quantized_path + + def stage5_compile_and_diff( model_dir: Path, graph_opt_path: Path, @@ -412,8 +549,14 @@ def evaluate_model( use_cache: bool, ep: str = "QNNExecutionProvider", device: str = "NPU", + run_perf: bool = True, + perf_iterations: int = 30, + perf_warmup: int = 5, + run_quantize: bool = True, + quantize_precision: str = "int8", + quantize_samples: int = 10, ) -> dict | None: - """Run the 4+1 stage SA eval pipeline for a single model.""" + """Run the 4+1+1 stage SA eval pipeline for a single model.""" hf_id = model_entry["hf_id"] task = model_entry.get("task", "") model_type = model_entry.get("model_type", "") @@ -433,6 +576,35 @@ def evaluate_model( if graph_opt_path is None: return _skip_result(hf_id, task, model_type, skip_reason or "SKIP_EXPORT", model_dir) + # Perf after export and after graph optimize + perf_exported: dict | None = None + perf_graph_opt: dict | None = None + perf_sa_opt: dict | None = None + + if run_perf: + exported_path = model_dir / "exported.onnx" + if is_cached(exported_path): + perf_exported = run_winml_perf( + "Perf (exported)", + exported_path, + model_dir / "exported_perf.json", + device=device, + ep=ep, + iterations=perf_iterations, + warmup=perf_warmup, + use_cache=use_cache, + ) + perf_graph_opt = run_winml_perf( + "Perf (graph_opt)", + graph_opt_path, + model_dir / "graph_optimized_perf.json", + device=device, + ep=ep, + iterations=perf_iterations, + warmup=perf_warmup, + use_cache=use_cache, + ) + # Stage 2 pre_result = stage2_sa_pre(model_dir, graph_opt_path, use_cache, ep=ep, device=device) if pre_result is None: @@ -444,6 +616,19 @@ def evaluate_model( if sa_opt_path is None: return _skip_result(hf_id, task, model_type, "SKIP_OPTIM", model_dir) + # Perf after SA capability optimization + if run_perf: + perf_sa_opt = run_winml_perf( + "Perf (sa_opt)", + sa_opt_path, + model_dir / "sa_optimized_perf.json", + device=device, + ep=ep, + iterations=perf_iterations, + warmup=perf_warmup, + use_cache=use_cache, + ) + # Stage 4 post_result = stage4_sa_post(model_dir, sa_opt_path, use_cache, ep=ep, device=device) if post_result is None: @@ -462,6 +647,31 @@ def evaluate_model( ep=ep, ) + # Stage 6: QDQ quantize + quantized_path: Path | None = None + perf_quantized: dict | None = None + if run_quantize: + quantized_path = stage6_quantize( + model_dir, + sa_opt_path, + hf_id, + task, + use_cache, + precision=quantize_precision, + samples=quantize_samples, + ) + if run_perf and quantized_path is not None: + perf_quantized = run_winml_perf( + "Perf (quantized)", + quantized_path, + model_dir / "quantized_perf.json", + device=device, + ep=ep, + iterations=perf_iterations, + warmup=perf_warmup, + use_cache=use_cache, + ) + elapsed = time.monotonic() - t0 delta = compute_delta(sa_pre, sa_post) @@ -473,6 +683,18 @@ def evaluate_model( f"({delta['supported_ratio_delta']:+.0%})" ) + if run_perf: + + def _fmt(p: dict | None) -> str: + return f"{p['mean']:.2f}ms" if p else "N/A" + + safe_print( + f" Perf (mean): exported={_fmt(perf_exported)} " + f"→ normalize={_fmt(perf_graph_opt)} " + f"→ sa_opt={_fmt(perf_sa_opt)} " + f"→ quantize={_fmt(perf_quantized)}" + ) + result: dict = { "model": hf_id, "task": task, @@ -483,6 +705,7 @@ def evaluate_model( "exported_onnx": str(model_dir / "exported.onnx"), "graph_optimized_onnx": str(graph_opt_path), "sa_optimized_onnx": str(sa_opt_path), + **({"quantized_onnx": str(quantized_path)} if quantized_path else {}), }, "sa_pre": { "source_onnx": graph_opt_path.name, @@ -513,6 +736,14 @@ def evaluate_model( if epcontext_diff_post: result["epcontext_diff_post"] = epcontext_diff_post + if run_perf: + result["perf"] = { + "exported": perf_exported, + "graph_optimized": perf_graph_opt, + "sa_optimized": perf_sa_opt, + "quantized": perf_quantized, + } + out_file = model_dir / "sa_eval_result.json" out_file.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8") safe_print(f" Written: {out_file}") @@ -691,12 +922,71 @@ def main() -> None: help="Execution provider (default: QNNExecutionProvider)", ) parser.add_argument("--device", default="NPU", help="Target device (default: NPU)") + parser.add_argument( + "--no-perf", + action="store_true", + help="Skip winml perf benchmarks after each stage", + ) + parser.add_argument( + "--perf-iterations", + type=int, + default=30, + help="Number of perf iterations per stage (default: 30)", + ) + parser.add_argument( + "--perf-warmup", + type=int, + default=5, + help="Number of perf warmup iterations per stage (default: 5)", + ) + parser.add_argument( + "--no-quantize", + action="store_true", + help="Skip QDQ quantize step (stage 6)", + ) + parser.add_argument( + "--quantize-precision", + default="int8", + help="Quantization precision (default: int8)", + ) + parser.add_argument( + "--quantize-samples", + type=int, + default=10, + help="Number of calibration samples for quantize (default: 10)", + ) + parser.add_argument( + "--report-only", + action="store_true", + help=( + "Skip all eval stages — collect existing sa_eval_result.json files from " + "models/ subdirectories and regenerate the HTML report only." + ), + ) args = parser.parse_args() - output_dir = args.output_dir or Path(f"sa_eval_results/{date.today().isoformat()}") + output_dir = (args.output_dir or Path(f"sa_eval_results/{date.today().isoformat()}")).resolve() output_dir.mkdir(parents=True, exist_ok=True) safe_print(f"Output: {output_dir}") + # --report-only: collect existing per-model JSONs and regenerate the report + if args.report_only: + models_dir = output_dir / "models" + result_files = sorted(models_dir.glob("*/sa_eval_result.json")) + if not result_files: + safe_print(f"[ERROR] No sa_eval_result.json files found under {models_dir}") + sys.exit(1) + all_results = [json.loads(f.read_text(encoding="utf-8")) for f in result_files] + safe_print(f"Collected {len(all_results)} model results from disk.") + report = build_aggregate_report(all_results, args.models_file) + report_json = output_dir / "sa_eval_report.json" + report_json.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8") + safe_print(f"Report JSON: {report_json}") + report_html = output_dir / "sa_eval_report.html" + generate_sa_html_report(report, report_html) + safe_print(f"Report HTML: {report_html}") + return + if not args.models_file.exists(): safe_print(f"[ERROR] Models file not found: {args.models_file}") sys.exit(1) @@ -718,7 +1008,17 @@ def main() -> None: for i, entry in enumerate(models_to_run, 1): safe_print(f"\n[{i}/{len(models_to_run)}]") result = evaluate_model( - entry, output_dir, use_cache=args.use_cache, ep=args.ep, device=args.device + entry, + output_dir, + use_cache=args.use_cache, + ep=args.ep, + device=args.device, + run_perf=not args.no_perf, + perf_iterations=args.perf_iterations, + perf_warmup=args.perf_warmup, + run_quantize=not args.no_quantize, + quantize_precision=args.quantize_precision, + quantize_samples=args.quantize_samples, ) if result: all_results.append(result) diff --git a/scripts/e2e_eval/sa_comparison.py b/scripts/e2e_eval/sa_comparison.py index a4803de31..d1e4b976f 100644 --- a/scripts/e2e_eval/sa_comparison.py +++ b/scripts/e2e_eval/sa_comparison.py @@ -61,9 +61,11 @@ def run_sa_with_info( classifications: dict[str, str] = {} info_items: list[dict] = [] + ep_found = False for ep_result in result.output.results: if ep_result.ep_type != ep: continue + ep_found = True for level_enum, pid_list in ep_result.classification.items(): level = level_enum.value.upper() for pid in pid_list: @@ -78,6 +80,18 @@ def run_sa_with_info( ) break + if not ep_found: + # No rule data for this EP/device — SA skipped the EP entirely. + # Return empty classifications so callers can proceed without SA-driven + # optimization (perf comparison across stages still works). + import logging + + logging.getLogger(__name__).warning( + "SA produced no results for EP=%s — no runtime rule data available. " + "Returning empty classifications.", + ep, + ) + # Get optimization config from SA recommendations optim_config = dict(result.get_optimization_config(ep)) diff --git a/scripts/e2e_eval/sa_report.py b/scripts/e2e_eval/sa_report.py index 0ad748145..2c42de333 100644 --- a/scripts/e2e_eval/sa_report.py +++ b/scripts/e2e_eval/sa_report.py @@ -9,8 +9,6 @@ EPContext ground-truth accuracy, and per-model drill-down. """ -# ruff: noqa: E501 - from __future__ import annotations import json @@ -102,6 +100,31 @@ def generate_sa_html_report(report_data: dict, output_path: Path) -> None: for c in r.get("epcontext_diff_post", {}).get("comparison", []) if c["verdict"] == "FP" ], + # perf (mean ms per stage) + "perf_exported_mean": r.get("perf", {}).get("exported", {}).get("mean") + if r.get("perf", {}).get("exported") + else None, + "perf_exported_p90": r.get("perf", {}).get("exported", {}).get("p90") + if r.get("perf", {}).get("exported") + else None, + "perf_graph_opt_mean": r.get("perf", {}).get("graph_optimized", {}).get("mean") + if r.get("perf", {}).get("graph_optimized") + else None, + "perf_graph_opt_p90": r.get("perf", {}).get("graph_optimized", {}).get("p90") + if r.get("perf", {}).get("graph_optimized") + else None, + "perf_sa_opt_mean": r.get("perf", {}).get("sa_optimized", {}).get("mean") + if r.get("perf", {}).get("sa_optimized") + else None, + "perf_sa_opt_p90": r.get("perf", {}).get("sa_optimized", {}).get("p90") + if r.get("perf", {}).get("sa_optimized") + else None, + "perf_quantized_mean": r.get("perf", {}).get("quantized", {}).get("mean") + if r.get("perf", {}).get("quantized") + else None, + "perf_quantized_p90": r.get("perf", {}).get("quantized", {}).get("p90") + if r.get("perf", {}).get("quantized") + else None, } ) @@ -379,6 +402,78 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str: `).join(''); }} +// ---- Perf helpers ---- +function fmtMs(v) {{ + if (v == null) return ''; + return `${{v.toFixed(1)}}`; +}} + +function gainPct(baseline, current) {{ + if (baseline == null || current == null) return ''; + const pct = ((current - baseline) / baseline * 100); + const cls = pct < -1 ? 'c-good' : pct > 1 ? 'c-bad' : 'c-muted'; + return `${{pct>0?'+':''}}${{pct.toFixed(0)}}%`; +}} + +function singlePerfCell(mean, baseline) {{ + if (mean == null) return ''; + const g = baseline != null ? gainPct(baseline, mean) : ''; + return `${{fmtMs(mean)}}ms${{g ? ' ' + g : ''}}`; +}} + +function buildPerfDetail(d) {{ + const hasPerfData = d.perf_exported_mean != null || d.perf_graph_opt_mean != null || d.perf_sa_opt_mean != null; + if (!hasPerfData) return ''; + + const rows = [ + {{ stage: 'exported.onnx', mean: d.perf_exported_mean, p90: d.perf_exported_p90 }}, + {{ stage: 'graph_optimized.onnx', mean: d.perf_graph_opt_mean, p90: d.perf_graph_opt_p90 }}, + {{ stage: 'sa_optimized.onnx', mean: d.perf_sa_opt_mean, p90: d.perf_sa_opt_p90 }}, + {{ stage: 'quantized.onnx', mean: d.perf_quantized_mean, p90: d.perf_quantized_p90 }}, + ].filter(r => r.mean != null); + + let html = ` +
+ Perf Comparison (NPU, mean latency) +
+
+ + + + + + + + + + + `; + + const maxMean = Math.max(...rows.map(r => r.mean ?? 0)); + let prevMean = null; + rows.forEach(r => {{ + const pctVsPrev = (prevMean != null && r.mean != null) + ? ((r.mean - prevMean) / prevMean * 100) + : null; + const deltaHtml = pctVsPrev == null ? 'baseline' + : `${{pctVsPrev > 0 ? '+' : ''}}${{pctVsPrev.toFixed(1)}}%`; + const barWidth = (maxMean > 0 && r.mean != null) ? (r.mean / maxMean * 100).toFixed(1) : 0; + const barColor = pctVsPrev == null ? '#8b8fa3' : pctVsPrev < -1 ? '#4ecdc4' : pctVsPrev > 1 ? '#ff6b9d' : '#8b8fa3'; + html += ` + + + + + + `; + if (r.mean != null) prevMean = r.mean; + }}); + html += `
StageMean (ms)P90 (ms)vs Prev StageBar
${{esc(r.stage)}}${{r.mean != null ? r.mean.toFixed(2) : '—'}}${{r.p90 != null ? r.p90.toFixed(2) : '—'}}${{deltaHtml}} + ${{r.mean != null ? `
` : ''}} +
`; + return html; +}} + // ---- Per-Model Table ---- let sortKey = 'pre_supported_asc'; let searchQuery = ''; @@ -398,6 +493,14 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str: case 'pre_epctx_asc': return s.sort((a,b) => (a.pre_epctx_accuracy||0) - (b.pre_epctx_accuracy||0)); case 'post_epctx_desc': return s.sort((a,b) => (b.post_epctx_accuracy||0) - (a.post_epctx_accuracy||0)); case 'post_epctx_asc': return s.sort((a,b) => (a.post_epctx_accuracy||0) - (b.post_epctx_accuracy||0)); + case 'perf_exported_asc': return s.sort((a,b) => (a.perf_exported_mean??Infinity) - (b.perf_exported_mean??Infinity)); + case 'perf_exported_desc': return s.sort((a,b) => (b.perf_exported_mean??-Infinity) - (a.perf_exported_mean??-Infinity)); + case 'perf_graph_opt_asc': return s.sort((a,b) => (a.perf_graph_opt_mean??Infinity) - (b.perf_graph_opt_mean??Infinity)); + case 'perf_graph_opt_desc': return s.sort((a,b) => (b.perf_graph_opt_mean??-Infinity) - (a.perf_graph_opt_mean??-Infinity)); + case 'perf_sa_opt_asc': return s.sort((a,b) => (a.perf_sa_opt_mean??Infinity) - (b.perf_sa_opt_mean??Infinity)); + case 'perf_sa_opt_desc': return s.sort((a,b) => (b.perf_sa_opt_mean??-Infinity) - (a.perf_sa_opt_mean??-Infinity)); + case 'perf_quantized_asc': return s.sort((a,b) => (a.perf_quantized_mean??Infinity) - (b.perf_quantized_mean??Infinity)); + case 'perf_quantized_desc': return s.sort((a,b) => (b.perf_quantized_mean??-Infinity) - (a.perf_quantized_mean??-Infinity)); }} return s; }} @@ -411,12 +514,16 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str: let html = `
+ + - - + + + + + - `; @@ -435,16 +542,20 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str: html += ` + + - + - + + + - `; }}); @@ -609,5 +723,26 @@ def _fallback_css() -> str: -webkit-background-clip: text; -webkit-text-fill-color: transparent; } .header-stats { display: flex; gap: 20px; font-size: 13px; color: var(--text2); } -.header-stats span { font-weight: 600; color: var(--accent); } -""" +.header-stats span { font-weight: 600; color: var(--accent); }""" + + +if __name__ == "__main__": + import argparse + import json + import sys + + parser = argparse.ArgumentParser( + description="Regenerate SA eval HTML report from existing JSON." + ) + parser.add_argument("output_dir", type=Path, help="Directory containing sa_eval_report.json") + args = parser.parse_args() + + json_path = args.output_dir / "sa_eval_report.json" + if not json_path.exists(): + print(f"[ERROR] Not found: {json_path}", file=sys.stderr) + sys.exit(1) + + report = json.loads(json_path.read_text(encoding="utf-8")) + html_path = args.output_dir / "sa_eval_report.html" + generate_sa_html_report(report, html_path) + print(f"Report regenerated: {html_path}") diff --git a/src/winml/modelkit/commands/quantize.py b/src/winml/modelkit/commands/quantize.py index 58101a9d5..dbc69f728 100644 --- a/src/winml/modelkit/commands/quantize.py +++ b/src/winml/modelkit/commands/quantize.py @@ -98,6 +98,12 @@ default=None, help="Task for calibration dataset selection (e.g., 'image-classification').", ) +@click.option( + "--model-name", + type=str, + default=None, + help="HuggingFace model ID for task-aware calibration tokenizer/processor.", +) @click.option( "--verbose", "-v", @@ -119,6 +125,7 @@ def quantize( per_channel: bool, symmetric: bool, task: str | None, + model_name: str | None, verbose: bool, config_file: Path | None, ) -> None: @@ -200,6 +207,7 @@ def quantize( per_channel=per_channel, symmetric=symmetric, task=task, + model_name=model_name, ) # Display dataset info from config From 0b899f38e2c53da7f979edb538da058cd6e35858 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Tue, 12 May 2026 16:19:42 +0800 Subject: [PATCH 2/4] feat(eval): add --cleanup flag to delete intermediate ONNX files after each model --- scripts/e2e_eval/run_sa_eval.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py index ad603b2d8..480570858 100644 --- a/scripts/e2e_eval/run_sa_eval.py +++ b/scripts/e2e_eval/run_sa_eval.py @@ -184,6 +184,28 @@ def run_winml_perf( return None +# --------------------------------------------------------------------------- +# Cleanup +# --------------------------------------------------------------------------- + + +def cleanup_onnx_artifacts(model_dir: Path) -> None: + """Delete intermediate ONNX files after eval, keeping only JSON/log results. + + Removes all ``*.onnx`` and ``*.onnx.data`` files (exported, graph_optimized, + sa_optimized, quantized, compiled EPContext). JSON result files and perf + logs are preserved so --report-only and --use-cache still work for the + JSON-driven stages. + """ + freed = 0 + for pattern in ("*.onnx", "*.onnx.data"): + for f in model_dir.glob(pattern): + size = f.stat().st_size + f.unlink() + freed += size + safe_print(f" [cleanup] Freed {freed / 1024**2:.1f} MB of ONNX artifacts") + + # --------------------------------------------------------------------------- # Stage implementations # --------------------------------------------------------------------------- @@ -555,6 +577,7 @@ def evaluate_model( run_quantize: bool = True, quantize_precision: str = "int8", quantize_samples: int = 10, + cleanup: bool = False, ) -> dict | None: """Run the 4+1+1 stage SA eval pipeline for a single model.""" hf_id = model_entry["hf_id"] @@ -748,6 +771,9 @@ def _fmt(p: dict | None) -> str: out_file.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8") safe_print(f" Written: {out_file}") + if cleanup: + cleanup_onnx_artifacts(model_dir) + return result @@ -955,6 +981,12 @@ def main() -> None: default=10, help="Number of calibration samples for quantize (default: 10)", ) + parser.add_argument( + "--cleanup", + action="store_true", + help="Delete intermediate ONNX files after each model completes to free disk space. " + "JSON result and perf files are preserved.", + ) parser.add_argument( "--report-only", action="store_true", @@ -1019,6 +1051,7 @@ def main() -> None: run_quantize=not args.no_quantize, quantize_precision=args.quantize_precision, quantize_samples=args.quantize_samples, + cleanup=args.cleanup, ) if result: all_results.append(result) From 0d1691243487673f0f0aa4544bab01b542469e02 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Wed, 13 May 2026 16:15:20 +0800 Subject: [PATCH 3/4] feat(e2e_eval): improve SA eval report UX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Default sort by perf gain descending (Unlocked models float to top) - Add perf gain summary cards: Avg Perf Gain, Faster Models, Unlocked count - Reorder summary cards to show perf gain metrics first - Unlocked badge: compact purple pill style '⚡ Unlocked · Xms' - Hide models without quantize perf from main table - Add footer showing quantized vs total complete model counts - Rename report title to 'WinML CLI Component Analysis Report' - Remove Regressed summary card --- scripts/e2e_eval/run_sa_eval.py | 4 +- scripts/e2e_eval/sa_report.py | 202 ++++++++++++++++++++++++++------ 2 files changed, 169 insertions(+), 37 deletions(-) diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py index 480570858..561354e03 100644 --- a/scripts/e2e_eval/run_sa_eval.py +++ b/scripts/e2e_eval/run_sa_eval.py @@ -540,7 +540,7 @@ def stage5_compile_and_diff( diff_pre = _compile_and_diff( "5a (pre)", graph_opt_path, - graph_opt_path.stem + "_qnn_ctx.onnx", + graph_opt_path.stem + "_ctx.onnx", sa_pre, model_dir, use_cache, @@ -550,7 +550,7 @@ def stage5_compile_and_diff( diff_post = _compile_and_diff( "5b (post)", sa_opt_path, - sa_opt_path.stem + "_qnn_ctx.onnx", + sa_opt_path.stem + "_ctx.onnx", sa_post, model_dir, use_cache, diff --git a/scripts/e2e_eval/sa_report.py b/scripts/e2e_eval/sa_report.py index 2c42de333..3dc0e5617 100644 --- a/scripts/e2e_eval/sa_report.py +++ b/scripts/e2e_eval/sa_report.py @@ -155,11 +155,23 @@ def generate_sa_html_report(report_data: dict, output_path: Path) -> None: avg_post = post_opt.get("avg_supported_ratio", 0) avg_delta = effectiveness.get("avg_supported_ratio_delta", 0) n_improved = effectiveness.get("models_improved", 0) - n_regressed = effectiveness.get("models_regressed", 0) - avg_pre_unknown = pre_opt.get("avg_unknown_count", 0) - avg_post_unknown = post_opt.get("avg_unknown_count", 0) delta_cls = "c-good" if avg_delta > 0 else "c-muted" - regressed_cls = "c-bad" if n_regressed > 0 else "c-muted" + + # Perf gain summary stats (from viewer_data) + n_unlocked = sum( + 1 + for d in viewer_data + if d["perf_exported_mean"] is None and d["perf_quantized_mean"] is not None + ) + gain_vals = [ + (d["perf_exported_mean"] - d["perf_quantized_mean"]) / d["perf_exported_mean"] * 100 + for d in viewer_data + if d["perf_exported_mean"] is not None and d["perf_quantized_mean"] is not None + ] + avg_perf_gain = sum(gain_vals) / len(gain_vals) if gain_vals else None + n_with_gain = sum(1 for g in gain_vals if g > 1) + gain_cls = "c-good" if avg_perf_gain and avg_perf_gain > 1 else "c-muted" + avg_gain_str = f"{avg_perf_gain:+.1f}%" if avg_perf_gain is not None else "—" def _epctx_card(label: str, acc: float | None, n: int) -> str: if not n or acc is None: @@ -182,7 +194,7 @@ def _epctx_card(label: str, acc: float | None, n: int) -> str: -SA Eval Report +WinML CLI Component Analysis Report
Model${{arrow('model')}} TaskExport (ms)${{arrow('perf_exported')}}Normalize (ms)${{arrow('perf_graph_opt')}} Pre SA${{arrow('pre_supported')}}Pre EPCtx${{arrow('pre_epctx')}}SA Opt FlagsFlagsOptimized (ms)${{arrow('perf_sa_opt')}} Post SA${{arrow('post_supported')}}Quantize (ms)${{arrow('perf_quantized')}}Delta${{arrow('delta')}}Pre EPCtx${{arrow('pre_epctx')}} Post EPCtx${{arrow('post_epctx')}}SA Delta${{arrow('delta')}} Unknown${{arrow('unknown')}} Time
${{esc(d.model)}} ${{esc(d.task||'-')}}${{singlePerfCell(d.perf_exported_mean, null)}}${{singlePerfCell(d.perf_graph_opt_mean, d.perf_exported_mean)}} ${{levelBar(d.pre_supported,d.pre_partial,d.pre_unsupported,d.pre_unknown)}}${{epctxAcc(d.pre_epctx_accuracy)}} ${{flagsCell}}${{singlePerfCell(d.perf_sa_opt_mean, d.perf_graph_opt_mean)}} ${{levelBar(d.post_supported,d.post_partial,d.post_unsupported,d.post_unknown)}}${{epctxAcc(d.post_epctx_accuracy)}}${{singlePerfCell(d.perf_quantized_mean, d.perf_sa_opt_mean)}} ${{deltaPct(d.supported_ratio_delta)}}${{epctxAcc(d.pre_epctx_accuracy)}}${{epctxAcc(d.post_epctx_accuracy)}} ${{unknownBadge}} ${{(d.elapsed||0).toFixed(1)}}s