diff --git a/.github/workflows/modelkit-ci.yml b/.github/workflows/modelkit-ci.yml index a3e7ecba1..e51676ffb 100644 --- a/.github/workflows/modelkit-ci.yml +++ b/.github/workflows/modelkit-ci.yml @@ -35,7 +35,7 @@ jobs: paths: >- tests/unit/core tests/unit/onnx tests/unit/cache tests/unit/utils tests/unit/sysinfo tests/unit/inspect - tests/unit/optracing tests/regression + tests/regression name: test (${{ matrix.group }}) diff --git a/README.md b/README.md index ee64a0de2..1546a6309 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,454 @@ # ModelKit -Accelerate Model Deployment on WinML. +![Status](https://img.shields.io/badge/status-early%20access-blue) +![Python](https://img.shields.io/badge/python-3.10%2B-blue?logo=python&logoColor=white) +![License](https://img.shields.io/badge/license-MIT-green) -ModelKit is a Python toolkit for converting and optimizing PyTorch models to ONNX format, targeting deployment on the [Windows ML](https://learn.microsoft.com/en-us/windows/ai/windows-ml/) runtime. It supports multiple hardware backends including QNN (Qualcomm Neural Processing SDK) and OpenVINO. +**ModelKit** is a CLI toolkit to build **portable, performant, and high-quality** models for Windows ML. It covers the entire journey from pretrained model to on-device inference — export, optimization, quantization, compilation, and benchmarking — across **all execution providers**, regardless of silicon. -## Features +--- -- **Universal ONNX Export** — Convert PyTorch and Hugging Face models to ONNX with hierarchy preservation -- **Model Analysis** — Validate ONNX models for operator support, shape inference, and backend compatibility -- **Quantization** — INT8/INT16 quantization with calibration dataset support -- **Optimization** — Graph optimizations tailored for target execution providers -- **Performance Profiling** — Operation-level tracing and hardware monitoring -- **Multi-Backend Support** — QNN, OpenVINO, DirectML, and ONNX Runtime CPU/GPU +## :dart: ModelKit Is Right for You If -## Getting Started +- [x] You want to build models that run on **any Windows device** — Qualcomm, Intel, AMD, NVIDIA, or CPU +- [x] You want to benchmark a model with **one command** — latency, throughput, and live hardware utilization +- [x] You want to catch compatibility issues **ahead of time** — unsupported ops, shape mismatches, EP gaps +- [x] You want **deep insights** into your model — I/O shapes, task mapping, operator coverage per EP +- [x] You want a **repeatable and traceable** model building process — config-driven, inspectable at every stage +- [x] You want **AI agents** to build and profile models for you — agent-ready skills for coding assistants -### Prerequisites +--- -- Windows 10/11 -- Python 3.10 -- [uv](https://github.com/astral-sh/uv) package manager +## :desktop_computer: Supported Hardware -### Installation +| Execution Provider | Hardware | Status | EP Flag | Device Flag | +|:-------------------|:---------|:------:|:--------|:------------| +| **QNN** | Qualcomm NPU (Snapdragon X Elite) | 🟢 Ready | `--ep qnn` | `--device npu` | +| **OpenVINO** | Intel NPU (Meteor Lake / Lunar Lake) | 🟢 Ready | `--ep openvino` | `--device npu` | +| **VitisAI** | AMD NPU (Ryzen AI) | 🟢 Ready | `--ep vitisai` | `--device npu` | +| **TensorRT** | NVIDIA discrete GPUs | 🔶 Planned | `--ep tensorrt` | `--device gpu` | +| **MIGraphX** | AMD discrete GPUs | 🔶 Planned | `--ep migraphx` | `--device gpu` | +| **DirectML** | Hardware-agnostic GPU backend | 🔶 Planned | `--ep dml` | `--device gpu` | +| **CPU** | Cross-platform fallback | ⚪ Always available | `--ep cpu` | `--device cpu` | + +> **Tip:** Use `--device auto` and ModelKit picks the best available device — NPU first, then GPU, then CPU. + +--- + +## :clipboard: Prerequisites + +### Required Software + +| **Component** | **How to Get It** | +|-----------|--------------| +| **Windows 11** (x64 or ARM64) | Windows 11 24H2+ required for NPU support | +| **UV** | Install [UV](https://github.com/astral-sh/uv) | +| **Windows App SDK Runtime 1.8** | [Latest Windows App SDK downloads](https://learn.microsoft.com/en-us/windows/apps/windows-app-sdk/downloads) | +| **ModelKit** (Python wheel) | See release instructions | + +### Required Hardware + +**ModelKit targets NPU.** We recommend testing on one of the following NPU devices: + +| Device | EP | Flag | +|--------|-----|------| +| Snapdragon X Elite (Qualcomm) | QNN | `--ep qnn --device npu` | +| Intel AI Boost (Meteor Lake / Lunar Lake) | OpenVINO | `--ep openvino --device npu` | +| AMD Ryzen AI (Phoenix / Hawk Point / Strix) | VitisAI | `--ep vitisai --device npu` | + +**No NPU?** Use `--device auto` — ModelKit will fall back to the best available device (GPU → CPU). Note that `winml compile` requires NPU and cannot run without one. + +### Accepted Inputs + +- **HuggingFace model ID** (e.g., `microsoft/resnet-50`) — weights are downloaded on first run +- **Local ONNX file** (e.g., `model.onnx`) — from `winml export`, `winml build`, or any ONNX you already have + +### The Golden Rule: Inspect First + +Before running any pipeline command, always verify the model is supported: + +```bash +winml inspect -m +``` + +If `inspect` prints an error or shows `Unsupported`, **skip that model**. Only models that pass inspect are valid inputs for export, analyze, build, perf, and eval. + +--- + +## :package: Installation + +ModelKit requires **Python 3.10** and is distributed as a Python wheel. We recommend [uv](https://docs.astral.sh/uv/) for fast, reproducible environment setup. + +**1. Create a Python 3.10 environment** + +```bash +uv venv --python 3.10 +``` + +Activate it: + +```bash +# Windows (PowerShell) +.venv\Scripts\activate + +# Windows (Git Bash / WSL) +source .venv/Scripts/activate +``` + +**2. Install from wheel** + +```bash +uv pip install winml_modelkit--py3-none-any.whl +``` + +**3. Verify your environment** + +```bash +winml sys --list-device --list-ep +``` + +Confirm that your target device and EP appear in the output: + +- **Snapdragon X Elite** — look for `QNNExecutionProvider` +- **Intel AI Boost** — look for `OpenVINOExecutionProvider` +- **AMD Ryzen AI** — look for `VitisAIExecutionProvider` + +If no NPU is detected, you can still use ModelKit with `--device auto` for most commands. The only exception is `winml compile`, which requires an NPU device. + +--- + +## :wrench: Commands + +| Category | Commands | Purpose | +|:---------|:---------|:--------| +| **Primitives** | `inspect` `export` `optimize` `quantize` `compile` | Single-stage building blocks | +| **Pipeline** | `config` `build` `perf` `eval` `run`\* | End-to-end orchestration | +| **Insights** | `analyze` `debug`\* | Diagnostics and compatibility | +| **Utilities** | `hub` `cache`\* `doctor`\* `setting`\* `sys` | Catalog, cache, and environment | + +\* = coming soon + +
+Primitives — one stage at a time + +**`winml inspect`** — Discover model metadata. Prints the task, model class, input/output tensor names and shapes, and execution provider compatibility. No weights are loaded — this reads only the model configuration, making it fast and lightweight. Always run inspect first to verify a model is supported. + +**`winml export`** — Convert a source model to ONNX. Takes a Hugging Face model ID (or local checkpoint) and produces a standards-compliant ONNX file with hierarchy-preserving metadata. + +**`winml optimize`** — Fuse operators, simplify graphs, and prepare for target EPs. Takes an ONNX model and an optimization config (typically generated by `winml analyze`) and applies graph-level transformations: operator fusion, constant folding, shape inference, and EP-specific rewrites. + +**`winml quantize`** — Compress to low-bit precision. Reduces model size and inference latency by converting weights and activations from FP32 to INT8 (or other low-bit formats). After quantization, the model is portable — it can run on any ONNX Runtime backend. + +**`winml compile`** — Generate device-specific binaries. Takes a quantized ONNX model and produces EP-specific compiled artifacts (for example, QNN context binaries for Qualcomm NPU). This step locks the model to a specific device but delivers the lowest possible inference latency. + +
+ +
+Pipeline — orchestrated workflows + +**`winml config`** — Auto-detect optimal settings into a JSON config. Inspects the model and generates a complete build specification: task, I/O shapes, optimization flags, quantization parameters, and target EP settings. The config file is reviewable, editable, and version-controllable — the single source of truth for your build. + +**`winml build`** — Orchestrate the full pipeline. Takes a config file and executes every stage in sequence: export, analyze, optimize, quantize, and compile. Two commands (`config` + `build`) replace eight manual steps. + +**`winml perf`** — Benchmark latency, throughput, and hardware utilization. Runs inference on the target device and reports latency percentiles (p50, p90, p99), throughput (inferences per second), and optionally live hardware monitoring (CPU, RAM, NPU utilization) with the `--monitor` flag. Can accept a local ONNX file or a Hugging Face model ID. + +**`winml eval`** — Measure model accuracy against reference datasets. Compares the output of your optimized/quantized model against the original to quantify any accuracy loss introduced by the pipeline. + +**`winml run`** — End-to-end inference with pre/post processing. *(Coming soon.)* + +
+ +
+Insights — understand what is happening inside + +**`winml analyze`** — Lint operators, check EP compatibility, and generate optimization config. The analyzer has two components: the **Linter** (like ESLint for ONNX) checks every operator against target EPs and classifies each as supported, partial, or unsupported. **AutoConf** detects suboptimal patterns and generates the optimization config that the optimizer consumes. Together they form the analyze-optimize loop. + +**`winml debug`** — Interactive model debugging and layer-by-layer inspection. *(Coming soon.)* + +
+ +
+Utilities — catalog, cache, and environment + +**`winml hub`** — Browse the curated built-in model catalog. + +**`winml cache`** — Manage built model artifacts and pipeline outputs. View, clean, or selectively remove cached models and intermediate files. + +**`winml doctor`** — Diagnose environment issues. Checks runtimes, execution providers, and dependencies to identify configuration problems. + +**`winml setting`** — Configure ModelKit preferences. Set default EPs, output directories, and other global options. + +**`winml sys`** — System information and capability reporting. Prints detected hardware, available EPs, Python version, and installed package versions. + +
+ +--- + +## :rocket: Quick Start + +### Inspect a Model + +The fastest way to get started is to inspect a model. Let's look at ResNet-50: ```bash -git clone https://github.com/microsoft/ModelKit.git -cd ModelKit -uv python install 3.10 -uv sync +winml inspect -m microsoft/resnet-50 ``` -### Usage +This prints the model's metadata without downloading weights: + +- **Task**: `image-classification` — what the model does +- **Model class**: `ResNetForImageClassification` — the architecture +- **Input tensors**: names, data types, and shapes (e.g., `pixel_values: float32 [1, 3, 224, 224]`) +- **Output tensors**: names, data types, and shapes (e.g., `logits: float32 [1, 1000]`) + +If inspect succeeds, the model is supported and you can proceed with the rest of the pipeline. + +> **Golden rule: always inspect first.** Before running export, build, perf, or any other pipeline command, verify the model is supported with `winml inspect`. -ModelKit provides a CLI tool `winml`: +### Build with Primitive Commands + +This walkthrough builds **ConvNeXT** (`facebook/convnext-base-224`) step by step using primitive commands. ConvNeXT is a family of CNN models inspired by Vision Transformers, introduced by Meta in 2022 — it offers high accuracy while retaining the efficiency of CNNs. + +#### Phase 1: Inspect + +```bash +winml inspect -m facebook/convnext-base-224 +``` + +#### Phase 2: Build a Portable Model + +**Export** from PyTorch to ONNX: ```bash -# Export a Hugging Face model to ONNX -uv run winml export --model microsoft/resnet-50 --output ./output +winml export -m facebook/convnext-base-224 -o convnext/model.onnx -v +``` -# Analyze an ONNX model -uv run winml analyze --model ./output/model.onnx +**Analyze** for EP compatibility: -# Quantize an ONNX model -uv run winml quantize --model ./output/model.onnx +```bash +winml analyze -m convnext/model.onnx --optim-config optim.json ``` -## Contributions and Feedback +**Optimize** the graph using the analyzer's config: + +```bash +winml optimize -m convnext/model.onnx -c optim.json -o convnext/model_opt.onnx +``` + +**Quantize** to INT8: + +```bash +winml quantize -m convnext/model_opt.onnx -o convnext/model_opt_int8.onnx +``` + +#### Phase 3: Benchmark on Device + +**Compile** for NPU (generates device-specific binaries): + +```bash +winml compile -m convnext/model_opt_int8.onnx --ep qnn -o convnext/model_compiled.onnx +``` + +**Benchmark on NPU** — note the latency: + +```bash +winml perf -m convnext/model_compiled.onnx --ep qnn --iterations 100 +``` + +**Benchmark on CPU** for comparison: + +```bash +winml perf -m convnext/model_opt.onnx --ep cpu --iterations 100 +``` + +Compare the two numbers to see the performance difference between NPU and CPU inference. + +### Build with Config + Build + +Same model, different approach. Instead of running each command manually, use the config-driven pipeline. Think of it like CMake: `config` generates a build plan, `build` executes it. + +**Generate the build config:** + +```bash +winml config -m facebook/convnext-base-224 -o convnext_config.json +``` + +This creates a JSON file containing all settings for every pipeline step — task, I/O shapes, optimization flags, quantization parameters — all auto-detected from the model. + +**Build the model:** + +```bash +winml build -c convnext_config.json -m facebook/convnext-base-224 -o convnext_build/ +``` + +This orchestrates the full pipeline — export, analyze, optimize, quantize, compile — all in one go. Same result as the manual steps above, but in two commands. + +**Benchmark the result:** + +```bash +winml perf -m convnext_build/model.onnx --ep qnn --iterations 100 +``` + +The config file is the single source of truth for your build. Version-control it, share it with teammates, edit it to override settings, and replay builds deterministically on any machine. + +### Benchmark in One Command + +The simplest way to evaluate a model — one command, zero setup: + +```bash +winml perf -m facebook/convnext-base-224 --device npu --monitor +``` + +ModelKit handles everything behind the scenes: download the model from Hugging Face, export to ONNX, optimize the graph, and run the benchmark on your NPU. The `--monitor` flag enables live hardware monitoring — real-time CPU utilization, RAM usage, and NPU activity alongside the latency results. + +This is ideal for quick smoke tests: does the model run on this device, and how fast is it? + +--- + +## :arrows_counterclockwise: The BYOM Workflow + +The **Build Your Own Model** (BYOM) workflow is the philosophy behind ModelKit. It defines how a source model becomes a production-ready, device-optimized artifact. + +### The Pipeline + +``` +Source Model --> Export --> Analyze --> Optimize --> Quantize --> Compile --> Benchmark +``` + +![BYOM Workflow](docs/assets/workflow-only.svg) + +Each arrow is a ModelKit command. You can enter the pipeline at any stage (for example, start with a local ONNX file and skip export), exit early (stop after optimization if you do not need quantization), or loop back to repeat a stage with different settings. + +### Primitive Commands vs. Config-Driven Pipeline + +| | **Primitive Commands** | **Config-Driven Pipeline** | +|:--|:--|:--| +| **Steps** | One command **per stage** | Two steps: **config** + **build** | +| **Control** | Start from any stage; try different settings to fix errors or tweak performance | Repeatable, tweakable, version-controllable | +| **Best for** | **Flexible** workflow | Production-ready **delivery** | +| **When to use** | Exploring, debugging, prototyping | CI/CD, batch builds, team workflows | +| **Lifecycle** | "Coding" phase | Polish | + +--- + +## :clipboard: Built-in Models + +Run `winml hub` to browse the full catalog interactively. + +
+Click to expand the full model catalog + +| Model ID | Task | Architecture | +|:---------|:-----|:-------------| +| `microsoft/resnet-50` | image-classification | ResNet | +| `google/vit-base-patch16-224` | image-classification | ViT | +| `microsoft/swin-large-patch4-window7-224` | image-classification | Swin | +| `facebook/convnext-tiny-224` | image-classification | ConvNeXT | +| `rizvandwiki/gender-classification` | image-classification | ViT | +| `ProsusAI/finbert` | text-classification | BERT | +| `Intel/bert-base-uncased-mrpc` | text-classification | BERT | +| `cardiffnlp/twitter-roberta-base-sentiment-latest` | text-classification | RoBERTa | +| `dslim/bert-base-NER` | token-classification | BERT | +| `dbmdz/bert-large-cased-finetuned-conll03-english` | token-classification | BERT | +| `Babelscape/wikineural-multilingual-ner` | token-classification | BERT | +| `w11wo/indonesian-roberta-base-posp-tagger` | token-classification | RoBERTa | +| `microsoft/table-transformer-detection` | object-detection | Table Transformer | +| `mattmdjaga/segformer_b2_clothes` | image-segmentation | SegFormer | +| `nvidia/segformer-b1-finetuned-ade-512-512` | image-segmentation | SegFormer | +| `nvidia/segformer-b2-finetuned-ade-512-512` | image-segmentation | SegFormer | +| `nvidia/segformer-b5-finetuned-ade-640-640` | image-segmentation | SegFormer | + +
+ +These models are verified against ModelKit's full pipeline and serve as reliable starting points. You are not limited to this list — any Hugging Face model that passes `winml inspect` is a valid input. + +For models not in this table, run `winml inspect -m ` to verify support before proceeding. + +--- + +## :warning: Scope & Limitations + +### What ModelKit supports + +ModelKit targets **classic deep learning models** — CNNs, encoders, vision transformers, NLP classifiers, token classifiers, object detection models, and segmentation models. + +Supported tasks include: +- Image classification (ResNet, ViT, Swin, ConvNeXT) +- Text classification (BERT, RoBERTa) +- Token classification / NER (BERT, RoBERTa) +- Object detection (Table Transformer) +- Image segmentation (SegFormer) + +### What ModelKit does not support + +**LLMs and generative models are not in scope.** Do not use ModelKit with GPT, LLaMA, Phi, Mistral, Stable Diffusion, or any model with a decoder-only or sequence-to-sequence generative architecture. LLM support (with LoRA) is planned for Q3-Q4 2026. + +### Known constraints + +- `winml compile` requires an NPU device. If no NPU is available, skip the compile step and use `--device auto` for benchmarking. +- Some models may export successfully but fail during optimization or quantization due to unsupported operator patterns. The analyzer will flag these issues. +- Performance numbers vary by device, driver version, and EP version. Always benchmark on your target hardware. + +--- + +## :world_map: Roadmap + +| Milestone | Target | Highlights | +|:----------|:-------|:-----------| +| 🟡 **Kickoff** | Q4 2025 | Internal prototype, core primitive commands | +| 🟢 **Early Access** | Q1 2026 | First external testers, config + build pipeline, hub catalog | +| 🔵 **Public Beta** | Q2 2026 | Open source, agent skills, AI Toolkit integration | +| 🟣 **RC** | Q3-Q4 2026 | **LLM support** (with LoRA), broader device coverage, MLIR | + +
+Click to expand roadmap details + +**Q4 2025 — Kickoff** +- Primitive commands: `inspect`, `export`, `optimize`, `quantize`, `compile` +- QNN, OpenVINO, and VitisAI execution provider support +- Internal validation with ResNet, BERT, ViT, SegFormer families + +**Q1 2026 — Early Access** +- Pipeline commands: `config`, `build`, `perf`, `eval` +- Analyzer with auto-configuration loop +- Built-in model catalog (`winml hub`) +- Live hardware monitoring (`--monitor`) + +**Q2 2026 — Public Beta** +- Open source release +- Agent-ready skills for coding assistants (Claude Code, Cursor, Copilot) +- AI Toolkit for VS Code integration + +**Q3-Q4 2026 — Release Candidate** +- LLM support (decoder-only architectures with LoRA adapters) +- TensorRT, MIGraphX, and DirectML execution providers +- MLIR-based optimization backend +- Public SDK and framework APIs + +
+ +--- + +## :handshake: Contributions and Feedback We welcome contributions! Please see the [contribution guidelines](CONTRIBUTING.md). For feature requests or bug reports, please file a [GitHub Issue](https://github.com/microsoft/ModelKit/issues). +--- -## Code of Conduct +## :balance_scale: Code of Conduct See [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). -## License +--- + +## :page_facing_up: License This project is licensed under the [MIT License](LICENSE.txt). +--- + ## Trademarks This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft diff --git a/docs/assets/workflow-only.svg b/docs/assets/workflow-only.svg new file mode 100644 index 000000000..c7ce33600 --- /dev/null +++ b/docs/assets/workflow-only.svg @@ -0,0 +1,608 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/design/build/console_mockup.py b/docs/design/build/console_mockup.py new file mode 100644 index 000000000..0080d6d67 --- /dev/null +++ b/docs/design/build/console_mockup.py @@ -0,0 +1,715 @@ +# ruff: noqa +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Mock v5: Cascading Live — each stage is its own Live region. + +Run: uv run python temp/mock_build_output_v5.py [--2stage] [--3stage] [--onnx] [--qdq] [--reuse] [--error] [--all] + +Design v5: +- Cascading Live: each stage gets its own Live area +- When stage completes, Live stops → final text printed as static +- Next stage starts a new Live +- Model source detection: HuggingFace / ONNX / Local +- Two sections: 🔧 Setup — {source} + 🎯 Stages +- Artifact always last in each stage +""" + +from __future__ import annotations + +import sys +import time + +from rich.console import Console, Group, RenderableType +from rich.live import Live +from rich.text import Text + + +console = Console(stderr=True) + +HEAVY_SEP = "\u2550" * 60 +LIGHT_SEP = "\u2500" * 60 +MAX_BAR_WIDTH = 36 + +ICON_RUNNING = "\u23f3" +ICON_DONE = "\u2705" +ICON_ERROR = "\u274c" + + +def _fmt_size(mb: float) -> str: + return f"{mb / 1000:.1f} GB" if mb >= 1000 else f"{mb:.1f} MB" + + +def _build_bar(s: int, p: int, u: int) -> Text: + total = s + p + u + if total == 0: + return Text() + bar = Text() + s_w = max(1, round(s / total * MAX_BAR_WIDTH)) if s else 0 + p_w = max(1, round(p / total * MAX_BAR_WIDTH)) if p else 0 + u_w = max(1, round(u / total * MAX_BAR_WIDTH)) if u else 0 + used = s_w + p_w + u_w + if used > MAX_BAR_WIDTH: + s_w = max(1, s_w - (used - MAX_BAR_WIDTH)) + bar.append("\u2588" * s_w, style="green") + if p_w: + bar.append("\u2588" * p_w, style="yellow") + if u_w: + bar.append("\u2588" * u_w, style="red") + return bar + + +def _spu_text(s: int, p: int, u: int) -> Text: + t = Text() + t.append(str(s), style="bold green") + t.append("/", style="dim") + t.append(str(p), style="bold yellow" if p > 0 else "dim") + t.append("/", style="dim") + t.append(str(u), style="bold red" if u > 0 else "dim") + return t + + +# ══════════════════════════════════════════════════════════════════════════ +# STAGE LIVE — a single stage's animated region +# ══════════════════════════════════════════════════════════════════════════ + + +class StageLive: + """Live region for a single build stage. + + Usage: + with StageLive("export") as sl: + sl.set_status("Exporting to ONNX...") + ... + sl.set_done(12.3) + sl.detail("Task: fill-mask") + sl.artifact("output/export.onnx", 438.2) + After the `with` block exits, the final content is printed as static text. + """ + + def __init__(self, name: str) -> None: + self._name = name + self._lines: list[RenderableType] = [] + self._live: Live | None = None + self._status_line_idx: int = 0 + + def __enter__(self) -> StageLive: + # Start with a running status line + self._lines = [self._make_running_line()] + self._status_line_idx = 0 + self._live = Live( + self._render(), + console=console, + refresh_per_second=15, + transient=False, # Keep final frame — avoids flicker between stages + ) + self._live.start() + return self + + def __exit__(self, *_: object) -> None: + if self._live: + # Final update to ensure last state is rendered + self._live.update(self._render()) + self._live.stop() + self._live = None + # No re-print needed — Live's final frame stays on screen + + def _render(self) -> Group: + return Group(*self._lines) + + def _update(self) -> None: + if self._live: + self._live.update(self._render()) + + def _make_running_line(self, detail: str = "") -> Text: + line = Text() + line.append(f"{ICON_RUNNING} ") + line.append(self._name.capitalize(), style="bold yellow") + if detail: + line.append(f" {detail}", style="dim") + return line + + def _make_done_line(self, elapsed: float) -> Text: + line = Text() + line.append(f"{ICON_DONE} ") + line.append(f"{self._name.capitalize():<48}", style="green") + line.append(f"{elapsed:.1f}s", style="green") + return line + + def _make_error_line(self, error: str = "") -> Text: + line = Text() + line.append(f"{ICON_ERROR} ") + line.append(self._name.capitalize(), style="bold red") + if error: + line.append(f" {error}", style="red") + return line + + # ── Public API ──────────────────────────────────────────────── + + def set_status(self, detail: str) -> None: + """Update the running status text.""" + self._lines[self._status_line_idx] = self._make_running_line(detail) + self._update() + + def set_done(self, elapsed: float) -> None: + """Mark stage as done — replaces the status line.""" + self._lines[self._status_line_idx] = self._make_done_line(elapsed) + self._update() + + def set_error(self, error: str = "") -> None: + """Mark stage as failed.""" + self._lines[self._status_line_idx] = self._make_error_line(error) + self._update() + + def detail(self, markup: str) -> None: + """Add a detail line (indented under stage).""" + self._lines.append(Text.from_markup(f" {markup}")) + self._update() + + def kv(self, label: str, value: str) -> None: + """Add a key-value detail line with aligned columns.""" + self._lines.append(Text.from_markup(f" {label:<14}{value}")) + self._update() + + def artifact(self, path: str, size_mb: float) -> None: + """Add artifact line (always last).""" + label = "\U0001f4e6 Artifact:" + self._lines.append( + Text.from_markup(f" {label:<14}[dim]{path}[/dim] ({_fmt_size(size_mb)})") + ) + self._update() + + def blank(self) -> None: + self._lines.append(Text("")) + self._update() + + def ep_bar_add(self, ep_name: str) -> int: + """Add a placeholder EP bar line, return its index.""" + idx = len(self._lines) + self._lines.append(Text(f" - {ep_name:<24}...", style="dim")) + self._update() + return idx + + def ep_bar_update(self, idx: int, ep_name: str, s: int, p: int, u: int) -> None: + """Update an EP bar line by index.""" + line = Text() + line.append(" - ") + line.append(f"{ep_name:<24}", style="cyan") + line.append_text(_spu_text(s, p, u)) + line.append(" ") + line.append_text(_build_bar(s, p, u)) + self._lines[idx] = line + self._update() + + def io_input(self, name: str, shape: str, dtype: str, first: bool = True) -> None: + label = "Input: " if first else " " + self._lines.append( + Text.from_markup(f" {label}[cyan]{name:<18}[/cyan] {shape:<14} [dim]{dtype}[/dim]") + ) + self._update() + + def io_output(self, name: str, shape: str, dtype: str, first: bool = True) -> None: + label = "Output: " if first else " " + self._lines.append( + Text.from_markup(f" {label}[cyan]{name:<18}[/cyan] {shape:<14} [dim]{dtype}[/dim]") + ) + self._update() + + +# ══════════════════════════════════════════════════════════════════════════ +# STATIC STAGE (for skipped stages — no animation needed) +# ══════════════════════════════════════════════════════════════════════════ + + +def print_stage_skip(name: str, reason: str = "") -> None: + """Print a skipped stage as static text.""" + line = Text() + line.append("\u23f8\ufe0f ") + line.append(name.capitalize(), style="dim") + if reason: + line.append(f" {reason}", style="dim italic") + console.print(line) + console.print() + + +# ══════════════════════════════════════════════════════════════════════════ +# HEADER / FOOTER +# ══════════════════════════════════════════════════════════════════════════ + + +def print_setup( + model: str, + config: str, + output: str, + source: str = "HuggingFace", +) -> None: + console.print() + console.print(HEAVY_SEP) + console.print(f"[bold]\U0001f527 Setup \u2014 {source}[/bold]") + console.print(HEAVY_SEP) + console.print(f" \U0001f4e6 [bold]{'Model:':<10}[/bold] [cyan]{model}[/cyan]") + console.print(f" \U0001f4c1 [bold]{'Config:':<10}[/bold] [cyan]{config}[/cyan]") + console.print(f" \U0001f4c2 [bold]{'Output:':<10}[/bold] [cyan]{output}[/cyan]") + console.print() + + +def print_stages_header() -> None: + console.print(HEAVY_SEP) + console.print("[bold]\U0001f3af Stages[/bold]") + console.print(HEAVY_SEP) + + +def print_final( + elapsed: float, + artifact: str, + stage_timings: list[tuple[str, float | None]] | None = None, +) -> None: + """Print final summary section with stage timing breakdown. + + stage_timings: list of (stage_name, elapsed_seconds | None for skipped) + """ + console.print() + console.print(HEAVY_SEP) + console.print("[bold]\U0001f4ca Summary[/bold]") + console.print(HEAVY_SEP) + console.print(f"{ICON_DONE} [bold green]Build complete in {elapsed:.1f}s[/bold green]") + if stage_timings: + for name, t in stage_timings: + if t is not None: + console.print(f" {name:<12} [green]{t:.1f}s[/green]") + else: + console.print(f" {name:<12} [dim]skipped[/dim]") + console.print(f"\U0001f4e6 Final artifact: [bold]{artifact}[/bold]") + console.print() + + +# ══════════════════════════════════════════════════════════════════════════ +# ANIMATE HELPER +# ══════════════════════════════════════════════════════════════════════════ + + +def animate_ep(sl: StageLive, ep_name: str, s: int, p: int) -> None: + """Animate a single EP bar from 0 to final counts.""" + idx = sl.ep_bar_add(ep_name) + steps = 15 + for i in range(1, steps + 1): + frac = i / steps + cur_s = min(int(s * frac), s) + cur_p = min(int(p * frac), p) + sl.ep_bar_update(idx, ep_name, cur_s, cur_p, 0) + time.sleep(0.15) # Realistic: each node check takes ~0.1-0.2s + sl.ep_bar_update(idx, ep_name, s, p, 0) + + +# ══════════════════════════════════════════════════════════════════════════ +# SCENARIOS +# ══════════════════════════════════════════════════════════════════════════ + + +def demo_full_4stage() -> None: + """Full: export → optimize → quantize → compile (HuggingFace).""" + print_setup( + model="bert-base-uncased [dim](pretrained)[/dim]", + config="config.json", + output="output/", + source="HuggingFace", + ) + print_stages_header() + + # ── Export ──────────────────────────────────────────────────── + with StageLive("export") as sl: + sl.set_status("Exporting to ONNX...") + # Meta info known before export (from loader/config resolution) + sl.kv("Model class:", "[cyan]BertForMaskedLM[/cyan] [dim](auto-detected)[/dim]") + sl.kv("Task:", "[cyan]fill-mask[/cyan] [dim](auto-detected)[/dim]") + sl.io_input("input_ids", "[1, 128]", "int64") + sl.io_input("attention_mask", "[1, 128]", "int64", first=False) + sl.io_input("token_type_ids", "[1, 128]", "int64", first=False) + sl.io_output("logits", "[1, 30522]", "float32") + time.sleep(10.0) # Realistic: export takes ~10s + sl.set_done(12.3) + sl.artifact("output/export.onnx", 438.2) + sl.blank() + + # ── Optimize ────────────────────────────────────────────────── + with StageLive("optimize") as sl: + sl.set_status("Optimizing ONNX graph...") + time.sleep(2.0) # Realistic: initial optimize ~2s + + # Autoconf iter 1 + sl.detail("[bold]Analyzing[/bold] [dim](iter 1/3)[/dim]") + animate_ep(sl, "QnnExecutionProvider", 325, 15) + animate_ep(sl, "OpenVINOProvider", 340, 0) + + sl.detail("[bold]Patterns[/bold]") + sl.detail(" [yellow]Gelu[/yellow] [dim]\u2192 disable_gelu_fusion[/dim]") + time.sleep(0.5) + + sl.detail("[bold]Optimizing[/bold] [dim](applying autoconf)[/dim]") + sl.detail(" [dim]{disable_gelu_fusion: true}[/dim]") + time.sleep(2.0) # Realistic: re-optimize ~2s + + # Autoconf iter 2 + sl.detail("[bold]Analyzing[/bold] [dim](iter 2/3)[/dim]") + animate_ep(sl, "QnnExecutionProvider", 340, 0) + animate_ep(sl, "OpenVINOProvider", 340, 0) + + sl.detail("[dim]Autoconf converged after 2 iteration(s)[/dim]") + sl.set_done(3.1) + sl.artifact("output/optimized.onnx", 412.5) + sl.blank() + time.sleep(0.2) + + # ── Quantize ────────────────────────────────────────────────── + with StageLive("quantize") as sl: + sl.set_status("Quantizing (uint8)...") + sl.kv("Dataset:", "[cyan]timm/imagenet-1k-wds[/cyan] [dim](test)[/dim]") + sl.kv("Calibration:", "[cyan]10[/cyan] samples [dim](minmax)[/dim]") + time.sleep(8.0) # Realistic: quantize ~8s + sl.set_done(8.7) + sl.kv("Precision:", "[cyan]uint8/uint8[/cyan] [dim](weight/activation)[/dim]") + sl.artifact("output/quantized.onnx", 112.8) + sl.blank() + + # ── Compile ─────────────────────────────────────────────────── + with StageLive("compile") as sl: + sl.set_status("Compiling for QNN...") + time.sleep(3.0) # Realistic: compile ~3s + sl.set_done(2.1) + sl.detail( + "[bold]Graph:[/bold] [cyan]EPContext[/cyan] (1), " + "[cyan]Conv[/cyan] (8), " + "[cyan]MatMul[/cyan] (12), " + "[cyan]Add[/cyan] (15), " + "[cyan]Relu[/cyan] (8)" + ) + sl.artifact("output/compiled.onnx", 112.8) + time.sleep(0.1) + + print_final( + 26.2, + "output/model.onnx", + stage_timings=[ + ("Export", 12.3), + ("Optimize", 3.1), + ("Quantize", 8.7), + ("Compile", 2.1), + ], + ) + + +def demo_2stage() -> None: + """Export + optimize only (HuggingFace).""" + print_setup( + model="bert-base-uncased [dim](pretrained)[/dim]", + config="config_portable.json", + output="output/", + source="HuggingFace", + ) + print_stages_header() + + with StageLive("export") as sl: + sl.set_status("Exporting to ONNX...") + sl.kv("Model class:", "[cyan]BertForMaskedLM[/cyan] [dim](auto-detected)[/dim]") + sl.kv("Task:", "[cyan]fill-mask[/cyan] [dim](auto-detected)[/dim]") + sl.io_input("input_ids", "[1, 128]", "int64") + sl.io_input("attention_mask", "[1, 128]", "int64", first=False) + sl.io_input("token_type_ids", "[1, 128]", "int64", first=False) + sl.io_output("logits", "[1, 30522]", "float32") + time.sleep(5.0) + sl.set_done(12.3) + sl.artifact("output/export.onnx", 438.2) + sl.blank() + + with StageLive("optimize") as sl: + sl.set_status("Optimizing...") + time.sleep(1.5) + sl.detail("[bold]Analyzing[/bold] [dim](iter 1/3)[/dim]") + animate_ep(sl, "QnnExecutionProvider", 325, 15) + sl.detail("[dim]Autoconf converged after 1 iteration(s)[/dim]") + sl.set_done(3.1) + sl.artifact("output/optimized.onnx", 412.5) + + print_final( + 15.4, + "output/model.onnx", + stage_timings=[ + ("Export", 12.3), + ("Optimize", 3.1), + ], + ) + + +def demo_3stage() -> None: + """Export + optimize + quantize (HuggingFace, no compile).""" + print_setup( + model="microsoft/resnet-50 [dim](pretrained)[/dim]", + config="config_npu_noc.json", + output="output/", + source="HuggingFace", + ) + print_stages_header() + + with StageLive("export") as sl: + sl.set_status("Exporting to ONNX...") + sl.kv( + "Model class:", "[cyan]ResNetForImageClassification[/cyan] [dim](auto-detected)[/dim]" + ) + sl.kv("Task:", "[cyan]image-classification[/cyan] [dim](auto-detected)[/dim]") + sl.io_input("pixel_values", "[1, 3, 224, 224]", "float32") + sl.io_output("logits", "[1, 1000]", "float32") + time.sleep(4.0) + sl.set_done(5.1) + sl.artifact("output/export.onnx", 97.3) + sl.blank() + + with StageLive("optimize") as sl: + sl.set_status("Optimizing...") + time.sleep(1.5) + sl.detail("[bold]Analyzing[/bold] [dim](iter 1/3)[/dim]") + animate_ep(sl, "QnnExecutionProvider", 127, 5) + sl.detail("[dim]Autoconf converged after 1 iteration(s)[/dim]") + sl.set_done(1.8) + sl.artifact("output/optimized.onnx", 89.4) + sl.blank() + + with StageLive("quantize") as sl: + sl.set_status("Quantizing (uint8)...") + sl.kv("Dataset:", "[cyan]timm/imagenet-1k-wds[/cyan] [dim](test)[/dim]") + sl.kv("Calibration:", "[cyan]10[/cyan] samples [dim](minmax)[/dim]") + time.sleep(4.0) + sl.set_done(4.2) + sl.kv("Precision:", "[cyan]uint8/uint8[/cyan] [dim](weight/activation)[/dim]") + sl.artifact("output/quantized.onnx", 25.1) + + print_final( + 11.1, + "output/model.onnx", + stage_timings=[ + ("Export", 5.1), + ("Optimize", 1.8), + ("Quantize", 4.2), + ], + ) + + +def demo_onnx() -> None: + """ONNX input — no export. I/O under optimize.""" + print_setup( + model="model.onnx [dim](438.2 MB)[/dim]", + config="config.json", + output="output/", + source="ONNX", + ) + print_stages_header() + + with StageLive("optimize") as sl: + sl.set_status("Optimizing...") + time.sleep(0.3) + sl.detail("[bold]Analyzing[/bold] [dim](iter 1/3)[/dim]") + animate_ep(sl, "QnnExecutionProvider", 340, 0) + sl.detail("[dim]Autoconf converged after 1 iteration(s)[/dim]") + sl.io_input("pixel_values", "[1, 3, 224, 224]", "float32") + sl.io_output("logits", "[1, 1000]", "float32") + sl.set_done(3.1) + sl.artifact("output/model_optimized.onnx", 412.5) + sl.blank() + + with StageLive("quantize") as sl: + sl.set_status("Quantizing (uint8)...") + sl.kv("Dataset:", "[cyan]timm/imagenet-1k-wds[/cyan] [dim](test)[/dim]") + sl.kv("Calibration:", "[cyan]10[/cyan] samples [dim](minmax)[/dim]") + time.sleep(0.5) + sl.set_done(8.7) + sl.kv("Precision:", "[cyan]uint8/uint8[/cyan] [dim](weight/activation)[/dim]") + sl.artifact("output/model_quantized.onnx", 112.8) + sl.blank() + + with StageLive("compile") as sl: + sl.set_status("Compiling for QNN...") + time.sleep(0.4) + sl.set_done(2.1) + sl.detail( + "[bold]Graph:[/bold] [cyan]EPContext[/cyan] (1), " + "[cyan]Conv[/cyan] (8), [cyan]Relu[/cyan] (16), [cyan]Add[/cyan] (8)" + ) + sl.artifact("output/model_compiled.onnx", 112.8) + + print_final( + 13.9, + "output/model.onnx", + stage_timings=[ + ("Optimize", 3.1), + ("Quantize", 8.7), + ("Compile", 2.1), + ], + ) + + +def demo_qdq_skip() -> None: + """Quantize in config but auto-skipped (QDQ detected).""" + print_setup( + model="prequantized-model [dim](pretrained)[/dim]", + config="config.json", + output="output/", + source="HuggingFace", + ) + print_stages_header() + + with StageLive("export") as sl: + sl.set_status("Exporting to ONNX...") + sl.kv("Model class:", "[cyan]BertForMaskedLM[/cyan]") + sl.kv("Task:", "[cyan]fill-mask[/cyan]") + sl.io_input("input_ids", "[1, 128]", "int64") + sl.io_output("logits", "[1, 30522]", "float32") + time.sleep(5.0) + sl.set_done(12.3) + sl.artifact("output/export.onnx", 438.2) + sl.blank() + + with StageLive("optimize") as sl: + sl.set_status("Optimizing...") + time.sleep(1.5) + sl.detail("[bold]Analyzing[/bold] [dim](iter 1/3)[/dim]") + animate_ep(sl, "QnnExecutionProvider", 340, 0) + sl.detail("[dim]Autoconf converged after 1 iteration(s)[/dim]") + sl.set_done(3.1) + sl.artifact("output/optimized.onnx", 412.5) + sl.blank() + + # Skipped — static, no Live needed + print_stage_skip("quantize", "(QDQ nodes already present)") + + with StageLive("compile") as sl: + sl.set_status("Compiling for QNN...") + time.sleep(2.0) + sl.set_done(2.1) + sl.detail("[bold]Graph:[/bold] [cyan]EPContext[/cyan] (1), [cyan]Conv[/cyan] (8)") + sl.artifact("output/compiled.onnx", 438.2) + + print_final( + 17.5, + "output/model.onnx", + stage_timings=[ + ("Export", 12.3), + ("Optimize", 3.1), + ("Quantize", None), + ("Compile", 2.1), + ], + ) + + +def demo_reuse() -> None: + """Existing artifact found.""" + print_setup( + model="bert-base-uncased", + config="config.json", + output="output/", + source="HuggingFace", + ) + print_stages_header() + console.print() + console.print( + " \u267b\ufe0f [bold cyan]Existing artifact found:[/bold cyan] output/model.onnx" + ) + console.print(" \U0001f4a1 [dim]Use --rebuild to force rebuild.[/dim]") + console.print() + + +def demo_error() -> None: + """Build failure during quantize.""" + print_setup( + model="custom-model [dim](pretrained)[/dim]", + config="config.json", + output="output/", + source="HuggingFace", + ) + print_stages_header() + + with StageLive("export") as sl: + sl.set_status("Exporting to ONNX...") + sl.kv("Model class:", "[cyan]CustomForSequenceClassification[/cyan]") + sl.kv("Task:", "[cyan]text-classification[/cyan]") + sl.io_input("input_ids", "[1, 128]", "int64") + sl.io_output("logits", "[1, 2]", "float32") + time.sleep(5.0) + sl.set_done(12.3) + sl.artifact("output/export.onnx", 438.2) + sl.blank() + + with StageLive("optimize") as sl: + sl.set_status("Optimizing...") + time.sleep(1.5) + sl.detail("[bold]Analyzing[/bold] [dim](iter 1/3)[/dim]") + animate_ep(sl, "QnnExecutionProvider", 300, 20) + sl.detail("[dim]Autoconf converged after 1 iteration(s)[/dim]") + sl.set_done(3.1) + sl.artifact("output/optimized.onnx", 412.5) + sl.blank() + + with StageLive("quantize") as sl: + sl.set_status("Quantizing (int8)...") + time.sleep(3.0) + sl.set_error("Unsupported op 'CustomOp'") + + console.print() + console.print( + " [bold red]\u274c Quantization failed:[/bold red]" + " Unsupported op type 'CustomOp' for int8" + ) + console.print(" \U0001f4a1 [dim]Try: --no-quant to skip quantization[/dim]") + console.print(" \U0001f4a1 [dim]Try: wmk analyze -m model.onnx --ep qnn to investigate[/dim]") + console.print() + + +# ── Main ───────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + args = set(sys.argv[1:]) + + if "--help" in args or "-h" in args: + console.print("[bold]Usage:[/bold] uv run python temp/mock_build_output_v5.py [OPTIONS]") + console.print() + console.print(" [dim](no flags)[/dim] Full 4-stage (HuggingFace)") + console.print(" --2stage Export + Optimize only") + console.print(" --3stage Export + Optimize + Quantize") + console.print(" --onnx ONNX input") + console.print(" --qdq QDQ auto-skip") + console.print(" --reuse Existing artifact") + console.print(" --error Build failure") + console.print(" --all All scenarios") + sys.exit(0) + + if "--all" in args: + for label, fn in [ + ("Full 4-stage (HuggingFace)", demo_full_4stage), + ("2-stage: export + optimize", demo_2stage), + ("3-stage: no compile", demo_3stage), + ("ONNX input", demo_onnx), + ("QDQ auto-skip", demo_qdq_skip), + ("Existing artifact", demo_reuse), + ("Build failure", demo_error), + ]: + console.print() + console.print(f"[bold yellow]{'=' * 60}[/bold yellow]") + console.print(f"[bold yellow]\u25b6 Scenario: {label}[/bold yellow]") + console.print(f"[bold yellow]{'=' * 60}[/bold yellow]") + fn() + sys.exit(0) + + if "--2stage" in args: + demo_2stage() + elif "--3stage" in args: + demo_3stage() + elif "--onnx" in args: + demo_onnx() + elif "--qdq" in args: + demo_qdq_skip() + elif "--reuse" in args: + demo_reuse() + elif "--error" in args: + demo_error() + else: + demo_full_4stage() diff --git a/docs/design/cli/3_cli_args_spec.md b/docs/design/cli/3_cli_args_spec.md new file mode 100644 index 000000000..3c5b139ce --- /dev/null +++ b/docs/design/cli/3_cli_args_spec.md @@ -0,0 +1,579 @@ +# CLI Arguments Specification + +## Table of Contents + +- [Overview](#overview) +- [Architectural Decision Records](#architectural-decision-records) +- [Global Options](#1-global-options) +- [Shared Options](#2-shared-options) +- [Command-Specific Options](#3-command-specific-options) +- [Short Flag Registry](#4-short-flag-registry) +- [Negation Convention](#5-negation-convention) +- [Implementation Architecture](#6-implementation-architecture) +- [References](#references) + +--- + +## Overview + +### Purpose + +This document specifies the **argument conventions** for the `wmk` CLI. It defines what flags exist, how they're named, where they live, and how they compose across the 11 subcommands. + +The existing CLI PRD (`1_prd.md`) covers the framework — plugin discovery, global debug, lazy imports. This spec fills the gap: **the argument contract** that all commands must follow. + +### Scope + +- 11 subcommands: build, compile, config, export, inspect, perf, quantize, optimize, analyze, eval, sys +- Global options on root `wmk` group +- Shared options via reusable decorators +- Command-specific options +- Test-time enforcement of the spec + +### Design Principles + +1. **One source of truth**: Shared options defined once in `_options.py` +2. **Predictable**: Same flag name always means the same thing +3. **Minimal surprise**: Follows Click/Python CLI conventions (pip, ruff, uv) +4. **Extensible**: New commands inherit shared options; test validator catches omissions +5. **Clean slate**: No backward-compatibility constraints (pre-1.0 internal tool) + +--- + +## Architectural Decision Records + +### ADR-1: Breaking Change Strategy + +**Context**: CLI args grew organically with inconsistencies across 11 subcommands. + +| Option | Description | +|--------|-------------| +| A. Non-breaking | Add missing flags, deprecate old names with warnings | +| B. Breaking + deprecation | Rename flags, one release cycle of warnings | +| **C. Clean slate** | **Redesign freely, no backward compat** | + +**Decision**: C — wmk is internal/pre-1.0, no external consumers depend on exact flag names. + +--- + +### ADR-2: Verbosity Scope + +**Context**: Should `-v`/`-q` be global-only or per-command? + +| Option | Description | +|--------|-------------| +| **A. Global only** | **Root `wmk` level, inherited via `ctx.obj`** | +| B. Both levels, merged | Root + subcommand, values add up | + +**Decision**: A — Pythonic convention (pip, ruff, uv, black all do this). Click's `ctx.obj` was designed for this pattern. No use case for per-command verbosity in wmk. + +**Supersedes**: The PRD (`1_prd.md`) lists `--verbose`/`-v` as a standard subcommand option. This ADR overrides that — verbosity is global only. + +**Migration impact**: All 11 current subcommands define their own `-v`/`--verbose` as a boolean flag. These must all be removed, and each command refactored to read `ctx.obj["verbose"]` instead of a local `verbose` parameter. This is the single largest migration item in this spec. + +--- + +### ADR-3: Output Flag Convention + +**Context**: `-o` means "file" in most commands but "directory" in `build`. + +| Option | Description | +|--------|-------------| +| A. Always `-o`/`--output` | Semantic varies, documented in help | +| **B. Split** | **`-o`/`--output` for files, `--output-dir` for directories** | +| C. Always `--output-dir` | Everything writes to directory | + +**Decision**: B — explicit about what you're getting. `compile` produces a single file (with embedded context), so it uses `-o`/`--output`. Only `build` uses `--output-dir`. + +--- + +### ADR-4: Device Input Casing + +**Context**: Mixed casing across commands and internal code. + +| Option | Description | +|--------|-------------| +| A. Lowercase in, uppercase display | User types `npu`, display shows `NPU` | +| **B. Case-insensitive** | **Accept any casing, normalize to uppercase internally** | + +**Decision**: B — Click supports `case_sensitive=False` natively. Most forgiving for users. + +--- + +### ADR-5: Device Choice Set + +**Context**: Different commands had different choice orders and sets. + +**Decision**: Canonical set is `auto | cpu | gpu | npu` (alphabetical after auto). Same on all 8 device-aware commands. + +**Breaking change**: Current commands use `auto|npu|gpu|cpu` order. This is an intentional reordering to follow alphabetical convention. + +**Future**: If new devices are added (dsp, fpga), they join the canonical set in `_options.py` — one place to update. + +--- + +### ADR-6: `--device` and `--ep` Scope + +**Context**: Inconsistent command coverage for device/EP options. + +**Decision**: Same scope — both present on: build, compile, config, perf, eval, analyze, optimize, quantize. Absent from: export, inspect, sys. + +**Rationale**: If a command targets a device, it might also need to override the EP. + +--- + +### ADR-7: Model Option Scope + +**Context**: `-m`/`--model` is the backbone of wmk — nearly every command needs a model. + +**Decision**: Present on all commands except `sys`. Always `-m`/`--model`. + +`required=True` by default. Commands with info-only modes (e.g., `--list`, `--list-tasks`, `--list-capabilities`) set `required=False` and handle validation internally — these modes don't need a model. + +| Command | Required | Exception | +|---------|----------|-----------| +| export | Yes | — | +| quantize | Yes | — | +| build | Yes | — | +| optimize | No | `--list-capabilities` needs no model | +| compile | No | `--list` needs no model | +| config | No | `--model-type` can work without model | +| inspect | No | `--list-tasks` needs no model | +| perf | No | Can read model from config | +| eval | No | Can read model from config | +| analyze | No | Can read model from config | + +--- + +### ADR-8: Precision as String (not Choice) + +**Context**: Precision values include simple (`fp32`, `int8`) and compound (`w8a16`, `w4a16`). The set grows as backends evolve. + +| Option | Description | +|--------|-------------| +| A. Fixed Choice | Hard-coded set, rejects unknowns | +| **B. String + validator** | **Accept any string, validate against known set, warn on unknown** | + +**Decision**: B — extensible. A callback validator checks against known values and warns (not errors) on unknown, so new precision formats don't require CLI code changes. + +--- + +### ADR-9: Implementation Approach + +**Context**: How to standardize options across commands. + +| Option | Description | +|--------|-------------| +| A. Shared decorators | `_options.py` with reusable Click decorators | +| B. Class-based commands | `WmkCommand`, `DeviceAwareCommand` classes | +| **C. Shared decorators + test validator** | **Decorators for DRY + test suite for enforcement** | + +**Decision**: C — simplicity of decorators, enforcement through tests. Fits project's existing patterns (Click decorators + strong test discipline). Test validator introspects the Click command tree at runtime. + +--- + +### ADR-10: Negation Convention + +**Context**: Mix of `--no-X` single flags and `--X/--no-X` pairs. + +| Option | Description | +|--------|-------------| +| **A. `--no-X` only** | **Single negative flag, default is "do it"** | +| B. `--X/--no-X` pairs | Click flag pairs | + +**Decision**: A — simpler. Default behavior is always "enabled." Users opt out with `--no-quant`, `--no-compile`, etc. + +**Note**: This supersedes `compile`'s existing `--quantize/--no-quantize` and `--validate/--no-validate` pairs. Under the new convention, compile uses `--no-quant` and `--no-validate` (single negative flags). + +--- + +## 1. Global Options + +These live on the root `wmk` group only. Subcommands inherit them via `ctx.obj` — they never define their own `-v` or `-q`. + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--verbose` | `-v` | count | 0 | Increase verbosity (-v=INFO, -vv=DEBUG) | +| `--quiet` | `-q` | count | 0 | Decrease verbosity (-q=less, -qq=summary only) | +| `--debug` | — | flag | False | Alias for -vv (hidden) | +| `--version` | — | — | — | Show version and exit | +| `--help` | `-h` | — | — | Show help and exit | + +### Rules + +- `-v` and `-q` are mutually exclusive (error if both given) +- `--debug` sets `verbose=2` and is hidden from help +- Subcommands access via `ctx.obj["verbose"]` and `ctx.obj["quiet"]` +- No subcommand may define its own `-v`, `-q`, `--verbose`, or `--quiet` + +### Verbosity Matrix + +| Level | Quiet | Verbose | Behavior | +|-------|-------|---------|----------| +| Normal | 0 | 0 | Default output — stage progress, results | +| Quiet | 1 | 0 | Hide detail lines, keep stage status | +| Silent | 2 | 0 | Summary only — no stage output | +| Verbose | 0 | 1 | Step-by-step, INFO logging | +| Debug | 0 | 2 | Full DEBUG logging | + +### Context Object Contract + +Root `wmk` group stores in `ctx.obj`: + +```python +ctx.obj = { + "verbose": int, # 0, 1, or 2 + "quiet": int, # 0, 1, or 2 + "debug": bool, # True if --debug or -vv +} +``` + +Subcommands read — never write — these values. + +--- + +## 2. Shared Options + +Reusable Click option decorators defined in `modelkit/commands/_options.py`. Each command imports and composes the decorators it needs. + +### 2a. Model Option (`-m`/`--model`) + +| Flag | Short | Type | Default Required | Description | +|------|-------|------|------------------|-------------| +| `--model` | `-m` | string | Yes (see ADR-7 for exceptions) | HuggingFace model ID, local path, or .onnx file | + +**Scope**: All commands except `sys`. See ADR-7 for the per-command required/optional matrix — commands with info-only modes (e.g., `--list`, `--list-tasks`) set `required=False`. + +### 2b. Device & EP Options + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--device` | `-d` | Choice(`auto\|cpu\|gpu\|npu`, case_sensitive=False) | `auto` | Target device | +| `--ep` | — | string | None | Force specific execution provider (overrides `--device`) | + +**Scope**: build, compile, config, perf, eval, analyze, optimize, quantize. + +- Device choices are case-insensitive; normalized to uppercase internally. +- Default is `auto` for all commands. +- `--ep` has no short form — it's an advanced override. + +### 2c. Output Options + +| Flag | Short | Type | Description | +|------|-------|------|-------------| +| `--output` | `-o` | Path | Output file (single artifact) | +| `--output-dir` | — | Path | Output directory (multi-artifact) | + +**Scope by command:** + +| Command | Flag | Reason | +|---------|------|--------| +| export | `-o`/`--output` | Single .onnx | +| config | `-o`/`--output` | Single .json | +| perf | `-o`/`--output` | Single .json | +| eval | `-o`/`--output` | Single .json | +| quantize | `-o`/`--output` | Single .onnx | +| optimize | `-o`/`--output` | Single .onnx | +| analyze | `-o`/`--output` | Single .json | +| compile | `-o`/`--output` | Single .onnx (embedded context) | +| build | `--output-dir` | Directory of artifacts | +| inspect | — | Stdout only | +| sys | — | Stdout only | + +### 2d. Task Option + +| Flag | Short | Type | Description | +|------|-------|------|-------------| +| `--task` | `-t` | string | Override auto-detected task (e.g., image-classification) | + +**Scope**: export, config, inspect, perf, eval, quantize, analyze. + +**Not on**: build (reads from config), compile (task-agnostic), optimize (task-agnostic), sys. + +### 2e. Precision Option + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--precision` | `-p` | string | `auto` | Target precision | + +**Scope**: config, perf, quantize. + +**Validation**: Callback validates against known values. Known set today: + +- Simple: `auto`, `fp32`, `fp16`, `int8`, `int16` +- Compound: `w8a8`, `w8a16`, `w4a16` + +Unknown values produce a warning (not error) for forward-compatibility. + +--- + +## 3. Command-Specific Options + +Beyond shared options, each command has its own specialized flags. Shared options (Section 2) are implied and not repeated here. + +### build + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--config` | `-c` | Path | — | WinMLBuildConfig JSON file | +| `--output-dir` | — | Path | — | Output directory for artifacts | +| `--rebuild` | — | flag | False | Overwrite existing artifacts | +| `--no-quant` | — | flag | False | Skip quantization stage | +| `--no-compile` | — | flag | False | Skip compilation stage | +| `--no-analyze` | — | flag | False | Skip analyzer loop | +| `--max-analyze-iterations` | — | int | 3 | Max analyzer iterations | +| `--use-cache` | — | flag | False | Use global cache (~/.cache/winml/) | + +### compile + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--compiler` | — | Choice(`ort\|qairt`) | `ort` | Compiler backend | +| `--embed` | — | flag | False | Embed EP context in ONNX file | +| `--no-quant` | — | flag | False | Skip quantization before compilation | +| `--no-validate` | — | flag | False | Skip compiled model validation | +| `--qnn-sdk-root` | — | Path | None | Path to QAIRT SDK root | +| `--list` | `-l` | flag | False | List available compilers and exit | + +**Migration note**: Replaces the old `--quantize/--no-quantize` and `--validate/--no-validate` flag pairs (ADR-10). + +### config + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--config` | `-c` | Path | — | JSON config with overrides | +| `--model-class` | — | string | None | Override auto-detected model class | +| `--model-type` | — | string | None | Override auto-detected model type | +| `--module` | — | string | None | Submodule class name filter | +| `--shape-config` | — | Path | None | JSON with shape overrides | +| `--library` | — | string | `transformers` | Source library | +| `--no-quant` | — | flag | False | Exclude quantization from config | +| `--no-compile` | — | flag | False | Exclude compilation from config | +| `--trust-remote-code` | — | flag | False | Allow custom code from model repo | + +### export + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--with-report` | — | flag | False | Generate export reports (md + json) | +| `--no-hierarchy` | — | flag | False | Skip hierarchy_tag metadata | +| `--dynamo` | — | flag | False | Enable dynamo export | +| `--torch-module` | — | string | None | torch.nn modules to include (comma-sep) | +| `--input-specs` | — | Path | None | JSON input specifications | +| `--export-config` | — | Path | None | ONNX export config JSON | + +### inspect + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--format` | `-f` | Choice(`table\|json`) | `table` | Output format | +| `--hierarchy` | `-H` | flag | False | Show HF module hierarchy | +| `--list-tasks` | — | flag | False | List all known tasks and exit | +| `--model-type` | — | string | None | Override model type | +| `--model-class` | — | string | None | Override model class | + +### perf + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--iterations` | `-n` | int | 100 | Benchmark iterations | +| `--warmup` | — | int | 10 | Warmup iterations | +| `--batch-size` | — | int | 1 | Batch size | +| `--no-quant` | — | flag | False | Skip quantization during model build | +| `--module` | — | string | None | Per-module benchmarking | +| `--monitor` | — | flag | False | Live NPU utilization chart | +| `--op-tracing` | — | Choice(`basic\|detail`) | None | Operator-level profiling | + +Also inherits shared options: `-m`, `-d`/`--device`, `--ep`, `-o`/`--output`, `-t`/`--task`, `-p`/`--precision` (see Section 2). + +### quantize + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--samples` | `-n` | int | 10 | Calibration samples | +| `--method` | — | Choice(`minmax\|entropy\|percentile`) | `minmax` | Calibration method | +| `--weight-type` | — | string | None | Weight quantization type | +| `--activation-type` | — | string | None | Activation quantization type | +| `--per-channel` | — | flag | False | Per-channel quantization | +| `--symmetric` | — | flag | False | Symmetric quantization | + +### optimize + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--config` | `-c` | Path | None | Config file (YAML/JSON) | +| `--list-capabilities` | `-l` | flag | False | List capabilities and exit | +| *dynamic* | — | — | — | Auto-generated `--enable-X`/`--disable-X` from capability registry | + +### analyze + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--format` | `-f` | Choice(`table\|json`) | `table` | Output format | +| `--information` | — | flag | True | Show compatibility info | +| `--htp-metadata` | — | Path | None | HTP metadata for advanced analysis | +| `--no-run-unknown-op` | — | flag | False | Skip unknown op runtime checking | +| `--optim-config` | — | Path | None | Optimization config for analysis | + +**Note on `--format` choices**: inspect and analyze use `table|json` (they render Rich tables). sys uses `text|json|compact` (plain text output). The choice sets intentionally differ to match each command's output style. + +### eval + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--dataset` | — | string | None | HF dataset path | +| `--dataset-name` | — | string | None | Dataset config name | +| `--samples` | `-n` | int | 100 | Dataset samples | +| `--split` | — | string | `validation` | Dataset split | +| `--shuffle` | — | flag | False | Shuffle before sampling | +| `--streaming` | — | flag | False | Stream dataset | +| `--column` | — | string (multiple) | — | Column mapping key=value | +| `--label-mapping` | — | string | None | Label mapping JSON | +| `--model-id` | — | string | None | HF model ID when -m is .onnx | + +### sys + +| Flag | Short | Type | Default | Description | +|------|-------|------|---------|-------------| +| `--format` | `-f` | Choice(`text\|json\|compact`) | `text` | Output format | +| `--list-device` | — | flag | False | List available devices | +| `--list-ep` | — | flag | False | List available EPs | + +--- + +## 4. Short Flag Registry + +One source of truth to prevent collisions across the entire CLI. + +| Short | Long | Scope | Notes | +|-------|------|-------|-------| +| `-h` | `--help` | global | All commands (Click built-in) | +| `-v` | `--verbose` | global | Count flag | +| `-q` | `--quiet` | global | Count flag | +| `-m` | `--model` | shared | All except sys | +| `-d` | `--device` | shared | 8 commands | +| `-o` | `--output` | shared | File output commands | +| `-t` | `--task` | shared | 7 commands | +| `-p` | `--precision` | shared | config, perf, quantize | +| `-c` | `--config` | command | build, config, optimize | +| `-f` | `--format` | command | inspect, analyze, sys | +| `-n` | `--iterations`/`--samples` | command | perf (`--iterations`), eval (`--samples`), quantize (`--samples`) | +| `-l` | `--list`/`--list-capabilities` | command | compile (`--list`), optimize (`--list-capabilities`) | +| `-H` | `--hierarchy` | command | inspect only | + +### Rules + +- **Global shorts** (`-h`, `-v`, `-q`) are reserved — no subcommand may redefine them. +- **Shared shorts** (`-m`, `-d`, `-o`, `-t`, `-p`) must always map to the same long form. +- **Command shorts** (`-c`, `-f`, `-n`, `-l`, `-H`) may map to different longs per command, but the semantic should be similar (e.g., `-n` always means "count of something"). +- **Available** for future use: `-a`, `-b`, `-e`, `-g`, `-i`, `-j`, `-k`, `-r`, `-s`, `-u`, `-w`, `-x`, `-y`, `-z`, `-P`. + +--- + +## 5. Negation Convention + +Use `--no-X` single flags. Default behavior is always "enabled" — flags opt out. + +| Pattern | Example | Meaning | +|---------|---------|---------| +| `--no-quant` | `wmk build --no-quant` | Skip quantization | +| `--no-compile` | `wmk build --no-compile` | Skip compilation | +| `--no-analyze` | `wmk build --no-analyze` | Skip analyzer | +| `--no-validate` | `wmk compile --no-validate` | Skip validation | +| `--no-hierarchy` | `wmk export --no-hierarchy` | Skip hierarchy tags | + +**Removed**: `--quantize/--no-quantize` pair from compile (was deprecated). Clean slate — just `--no-validate`. + +--- + +## 6. Implementation Architecture + +### 6a. Shared Decorator Module + +`modelkit/commands/_options.py` — underscore prefix means it's importable but not auto-discovered as a command. + +``` +modelkit/commands/ +├── _options.py # Shared option decorators +├── build.py +├── compile.py +├── config.py +├── export.py +├── inspect.py +├── perf.py +├── quantize.py +├── optimize.py +├── analyze.py +├── eval.py +└── sys.py +``` + +Decorator pattern: + +```python +def model_option(required: bool = True): + """Model identifier: HF model ID, local path, or .onnx file.""" + return click.option( + "-m", "--model", + required=required, + type=str, + help="HuggingFace model ID, local path, or .onnx file", + ) + +def device_option(): + """Target device for inference/compilation.""" + return click.option( + "-d", "--device", + default="auto", + type=click.Choice(["auto", "cpu", "gpu", "npu"], case_sensitive=False), + callback=_normalize_uppercase, + help="Target device", + ) + +def ep_option(): + """Force specific execution provider (overrides --device).""" + return click.option( + "--ep", + default=None, + type=str, + help="Force specific execution provider (overrides --device)", + ) +``` + +Commands compose: + +```python +@click.command() +@model_option() +@device_option() +@ep_option() +@output_file_option() +def compile(...): + ... +``` + +### 6b. Test Validator + +`tests/test_cli_spec.py` — enforces this spec in CI. + +| Test | What it checks | +|------|----------------| +| `test_all_commands_have_model` | Every command except `sys` has `-m`/`--model` | +| `test_device_commands_have_device_and_ep` | 8 commands have both `--device` and `--ep` | +| `test_no_subcommand_defines_verbose_or_quiet` | No subcommand has `-v`, `-q`, `--verbose`, or `--quiet` | +| `test_short_flag_no_collisions` | No two options on the same command share a short flag | +| `test_output_flag_consistency` | Commands with `-o` use `--output` (not `--output-dir`) | +| `test_device_choice_values` | All `--device` options have exactly `auto\|cpu\|gpu\|npu` | +| `test_device_case_insensitive` | All `--device` choices use `case_sensitive=False` | +| `test_global_flags_not_shadowed` | No subcommand redefines `-h`, `-v`, `-q` | + +Tests introspect the Click command tree at runtime — no manual lists to keep in sync. + +--- + +## References + +- [CLI Framework PRD](1_prd.md) — Framework-level spec (discovery, global debug, error handling). **Note**: ADR-2 in this spec supersedes the PRD's recommendation of per-command `--verbose`. +- [CLI Core Loop](2_coreloop.md) — Implementation patterns (lazy imports, debug inheritance) +- [CLI Testing Strategy](testing-strategy.md) — Testing approach +- [Click Documentation](https://click.palletsprojects.com/) — CLI framework diff --git a/docs/design/cli/4_cli_args_plan.md b/docs/design/cli/4_cli_args_plan.md new file mode 100644 index 000000000..93b64ac52 --- /dev/null +++ b/docs/design/cli/4_cli_args_plan.md @@ -0,0 +1,726 @@ +# CLI Arguments Refactor — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Standardize CLI arguments across all 11 `wmk` subcommands per the spec at `docs/design/cli/3_cli_args_spec.md`. + +**Architecture:** Create a shared decorator module (`_options.py`) for reusable Click options, migrate all commands to use shared decorators + global-only verbosity via `ctx.obj`, and enforce the spec with a test validator suite. + +**Tech Stack:** Python 3.10+, Click, Rich, pytest + +**Spec Reference:** `docs/design/cli/3_cli_args_spec.md` — all ADRs and flag tables live there. + +--- + +## File Structure + +| Action | File | Responsibility | +|--------|------|----------------| +| **Create** | `modelkit/commands/_options.py` | Shared Click option decorators | +| **Create** | `tests/test_cli_spec.py` | Spec enforcement test validator | +| **Modify** | `modelkit/cli.py` | Root group: rename ctx key `verbosity`→`verbose`, add `-h` | +| **Modify** | `modelkit/commands/build.py` | Remove local `-v`/`-q`, use shared decorators | +| **Modify** | `modelkit/commands/compile.py` | Remove local `-v`, use shared decorators, `--no-quant`/`--no-validate` | +| **Modify** | `modelkit/commands/config.py` | Add `@click.pass_context`, remove local `-v`, use shared decorators | +| **Modify** | `modelkit/commands/export.py` | Remove local `-v`, use shared decorators | +| **Modify** | `modelkit/commands/inspect.py` | Remove local `-v`, use shared decorators | +| **Modify** | `modelkit/commands/perf.py` | Remove local `-v`, use shared decorators, add `-n`/`-t` shorts | +| **Modify** | `modelkit/commands/quantize.py` | Remove local `-v`, use shared decorators, add `-d`/`--ep`/`-n`/`-t` | +| **Modify** | `modelkit/commands/optimize.py` | Remove local `-v`, use shared decorators, add `-d`/`--ep` | +| **Modify** | `modelkit/commands/analyze.py` | Remove local `-v`, use shared decorators | +| **Modify** | `modelkit/commands/eval.py` | Remove local `-v`, use shared decorators, add `--ep`/`-t` | +| **Modify** | `modelkit/commands/sys.py` | Remove local `-v` | + +--- + +## Task 1: Create `_options.py` — Shared Decorator Module + +**Files:** +- Create: `modelkit/commands/_options.py` +- Test: `tests/test_cli_spec.py` (partial — decorator unit tests) + +- [ ] **Step 1: Write failing test for shared decorators** + +Create `tests/test_cli_spec.py` with basic import and decorator tests: + +```python +"""CLI spec enforcement tests — validates argument conventions.""" + +import click +import pytest + + +def test_options_module_importable(): + """_options.py must be importable.""" + from modelkit.commands._options import ( + model_option, + device_option, + ep_option, + output_file_option, + output_dir_option, + task_option, + precision_option, + ) + + +def test_model_option_creates_click_option(): + """model_option() must return a Click decorator.""" + from modelkit.commands._options import model_option + + @click.command() + @model_option() + def dummy(model): + pass + + param_names = [p.name for p in dummy.params] + assert "model" in param_names + + +def test_model_option_required_default(): + """model_option() is required=True by default.""" + from modelkit.commands._options import model_option + + @click.command() + @model_option() + def dummy(model): + pass + + model_param = next(p for p in dummy.params if p.name == "model") + assert model_param.required is True + + +def test_model_option_required_false(): + """model_option(required=False) makes it optional.""" + from modelkit.commands._options import model_option + + @click.command() + @model_option(required=False) + def dummy(model): + pass + + model_param = next(p for p in dummy.params if p.name == "model") + assert model_param.required is False + + +def test_device_option_choices(): + """device_option() must have auto|cpu|gpu|npu choices, case-insensitive.""" + from modelkit.commands._options import device_option + + @click.command() + @device_option() + def dummy(device): + pass + + device_param = next(p for p in dummy.params if p.name == "device") + assert isinstance(device_param.type, click.Choice) + assert set(device_param.type.choices) == {"auto", "cpu", "gpu", "npu"} + assert device_param.type.case_sensitive is False + + +def test_device_option_default_auto(): + """device_option() defaults to 'auto'.""" + from modelkit.commands._options import device_option + + @click.command() + @device_option() + def dummy(device): + pass + + device_param = next(p for p in dummy.params if p.name == "device") + assert device_param.default == "auto" + + +def test_precision_option_is_string_not_choice(): + """precision_option() must be string type (ADR-8), not Choice.""" + from modelkit.commands._options import precision_option + + @click.command() + @precision_option() + def dummy(precision): + pass + + precision_param = next(p for p in dummy.params if p.name == "precision") + # Must NOT be Choice — ADR-8 says string + validator + assert not isinstance(precision_param.type, click.Choice) +``` + +- [ ] **Step 2: Run test to verify it fails** + +```bash +uv run pytest tests/test_cli_spec.py -v +``` + +Expected: ImportError — `_options` module doesn't exist yet. + +- [ ] **Step 3: Create `_options.py` with all shared decorators** + +Create `modelkit/commands/_options.py`: + +```python +"""Shared CLI option decorators. + +One source of truth for options used across multiple wmk subcommands. +See docs/design/cli/3_cli_args_spec.md for the full specification. +""" + +from __future__ import annotations + +import click + + +def _normalize_uppercase( + ctx: click.Context, param: click.Parameter, value: str | None, +) -> str | None: + """Normalize value to uppercase (for device choices).""" + return value.upper() if value else value + + +# Known precision values — warn (don't error) on unknown for forward-compat +_KNOWN_PRECISIONS = { + "auto", "fp32", "fp16", "int8", "int16", + "w8a8", "w8a16", "w4a16", +} + + +def _validate_precision( + ctx: click.Context, param: click.Parameter, value: str | None, +) -> str | None: + """Warn on unknown precision values but don't reject them.""" + if value and value.lower() not in _KNOWN_PRECISIONS: + click.echo( + f"Warning: unknown precision '{value}'. " + f"Known values: {', '.join(sorted(_KNOWN_PRECISIONS))}", + err=True, + ) + return value + + +# ── Shared option decorators ──────────────────────────────────── + + +def model_option(required: bool = True): + """Model identifier: HF model ID, local path, or .onnx file.""" + return click.option( + "-m", "--model", + required=required, + type=str, + help="HuggingFace model ID, local path, or .onnx file", + ) + + +def device_option(): + """Target device for inference/compilation.""" + return click.option( + "-d", "--device", + default="auto", + type=click.Choice( + ["auto", "cpu", "gpu", "npu"], + case_sensitive=False, + ), + callback=_normalize_uppercase, + expose_value=True, + is_eager=False, + help="Target device", + ) + + +def ep_option(): + """Force specific execution provider (overrides --device).""" + return click.option( + "--ep", + default=None, + type=str, + help="Force specific execution provider (overrides --device)", + ) + + +def output_file_option(required: bool = False): + """Output file path (single artifact).""" + from pathlib import Path + + return click.option( + "-o", "--output", + required=required, + type=click.Path(path_type=Path), + help="Output file path", + ) + + +def output_dir_option(required: bool = False): + """Output directory (multi-artifact builds).""" + from pathlib import Path + + return click.option( + "--output-dir", + required=required, + type=click.Path(path_type=Path), + help="Output directory for artifacts", + ) + + +def task_option(): + """Override auto-detected task.""" + return click.option( + "-t", "--task", + default=None, + type=str, + help="Override auto-detected task (e.g., image-classification)", + ) + + +def precision_option(): + """Target precision (string + validator per ADR-8).""" + return click.option( + "-p", "--precision", + default="auto", + type=str, + callback=_validate_precision, + help="Target precision (e.g., fp32, fp16, int8, w8a16)", + ) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +uv run pytest tests/test_cli_spec.py -v +``` + +Expected: All 7 tests PASS. + +- [ ] **Step 5: Lint** + +```bash +uv run ruff check modelkit/commands/_options.py tests/test_cli_spec.py +``` + +- [ ] **Step 6: Commit** + +```bash +git add modelkit/commands/_options.py tests/test_cli_spec.py +git commit -m "feat(cli): add shared option decorators (_options.py) and spec tests" +``` + +--- + +## Task 2: Fix Root Group (`cli.py`) + +**Files:** +- Modify: `modelkit/cli.py:118-153` + +- [ ] **Step 1: Write failing test for root context contract** + +Append to `tests/test_cli_spec.py`: + +```python +from click.testing import CliRunner +from modelkit.cli import main + + +def test_root_context_stores_verbose_key(): + """Root group must store ctx.obj['verbose'] (not 'verbosity').""" + runner = CliRunner() + result = runner.invoke(main, ["-v", "sys", "--help"]) + # If it runs without error, the context was set up + assert result.exit_code == 0 + + +def test_root_help_short_flag(): + """wmk -h must work (not just --help).""" + runner = CliRunner() + result = runner.invoke(main, ["-h"]) + assert result.exit_code == 0 + assert "WML ModelKit" in result.output + + +def test_root_vq_mutually_exclusive(): + """wmk -v -q must error.""" + runner = CliRunner() + result = runner.invoke(main, ["-v", "-q", "sys", "--help"]) + assert result.exit_code != 0 +``` + +- [ ] **Step 2: Run tests to see failures** + +```bash +uv run pytest tests/test_cli_spec.py::test_root_context_stores_verbose_key tests/test_cli_spec.py::test_root_help_short_flag tests/test_cli_spec.py::test_root_vq_mutually_exclusive -v +``` + +- [ ] **Step 3: Update `cli.py`** + +Changes to `modelkit/cli.py`: + +1. **Line 116**: `context_settings={"help_option_names": ["-h", "--help"]}` — already present in unstaged changes. +2. **Line 147**: Add mutual exclusion check: + ```python + if verbose and quiet: + raise click.UsageError("Cannot use --verbose and --quiet together.") + ``` +3. **Line 152**: Rename key `"verbosity"` → `"verbose"`: + ```python + ctx.obj["verbose"] = verbose + ``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +uv run pytest tests/test_cli_spec.py -v +``` + +- [ ] **Step 5: Lint and commit** + +```bash +uv run ruff check modelkit/cli.py +git add modelkit/cli.py tests/test_cli_spec.py +git commit -m "fix(cli): rename ctx.obj key verbosity→verbose, add -v/-q mutual exclusion" +``` + +--- + +## Task 3: Write Spec Validator Tests + +**Files:** +- Modify: `tests/test_cli_spec.py` + +These tests introspect the Click command tree and enforce the spec. They will fail initially (proving they catch violations), then pass as commands are migrated. + +- [ ] **Step 1: Write the validator tests** + +Append to `tests/test_cli_spec.py`: + +```python +# ── Spec enforcement tests ────────────────────────────────────── +# These introspect the Click command tree to enforce the spec. +# They will be the last to pass, after all commands are migrated. + +# Commands that must have -m/--model +MODEL_COMMANDS = { + "build", "compile", "config", "export", "inspect", + "perf", "quantize", "optimize", "analyze", "eval", +} + +# Commands that must have --device and --ep +DEVICE_COMMANDS = { + "build", "compile", "config", "perf", + "eval", "analyze", "optimize", "quantize", +} + +# Global shorts that subcommands must NOT define +RESERVED_SHORTS = {"-v", "-q", "-h"} + + +def _get_command(name: str) -> click.Command: + """Get a subcommand by name from the root group.""" + cmd = main.commands.get(name) + assert cmd is not None, f"Command '{name}' not found" + return cmd + + +def _param_names(cmd: click.Command) -> set[str]: + """Get all parameter names for a command.""" + return {p.name for p in cmd.params} + + +def _param_shorts(cmd: click.Command) -> list[str]: + """Get all short flags for a command.""" + shorts = [] + for p in cmd.params: + if hasattr(p, "opts"): + for opt in p.opts: + if opt.startswith("-") and not opt.startswith("--"): + shorts.append(opt) + return shorts + + +def test_spec_all_commands_have_model(): + """Every command in MODEL_COMMANDS must have a 'model' parameter.""" + for name in MODEL_COMMANDS: + cmd = _get_command(name) + assert "model" in _param_names(cmd), ( + f"Command '{name}' is missing -m/--model (spec Section 2a)" + ) + # NOTE: after migration, all commands use "model" (not "model_id") + + +def test_spec_sys_has_no_model(): + """sys command must NOT have -m/--model.""" + cmd = _get_command("sys") + names = _param_names(cmd) + assert "model" not in names, "sys should not have --model" + + +def test_spec_device_commands_have_device_and_ep(): + """Commands in DEVICE_COMMANDS must have both 'device' and 'ep'.""" + for name in DEVICE_COMMANDS: + cmd = _get_command(name) + names = _param_names(cmd) + assert "device" in names, f"'{name}' missing --device (spec Section 2b)" + assert "ep" in names, f"'{name}' missing --ep (spec Section 2b)" + + +def test_spec_no_subcommand_defines_verbose_or_quiet(): + """No subcommand may define --verbose or --quiet parameters (ADR-2). + + Short flags -v, -q, -h are checked by test_spec_global_flags_not_shadowed. + """ + for name in main.commands: + cmd = _get_command(name) + names = _param_names(cmd) + assert "verbose" not in names, ( + f"'{name}' defines --verbose — must use ctx.obj (ADR-2)" + ) + assert "quiet" not in names, ( + f"'{name}' defines --quiet — must use ctx.obj (ADR-2)" + ) + + +def test_spec_device_choice_values(): + """All --device options must have exactly auto|cpu|gpu|npu.""" + expected = {"auto", "cpu", "gpu", "npu"} + for name in DEVICE_COMMANDS: + cmd = _get_command(name) + device_param = next( + (p for p in cmd.params if p.name == "device"), None + ) + assert device_param is not None, f"'{name}' missing --device" + assert isinstance(device_param.type, click.Choice), ( + f"'{name}' --device must be Choice type" + ) + assert set(device_param.type.choices) == expected, ( + f"'{name}' --device choices are {device_param.type.choices}, " + f"expected {expected}" + ) + + +def test_spec_device_case_insensitive(): + """All --device options must use case_sensitive=False.""" + for name in DEVICE_COMMANDS: + cmd = _get_command(name) + device_param = next( + (p for p in cmd.params if p.name == "device"), None + ) + assert device_param is not None + assert device_param.type.case_sensitive is False, ( + f"'{name}' --device must be case_sensitive=False (ADR-4)" + ) + + +def test_spec_global_flags_not_shadowed(): + """No subcommand may redefine -h, -v, or -q (reserved globals).""" + for name in main.commands: + cmd = _get_command(name) + shorts = _param_shorts(cmd) + for reserved in RESERVED_SHORTS: + assert reserved not in shorts, ( + f"'{name}' redefines {reserved} — reserved global short" + ) + + +def test_spec_output_flag_consistency(): + """Commands with -o must map it to --output (not --output-dir). + + build uses --output-dir WITHOUT -o short (ADR-3). + """ + for name in main.commands: + cmd = _get_command(name) + for p in cmd.params: + if not hasattr(p, "opts"): + continue + if "-o" in p.opts: + assert "--output" in p.opts, ( + f"'{name}' maps -o to {p.opts} — must be --output (ADR-3)" + ) + # build must have --output-dir WITHOUT -o + build_cmd = _get_command("build") + for p in build_cmd.params: + if hasattr(p, "opts") and "--output-dir" in p.opts: + assert "-o" not in p.opts, ( + "build --output-dir must not have -o short (ADR-3)" + ) + + +def test_spec_short_flag_no_collisions(): + """No two options on the same command may share a short flag.""" + for name in main.commands: + cmd = _get_command(name) + shorts = _param_shorts(cmd) + dupes = [s for s in shorts if shorts.count(s) > 1] + assert not dupes, ( + f"'{name}' has short flag collisions: {set(dupes)}" + ) +``` + +- [ ] **Step 2: Run to see which commands violate the spec** + +```bash +uv run pytest tests/test_cli_spec.py -v -k "test_spec" 2>&1 | head -60 +``` + +Expected: Multiple failures — this proves the validators work. The exact failures become the migration checklist. + +- [ ] **Step 3: Commit the validator tests (they will fail until migration is complete)** + +```bash +git add tests/test_cli_spec.py +git commit -m "test(cli): add spec enforcement validators (will fail until migration)" +``` + +--- + +## Task 4: Migrate Commands — Remove Local `-v`/`-q`, Use `ctx.obj` + +This is the largest task. Each command file needs: +1. Remove `@click.option("--verbose", "-v", ...)` decorator +2. Remove `verbose` from function signature +3. Add `@click.pass_context` if missing (config) +4. Replace `verbose` variable reads with `ctx.obj["verbose"]` +5. Replace `if ctx.obj.get("debug"): verbose = True` pattern (no longer needed) + +**Files:** All 11 command files. + +The migration is mechanical per command. Do them in dependency order — start with simpler commands, end with build (most complex). + +- [ ] **Step 1: Migrate `sys.py`** (simplest command) + +Remove `-v`/`--verbose` option. Read verbosity from `ctx.obj["verbose"]`. + +- [ ] **Step 2: Migrate `inspect.py`** + +Remove `-v`/`--verbose`. Remove `if ctx.obj.get("debug"): verbose = True`. Read `ctx.obj["verbose"]` where needed. + +- [ ] **Step 3: Migrate `config.py`** + +Add `@click.pass_context` (currently missing). Remove `-v`/`--verbose`. Read from `ctx.obj`. Replace `--precision` from `click.Choice(["auto","fp32","fp16","int8","int16"])` to shared `precision_option()` (string + validator per ADR-8). Device choices will reorder from `auto|npu|gpu|cpu` to `auto|cpu|gpu|npu` automatically via shared `device_option()` decorator (ADR-5). + +- [ ] **Step 4: Migrate `export.py`** + +Remove `-v`/`--verbose`. Remove debug override pattern. + +- [ ] **Step 5: Migrate `quantize.py`** + +Remove `-v`/`--verbose`. Add `-d`/`--device`, `--ep`, `-t`/`--task` short flag, `-n` for `--samples` using shared decorators. + +- [ ] **Step 6: Migrate `optimize.py`** + +Remove `-v`/`--verbose`. Remove `-p`/`--preset` (per user decision). Add `-d`/`--device`, `--ep` using shared decorators. + +- [ ] **Step 7: Migrate `eval.py`** + +Remove `-v`/`--verbose`. Add `--ep`, `-t`/`--task` short flag. + +- [ ] **Step 8: Migrate `analyze.py`** + +Remove `-v`/`--verbose`. Ensure `--device`/`--ep` use shared decorators. + +- [ ] **Step 9: Migrate `compile.py`** + +Remove `-v`/`--verbose`. Replace `--quantize/--no-quantize` pair with `--no-quant`. Replace `--validate/--no-validate` pair with `--no-validate` (single flag). Change `--output-dir` to `-o`/`--output`. Change `--device` default from `npu` to `auto`. Change `--ep` from `click.Choice(VALID_EPS)` to `type=str` (using shared `ep_option()` decorator). Use shared decorators for model, device, ep, output. + +- [ ] **Step 10: Migrate `perf.py`** + +Remove `-v`/`--verbose`. Add `-n` for `--iterations`, `-t` for `--task`. Rename `--no-quantize` to `--no-quant` (Section 5 negation convention). Replace `--precision` from `click.Choice` to shared `precision_option()` (string + validator per ADR-8). Remove `--compare-devices` (not in spec, never implemented). Use shared device/ep decorators. + +- [ ] **Step 11: Migrate `build.py`** + +Remove local `-v`/`--verbose` and `-q`/`--quiet`. Make `-m`/`--model` required. Remove `-o` short flag from `--output-dir` — per ADR-3, `-o` is reserved for `--output` (file output); `build` uses `--output-dir` (no short form). Use shared decorators for model, device, ep. This is the most complex — has quiet plumbing through `_run_single_build`, `StageLive`, etc. The `quiet` value now comes from `ctx.obj["quiet"]` instead of a local parameter. + +- [ ] **Step 12: Run full test suite after each migration** + +After each command migration: +```bash +uv run pytest tests/test_cli_spec.py -v -k "test_spec" +``` + +Watch the failure count decrease with each command migrated. + +- [ ] **Step 13: Lint all modified files** + +```bash +uv run ruff check modelkit/commands/ modelkit/cli.py +``` + +- [ ] **Step 14: Commit** + +```bash +git add modelkit/commands/ modelkit/cli.py +git commit -m "refactor(cli): migrate all commands to shared decorators + global verbosity + +Removes per-command -v/--verbose in favor of global ctx.obj['verbose']. +Standardizes --device, --ep, -m/--model via _options.py shared decorators. +Replaces compile flag pairs with --no-quant/--no-validate (ADR-10). +" +``` + +--- + +## Task 5: Run Full Spec Validation + +- [ ] **Step 1: Run all spec validator tests** + +```bash +uv run pytest tests/test_cli_spec.py -v +``` + +Expected: ALL PASS. + +- [ ] **Step 2: Run existing test suite to catch regressions** + +```bash +uv run pytest tests/ -v --ignore=tests/integration 2>&1 | tail -20 +``` + +Fix any regressions from the migration. + +- [ ] **Step 3: Smoke test key commands** + +```bash +uv run wmk -h +uv run wmk -vv sys +uv run wmk -q build --help +uv run wmk export --help +uv run wmk compile --help +``` + +Verify help text looks correct and global flags propagate. + +- [ ] **Step 4: Final lint** + +```bash +uv run ruff check modelkit/ tests/test_cli_spec.py +``` + +- [ ] **Step 5: Commit any fixes** + +```bash +git add modelkit/ tests/test_cli_spec.py +git commit -m "fix(cli): address regression fixes from args migration" +``` + +--- + +## Task 6: Final Commit and Cleanup + +- [ ] **Step 1: Verify git log is clean** + +```bash +git log --oneline -10 +``` + +- [ ] **Step 2: Verify no leftover `verbose` parameters in command signatures** + +```bash +grep -rn "def .*(.*verbose.*)" modelkit/commands/ +``` + +Expected: No matches (all removed). + +- [ ] **Step 3: Verify no leftover `-v` in command decorators** + +```bash +grep -rn '"-v"' modelkit/commands/ +``` + +Expected: No matches. + +- [ ] **Step 4: Done** + +All spec validator tests pass. All existing tests pass (or pre-existing failures documented). CLI arguments are standardized per `docs/design/cli/3_cli_args_spec.md`. diff --git a/docs/design/config/config-command-changes.md b/docs/design/config/config-command-changes.md new file mode 100644 index 000000000..c8b6882aa --- /dev/null +++ b/docs/design/config/config-command-changes.md @@ -0,0 +1,328 @@ +# Config Command Console Output — Change Explanation + +Every diff block in `modelkit/commands/config.py` explained. + +--- + +## Block 1: Import replacement + +```diff +-from rich.console import Console ++from ..utils.console import ( ++ get_console, ++ print_command_header, ++ print_error, ++ print_io_specs_detail, ++ print_io_specs_na, ++ print_kv, ++ print_success, ++) + +-console = Console(stderr=True) ++console = get_console() +``` + +**Why**: Instead of creating `Console(stderr=True)` directly, we import shared formatting functions from `modelkit/utils/console.py`. This ensures consistent output style across all commands (config, build, analyze). `get_console()` returns the same `Console(stderr=True)` — just centralized. + +**Functions imported**: +- `print_command_header` — `═══` separator + title (used by analyze too) +- `print_kv` — key-value line like `📦 Model: bert-base-uncased` +- `print_io_specs_detail` — aligned I/O tensor table (Input/Output with name, shape, dtype) +- `print_io_specs_na` — "N/A" line for ONNX mode (no I/O specs available) +- `print_error` — `❌` error line with `💡` hint +- `print_success` — `✅` success line + +--- + +## Block 2: Error display before validation error + +```diff + if hf_model is None and model_type is None and model_class is None: ++ print_command_header(console, "\U0001f4cb CONFIG GENERATION") ++ print_error( ++ console, ++ "Missing required input", ++ hint="Provide one of: -m/--model, --model-type, or --model-class", ++ ) ++ console.print() + raise click.UsageError(...) +``` + +**Why**: Before, the error was just a plain click error message. Now we show the styled header + a hint line before raising, so the user sees: +``` +════════════════════════════ +📋 CONFIG GENERATION +════════════════════════════ + ❌ Missing required input + 💡 Provide one of: -m/--model, --model-type, or --model-class +``` +The `click.UsageError` still raises for proper exit code handling. + +--- + +## Block 3: Store override filenames for later display + +```diff + override = None ++ _override_file: str | None = None ++ _shape_config_file: str | None = None +``` + +```diff +- console.print(f"[dim]Loaded overrides from {config_path.name}[/dim]") ++ _override_file = config_path.name +``` + +```diff +- console.print(f"[dim]Loaded I/O config from {shape_config_path.name}[/dim]") ++ _shape_config_file = shape_config_path.name +``` + +**Why**: The old code printed "Loaded overrides from X" immediately when the file was parsed. This broke the visual flow — the message appeared before the command header. + +Now we store the filename and display it later in the proper location (after the header, under "Override files" section). The variables are initialized to `None` at the top so they're always defined even when no override file is provided. + +--- + +## Block 4: Remove inline "Generating..." messages + +```diff +- console.print(f"[dim]Generating ONNX build config for {hf_model}...[/dim]") ++ # Header printed after all config generation completes (below) +``` + +```diff +- label = hf_model or model_type +- console.print(f"[dim]Generating config for {label}...[/dim]") ++ _is_onnx_mode = False +``` + +**Why**: The old code printed "Generating config for X..." before calling `generate_build_config()`. This is a transient message that provides no lasting value — the operation is fast (<3s typically). Instead, we now print a complete summary AFTER generation completes, which is more informative. + +--- + +## Block 5: Collect metadata instead of printing inline + +```diff +- console.print("[green]Generated ONNX build config (export=None)[/green]") + output_data = config_obj.to_dict() ++ _is_onnx_mode = True ++ _resolved_task = None ++ _resolved_model_class = None ++ _export_cfg = None ++ _n_modules = 0 +``` + +```diff + if module: + configs = result +- # Apply --no-quant / --no-compile overrides to each config + for cfg in configs: + _apply_stage_overrides(cfg, ...) +- console.print(f"[green]Found {len(configs)} submodules matching '{module}'[/green]") + output_data = [cfg.to_dict() for cfg in configs] ++ _n_modules = len(configs) ++ config_obj = configs[0] if configs else None +``` + +```diff + else: + config_obj = result +- # Apply --no-quant / --no-compile overrides + _apply_stage_overrides(config_obj, ...) +- if not task and not module: +- auto_task = config_obj.loader.task +- source = model_type or hf_model +- console.print(f"[dim]Auto-selected task: {auto_task} (from '{source}')[/dim]") +- console.print(f"[green]Generated config for task '{config_obj.loader.task}'[/green]") +- output_data = config_obj.to_dict() ++ output_data = config_obj.to_dict() ++ _n_modules = 0 ++ ++ _resolved_task = config_obj.loader.task if config_obj else None ++ _resolved_model_class = config_obj.loader.model_class if config_obj else None ++ _export_cfg = config_obj.export if config_obj else None +``` + +**Why**: The old code printed messages scattered throughout the generation logic — "Generated ONNX build config", "Found N submodules", "Auto-selected task: X", "Generated config for task X". Each was a separate `console.print()` call with inconsistent formatting. + +Now we collect all metadata into variables (`_is_onnx_mode`, `_resolved_task`, `_resolved_model_class`, `_export_cfg`, `_n_modules`) and print everything together in a structured block below. This separation of "compute" from "display" makes the code easier to follow. + +--- + +## Block 6: Structured Rich console output + +This is the main new section — replaces all the scattered `console.print()` calls. + +### 6a: Command header + +```python +subtitle = "ONNX mode" if _is_onnx_mode else ("module mode" if module else None) +print_command_header(console, "📋 CONFIG GENERATION", subtitle) +``` + +Produces: `════════ 📋 CONFIG GENERATION (ONNX mode) ════════` + +### 6b: Model identity + +```python +model_label = hf_model or model_type or model_class or "?" +print_kv(console, "Model:", model_label, icon="📦") +``` + +Shows the primary model identifier — whatever the user provided. + +### 6c: Model class + Task (or ONNX mode) + +```python +if _is_onnx_mode: + print_kv(console, "Mode:", "Direct ONNX", note="export=None", icon="🔧") +else: + # Model class before Task (matches build mock convention) + if module: + print_kv(console, "Module:", module, icon="🧩") + elif _resolved_model_class: + mc_note = None if model_class else "auto-detected" + print_kv(console, "Model class:", _resolved_model_class, note=mc_note, icon="🧩") + if _resolved_task: + task_note = None if task else "auto-detected" + print_kv(console, "Task:", _resolved_task, note=task_note, icon="🏷️") +``` + +**Key design**: The `(auto-detected)` suffix only appears when the user did NOT provide `--task` or `--model-class`. When the user explicitly provided it, no suffix — they already know. This is determined by checking the original CLI args (`task`, `model_class`) against the resolved values. + +### 6d: Override files + +```python +if config_file: + console.print(f" 📁 Overrides: {_override_file} ✓") +if shape_config_file: + console.print(f" 📁 Shape config: {_shape_config_file} ✓") +``` + +Only shown when override files were actually provided. Uses the filenames stored in Block 3. + +### 6e: I/O specs (always full detail) + +```python +if _is_onnx_mode: + print_io_specs_na(console) +elif _export_cfg is not None: + print_io_specs_detail(console, _export_cfg) +``` + +- ONNX mode: shows "N/A — inferred from ONNX graph at build time" +- HF mode: shows aligned columns with each input/output tensor name, shape, dtype + +`print_io_specs_detail` reads `export_config.input_tensors` and `export_config.output_tensors` directly — these are populated by `generate_build_config()` during Optimum OnnxConfig resolution. + +### 6f: Resolution (from config object, no hardcoding) + +```python +if _ref_config is not None: + _quant = _ref_config.quant + _compile = _ref_config.compile + + if _quant is not None or _compile is not None: + console.print(" ⚙️ Resolution:") + + if _compile and hasattr(_compile, "ep_config") and _compile.ep_config: + _provider = _compile.ep_config.provider + from ..utils.constants import normalize_ep_name + _ep_full = normalize_ep_name(_provider) or _provider + console.print(f" EP: {_ep_full}") + + if _quant: + console.print(f" Quant: {_quant.weight_type}/{_quant.activation_type}") +``` + +**Critical design decision**: This section reads directly from the config object — `config.compile.ep_config.provider` and `config.quant.weight_type/activation_type`. No reverse mapping, no hardcoded strings, no inference. + +- EP display name uses `normalize_ep_name()` from `modelkit/utils/constants.py` (existing API, also used by `wmk analyze`) +- Quant types are displayed exactly as stored in the config +- The section only shows when quant or compile is configured (not shown for default CPU/fp32 builds) + +### 6g: Submodule list + +```python +if module and not _is_onnx_mode and _n_modules > 0: + console.print(f" 🧩 Submodules: {_n_modules} matching '{module}'") +``` + +Only shown in module mode when submodules were found. + +--- + +## Block 7: Output line (single line) + +```diff +- console.print(f"[green]Config saved to:[/green] {output}") ++ suffix = f" [dim]({_n_modules} submodules)[/dim]" if _n_modules else "" ++ print_success(console, f"Config saved to: [bold]{output}[/bold]{suffix}") + else: ++ print_success(console, "Config written to stdout") + # Print to stdout (not stderr where console prints) + print(config_json) ++ ++ console.print() +``` + +**Why**: Merged the success indicator + save location into a single line with `✅`. Two variants: +- File output: `✅ Config saved to: output.json` +- Stdout: `✅ Config written to stdout` (then JSON follows on stdout) + +Module mode appends the count: `✅ Config saved to: output.json (3 submodules)` + +--- + +## Data Flow Summary + +``` +CLI args (hf_model, task, model_class, device, precision, ep) + │ + ▼ +generate_build_config() ← blocking, resolves everything + │ + ▼ +WinMLBuildConfig ← contains all resolved values + │ + ├── .loader.task → "Task: fill-mask (auto-detected)" + ├── .loader.model_class → "Model class: BertForMaskedLM (auto-detected)" + ├── .export.input_tensors → "Input: input_ids [1, 128] int64" + ├── .export.output_tensors → "Output: logits ? ?" (see Known Limitations #1) + ├── .compile.ep_config.provider → normalize_ep_name() → "EP: QNNExecutionProvider" + └── .quant.weight_type/activation_type → "Quant: uint8/uint8" +``` + +Every displayed value comes from the config object or existing APIs. No hardcoded model-specific logic or mapping tables. Display-only strings like `"auto-detected"`, `"Direct ONNX"`, `"module mode"` are UI labels, not model logic. + +--- + +## Known Limitations + +### 1. OutputTensorSpec lacks shape and dtype + +`OutputTensorSpec` (from `modelkit/onnx/io.py`) only carries `name` — no `shape` or `dtype`. This is because output shapes are model-dependent and not always known until export time. The display code uses `getattr(t, "shape", None)` with a `"?"` fallback, so output tensors will show: + +``` + Output: logits ? ? +``` + +This is accurate — the config genuinely doesn't know output shapes at generation time. + +### 2. Resolution section hidden when quant=None and compile=None + +When the user runs `wmk config -m some-model` and the model has no registered build config with quant/compile defaults, the Resolution section is not shown at all. This is intentional — there's nothing to resolve. The generated JSON config will have `quant: null` and `compile: null`. + +### 3. ONNX + --module is rejected + +`--module` requires a HuggingFace model for submodule discovery via torchinfo. ONNX files don't have a PyTorch module tree. An explicit `UsageError` is raised if both are provided. + +### 4. `print_resolution()` removed from console.py + +An earlier version of `console.py` had a `print_resolution()` function with hardcoded device/precision display logic (inferred from EP mappings). This was removed because: +- It duplicated `_EP_TO_DEVICE` and `_WEIGHT_TYPE` from `precision.py` +- The config command now reads resolution directly from the config object +- No other command imports it + +If the build command needs a resolution display in the future, it should also read from the config object directly. diff --git a/docs/design/config/console_mockup.py b/docs/design/config/console_mockup.py new file mode 100644 index 000000000..928b3f0f0 --- /dev/null +++ b/docs/design/config/console_mockup.py @@ -0,0 +1,349 @@ +# ruff: noqa +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Mock: Proposed wmk config console output. + +Run: uv run python temp/mock_config_output.py [--onnx] [--verbose] [--module] [--error] + +Demonstrates the redesigned config command output with: +- Command header with model identity +- Auto-detected values labeled +- Resolution summary (device, EP, precision) +- Verbose resolution chain (-v) +- Module mode output +- Error output with actionable hints +""" + +from __future__ import annotations + +import sys + +from rich.console import Console +from rich.table import Table + + +console = Console(stderr=True) + +# ── Shared styling constants (would live in modelkit/utils/console.py) ──── + +HEAVY_SEP = "═" * 60 +LIGHT_SEP = "─" * 60 + + +def print_command_header( + title: str, + subtitle: str | None = None, +) -> None: + """Print a command header block matching analyze style.""" + console.print() + console.print(HEAVY_SEP) + label = f"[bold]{title}[/bold]" + if subtitle: + label += f" [dim]({subtitle})[/dim]" + console.print(label) + console.print(HEAVY_SEP) + + +def print_kv( + label: str, + value: str, + note: str | None = None, + icon: str = "", +) -> None: + """Print a key-value line with optional note.""" + line = f" {icon} [bold]{label:<14}[/bold] [cyan]{value}[/cyan]" + if note: + line += f" [dim]({note})[/dim]" + console.print(line) + + +def print_success(message: str) -> None: + console.print(f" [green]✅ {message}[/green]") + + +def print_error(message: str, hint: str | None = None) -> None: + console.print(f" [red]❌ {message}[/red]") + if hint: + console.print(f" [dim]💡 {hint}[/dim]") + + +# ── Scenario: Normal HF model ──────────────────────────────────────────── + + +def print_io_specs( + inputs: list[tuple[str, str, str]], + output_names: list[str], +) -> None: + """Print resolved I/O specs. + + Args: + inputs: list of (name, shape_str, dtype) for input tensors + output_names: list of output tensor names (no shape/dtype available) + """ + for i, (name, shape, dtype) in enumerate(inputs): + label = "Input: " if i == 0 else " " + console.print(f" {label}[cyan]{name:<18}[/cyan] {shape:<14} [dim]{dtype}[/dim]") + # Fix #3: Output tensors have name only (OutputTensorSpec lacks shape/dtype) + for i, name in enumerate(output_names): + label = "Output: " if i == 0 else " " + console.print(f" {label}[cyan]{name}[/cyan]") + + +# Example I/O data for demos +_BERT_INPUTS = [ + ("input_ids", "[1, 128]", "int64"), + ("attention_mask", "[1, 128]", "int64"), + ("token_type_ids", "[1, 128]", "int64"), +] +_BERT_OUTPUTS = ["logits"] + +_RESNET_INPUTS = [ + ("pixel_values", "[1, 3, 224, 224]", "float32"), +] +_RESNET_OUTPUTS = ["logits"] + + +def demo_normal(verbose: bool = False) -> None: + """Simulate: wmk config -m bert-base-uncased.""" + print_command_header("📋 CONFIG GENERATION") + + # Fix #1: Model class before Task. Fix #2: no trailing space on 🏷️ + print_kv("Model:", "bert-base-uncased", icon="📦") + print_kv("Model class:", "BertForMaskedLM", note="auto-detected", icon="🧩") + print_kv("Task:", "fill-mask", note="auto-detected", icon="🏷️") + + console.print() + + # Fix #3: Output name only (no shape/dtype) + print_io_specs(_BERT_INPUTS, _BERT_OUTPUTS) + + console.print() + + console.print(" ⚙️ [bold]Resolution:[/bold]") + console.print(" Device: [cyan]NPU[/cyan]") + console.print(" Quant: [cyan]uint8/uint8[/cyan] [dim](weight/activation)[/dim]") + + console.print() + print_success("Config saved to: [bold]output/config.json[/bold]") + console.print() + + +# ── Scenario: ONNX file input ──────────────────────────────────────────── + + +def demo_onnx() -> None: + """Simulate: wmk config -m model.onnx.""" + print_command_header("📋 CONFIG GENERATION", subtitle="ONNX mode") + + print_kv("Model:", "model.onnx", icon="📦") + print_kv("Mode:", "Direct ONNX", note="export=None", icon="🔧") + + console.print() + console.print( + " 📐 [bold]I/O specs:[/bold] [dim]N/A — inferred from ONNX graph at build time[/dim]" + ) + + console.print() + console.print(" ⚙️ [bold]Resolution:[/bold]") + console.print(" Device: [cyan]NPU[/cyan]") + console.print(" Quant: [cyan]uint8/uint8[/cyan] [dim](weight/activation)[/dim]") + + console.print() + print_success("Config saved to: [bold]output/config.json[/bold]") + console.print() + + +# ── Scenario: Module mode ──────────────────────────────────────────────── + + +def demo_module() -> None: + """Simulate: wmk config -m microsoft/resnet-50 --module ResNetConvLayer.""" + print_command_header("📋 CONFIG GENERATION", subtitle="module mode") + + print_kv("Model:", "microsoft/resnet-50", icon="📦") + print_kv("Module:", "ResNetConvLayer", icon="🧩") + print_kv("Task:", "image-classification", note="auto-detected", icon="🏷️") + + console.print() + print_io_specs(_RESNET_INPUTS, _RESNET_OUTPUTS) + + console.print() + console.print(" ⚙️ [bold]Resolution:[/bold]") + console.print(" Device: [cyan]NPU[/cyan]") + console.print(" Quant: [cyan]uint8/uint8[/cyan] [dim](weight/activation)[/dim]") + + console.print() + + # Module discovery results + console.print( + " 🧩 [bold]Submodules found:[/bold] [green]3[/green] matching 'ResNetConvLayer'" + ) + console.print() + + table = Table( + show_header=True, + header_style="bold", + box=None, + padding=(0, 1), + expand=False, + ) + table.add_column("#", width=4, justify="right") + table.add_column("Module path", width=30) + table.add_column("Class", width=20) + + table.add_row("[dim]1[/dim]", "encoder.stages.0.layers.0.conv", "ResNetConvLayer") + table.add_row("[dim]2[/dim]", "encoder.stages.1.layers.0.conv", "ResNetConvLayer") + table.add_row("[dim]3[/dim]", "encoder.stages.2.layers.0.conv", "ResNetConvLayer") + + console.print(table) + + console.print() + print_success("Config saved to: [bold]output/config.json[/bold] [dim](3 submodules)[/dim]") + console.print() + + +# ── Scenario: Override files ───────────────────────────────────────────── + + +def demo_overrides() -> None: + """Simulate: wmk config -m bert-base-uncased -c overrides.json --shape-config shapes.json.""" + print_command_header("📋 CONFIG GENERATION") + + print_kv("Model:", "bert-base-uncased", icon="📦") + print_kv("Model class:", "BertForMaskedLM", note="auto-detected", icon="🧩") + print_kv("Task:", "fill-mask", note="auto-detected", icon="🏷️") + + console.print() + + # Override files + console.print(" 📁 [bold]Overrides:[/bold] overrides.json [green]✓[/green]") + console.print(" 📁 [bold]Shape config:[/bold] shapes.json [green]✓[/green]") + + console.print() + print_io_specs(_BERT_INPUTS, _BERT_OUTPUTS) + + console.print() + console.print(" ⚙️ [bold]Resolution:[/bold]") + console.print(" Device: [cyan]NPU[/cyan]") + console.print(" Quant: [cyan]uint8/uint8[/cyan] [dim](weight/activation)[/dim]") + + console.print() + print_success("Config written to stdout") + console.print() + + +# ── Scenario: Error — resolution failure ───────────────────────────────── + + +def demo_error() -> None: + """Simulate: wmk config -m unknown-model --task custom-task.""" + print_command_header("📋 CONFIG GENERATION") + + print_kv("Model:", "unknown-model", icon="📦") + print_kv("Task:", "custom-task", note="user-provided", icon="🏷️") + + console.print() + print_error( + "I/O spec resolution failed:", + hint="Try: --model-class MyModelClass or --shape-config shapes.json", + ) + console.print( + " [dim]Could not find OnnxConfig for model_type='unknown', task='custom-task'[/dim]" + ) + console.print() + + +# ── Scenario: Error — missing input ────────────────────────────────────── + + +def demo_missing_input() -> None: + """Simulate: wmk config (no arguments).""" + print_command_header("📋 CONFIG GENERATION") + + print_error( + "Missing required input", + hint="Provide one of: -m/--model, --model-type, or --model-class", + ) + console.print() + + +# ── Scenario: Auto device (default, no --device) ──────────────────────── + + +def demo_auto_device() -> None: + """Simulate: wmk config -m bert-base-uncased (no device/precision flags).""" + print_command_header("📋 CONFIG GENERATION") + + print_kv("Model:", "bert-base-uncased", icon="📦") + print_kv("Model class:", "BertForMaskedLM", note="auto-detected", icon="🧩") + print_kv("Task:", "fill-mask", note="auto-detected", icon="🏷️") + + console.print() + print_io_specs(_BERT_INPUTS, _BERT_OUTPUTS) + + console.print() + console.print(" ⚙️ [bold]Resolution:[/bold]") + console.print(" Device: [cyan]NPU[/cyan]") + console.print(" Quant: [dim]none[/dim]") + + console.print() + print_success("Config written to stdout") + console.print() + + +# ── Main ───────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + args = set(sys.argv[1:]) + + if "--help" in args or "-h" in args: + console.print("[bold]Usage:[/bold] uv run python temp/mock_config_output.py [OPTIONS]") + console.print() + console.print(" [dim](no flags)[/dim] Normal HF model with NPU/int8") + console.print(" --auto Default device (CPU/fp32, no --device)") + console.print(" --onnx ONNX file input") + console.print(" --module Module mode (submodule discovery)") + console.print(" --overrides With override files") + console.print(" --verbose Verbose resolution chain") + console.print(" --error Resolution failure") + console.print(" --missing Missing required input") + console.print(" --all Run all scenarios") + sys.exit(0) + + if "--all" in args: + scenarios = [ + ("Normal HF model (--device npu --precision int8)", demo_normal), + ("Default device (CPU/fp32)", demo_auto_device), + ("ONNX file input", demo_onnx), + ("Module mode", demo_module), + ("With override files", demo_overrides), + ("Verbose resolution chain", lambda: demo_normal(verbose=True)), + ("Error: resolution failure", demo_error), + ("Error: missing input", demo_missing_input), + ] + for label, fn in scenarios: + console.print() + console.print(f"[bold yellow]▶ Scenario: {label}[/bold yellow]") + fn() + console.print() + sys.exit(0) + + if "--onnx" in args: + demo_onnx() + elif "--module" in args: + demo_module() + elif "--overrides" in args: + demo_overrides() + elif "--verbose" in args: + demo_normal(verbose=True) + elif "--error" in args: + demo_error() + elif "--missing" in args: + demo_missing_input() + elif "--auto" in args: + demo_auto_device() + else: + demo_normal() diff --git a/docs/design/e2e_eval/mockup.py b/docs/design/e2e_eval/mockup.py new file mode 100644 index 000000000..62a5f6f56 --- /dev/null +++ b/docs/design/e2e_eval/mockup.py @@ -0,0 +1,163 @@ +# ruff: noqa +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Mockup script for wmk eval CLI output. + +Renders the proposed eval command UI using Rich, with simulated +progress and fake metrics. Run to see what the real command will look like. + +Usage: + uv run python temp/eval_mockup.py + uv run python temp/eval_mockup.py --quiet +""" + +from __future__ import annotations + +import time + +import click +from rich.console import Console +from rich.table import Table + + +console = Console(stderr=True) + + +def _render_header( + model: str, + device: str, + task: str, + dataset: str, + split: str, + samples: int, +) -> None: + console.print() + console.print("[bold]" + "=" * 80 + "[/bold]") + console.print("[bold]EVALUATION[/bold]") + console.print("[bold]" + "=" * 80 + "[/bold]") + console.print(f" Model: [bold cyan]{model}[/bold cyan]") + console.print(f" Device: [green]{device}[/green]") + console.print(f" Task: {task}") + console.print(f" Dataset: {dataset} ({split})") + console.print(f" Samples: {samples:,}") + console.print() + + +def _render_progress(total: int) -> None: + """Simulate live progress bar.""" + from rich.live import Live + from rich.text import Text + + console.print("-" * 80) + + with Live(console=console, refresh_per_second=15, transient=True) as live: + for i in range(1, total + 1): + pct = i / total + bar_len = int(pct * 40) + bar = f"[{'=' * bar_len}{' ' * (40 - bar_len)}]" + + # Simulated metrics + latency = 8.32 + (0.5 if i % 7 == 0 else 0) + throughput = 1000.0 / latency + eta = (total - i) * latency / 1000.0 + + line = Text.from_markup( + f" Evaluating... {bar} {i}/{total} {pct:.0%}\n" + f" Latency: {latency:.2f} ms | ~{throughput:.0f} smp/s" + f" | ETA: {eta:.1f}s" + ) + live.update(line) + time.sleep(0.01) + + console.print("-" * 80) + console.print() + + +def _render_results( + task: str, + accuracy: float, + f1: float | None, + samples: int, + total_time: float, +) -> None: + console.print("[bold]" + "=" * 80 + "[/bold]") + console.print("[bold]RESULTS[/bold]") + console.print("[bold]" + "=" * 80 + "[/bold]") + console.print() + + # Metrics + console.print(f" [bold]Accuracy:[/bold] {accuracy:.2f}%") + if f1 is not None: + console.print(f" [bold]F1:[/bold] {f1:.2f}%") + console.print() + + # Latency table + console.print(" [bold]Latency (ms)[/bold]") + table = Table(show_header=True, header_style="bold cyan", padding=(0, 1)) + for col in ["Avg", "P50", "P90", "P95", "P99", "Std"]: + table.add_column(col, justify="right") + table.add_row("8.32", "7.95", "10.21", "11.44", "14.02", "1.87") + console.print(table) + console.print() + + # Throughput + throughput = samples / total_time + console.print(f" [bold]Throughput:[/bold] {throughput:.2f} samples/sec") + console.print(f" [bold]Total time:[/bold] {total_time:.2f}s ({samples:,} samples)") + console.print() + + +def _render_hardware() -> None: + console.print(" [bold]Hardware[/bold]") + console.print(" NPU: 67.3% avg, 89.1% peak | CPU: 12.4% avg") + console.print(" Device Mem: 245/128 MB (local/shared) | Sys Mem: 8,432 MB") + console.print() + + +@click.command() +@click.option("--quiet", "-q", is_flag=True, help="JSON-only output") +@click.option("--task", default="image-classification") +@click.option("--no-monitor", is_flag=True, help="Skip hardware section") +def main(quiet: bool, task: str, no_monitor: bool) -> None: + """Render eval command mockup.""" + model = "microsoft/resnet-50" + dataset = "timm/mini-imagenet" + samples = 1000 + + if quiet: + import json + + print( + json.dumps( + { + "model": model, + "task": task, + "device": "npu", + "dataset": dataset, + "samples": samples, + "metrics": {"accuracy": 0.7842}, + "latency_ms": {"mean": 8.32, "p50": 7.95, "p90": 10.21}, + "throughput_sps": 120.19, + } + ) + ) + return + + f1 = 91.05 if task == "text-classification" else None + accuracy = 91.28 if task == "text-classification" else 78.42 + + _render_header(model, "npu", task, dataset, "test", samples) + _render_progress(samples) + _render_results(task, accuracy, f1, samples, total_time=8.32) + if not no_monitor: + _render_hardware() + + console.print(f" Results saved to: temp/eval_{model.split('/')[-1]}_20260322.json") + console.print() + + +if __name__ == "__main__": + main() diff --git a/docs/design/importtime/1_plan.md b/docs/design/importtime/1_plan.md new file mode 100644 index 000000000..569370056 --- /dev/null +++ b/docs/design/importtime/1_plan.md @@ -0,0 +1,221 @@ +# Import Time Optimization — Implementation Plan + +**Prereq**: Read `1_prd.md` for problem analysis and measurements. + +## Approach: Lazy Loading via PEP 562 + +Three changes addressing the three heavy import chains identified in the PRD. +**Steps 1 and 3 must ship together** (Step 3 guards registrations broken by Step 1). +Step 2 is independent and low-risk. + +## Step 1: Lazy `modelkit/__init__.py` + +**Addresses**: All three chains — the entire 30s root cost. + +**What**: Replace eager top-level imports with PEP 562 `__getattr__`: + +```python +# modelkit/__init__.py — AFTER + +from importlib.metadata import PackageNotFoundError, version + +from . import _warnings # Must stay eager: warning filters before any heavy import + +try: + __version__ = version("winml-modelkit") +except PackageNotFoundError: + __version__ = "0.0.1.dev0" + +__all__ = [ + "WinMLAutoModel", + "WinMLBuildConfig", + "WinMLModelForImageClassification", + "WinMLPreTrainedModel", + "__version__", +] + +def __getattr__(name: str): + if name == "WinMLBuildConfig": + from .config import WinMLBuildConfig + globals()["WinMLBuildConfig"] = WinMLBuildConfig # Cache after first access + return WinMLBuildConfig + + if name in ("WinMLAutoModel", "WinMLPreTrainedModel", "WinMLModelForImageClassification"): + from .models import ( + WinMLAutoModel, + WinMLModelForImageClassification, + WinMLPreTrainedModel, + ) + globals().update({ + "WinMLAutoModel": WinMLAutoModel, + "WinMLPreTrainedModel": WinMLPreTrainedModel, + "WinMLModelForImageClassification": WinMLModelForImageClassification, + }) + return globals()[name] + + raise AttributeError(f"module 'modelkit' has no attribute {name!r}") + +def __dir__(): + return __all__ +``` + +**Key mechanism**: `cli.py` does `from . import __version__` which executes +`modelkit/__init__.py` — but with the lazy version, only `_warnings` + version +detection run at module scope. The heavy imports in `__getattr__` are never triggered +by the CLI path. This is the **primary mechanism** that makes the CLI fast. + +**Why `globals()` caching**: Without it, `__getattr__` fires on every attribute access. +Caching into `globals()` means the import only happens once. + +**Why `__dir__`**: Without it, `dir(modelkit)` would not include lazy attributes. +Required for debugger/IPython tab-completion. + +**Verification**: +```bash +# Should NOT show transformers/torch/optimum in the trace +uv run python -X importtime -c "from modelkit.cli import main" 2>&1 | grep -E "torch|transformers|optimum" +``` + +## Step 2: Lazy CLI Command Discovery + +**Addresses**: `_discover_commands()` importing all command modules at load time. + +**What**: Replace `_discover_commands()` + `@click.group()` with a `LazyGroup` that +only imports command modules when a specific command is invoked: + +```python +class LazyGroup(click.Group): + """Click group that defers command imports until invoked.""" + + _commands_dir = Path(__file__).parent / "commands" + + def list_commands(self, ctx: click.Context) -> list[str]: + """Return command names from filesystem — no imports.""" + if not self._commands_dir.exists(): + return [] + return sorted( + p.stem for p in self._commands_dir.glob("*.py") + if not p.name.startswith("_") + ) + + def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: + """Import command module only when the command is actually invoked.""" + try: + module = import_module(f".commands.{cmd_name}", package=__package__) + except ImportError as e: + logger.warning("Failed to import command module %s: %s", cmd_name, e) + return None + except Exception as e: + logger.error("Error loading command %s: %s", cmd_name, e) + return None + + # Find Click command in module (prefer Group over Command) + discovered = None + for attr_name in dir(module): + attr = getattr(module, attr_name) + if isinstance(attr, click.Group): + return attr + if isinstance(attr, click.Command) and discovered is None: + discovered = attr + return discovered + +@click.group(cls=LazyGroup) +@click.version_option(version=__version__, prog_name="wmk") +# ... rest of main() unchanged +``` + +Remove the old `_discover_commands()` function and the `_discover_commands()` call +at module scope. + +**Error handling**: Both `ImportError` and generic `Exception` are caught with +appropriate log levels, matching the current `_discover_commands()` behavior +(lines 121-124 of `cli.py`). + +**Verification**: +```bash +time uv run wmk --help +# Should list commands without importing any command module +``` + +## Step 3: ONNX Config Registration Guard + +**Addresses**: Lazy loading breaks `@register_onnx_overwrite` decorator side effects. + +**MUST ship with Step 1** — without this guard, `wmk export` would fail to find +custom ONNX configs for models like BERT, CLIP, DETR, SAM. + +**Problem**: With lazy loading, HF model files are never imported until explicitly +accessed. Their `@register_onnx_overwrite` decorators never fire. These files also +have direct optimum imports (e.g., `bert.py:15-19` imports `BertOnnxConfig`), so +the entire `models/hf/` package must be treated as a unit. + +**What**: Add an idempotent trigger in `export/io.py` (near TasksManager usage): + +```python +# modelkit/export/io.py + +_hf_models_registered = False + +def ensure_hf_models_registered() -> None: + """Trigger HF model ONNX config registrations. Idempotent.""" + global _hf_models_registered + if _hf_models_registered: + return + from modelkit.models import hf as _hf # noqa: F401 — triggers decorators + _hf_models_registered = True +``` + +**Call sites** — only the genuinely unguarded path needs the explicit guard: +- `export/io.py` → `_get_onnx_config_constructor()` — **REQUIRED**, this is the only + path that reaches `TasksManager.get_exporter_config_constructor()` without going + through `models/__init__.py` first. + +The following paths are **already safe** through transitive imports but get the guard +defensively: +- `config/build.py` → `generate_build_config()` — already imports + `MODEL_BUILD_CONFIGS` at line 413 which triggers `models/hf/__init__.py` +- `inspect/resolver.py` — imports from `..models` at top level (lines 16-21) + which triggers `models/__init__.py` → `models/hf/__init__.py` + +**Verification**: +```bash +# Export must still find custom ONNX configs +uv run wmk export -m prajjwal1/bert-tiny -o temp/test_export +# Config generation must still work +uv run wmk config -m microsoft/resnet-50 --device npu --precision int8 +# Inspect must still work +uv run wmk inspect -m google-bert/bert-base-uncased +``` + +## Implementation Order + +``` +Step 2 (independent, low risk) → measure + ↓ +Steps 1 + 3 (atomic pair) → measure → full test suite +``` + +Steps 1 and 3 MUST ship together. Step 2 is truly independent and can be done first. + +## Risks & Mitigations + +| Risk | Mitigation | +|------|------------| +| Circular imports from changed load order | Test incrementally; existing workarounds documented in PRD | +| `@register_onnx_overwrite` not called | Step 3 — explicit guard; verify with `wmk export` | +| `_warnings.py` runs too late | Kept as eager import in `__init__.py` | +| `export/__init__.py` eagerly imports `io.py` → optimum | Verify no CLI path reaches `export/__init__.py`; make lazy if needed | +| `dir(modelkit)` incomplete | `__dir__` override returns `__all__` | +| Broken command modules crash CLI | `LazyGroup.get_command()` catches both ImportError and Exception | +| Future dev re-introduces eager import | Add regression test (see verification criteria) | + +## Verification Criteria + +1. `wmk --help` < 2s +2. `wmk sys --format compact` < 3s +3. `from modelkit import WinMLAutoModel` still works +4. `wmk export` finds custom ONNX configs (BERT, CLIP, DETR, SAM) +5. `wmk config -m microsoft/resnet-50 --device npu --precision int8` produces correct config +6. No test regressions +7. `wmk build`, `wmk inspect`, `wmk perf` work normally +8. Automated regression test: `import modelkit` does NOT pull torch/transformers into `sys.modules` diff --git a/docs/design/importtime/1_prd.md b/docs/design/importtime/1_prd.md new file mode 100644 index 000000000..9dec81eed --- /dev/null +++ b/docs/design/importtime/1_prd.md @@ -0,0 +1,162 @@ +# Import Time Optimization — PRD + +**Status**: Analysis complete, implementation pending +**Date**: 2026-03-18 +**Branch**: `mvp` + +## Problem Statement + +`wmk --help` takes ~30 seconds. Every CLI invocation — including lightweight commands +like `wmk sys`, `wmk --version`, and `wmk --help` — pays the full import cost of +torch, transformers, optimum, diffusers, and sklearn, even though these libraries are +only needed for model export/build/inference. + +This is unacceptable UX for a CLI tool. Users expect sub-second response for help and +system info commands. + +## Current State (Baseline Measurements) + +### Per-Package Import Cost + +Measured in isolated subprocesses (`uv run python -c "import X"` per package): + +| Rank | Package | Import Time | Used Directly? | +|------|---------|-------------|----------------| +| 1 | `optimum.exporters.tasks` | **6.38s** | Yes (TasksManager) | +| 2 | `diffusers` | **3.34s** | No (pulled in by optimum) | +| 3 | `transformers` | **3.01s** | Yes (loader, export) | +| 4 | `torchvision` | **2.67s** | No (pulled in by transformers) | +| 5 | `torch` | **1.51s** | Yes (inference, export) | +| 6 | `sklearn` | **1.28s** | No (pulled in by transformers) | +| 7 | `onnx` | **0.22s** | Yes (session, compiler) | +| 8 | `onnxruntime` | **0.19s** | Yes (session) | +| 9 | `scipy` | **0.11s** | No (pulled in by sklearn) | +| 10 | `numpy` | **0.08s** | Yes (everywhere) | + +**Total deferrable cost**: ~18.2s from top 6 packages. +Packages 7-10 are fast enough (~0.6s combined) — not worth optimizing. + +### Import Chain (Why Everything Loads) + +``` +modelkit/__init__.py (~30s) + │ + ├── _warnings.py (~0s, fast) + │ + ├── from .config import WinMLBuildConfig (~8s) ─── CHAIN B + │ └── config/build.py + │ └── export/config.py → compiler/configs.py → ... + │ + └── from .models import WinMLAutoModel (~22s) ─── CHAIN A + C + └── models/__init__.py + │ + ├── models/hf/__init__.py CHAIN A (22s) + │ └── bert.py, clip.py, detr.py, sam.py + │ └── export/io.py line 33: + │ from optimum.exporters.tasks import TasksManager + │ (6.4s self + transitively loads everything above) + │ + └── models/winml/__init__.py CHAIN C (2s) + └── winml/base.py + ├── import torch (1.5s) + ├── import numpy (0.1s) + └── session/session.py + ├── import onnx (0.2s) + └── import onnxruntime (0.2s) +``` + +**Two import pathways into optimum from HF model files**: +1. `export/io.py` line 33 imports `TasksManager` at module scope because line 54 + needs it immediately: `register_onnx_overwrite = TasksManager.create_register(...)`. + HF model files import `register_onnx_overwrite` for their decorators. +2. HF model files also **directly** import from `optimum` at top level: + - `bert.py:15-19`: `from optimum.exporters.onnx.model_configs import BertOnnxConfig, ...` + - `clip.py:26-30`: `from optimum.exporters.onnx.model_configs import CLIPOnnxConfig, ...` + - `detr.py:22-23`: `from optimum.exporters.onnx.model_configs import ...` + - `sam.py:36-41`: `from optimum.exporters.onnx import OnnxConfig, ...` + +Even if `export/io.py` were deferred, the HF model files themselves pull in optimum. +The entire `models/hf/` package must be deferred as a unit. + +### Additional: CLI Command Discovery + +`cli.py` line 128 calls `_discover_commands()` at module load time, which imports +every command module. Some commands have heavy top-level imports: +- `commands/optimize.py`: `import onnx` (0.2s) +- `commands/perf.py`: `import numpy` (0.1s) +- `commands/sys.py`: `from ..sysinfo import OS` (hardware detection) + +Note: command discovery cost overlaps with `__init__.py` cost when measured in the +same process. The ~2-4s estimate may be inflated due to double-counting. With a lazy +`__init__.py`, command discovery's incremental cost is likely ~0.5-1s. + +## Requirements + +### R1: Lightweight CLI Commands Must Be Fast + +| Command | Current | Target | +|---------|---------|--------| +| `wmk --help` | ~30s | < 2s | +| `wmk --version` | ~30s | < 1s | +| `wmk sys --format compact` | ~30s | < 3s | + +### R2: Heavy Commands Are Unaffected + +`wmk export`, `wmk build`, `wmk inspect`, `wmk perf`, `wmk config` — these need +torch/transformers/optimum and will continue to pay the import cost. No regression. + +### R3: Library API Unchanged + +All public exports must still work via lazy loading on first access: +- `from modelkit import WinMLAutoModel` +- `from modelkit import WinMLBuildConfig` +- `from modelkit import WinMLPreTrainedModel` +- `from modelkit import WinMLModelForImageClassification` + +Acceptable since library users need the heavy deps anyway. + +### R4: Zero Behavior Change + +- No test regressions (baseline: 808 passed, 1 pre-existing failure in static_analyzer) +- ONNX config registrations (`@register_onnx_overwrite`) still work +- Warning filters (`_warnings.py`) still apply before heavy imports +- Existing circular import workarounds remain valid + +### R5: No New Dependencies + +Use Python stdlib mechanisms only (PEP 562 `__getattr__`). No third-party lazy +import libraries. + +## Constraints + +- torch, transformers, optimum imports **cannot be removed** — they are required + for core functionality (export, build, inference) +- `export/io.py` line 54 (`register_onnx_overwrite = TasksManager.create_register(...)`) + requires `TasksManager` at module scope — this is the registration factory +- HF model files use `@register_onnx_overwrite` decorators at top level — these + registrations must happen before any `TasksManager.get_exporter_config_constructor()` + call +- HF model files also **directly import from optimum** at top level (model configs, + normalized configs, dummy generators) — deferring only `export/io.py` is insufficient; + the entire `models/hf/` package must be deferred as a unit +- `export/__init__.py` line 16 eagerly imports `from .io import resolve_io_specs` — + any `from modelkit.export import ...` triggers the full optimum chain +- `_warnings.py` must execute before any heavy package import to suppress noisy + warnings from transformers/torch/diffusers + +## Known Circular Import Workarounds (Must Survive) + +| Location | Pattern | Cycle | +|----------|---------|-------| +| `models/__init__.py:50-56` | `__getattr__` for WinMLAutoModel | models → loader → models | +| `config/build.py:413` | Lazy import of MODEL_BUILD_CONFIGS inside `generate_build_config()` | config → models.hf → config | +| `loader/task.py:154,238,314,383,482` | TasksManager imported inside functions (5 call sites) | loader → optimum → heavy deps | + +## Out of Scope + +- Optimizing upstream package import times (torch, transformers, etc.) +- Changing the public API surface (what users import from `modelkit`) +- Deferring onnx/onnxruntime/numpy imports (fast enough at <0.3s each) + +Note: internal module restructuring (e.g., making `__init__.py` files lazy) IS in +scope — only the user-facing import API is preserved unchanged. diff --git a/docs/design/importtime/2_roadmap.md b/docs/design/importtime/2_roadmap.md new file mode 100644 index 000000000..b8ba5cf4d --- /dev/null +++ b/docs/design/importtime/2_roadmap.md @@ -0,0 +1,122 @@ +# Import Time Optimization — Roadmap + +**Prereq**: `1_prd.md` (problem/measurements), `1_plan.md` (approach/design) + +## Phase 1: Lazy CLI Command Discovery (independent, low risk) + +### 1.1 Implement `LazyGroup` in `cli.py` +- [ ] Add `LazyGroup(click.Group)` class with `list_commands()` and `get_command()` +- [ ] `list_commands()`: return command names from filesystem (glob `*.py`, skip `_` prefix) +- [ ] `get_command()`: `import_module()` only when command is invoked +- [ ] `get_command()`: error handling — catch `ImportError` (warning) + `Exception` (error) +- [ ] `get_command()`: prefer `click.Group` over `click.Command` (match current behavior) +- [ ] Change `@click.group()` to `@click.group(cls=LazyGroup)` +- [ ] Remove `_discover_commands()` function and its module-level call + +### 1.2 Verify Phase 1 +- [ ] `wmk --help` lists all commands correctly +- [ ] `wmk sys --format compact` works +- [ ] `wmk build --help` works (command imported on demand) +- [ ] Broken/missing command module → graceful warning, not crash +- [ ] Measure: `time uv run wmk --help` (expect improvement if `__init__.py` still eager) + +### 1.3 Commit Phase 1 +- [ ] Ruff lint `modelkit/cli.py` +- [ ] Run `uv run pytest tests/ -x -q -o "required_plugins="` +- [ ] Commit: `perf: lazy CLI command discovery via LazyGroup` + +--- + +## Phase 2: Lazy `modelkit/__init__.py` + Registration Guard (atomic pair) + +### 2.1 Make `modelkit/__init__.py` lazy +- [ ] Remove eager imports: `from .config import WinMLBuildConfig` (line 30) +- [ ] Remove eager imports: `from .models import ...` (lines 31-35) +- [ ] Add `__getattr__(name)` with lazy imports + `globals()` caching +- [ ] Add `__dir__()` returning `__all__` (for debugger/IPython compatibility) +- [ ] Keep `from . import _warnings` as eager (line 29) +- [ ] Keep `__version__` assignment as eager +- [ ] Keep `__all__` listing all public names + +### 2.2 Verify `__init__.py` in isolation +- [ ] `from modelkit.cli import main` does NOT trigger torch/transformers/optimum + ```bash + uv run python -X importtime -c "from modelkit.cli import main" 2>&1 | grep -cE "torch|transformers|optimum" + # Should output: 0 + ``` +- [ ] `from modelkit import __version__` works and is fast +- [ ] `from modelkit import WinMLAutoModel` works (triggers lazy load) +- [ ] `from modelkit import WinMLBuildConfig` works +- [ ] `from modelkit import WinMLPreTrainedModel` works +- [ ] `from modelkit import WinMLModelForImageClassification` works +- [ ] `dir(modelkit)` includes all `__all__` names + +### 2.3 Add ONNX config registration guard +- [ ] Add `ensure_hf_models_registered()` in `modelkit/export/io.py` +- [ ] Call in `_get_onnx_config_constructor()` (before `TasksManager.get_exporter_config_constructor()`) +- [ ] Defensively call in `config/build.py:generate_build_config()` (already safe, belt-and-suspenders) +- [ ] Verify: `wmk export` finds custom ONNX configs (BERT, CLIP, DETR, SAM) +- [ ] Verify: `wmk config -m microsoft/resnet-50 --device npu --precision int8` +- [ ] Verify: `wmk inspect -m google-bert/bert-base-uncased` + +### 2.4 Check `export/__init__.py` exposure +- [ ] Verify no CLI path reaches `modelkit.export` package import (only `modelkit.export.config`, `modelkit.export.io` submodules) +- [ ] If exposed: make `export/__init__.py` lazy too +- [ ] If not exposed: document as known landmine for future devs + +### 2.5 Verify Phase 2 end-to-end +- [ ] `wmk --help` < 2s +- [ ] `wmk --version` < 1s +- [ ] `wmk sys --format compact` < 3s +- [ ] `wmk export -m prajjwal1/bert-tiny -o temp/test_export` works +- [ ] `wmk config -m microsoft/resnet-50 --device npu --precision int8` works +- [ ] `wmk inspect -m google-bert/bert-base-uncased` works +- [ ] `wmk build` works (with existing test configs) +- [ ] `wmk perf` works + +### 2.6 Run full test suite +- [ ] `uv run pytest tests/ -x -q -o "required_plugins="` +- [ ] No regressions vs baseline (808 passed, 1 pre-existing failure) + +### 2.7 Commit Phase 2 +- [ ] Ruff lint all changed files +- [ ] Commit: `perf: lazy __init__.py + ONNX config registration guard` + +--- + +## Phase 3: Regression Prevention & Cleanup + +### 3.1 Add import time regression test +- [ ] Add `tests/test_import_time.py`: + ```python + def test_cli_import_no_heavy_deps(): + """Importing the CLI must not pull in torch/transformers/optimum.""" + result = subprocess.run( + [sys.executable, "-c", + "import sys; from modelkit.cli import main; " + "heavy = [m for m in sys.modules if m.startswith(('torch', 'transformers', 'optimum'))]; " + "assert not heavy, f'Heavy modules loaded: {heavy}'"], + capture_output=True, text=True, + ) + assert result.returncode == 0, result.stderr + ``` +- [ ] Verify test passes + +### 3.2 Final measurements +- [ ] Record before/after timing table +- [ ] Update `1_prd.md` status to "Complete" with final measurements + +### 3.3 Commit Phase 3 +- [ ] Commit: `test: add import time regression test` + +--- + +## Summary + +| Phase | Files Modified | Risk | Dependency | +|-------|---------------|------|------------| +| 1: LazyGroup | `cli.py` | Low | None | +| 2: Lazy init + guard | `__init__.py`, `export/io.py`, maybe `export/__init__.py` | Medium | Phase 1 recommended first | +| 3: Regression test | `tests/test_import_time.py` | None | Phase 2 | + +**Total files changed**: 3-4 (+ 1 new test file) diff --git a/docs/design/inspect/2026-03-20-inspect-config-improvement-plan.md b/docs/design/inspect/2026-03-20-inspect-config-improvement-plan.md new file mode 100644 index 000000000..0898eec4c --- /dev/null +++ b/docs/design/inspect/2026-03-20-inspect-config-improvement-plan.md @@ -0,0 +1,945 @@ +# Inspect & Config Command Improvement Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Fix all known bugs in `wmk inspect` and `wmk config`, remove MUST-rule violations from resolver.py, consolidate duplicated I/O extraction logic, and add missing features (`--list-tasks`, local ONNX support). + +**Architecture:** The inspect command's resolver.py is refactored to reuse config's battle-tested `export/io.py:resolve_io_specs()` for I/O extraction (eliminating ~100 lines of duplicated code). The config command's default EP is changed from hardcoded `"qnn"` to hardware-detected. All 5 MUST-rule violations (D-1 through D-5) in resolver.py are eliminated by replacing hardcoded patterns with data-driven approaches. + +**Tech Stack:** Python 3.10+, pytest, click, rich, transformers, optimum, huggingface_hub + +**GitHub Issues Addressed:** #247 (MUST-rule violations), #412 (config device bug), #354 (partial — ONNX inspect groundwork) + +--- + +## File Structure + +| Action | File | Responsibility | +|--------|------|---------------| +| Modify | `modelkit/inspect/resolver.py` | Remove `_extract_tensor_specs_from_onnx_config()`, reuse `resolve_io_specs()`. Fix D-1..D-5. Fix processor resolution. | +| Modify | `modelkit/inspect/types.py` | Add `value_range` field to `TensorInfo`. Extend `IOConfigInfo` for `hidden_sizes`. | +| Modify | `modelkit/inspect/__init__.py` | Pass `model_id` through to resolver for image size resolution. | +| Modify | `modelkit/inspect/formatter.py` | Display value_range. Handle `hidden_sizes` list. | +| Modify | `modelkit/commands/inspect.py` | Add `--list-tasks` flag. | +| Modify | `modelkit/compiler/configs.py` | Change `EPConfig.provider` default from `"qnn"` to `None`. | +| Modify | `modelkit/config/build.py` | Always call `resolve_device()` to populate compile config. | +| Modify | `modelkit/config/precision.py` | Handle `compile_provider=None` in no-op path. | +| Create | `tests/inspect/test_resolver.py` | Unit tests for all resolver functions. | +| Modify | `tests/commands/test_inspect_cli.py` | Add `--list-tasks` CLI test. | +| Modify | `tests/commands/test_config_cli.py` | Add device detection test. | + +--- + +## Chunk 1: Fix I/O Extraction (Consolidation + Image Size Bug) + +### Task 1: Add `value_range` to TensorInfo and extend IOConfigInfo + +**Files:** +- Modify: `modelkit/inspect/types.py:18-26` (TensorInfo) +- Modify: `modelkit/inspect/types.py:92-108` (IOConfigInfo) +- Test: `tests/inspect/test_resolver.py` (new) + +- [ ] **Step 1: Write failing test for TensorInfo.value_range** + +```python +# tests/inspect/test_resolver.py +"""Tests for inspect resolver module.""" +from modelkit.inspect.types import TensorInfo, IOConfigInfo + + +class TestTensorInfo: + def test_value_range_field_exists(self): + t = TensorInfo(name="pixel_values", dtype="float32", value_range=(0.0, 1.0)) + assert t.value_range == (0.0, 1.0) + + def test_value_range_default_none(self): + t = TensorInfo(name="x") + assert t.value_range is None + + +class TestIOConfigInfo: + def test_hidden_sizes_field(self): + io = IOConfigInfo(hidden_sizes=[256, 512, 1024, 2048]) + assert io.hidden_sizes == [256, 512, 1024, 2048] + + def test_hidden_sizes_default_none(self): + io = IOConfigInfo() + assert io.hidden_sizes is None +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/inspect/test_resolver.py::TestTensorInfo -v` +Expected: FAIL — `TensorInfo.__init__() got an unexpected keyword argument 'value_range'` + +- [ ] **Step 3: Add value_range to TensorInfo and hidden_sizes to IOConfigInfo** + +In `modelkit/inspect/types.py`, add to `TensorInfo`: +```python +@dataclass +class TensorInfo: + """Information about a tensor.""" + name: str + dtype: str | None = None + shape: tuple[int, ...] | None = None + shape_desc: str | None = None + dynamic_axes: dict[int, str] | None = None + value_range: tuple[float, float] | None = None # ADD THIS +``` + +In `modelkit/inspect/types.py`, add to `IOConfigInfo`: +```python +@dataclass +class IOConfigInfo: + # ... existing fields ... + hidden_size: int | None = None + hidden_sizes: list[int] | None = None # ADD THIS (for ResNet-like models) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `uv run pytest tests/inspect/test_resolver.py -v` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add modelkit/inspect/types.py tests/inspect/test_resolver.py +git commit -m "feat(inspect): add value_range to TensorInfo and hidden_sizes to IOConfigInfo" +``` + +### Task 2: Replace `_extract_tensor_specs_from_onnx_config` with `resolve_io_specs` + +This is the core consolidation. Inspect's duplicated I/O extraction (~100 lines) is replaced by calling config's battle-tested `export/io.py:resolve_io_specs()`. + +**Files:** +- Modify: `modelkit/inspect/resolver.py:175-277` (delete _extract_tensor_specs_from_onnx_config) +- Modify: `modelkit/inspect/resolver.py:280-384` (update resolve_exporter) +- Modify: `modelkit/inspect/__init__.py:67-195` (pass model_id) +- Test: `tests/inspect/test_resolver.py` + +- [ ] **Step 1: Write failing test for resolve_exporter with correct image size** + +```python +# Add to tests/inspect/test_resolver.py +from unittest.mock import patch, MagicMock +from modelkit.inspect.resolver import resolve_exporter + + +class TestResolveExporter: + def test_resnet_gets_224_not_64(self): + """Verify ResNet inspect shows 224x224, not Optimum's 64x64 fallback.""" + from transformers import AutoConfig + hf_config = AutoConfig.from_pretrained("microsoft/resnet-50") + info = resolve_exporter( + "resnet", "image-classification", + hf_config=hf_config, + model_id="microsoft/resnet-50", + ) + # Should have pixel_values input + assert len(info.input_tensors) > 0 + pv = info.input_tensors[0] + assert pv.name == "pixel_values" + # Shape should contain 224, NOT 64 + assert "224" in (pv.shape_desc or ""), ( + f"Expected 224 in shape_desc, got {pv.shape_desc}" + ) + + def test_resnet_input_has_value_range(self): + """Verify value_range is captured for vision models.""" + from transformers import AutoConfig + hf_config = AutoConfig.from_pretrained("microsoft/resnet-50") + info = resolve_exporter( + "resnet", "image-classification", + hf_config=hf_config, + model_id="microsoft/resnet-50", + ) + pv = info.input_tensors[0] + assert pv.value_range is not None, "value_range should be captured" + + def test_output_tensors_have_dtype(self): + """Verify output tensors get dtype from dummy forward pass.""" + from transformers import AutoConfig + hf_config = AutoConfig.from_pretrained("microsoft/resnet-50") + info = resolve_exporter( + "resnet", "image-classification", + hf_config=hf_config, + model_id="microsoft/resnet-50", + ) + # logits output should exist + assert len(info.output_tensors) > 0 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/inspect/test_resolver.py::TestResolveExporter::test_resnet_gets_224_not_64 -v` +Expected: FAIL — `resolve_exporter() got an unexpected keyword argument 'model_id'` (old signature) + +- [ ] **Step 3: Rewrite resolve_exporter to use resolve_io_specs** + +In `modelkit/inspect/resolver.py`: + +1. **Delete** the entire `_extract_tensor_specs_from_onnx_config()` function (lines 175-277). + +2. **Add** a new internal helper `_build_tensor_infos_from_io_specs()`: + +```python +def _build_tensor_infos_from_io_specs( + io_specs: dict, +) -> tuple[list[TensorInfo], list[TensorInfo]]: + """Convert resolve_io_specs() output to TensorInfo lists. + + Args: + io_specs: Dict from export/io.py resolve_io_specs() + + Returns: + Tuple of (input_tensors, output_tensors) + """ + input_tensors: list[TensorInfo] = [] + output_tensors: list[TensorInfo] = [] + + input_names = io_specs.get("input_names", []) + input_shapes = io_specs.get("input_shapes", []) + input_dtypes = io_specs.get("input_dtypes", []) + inputs_axes = io_specs.get("inputs", {}) + value_ranges = io_specs.get("value_ranges", {}) + + for i, name in enumerate(input_names): + shape = input_shapes[i] if i < len(input_shapes) else None + dtype = input_dtypes[i] if i < len(input_dtypes) else None + axes = inputs_axes.get(name, {}) + vr = value_ranges.get(name) + + shape_desc = _shape_to_desc(shape, axes) if shape else None + + input_tensors.append( + TensorInfo( + name=name, + dtype=dtype, + shape=shape, + shape_desc=shape_desc, + dynamic_axes=dict(axes) if axes else None, + value_range=vr, + ) + ) + + output_names = io_specs.get("output_names", []) + outputs_axes = io_specs.get("outputs", {}) + + for name in output_names: + axes = outputs_axes.get(name, {}) + shape_desc = _shape_to_desc(None, axes) if axes else None + output_tensors.append( + TensorInfo( + name=name, + shape_desc=shape_desc, + dynamic_axes=dict(axes) if axes else None, + ) + ) + + return input_tensors, output_tensors + + +def _shape_to_desc( + shape: tuple | list | None, dynamic_axes: dict[int, str] +) -> str: + """Convert tensor shape to human-readable string with dynamic markers. + + Uses dynamic_axes values directly as labels (no hardcoded abbreviations). + Falls back to dimension index labels when no axis names available. + """ + if shape is None: + parts = [] + for _idx, axis_name in sorted(dynamic_axes.items()): + parts.append(axis_name) + return f"[{', '.join(parts)}]" if parts else "[]" + + parts = [] + for i, dim in enumerate(shape): + if i in dynamic_axes: + axis_name = dynamic_axes[i] + # Use the axis name for truly dynamic dims (batch), + # use actual value for spatial dims + if "batch" in axis_name.lower(): + parts.append("B") + else: + parts.append(str(dim)) + else: + parts.append(str(dim)) + return f"[{', '.join(parts)}]" +``` + +3. **Update** `resolve_exporter()` signature and TasksManager path: + +```python +def resolve_exporter( + model_type: str, + task: str, + hf_config: PretrainedConfig | None = None, + *, + model_id: str | None = None, +) -> ExporterInfo: + """Resolve exporter configuration for a model. + + Uses MODEL_BUILD_CONFIGS registry, then falls back to + export/io.py resolve_io_specs() for I/O extraction. + """ + model_type_normalized = model_type.lower().replace("_", "-") + + # Check MODEL_BUILD_CONFIGS for predefined config (unchanged) + if model_type_normalized in MODEL_BUILD_CONFIGS: + # ... existing MODEL_BUILD_CONFIGS path (unchanged) ... + + # Check if TasksManager supports this model_type + try: + import optimum.exporters.onnx.model_configs # noqa: F401 + from optimum.exporters.tasks import TasksManager + + onnx_config_cls = TasksManager.get_exporter_config_constructor( + exporter="onnx", + model_type=model_type, + task=task, + library_name="transformers", + ) + if onnx_config_cls: + import functools + config_name = ( + onnx_config_cls.func.__name__ + if isinstance(onnx_config_cls, functools.partial) + else onnx_config_cls.__name__ + ) + + # NEW: Use resolve_io_specs instead of _extract_tensor_specs_from_onnx_config + input_tensors: list[TensorInfo] = [] + output_tensors: list[TensorInfo] = [] + + if hf_config is not None: + try: + from ..export.io import resolve_io_specs + + io_specs = resolve_io_specs( + model_type=model_type, + task=task, + hf_config=hf_config, + model_id=model_id, + ) + input_tensors, output_tensors = _build_tensor_infos_from_io_specs( + io_specs + ) + except Exception as e: + logger.debug("resolve_io_specs failed: %s", e) + + return ExporterInfo( + onnx_config_class=config_name, + onnx_config_source="TasksManager", + support_level=SupportLevel.DEFAULT, + input_tensors=input_tensors, + output_tensors=output_tensors, + opset_version=17, + ) + except Exception as e: + logger.debug("TasksManager lookup failed for %s/%s: %s", model_type, task, e) + + # Unsupported (unchanged) + return ExporterInfo(...) +``` + +4. **Update** `inspect_model()` in `__init__.py` to pass `model_id`: + +```python +# Step 4: Resolve exporter configuration (pass model_id for image size) +exporter_info = resolve_exporter(model_type, task, hf_config=hf_config, model_id=model_id) +``` + +- [ ] **Step 4: Run tests to verify** + +Run: `uv run pytest tests/inspect/test_resolver.py -v` +Expected: ALL PASS — ResNet shows 224, value_range captured + +- [ ] **Step 5: Run full test suite to check no regressions** + +Run: `uv run pytest tests/inspect/ tests/commands/test_inspect_cli.py -v` + +- [ ] **Step 6: Lint** + +Run: `uv run ruff check modelkit/inspect/ --fix` + +- [ ] **Step 7: Commit** + +```bash +git add modelkit/inspect/resolver.py modelkit/inspect/__init__.py tests/inspect/test_resolver.py +git commit -m "refactor(inspect): consolidate I/O extraction via resolve_io_specs + +Replaces _extract_tensor_specs_from_onnx_config (~100 lines) with +export/io.py resolve_io_specs(). Fixes 64x64 image size bug for +ResNet (now correctly reads preprocessor_config.json for 224x224). +Adds value_range capture to TensorInfo." +``` + +### Task 3: Fix IOConfigInfo to handle hidden_sizes and image_size from preprocessor + +**Files:** +- Modify: `modelkit/inspect/resolver.py:605-673` (resolve_io_config) +- Modify: `modelkit/inspect/formatter.py:81-134` (_output_io_config_table) +- Test: `tests/inspect/test_resolver.py` + +- [ ] **Step 1: Write failing test** + +```python +class TestResolveIOConfig: + def test_resnet_hidden_sizes(self): + """ResNet has hidden_sizes (list), not hidden_size (scalar).""" + from transformers import AutoConfig + from modelkit.inspect.resolver import resolve_io_config + + config = AutoConfig.from_pretrained("microsoft/resnet-50") + io = resolve_io_config(config) + assert io.hidden_sizes == [256, 512, 1024, 2048] + + def test_resnet_image_size_from_preprocessor(self): + """ResNet config lacks image_size; should get it from preprocessor.""" + from transformers import AutoConfig + from modelkit.inspect.resolver import resolve_io_config + + config = AutoConfig.from_pretrained("microsoft/resnet-50") + io = resolve_io_config(config, model_id="microsoft/resnet-50") + assert io.image_size == 224 or io.image_size == (224, 224) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Expected: FAIL — `resolve_io_config() got an unexpected keyword argument 'model_id'` + +- [ ] **Step 3: Implement fixes** + +Update `resolve_io_config()` in resolver.py: + +```python +def resolve_io_config( + config: PretrainedConfig, + *, + model_id: str | None = None, +) -> IOConfigInfo: + """Extract IO configuration from HuggingFace config. + + For vision models where image_size is missing from config (e.g., ResNet), + falls back to reading preprocessor_config.json via model_id. + """ + # ... existing get_config_attr helper (unchanged) ... + + # Existing lookups (unchanged) + max_position_embeddings = get_config_attr("max_position_embeddings", ["text_config"]) + vocab_size = get_config_attr("vocab_size", ["text_config"]) + image_size = get_config_attr("image_size", ["vision_config"]) + patch_size = get_config_attr("patch_size", ["vision_config"]) + num_channels = get_config_attr("num_channels", ["vision_config"]) + sampling_rate = get_config_attr("sampling_rate", ["audio_config"]) + hidden_size = get_config_attr("hidden_size", ["text_config", "vision_config"]) + + # NEW: hidden_sizes (for ResNet-like models with per-stage hidden dims) + hidden_sizes = get_config_attr("hidden_sizes") + + # NEW: Fallback to preprocessor_config.json for image_size + if image_size is None and model_id is not None: + try: + from ..export.io import _populate_image_size_from_preprocessor + shape_kwargs: dict = {} + _populate_image_size_from_preprocessor(model_id, shape_kwargs) + if "height" in shape_kwargs: + h, w = shape_kwargs["height"], shape_kwargs["width"] + image_size = h if h == w else (h, w) + except Exception as e: + logger.debug("Failed to get image_size from preprocessor: %s", e) + + return IOConfigInfo( + max_position_embeddings=max_position_embeddings, + vocab_size=vocab_size, + image_size=image_size, + patch_size=patch_size, + num_channels=num_channels, + sampling_rate=sampling_rate, + hidden_size=hidden_size, + hidden_sizes=hidden_sizes, + ) +``` + +Update `inspect_model()` in `__init__.py`: +```python +# Step 10: Extract IO config from HF config +io_config_info = resolve_io_config(hf_config, model_id=model_id) +``` + +Update `_output_io_config_table()` in formatter.py to display hidden_sizes: +```python +# After the hidden_size block, add: +if io_config.hidden_sizes is not None: + sizes_str = " → ".join(str(s) for s in io_config.hidden_sizes) + io_table.add_row("Hidden Sizes", sizes_str) + has_content = True +``` + +Update `output_json()` in formatter.py to include hidden_sizes: +```python +# In the io_config dict: +"hidden_sizes": io_config.hidden_sizes, +``` + +- [ ] **Step 4: Run tests** + +Run: `uv run pytest tests/inspect/test_resolver.py -v` + +- [ ] **Step 5: Lint and commit** + +```bash +uv run ruff check modelkit/inspect/ --fix +git add modelkit/inspect/resolver.py modelkit/inspect/__init__.py modelkit/inspect/types.py modelkit/inspect/formatter.py tests/inspect/test_resolver.py +git commit -m "fix(inspect): resolve image_size from preprocessor, add hidden_sizes" +``` + +--- + +## Chunk 2: Fix Config Device Detection Bug (#412) + +### Task 4: Change EPConfig default from "qnn" to None + +**Files:** +- Modify: `modelkit/compiler/configs.py` (EPConfig.provider default) +- Modify: `modelkit/config/build.py` (always resolve device) +- Test: `tests/inspect/test_resolver.py` or `tests/config/test_device_default.py` + +- [ ] **Step 1: Write failing test** + +```python +# tests/config/test_device_default.py +"""Tests for config device detection defaults.""" + + +class TestConfigDeviceDefault: + def test_default_config_no_qnn_without_hardware(self): + """Config should not default to QNN without NPU hardware.""" + from modelkit.compiler.configs import EPConfig + ep = EPConfig() + # Default should be None (detect from hardware), not "qnn" + assert ep.provider is None or ep.provider != "qnn", ( + "EPConfig should not default to 'qnn' without hardware detection" + ) + + def test_generate_config_detects_device(self): + """generate_build_config should detect hardware even with device=auto.""" + from unittest.mock import patch + from modelkit.config import generate_build_config + + # Mock resolve_device to return cpu (no NPU) + with patch("modelkit.config.build.resolve_device", return_value=("cpu", ["cpu"])): + config = generate_build_config("microsoft/resnet-50") + # compile config should NOT have qnn + if config.compile and config.compile.ep_config: + assert config.compile.ep_config.provider != "qnn" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Expected: FAIL — `EPConfig().provider` is `"qnn"` + +- [ ] **Step 3: Fix EPConfig default** + +In `modelkit/compiler/configs.py`, change: +```python +@dataclass +class EPConfig: + provider: str | None = None # Changed from "qnn" — detected from hardware + # ... rest unchanged +``` + +- [ ] **Step 4: Update generate_build_config to always detect device** + +In `modelkit/config/build.py`, change the device detection block (around line 466-479): + +```python +# STEP 4.5: Apply device/precision policy (affects quant + compile only) +from .precision import resolve_precision +from ..sysinfo import resolve_device + +# ALWAYS detect hardware — don't skip when both are "auto" +resolved_device, available_devices = resolve_device(device=device) +logger.info( + "Device resolved: %s (available: %s)", + resolved_device, ", ".join(available_devices), +) + +policy = resolve_precision( + device=resolved_device, + precision=precision, + ep=ep, + available_devices=available_devices, + task=parent_config.loader.task, +) + +# Apply policy — always set compile provider from detected hardware +if policy.compile_provider is not None: + parent_config.compile = WinMLCompileConfig.for_provider( + policy.compile_provider, + ) + +if policy.weight_type is not None: + if parent_config.quant is None: + parent_config.quant = WinMLQuantizationConfig() + parent_config.quant.weight_type = policy.weight_type + parent_config.quant.activation_type = policy.activation_type +elif policy.device != "auto" and policy.weight_type is None: + parent_config.quant = None +``` + +- [ ] **Step 5: Update resolve_precision for auto+auto to still return device-based provider** + +In `modelkit/config/precision.py`, update the both-auto path: + +```python +# When both are auto, still return the hardware-detected provider +if device == "auto" and resolved_precision == "auto": + return PrecisionPolicy( + device="auto", + precision="auto", + weight_type=None, + activation_type=None, + compile_provider=None, # Let caller handle — device already resolved + ) +``` + +Actually, the cleaner fix is in `build.py` only: always call `resolve_device()` and use the result to set compile provider when it's None. The precision module stays the same. + +- [ ] **Step 6: Run tests** + +Run: `uv run pytest tests/config/test_device_default.py -v` +Run: `uv run pytest tests/ -k "config" --ignore=tests/integration --ignore=tests/e2e -v` + +- [ ] **Step 7: Lint and commit** + +```bash +uv run ruff check modelkit/compiler/ modelkit/config/ --fix +git add modelkit/compiler/configs.py modelkit/config/build.py tests/config/test_device_default.py +git commit -m "fix(config): detect hardware instead of defaulting to QNN (#412) + +EPConfig.provider now defaults to None instead of 'qnn'. +generate_build_config() always calls resolve_device() to detect +available hardware, even when device='auto'." +``` + +--- + +## Chunk 3: Fix MUST-Rule Violations (#247) + +### Task 5: Fix D-1 — Remove hardcoded dtype inference from name patterns + +The current `infer_dtype()` hardcodes `"ids"→int64`, `"mask"→int64`. With the consolidation in Task 2, this function is deleted (resolve_io_specs captures actual dtypes from dummy inputs). **This is already fixed by Task 2.** Verify only. + +- [ ] **Step 1: Write verification test** + +```python +class TestNoHardcodedDtypeInference: + def test_bert_dtypes_from_dummy_inputs(self): + """Dtypes should come from actual tensors, not name pattern matching.""" + from transformers import AutoConfig + from modelkit.inspect.resolver import resolve_exporter + + hf_config = AutoConfig.from_pretrained("google-bert/bert-base-uncased") + info = resolve_exporter("bert", "fill-mask", hf_config=hf_config) + for t in info.input_tensors: + assert t.dtype is not None, f"Tensor {t.name} missing dtype" +``` + +- [ ] **Step 2: Run test — should PASS (already fixed by Task 2)** + +Run: `uv run pytest tests/inspect/test_resolver.py::TestNoHardcodedDtypeInference -v` + +- [ ] **Step 3: Commit verification test** + +```bash +git add tests/inspect/test_resolver.py +git commit -m "test(inspect): verify D-1 fix — dtypes from dummy inputs, not name patterns" +``` + +### Task 6: Fix D-2 — Remove hardcoded nested config names + +`resolve_io_config()` hardcodes `["text_config"]`, `["vision_config"]`, `["audio_config"]`. Replace with dynamic discovery of all nested `PretrainedConfig` objects. + +**Files:** +- Modify: `modelkit/inspect/resolver.py:605-673` +- Test: `tests/inspect/test_resolver.py` + +- [ ] **Step 1: Write failing test** + +```python +class TestNoHardcodedNestedConfigs: + def test_discovers_nested_configs_dynamically(self): + """Should find nested configs without hardcoding names.""" + from transformers import AutoConfig + from modelkit.inspect.resolver import resolve_io_config + + # CLIP has text_config and vision_config + config = AutoConfig.from_pretrained("openai/clip-vit-base-patch32") + io = resolve_io_config(config) + # Should find vocab_size from text_config + assert io.vocab_size is not None + # Should find image_size from vision_config + assert io.image_size is not None +``` + +- [ ] **Step 2: Implement dynamic nested config discovery** + +Replace the hardcoded nested config names with: + +```python +def _find_nested_configs(config: PretrainedConfig) -> list[PretrainedConfig]: + """Discover all nested PretrainedConfig objects dynamically.""" + from transformers import PretrainedConfig as _PC + nested = [] + for attr_name in dir(config): + if attr_name.startswith("_"): + continue + try: + val = getattr(config, attr_name) + if isinstance(val, _PC): + nested.append(val) + except Exception: + continue + return nested +``` + +Then update `get_config_attr` to use this instead of hardcoded names: + +```python +def get_config_attr(attr_name: str) -> int | tuple[int, int] | list | None: + value = getattr(config, attr_name, None) + if value is not None: + return value + for nested in nested_configs: + value = getattr(nested, attr_name, None) + if value is not None: + return value + return None + +nested_configs = _find_nested_configs(config) +``` + +- [ ] **Step 3: Run tests** +- [ ] **Step 4: Commit** + +```bash +git commit -m "fix(inspect): D-2 — dynamic nested config discovery, no hardcoded names" +``` + +### Task 7: Fix D-3 — Remove hardcoded axis abbreviations from shape_to_desc + +Already partially fixed in Task 2's `_shape_to_desc()` — it only hardcodes "B" for batch. The old version hardcoded "S" for sequence, special-cased "height"/"width". The new version uses axis names directly from OnnxConfig. + +- [ ] **Step 1: Write verification test** + +```python +class TestShapeToDesc: + def test_uses_axis_names_not_abbreviations(self): + """shape_to_desc should not hardcode 'S' for sequence.""" + from modelkit.inspect.resolver import _shape_to_desc + + # Dynamic axes with sequence + axes = {0: "batch_size", 1: "sequence_length"} + desc = _shape_to_desc((1, 128), axes) + assert desc == "[B, 128]" + # batch → B, sequence uses actual value +``` + +- [ ] **Step 2: Run test — should PASS** +- [ ] **Step 3: Commit** + +### Task 8: Fix D-4 — Remove hardcoded JSON keys from processor resolution + +`_resolve_processor_from_hub_configs()` hardcodes JSON keys like `"processor_class"`, `"image_processor_type"`, `"feature_extractor_type"`, `"tokenizer_class"`. These are standard HF config keys — they are part of HF's API contract, NOT model-specific hardcoding. **D-4 is a false positive** — these keys are universal HF conventions, not model-specific patterns. + +However, the processor resolution has a real bug: ResNet shows `ConvNextImageProcessorFast`. This is because `preprocessor_config.json` says `"feature_extractor_type": "ConvNextFeatureExtractor"` and `AutoProcessor` returns `ConvNextImageProcessorFast`. **This is actually correct HF behavior** — ResNet's processor config on HuggingFace Hub genuinely points to ConvNext processors (they share the same preprocessing pipeline). + +- [ ] **Step 1: Document D-4 as false positive** + +Add a comment in resolver.py explaining this is universal HF API, not model-specific: + +```python +# NOTE: These JSON keys (processor_class, image_processor_type, etc.) are +# standard HuggingFace config conventions, not model-specific hardcoding. +# See: https://huggingface.co/docs/transformers/preprocessing +``` + +- [ ] **Step 2: Add model_type to ProcessorInfo for transparency** + +The real improvement is showing WHERE the processor class comes from, so users understand why ResNet says ConvNext. This is a formatter improvement, not a logic fix. + +- [ ] **Step 3: Commit** + +### Task 9: Fix D-5 — Remove modality assumptions in attribute grouping + +`resolve_io_config()` groups attributes by modality (text, vision, audio). After Task 6's dynamic discovery, the grouping is implicit — `get_config_attr()` searches all nested configs without assuming modality. **D-5 is resolved by Task 6.** + +- [ ] **Step 1: Verify with test** +- [ ] **Step 2: Commit verification** + +--- + +## Chunk 4: Missing Features (M-1, B-1) + +### Task 10: Add --list-tasks flag (M-1) + +**Files:** +- Modify: `modelkit/commands/inspect.py` +- Modify: `modelkit/inspect/resolver.py` (expose _get_known_tasks) +- Test: `tests/commands/test_inspect_cli.py` + +- [ ] **Step 1: Write failing test** + +```python +class TestListTasksFlag: + def test_list_tasks_outputs_tasks(self, runner): + from modelkit.commands.inspect import inspect + result = runner.invoke(inspect, ["--list-tasks"], obj={}) + assert result.exit_code == 0 + assert "image-classification" in result.output + assert "fill-mask" in result.output +``` + +- [ ] **Step 2: Add --list-tasks flag to CLI** + +```python +@click.option( + "--list-tasks", + is_flag=True, + default=False, + help="List all known tasks and exit", +) +``` + +In the command body, before the main logic: +```python +if list_tasks: + from ..inspect.resolver import get_known_tasks + tasks = sorted(get_known_tasks()) + for t in tasks: + click.echo(t) + return +``` + +Rename `_get_known_tasks` to `get_known_tasks` (make public). + +- [ ] **Step 3: Run tests** +- [ ] **Step 4: Commit** + +```bash +git commit -m "feat(inspect): add --list-tasks flag (M-1 from #247)" +``` + +### Task 11: Support local ONNX file input (B-1 groundwork) + +This is groundwork for #354 — full ONNX inspect is a separate feature. For now, add basic detection and a helpful error message. + +**Files:** +- Modify: `modelkit/commands/inspect.py` +- Test: `tests/commands/test_inspect_cli.py` + +- [ ] **Step 1: Write test** + +```python +class TestOnnxInput: + def test_onnx_file_gives_helpful_message(self, runner, tmp_path): + from modelkit.commands.inspect import inspect + onnx_file = tmp_path / "model.onnx" + onnx_file.write_bytes(b"fake") + result = runner.invoke(inspect, ["-m", str(onnx_file)], obj={}) + assert "ONNX" in result.output +``` + +- [ ] **Step 2: Add ONNX detection** + +```python +# Before the main try block: +if model.endswith(".onnx") and Path(model).exists(): + raise click.ClickException( + "ONNX file inspection is not yet supported. " + "Use 'wmk config -m model.onnx' for ONNX build config. " + "See issue #354 for progress on ONNX inspect." + ) +``` + +- [ ] **Step 3: Commit** + +```bash +git commit -m "feat(inspect): B-1 groundwork — helpful message for ONNX file input" +``` + +--- + +## Chunk 5: Processor Bug Investigation & Fix + +### Task 12: Investigate and fix processor identification + +ResNet-50 showing `ConvNextImageProcessorFast` is actually **correct HF behavior** — the Hub repo genuinely has `ConvNextFeatureExtractor` in its config. However, inspect should be more transparent about this. + +**Files:** +- Modify: `modelkit/inspect/formatter.py` (_output_processor_table) + +- [ ] **Step 1: Add source attribution to processor display** + +Show the source of each processor class so users understand WHY: + +```python +# In _output_processor_table, for each processor class: +# Show: "ConvNextImageProcessorFast (from preprocessor_config.json)" +``` + +This requires passing source info through ProcessorInfo. Add optional source fields: + +```python +@dataclass +class ProcessorInfo: + processor_class: str | None = None + tokenizer_class: str | None = None + image_processor_class: str | None = None + feature_extractor_class: str | None = None + # Source tracking + processor_source: str | None = None # "hub_config" | "auto_class" + image_processor_source: str | None = None +``` + +- [ ] **Step 2: Update resolver to track sources** +- [ ] **Step 3: Update formatter to display sources** +- [ ] **Step 4: Test and commit** + +```bash +git commit -m "fix(inspect): add source attribution to processor identification" +``` + +--- + +## Chunk 6: Final Verification & Cleanup + +### Task 13: End-to-end verification + +- [ ] **Step 1: Run `wmk inspect -m microsoft/resnet-50` and verify output** + +Expected changes: +- Input shape: `[B, 3, 224, 224]` (was 64x64) +- Output logits: should show dtype +- IO Config: shows Image Size 224, Hidden Sizes 256→512→1024→2048, Channels 3 +- Value range shown for pixel_values + +- [ ] **Step 2: Run `wmk config -m microsoft/resnet-50` and verify output** + +Expected changes: +- `execution_provider` should reflect actual hardware (not hardcoded "qnn") + +- [ ] **Step 3: Run `wmk inspect --list-tasks` and verify output** + +- [ ] **Step 4: Run full test suite** + +Run: `uv run pytest tests/inspect/ tests/commands/test_inspect_cli.py tests/commands/test_config_cli.py -v` + +- [ ] **Step 5: Lint all modified files** + +Run: `uv run ruff check modelkit/inspect/ modelkit/compiler/ modelkit/config/ --fix` + +- [ ] **Step 6: Final commit with any remaining fixes** diff --git a/docs/design/inspect/2026-03-21-inspect-consolidation-design.md b/docs/design/inspect/2026-03-21-inspect-consolidation-design.md new file mode 100644 index 000000000..8d58b3aa9 --- /dev/null +++ b/docs/design/inspect/2026-03-21-inspect-consolidation-design.md @@ -0,0 +1,259 @@ +# Inspect Command Consolidation Design + +> **Status:** Draft — pending user approval before implementation +> **Prerequisite:** Phase 1 changes (I/O extraction fix, device detection, dynamic IO config) — done in this session +> **GitHub Issues:** #247 (MUST-rule violations), #412 (config device bug), #354 (ONNX inspect groundwork) + +## Problem Statement + +`inspect/resolver.py` contains thin wrapper functions that re-call existing module APIs +with different names. Five functions (`detect_task`, `validate_task`, `resolve_loader`, +`resolve_exporter`, `get_build_config`) are <10 lines of real logic — they call +`loader/task.py` or `export/io.py` functions and wrap results with source labels. + +Meanwhile, `loader/config.py:resolve_loader_config()` already does Steps 1-3 (load HF config, +detect task, resolve model class) in a single call that inspect doesn't use. + +## What Inspect Currently Does (10 steps) + +``` +inspect_model(model_id) + 1. AutoConfig.from_pretrained() ← duplicates resolve_loader_config Step 1 + 2. detect_task() ← wraps loader/task._detect_task_from_config() + 3. resolve_loader() ← wraps loader/task._get_custom_model_class() + 4. resolve_exporter() ← wraps export/io.resolve_io_specs() + 5. resolve_winml() ← reads models/winml dicts + 6. compile_support_status() ← aggregates support levels + 7. get_build_config() ← MODEL_BUILD_CONFIGS.get() + 8. resolve_cache() ← cache module + 9. resolve_processor() ← HF hub + Auto classes + 10. resolve_io_config() ← dynamic OnnxConfig attr discovery +``` + +## Target Architecture + +``` +inspect_model(model_id, task, model_type, model_class) + 1. resolve_loader_config() ← SHARED with config command + 2. resolve_io_specs() ← SHARED with config command + 3. get_winml_class() ← from models/winml (existing) + 4. resolve_processor() ← inspect-only (stays) + 5. resolve_io_config() ← inspect-only (stays) + 6. resolve_cache() ← inspect-only (stays) + 7. Derive display metadata ← source labels, support levels + 8. Format and display +``` + +## Profiling (why not call generate_build_config directly) + +| Step | Time | Inspect needs? | +|------|------|----------------| +| resolve_loader_config() | 8.55s | Yes | +| MODEL_BUILD_CONFIGS.get() | 0.00s | Yes | +| _resolve_export_config_from_specs() | 0.25s | No — inspect calls resolve_io_specs() directly | +| _assemble_config() | 0.00s | No | +| resolve_device() | **4.83s** | **No — pure overhead** | +| resolve_precision() | 0.00s | No | + +Inspect needs **loader config + I/O specs**, not the full build config. +Calling `generate_build_config()` adds 4.83s of device detection overhead for zero benefit. + +## Functions to DELETE from resolver.py + +| Function | Lines | Why delete | +|----------|-------|------------| +| `detect_task()` | 101-130 | `resolve_loader_config()` detects task | +| `validate_task()` | 84-98 | Inline validation into `inspect_model()` or CLI | +| `resolve_loader()` | 133-172 | `resolve_loader_config()` returns model_class | +| `resolve_exporter()` | 271-393 | Call `resolve_io_specs()` directly + `_build_tensor_infos_from_io_specs()` | +| `get_build_config()` | 481-496 | Inline `MODEL_BUILD_CONFIGS.get()` | + +**~250 lines deleted.** + +## Functions to KEEP in resolver.py + +| Function | Lines | Why keep | +|----------|-------|----------| +| `get_known_tasks()` | 55-81 | Aggregates 3 sources for --list-tasks UI | +| `_shape_to_desc()` | 175-208 | Display helper | +| `_build_tensor_infos_from_io_specs()` | 211-268 | Converts export types → inspect display types | +| `resolve_winml()` | 396-435 | Reads models/winml dicts, returns display metadata | +| `compile_support_status()` | 438-478 | Aggregates support levels for display | +| `resolve_cache()` | 499-611 | Inspect-only, manifest + filename scanning | +| `_find_nested_configs()` | 614-639 | Dynamic nested config discovery | +| `_discover_io_attrs_from_onnx_config()` | 642-697 | Dynamic IO attr discovery from NormalizedConfig | +| `resolve_io_config()` | 700-807 | Model config attrs for display | +| `resolve_processor()` + helpers | 810-1063 | 3-strategy processor resolution | + +### Note on resolve_winml() + +`get_winml_class()` in `models/winml/` does the same 3-level lookup but returns the +actual class type for instantiation. `resolve_winml()` returns `WinMLInfo` (class name +string + source label + support level) for display. Different return types, same dicts. + +**Future consideration:** Could call `get_winml_class()` and derive metadata post-hoc, +but current implementation is straightforward and not a duplication of logic — it's a +different consumption of the same data. + +## Changes to inspect_model() + +### Before (current) +```python +def inspect_model(model_id, include_hierarchy=False, task_override=None): + hf_config = AutoConfig.from_pretrained(model_id) + model_type = hf_config.model_type + task, task_source = detect_task(hf_config) + loader_info = resolve_loader(model_type, task) + exporter_info = resolve_exporter(model_type, task, hf_config, model_id=model_id) + ... +``` + +### After (proposed) +```python +def inspect_model( + model_id=None, include_hierarchy=False, task_override=None, + model_type=None, model_class=None, +): + # Step 1: Shared loader resolution (same as config command) + loader_config, hf_config, resolved_class = resolve_loader_config( + model_id, task=task_override, model_type=model_type, model_class=model_class, + ) + model_type = loader_config.model_type + task = loader_config.task + + # Step 2: I/O specs via shared path + io_specs = resolve_io_specs(model_type, task, hf_config, model_id=model_id) + input_tensors, output_tensors = _build_tensor_infos_from_io_specs(io_specs) + + # Step 3: Derive display metadata (source labels, support levels) + loader_info = _derive_loader_info(model_type, task, loader_config) + exporter_info = _derive_exporter_info(model_type, task, input_tensors, output_tensors) + + # Step 4-8: Inspect-only enrichment (unchanged) + winml_info = resolve_winml(model_type, task) + processor_info = resolve_processor(model_id, model_type=model_type) + io_config_info = resolve_io_config(hf_config, model_id=model_id, model_type=model_type, task=task) + cache_info = resolve_cache(model_id) + ... +``` + +### Display metadata derivation + +```python +def _derive_loader_info(model_type, task, loader_config): + """Derive LoaderInfo display metadata from resolve_loader_config results.""" + mt = model_type.lower().replace("_", "-") + if (mt, task) in HF_MODEL_CLASS_MAPPING: + source, level = "MODEL_CLASS_MAPPING", SupportLevel.SUPPORTED + elif task in HF_TASK_DEFAULTS: + source, level = "HF_TASK_DEFAULTS", SupportLevel.DEFAULT + else: + source, level = "TasksManager", SupportLevel.DEFAULT + return LoaderInfo( + hf_model_class=loader_config.model_class or "Auto (TasksManager)", + hf_model_class_source=source, + support_level=level, + ) + +def _derive_task_source(model_type, task): + """Derive task detection source label for display.""" + mt = model_type.lower().replace("_", "-") + if (mt, task) in HF_MODEL_CLASS_MAPPING: + return "HF_MODEL_CLASS_MAPPING" + return "TasksManager" +``` + +## Known Output Changes + +### Loader class display string +**Before:** `"Auto (TasksManager)"` for models without explicit registry entry +**After:** `"AutoModelForImageClassification"` (actual class name from resolve_loader_config) + +This is intentionally more informative — users see the actual class that will be used. +If the old string must be preserved, add: `if source == "TasksManager": display = "Auto (TasksManager)"`. + +## CLI Changes + +Extend `wmk inspect` to match `wmk config` flags: + +``` +wmk inspect -m microsoft/resnet-50 # existing +wmk inspect --model-type bert # NEW: model_type without model_id +wmk inspect --model-type bert --task fill-mask # NEW: model_type + task +wmk inspect -m custom-model --model-class BertForCTC # NEW: model_class override +``` + +**Note:** When `model_id` is None (e.g., `--model-type` only), `resolve_cache()` and +`resolve_processor()` must be skipped since they require a model_id. These sections +will show as empty in the output — same as how `wmk config --model-type bert` works +without a model_id. + +## Error Handling + +Current `inspect_model()` wraps `AutoConfig.from_pretrained()` with `ModelNotFoundError` +and `NetworkError`. After switching to `resolve_loader_config()`, which raises generic +`ValueError`, inspect must catch and re-wrap: + +```python +try: + loader_config, hf_config, resolved_class = resolve_loader_config(...) +except ValueError as e: + if "not found" in str(e).lower() or "404" in str(e): + raise ModelNotFoundError(str(e)) from e + raise InspectError(str(e)) from e +except OSError as e: + raise NetworkError(str(e)) from e +``` + +## Multimodal Sub-Config Note + +After `resolve_loader_config()`, `hf_config` may be a sub-config (e.g., `CLIPTextConfig` +instead of `CLIPConfig`) and `model_type` may be the sub-model type (e.g., `clip_text_model`). +This is correct for I/O spec resolution — `resolve_io_specs()` needs the narrowed config. +But `resolve_io_config()` (which shows model-level attrs like vocab_size) should receive the +**parent config** for multimodal models. Implementation must preserve the parent config before +calling `resolve_loader_config()` if the parent is needed downstream. + +## Dependency on Phase 1 (this session's changes) + +This design builds on the Phase 1 changes already implemented: + +- [x] `resolve_io_specs()` consolidation (deleted `_extract_tensor_specs_from_onnx_config`) +- [x] Dynamic IO config discovery from NormalizedConfig +- [x] Dynamic nested config discovery (`_find_nested_configs`) +- [x] Processor source attribution + HF registry lookup +- [x] `--list-tasks` flag +- [x] ONNX file detection +- [x] Config device detection fix (#412) + +## Implementation Order + +1. **Add `--model-type` and `--model-class` to inspect CLI** +2. **Replace Steps 1-3 in `inspect_model()` with `resolve_loader_config()` call** + - Wrap with `ModelNotFoundError`/`NetworkError` + - Derive `task_source` via `_derive_task_source()` +3. **Handle MODEL_BUILD_CONFIGS registry path:** + - Check `MODEL_BUILD_CONFIGS.get(model_type)` before calling `resolve_io_specs()` + - If registered config has `input_tensors`, build `TensorInfo` from those directly + - Otherwise fall through to `resolve_io_specs()` +4. **Replace `resolve_exporter()` with direct `resolve_io_specs()` + `_build_tensor_infos_from_io_specs()`** +5. **Add `_derive_loader_info()`, `_derive_task_source()`, `_derive_exporter_info()` helpers** +6. **Make `resolve_cache()` and `resolve_processor()` conditional on `model_id is not None`** +7. **Delete `detect_task()`, `validate_task()`, `resolve_loader()`, `resolve_exporter()`, `get_build_config()`** +8. **Update tests** +9. **Verify `wmk inspect -m microsoft/resnet-50` output — expect loader class string change** + +## Resolved Questions + +### Q: MODEL_BUILD_CONFIGS registry path +**Answer:** Concrete step added (Step 3). Check registry first. If registered config has +`input_tensors`, use them. Otherwise call `resolve_io_specs()`. Same priority as +`generate_build_config()` Step 2-3. + +### Q: Should resolve_loader_config return source metadata? +**Answer:** No. Source labels are a display concern. Inspect derives them post-hoc by +checking `HF_MODEL_CLASS_MAPPING` and `HF_TASK_DEFAULTS`. Config doesn't need this. + +### Q: task_source required field +**Answer:** `_derive_task_source()` added. Checks same dicts as old `detect_task()` +to produce the source label. diff --git a/docs/design/logging/1_research.md b/docs/design/logging/1_research.md new file mode 100644 index 000000000..f96ddfae7 --- /dev/null +++ b/docs/design/logging/1_research.md @@ -0,0 +1,639 @@ +# Logging System Design Research + +**Module**: `modelkit` CLI (`wmk`) +**Date**: 2026-03-16 +**Status**: Research + +--- + +## 1. Problem Statement + +The current `wmk` CLI uses a binary `--debug` flag that toggles between `INFO` and `DEBUG`. This is non-standard, lacks granularity, and doesn't follow established Python CLI conventions. We need a proper verbosity system that: + +- Follows POSIX and Python CLI conventions (`-v`, `-vv`, `-q`) +- Integrates cleanly with Python's `logging` module +- Works well with Click +- Scales across all `modelkit` submodules + +### Current State (cli.py) + +```python +@click.option("--debug", is_flag=True, default=False, help="Enable debug logging") +def main(ctx, debug): + log_level = logging.DEBUG if debug else logging.INFO + logging.basicConfig( + level=log_level, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) +``` + +**Problems**: Default is `INFO` (too noisy for normal use), no `-v`/`-q` convention, no intermediate verbosity, debug format used at all levels. + +--- + +## 2. Standard Verbosity Conventions + +### 2.1 De Facto Standard Mapping + +The universally accepted convention for CLI tools maps repeated `-v` flags to Python logging levels: + +| Flags | Verbosity | Logging Level | Numeric | Purpose | +|-------|-----------|---------------|---------|---------| +| `-q` / `--quiet` | -1 | `ERROR` | 40 | Errors only | +| *(default)* | 0 | `WARNING` | 30 | Warnings and errors | +| `-v` | 1 | `INFO` | 20 | Informational progress | +| `-vv` | 2 | `DEBUG` | 10 | Full debug output | + +**The formula**: `log_level = 30 - (verbosity * 10)`, clamped to `[10, 40]`. + +This is derived from Python's logging level numbering (DEBUG=10, INFO=20, WARNING=30, ERROR=40, CRITICAL=50) and the fact that they are spaced exactly 10 apart. + +### 2.2 Default Level: WARNING, Not INFO + +The default (no flags) should be `WARNING`. This is the Python `logging` module's own default and the POSIX expectation: a well-behaved CLI should be silent on success, reporting only warnings and errors. Users who want progress feedback explicitly opt in with `-v`. + +### 2.3 Extended Verbosity (Optional) + +Some tools support three or more `-v` levels. Two approaches: + +**Approach A: Custom VERBOSE level (pip)** +pip defines a custom `VERBOSE = 15` level between DEBUG and INFO: + +| Flags | Level | +|-------|-------| +| *(default)* | WARNING (30) | +| `-v` | VERBOSE (15) | +| `-vv` | DEBUG (10) | + +**Approach B: Ansible's 5-level system** +Ansible supports `-v` through `-vvvvv`, each level revealing more detail (task results, input params, connection details, SSH protocol dumps). + +**Recommendation**: For ModelKit, the standard 4-level system (`-q`, default, `-v`, `-vv`) is sufficient. Two verbose levels plus a quiet mode covers all practical needs. Avoid custom log levels unless there is a demonstrated need. + +--- + +## 3. How Popular Python CLIs Handle This + +### 3.1 pip + +- **Flags**: `-v` (additive, up to 3x), `-q` (additive, up to 3x) +- **Levels**: Custom `VERBOSE=15` between DEBUG and INFO +- `-v` shows subprocess output; `-vv` shows full DEBUG; `-vvv` same as `-vv` (capped) +- `-q` reduces to ERROR; `-qq` to CRITICAL; `-qqq` to silent +- **Takeaway**: Additive quiet is a nice touch but adds complexity. Custom levels are worth it only at pip's scale. + +### 3.2 Ruff + +- **Flags**: `--verbose` / `-v`, `--quiet` / `-q`, `--silent` / `-s` +- Three explicit tiers: verbose, quiet (diagnostics only), silent (no output) +- No counting/additive flags +- **Takeaway**: Simple three-tier approach. Clean for tools where "how much debug info" is less important than "show diagnostics or not." + +### 3.3 Ansible + +- **Flags**: `-v` through `-vvvvv` (5 levels) +- Level 0: task names/status; Level 1: return values; Level 2: input params; Level 3: connection details; Level 4: SSH protocol dumps +- **Takeaway**: Fine-grained verbosity makes sense for complex orchestration tools. Overkill for most CLIs. + +### 3.4 HTTPie + +- **Flag**: `--verbose` / `-v` (boolean, not counting) +- Toggles display of request headers/body alongside response +- Not a logging-level control but a content-display toggle +- **Takeaway**: Some tools use `-v` as a feature toggle rather than log-level control. Keep these concepts separate. + +### 3.5 pytest + +- **Flags**: `-v` (additive), `-q` (additive), `--no-header` +- `-v` increases test output detail; `-vv` shows full assertion diffs +- Separate `--log-cli-level` for actual Python logging control +- **Takeaway**: Distinguishes between "output verbosity" and "log level." Worth considering but adds complexity. + +### 3.6 uvicorn + +- **Flag**: `--log-level` with explicit choices (critical, error, warning, info, debug, trace) +- No `-v`/`-q` shorthand +- Adds custom `TRACE` level below DEBUG +- **Takeaway**: Explicit `--log-level` is useful for server-type tools where users need precise control. + +### Summary Table + +| Tool | `-v` counting | `-q` flag | Custom levels | Default | +|------|:---:|:---:|:---:|---------| +| pip | Yes (3x) | Yes (3x) | VERBOSE=15 | WARNING | +| ruff | No (boolean) | Yes | No | WARNING | +| ansible | Yes (5x) | No | No | Level 0 | +| httpie | No (boolean) | No | No | Normal | +| pytest | Yes (2x) | Yes (2x) | No | WARNING | +| uvicorn | No | No | TRACE | INFO | + +--- + +## 4. Quiet Flag Design + +### 4.1 `-q` and `-v` Interaction + +The cleanest pattern shares a single `verbosity` destination: + +```python +# argparse version (conceptual) +parser.add_argument('-v', '--verbose', action='count', default=0, dest='verbosity') +parser.add_argument('-q', '--quiet', action='store_const', const=-1, dest='verbosity') +``` + +With Click, this requires a small callback since `count=True` doesn't natively support negative values: + +```python +@click.option('-v', '--verbose', count=True, help="Increase verbosity (-v for info, -vv for debug)") +@click.option('-q', '--quiet', is_flag=True, help="Suppress all output except errors") +def main(verbose, quiet): + if quiet: + verbosity = -1 + else: + verbosity = verbose +``` + +### 4.2 Mutual Exclusivity + +`-v` and `-q` should be mutually exclusive. If both are passed, two strategies: + +1. **Last wins** (complex to implement in Click) +2. **`-q` always wins** (simpler, safer -- errors should never be suppressed by accident) +3. **Error out** (strictest, most explicit) + +**Recommendation**: `-q` overrides `-v`. Simple, predictable, safe. + +--- + +## 5. Python Logging Best Practices + +### 5.1 Module-Level Loggers + +Every module should create its own logger: + +```python +# modelkit/export/io.py +import logging + +logger = logging.getLogger(__name__) +# __name__ == "modelkit.export.io" +``` + +**Why**: The logging module builds a hierarchy using dot notation. `getLogger("modelkit.export.io")` is a child of `"modelkit.export"`, which is a child of `"modelkit"`. Setting the level on `"modelkit"` propagates to all children. This is the single most important logging pattern. + +**Rules**: +- Use `logger = logging.getLogger(__name__)` at module top level +- Never call `logging.basicConfig()` in library/module code -- only in the CLI entry point +- Never use `print()` for diagnostic output; use `logger.info()` / `logger.debug()` +- Use lazy formatting: `logger.debug("Loading %s", model_name)` not `logger.debug(f"Loading {model_name}")` + +### 5.2 Configure Only at Entry Point + +All logging configuration happens once, in the CLI entry point (`cli.py`): + +```python +def _configure_logging(verbosity: int) -> None: + """Configure root logger based on CLI verbosity.""" + base_level = logging.WARNING # 30 + level = max(logging.DEBUG, base_level - (verbosity * 10)) + + logging.basicConfig( + level=level, + format="%(levelname)s: %(message)s", # Simple for WARNING default + stream=sys.stderr, + ) +``` + +### 5.3 Lazy String Formatting + +```python +# GOOD - string formatting only happens if the message will be emitted +logger.debug("Exported %d nodes from %s", count, model_path) + +# BAD - f-string is always evaluated, even at WARNING level +logger.debug(f"Exported {count} nodes from {model_path}") + +# BAD - .format() is always evaluated +logger.debug("Exported {} nodes from {}".format(count, model_path)) +``` + +This matters for debug messages in hot paths where the formatting cost is non-trivial (e.g., stringifying large objects). + +--- + +## 6. stderr vs stdout + +### 6.1 The Rule + +| Stream | Content | +|--------|---------| +| **stdout** | Program output (data, results, generated configs) | +| **stderr** | Everything else (logs, progress, warnings, errors) | + +**Why**: Users pipe stdout to files or other programs. Mixing logs into stdout breaks pipelines: + +```bash +# This must work cleanly: +wmk config --model bert-base-uncased > config.yaml + +# Logs go to stderr, config YAML goes to stdout +# User can redirect independently: +wmk export --model bert 2>export.log +``` + +### 6.2 Implementation + +Python's `logging.basicConfig()` defaults to `stderr` -- no configuration needed. But be explicit: + +```python +logging.basicConfig( + level=level, + format=fmt, + stream=sys.stderr, # Explicit is better than implicit +) +``` + +For `click.echo()` (used for program output), it goes to stdout by default. Use `click.echo(..., err=True)` for diagnostic messages outside the logging system. + +--- + +## 7. Log Format Best Practices + +### 7.1 Format by Level + +Different verbosity levels warrant different formats: + +```python +def _configure_logging(verbosity: int) -> None: + level = max(logging.DEBUG, logging.WARNING - (verbosity * 10)) + + if verbosity >= 2: # DEBUG + fmt = "%(asctime)s %(name)s %(levelname)s %(message)s" + datefmt = "%H:%M:%S" + elif verbosity >= 1: # INFO + fmt = "%(levelname)s: %(message)s" + datefmt = None + else: # WARNING (default) and QUIET (ERROR) + fmt = "%(levelname)s: %(message)s" + datefmt = None + + logging.basicConfig(level=level, format=fmt, datefmt=datefmt, stream=sys.stderr) +``` + +**Rationale**: +- At WARNING/ERROR: Users want terse output. No timestamps, no module names. +- At INFO: Level prefix helps distinguish info from warnings. +- At DEBUG: Full context (timestamp, module name) is essential for diagnosis. + +### 7.2 Format Considerations + +- **Timestamps**: Only at DEBUG level. Users running `-vv` are diagnosing timing issues. +- **Module names**: Only at DEBUG level. `modelkit.export.io` tells developers where the message originates. +- **Level names**: Always include. Distinguishes warnings from info from errors. +- **Color**: Consider using `click.style()` or a custom formatter for colored level names in terminal output. Not essential for v1. + +--- + +## 8. Click Integration Patterns + +### 8.1 Pattern: count=True with Callback + +The most Pythonic Click pattern for verbosity: + +```python +import logging +import sys + +import click + + +def _configure_logging(verbosity: int) -> None: + """Set up root logger from CLI verbosity level. + + Mapping: + -q -> ERROR (40) errors only + default -> WARNING (30) warnings + errors + -v -> INFO (20) progress messages + -vv -> DEBUG (10) full diagnostics + """ + level = max(logging.DEBUG, logging.WARNING - (verbosity * 10)) + + if verbosity >= 2: + fmt = "%(asctime)s %(name)s %(levelname)s %(message)s" + datefmt = "%H:%M:%S" + else: + fmt = "%(levelname)s: %(message)s" + datefmt = None + + logging.basicConfig( + level=level, + format=fmt, + datefmt=datefmt, + stream=sys.stderr, + ) + + +@click.group() +@click.version_option(version=__version__, prog_name="wmk") +@click.option("-v", "--verbose", count=True, help="Increase verbosity (-v info, -vv debug).") +@click.option("-q", "--quiet", is_flag=True, help="Only show errors.") +@click.pass_context +def main(ctx: click.Context, verbose: int, quiet: bool) -> None: + """WML ModelKit - Accelerate Model Deployment on WinML.""" + verbosity = -1 if quiet else min(verbose, 2) + _configure_logging(verbosity) + + ctx.ensure_object(dict) + ctx.obj["verbosity"] = verbosity +``` + +### 8.2 Pattern: Environment Variable Override + +Support a `WMK_LOG_LEVEL` environment variable for CI/scripting: + +```python +import os + +def _configure_logging(verbosity: int) -> None: + env_level = os.environ.get("WMK_LOG_LEVEL") + if env_level is not None: + level = getattr(logging, env_level.upper(), None) + if level is None: + raise click.BadParameter(f"Invalid WMK_LOG_LEVEL: {env_level}") + else: + level = max(logging.DEBUG, logging.WARNING - (verbosity * 10)) + # ... rest of configuration +``` + +### 8.3 Pattern: Reusable Decorator + +For projects with many Click groups, extract the options into a reusable decorator: + +```python +import functools + + +def verbosity_options(func): + """Add standard -v/-q verbosity options to a Click command.""" + @click.option("-v", "--verbose", count=True, help="Increase verbosity.") + @click.option("-q", "--quiet", is_flag=True, help="Only show errors.") + @functools.wraps(func) + def wrapper(*args, verbose, quiet, **kwargs): + verbosity = -1 if quiet else min(verbose, 2) + _configure_logging(verbosity) + return func(*args, **kwargs) + return wrapper +``` + +### 8.4 Anti-Patterns to Avoid + +```python +# BAD: Don't use click-log or click-logging libraries +# They are unmaintained (last release 2018/2020), add unnecessary +# dependency, and the stdlib pattern is simple enough. + +# BAD: Don't use --log-level with free-text input +@click.option("--log-level", type=click.Choice(["DEBUG", "INFO", ...])) +# This is verbose and unfamiliar to most CLI users. + +# BAD: Don't configure logging per-subcommand +# Configure once in the group, propagate via context. + +# BAD: Don't use logging.root directly +logging.root.setLevel(...) # Use logging.basicConfig() instead +``` + +--- + +## 9. Structured Logging Considerations + +### 9.1 When to Use JSON Logging + +JSON/structured logging (via `structlog` or stdlib's `logging.handlers`) is appropriate for: +- **Server applications** where logs are consumed by aggregators (ELK, Datadog) +- **Long-running services** where machine-parseable output enables alerting +- **Multi-service architectures** where correlated log analysis is needed + +It is **not appropriate** as a default for CLI tools because: +- CLI output is read by humans in a terminal +- JSON is unreadable without `jq` or similar +- CLI invocations are short-lived, not aggregated + +### 9.2 If We Ever Need It + +Offer a `--log-format json` flag that swaps the formatter: + +```python +if log_format == "json": + import json + + class JsonFormatter(logging.Formatter): + def format(self, record): + return json.dumps({ + "ts": self.formatTime(record), + "level": record.levelname, + "logger": record.name, + "msg": record.getMessage(), + }) + + handler = logging.StreamHandler(sys.stderr) + handler.setFormatter(JsonFormatter()) + logging.root.addHandler(handler) + logging.root.setLevel(level) +``` + +**Recommendation**: Do not implement JSON logging now. It adds complexity with no current use case. Revisit if ModelKit is used in CI/CD pipelines that need machine-parseable logs. + +--- + +## 10. Third-Party Library Noise + +### 10.1 The Problem + +At DEBUG level, third-party libraries (transformers, urllib3, onnxruntime) flood stderr with their own debug messages. This makes our debug output unusable. + +### 10.2 The Solution + +After `basicConfig()`, raise the level for noisy third-party loggers: + +```python +def _configure_logging(verbosity: int) -> None: + level = max(logging.DEBUG, logging.WARNING - (verbosity * 10)) + + logging.basicConfig(level=level, format=fmt, stream=sys.stderr) + + # Silence noisy third-party loggers even at DEBUG + for noisy in ("urllib3", "transformers", "onnxruntime", "filelock"): + logging.getLogger(noisy).setLevel(max(level, logging.WARNING)) +``` + +This keeps our `modelkit.*` loggers at the requested level while preventing third-party noise. Users who truly need transformers debug output can use `WMK_LOG_LEVEL` or configure those loggers separately. + +--- + +## 11. Recommendation: Specific Pattern for ModelKit + +### 11.1 Verbosity Flags + +``` +wmk [global options] [command options] + +Global options: + -v, --verbose Increase verbosity (use -vv for maximum detail) + -q, --quiet Only show errors + --version Show version + --help Show help +``` + +| Invocation | Verbosity | Level | What the user sees | +|------------|-----------|-------|--------------------| +| `wmk -q export ...` | -1 | ERROR | Only errors | +| `wmk export ...` | 0 | WARNING | Warnings and errors | +| `wmk -v export ...` | 1 | INFO | Progress: "Exporting model...", "Config generated" | +| `wmk -vv export ...` | 2 | DEBUG | Full diagnostics: shapes, paths, timing | + +### 11.2 Recommended Implementation + +```python +"""modelkit/cli.py -- CLI entry point.""" +from __future__ import annotations + +import logging +import os +import sys +from importlib import import_module +from pathlib import Path + +import click + +from . import __version__ + +logger = logging.getLogger(__name__) + +_NOISY_LOGGERS = ("urllib3", "transformers", "onnxruntime", "filelock", "PIL") + + +def _configure_logging(verbosity: int) -> None: + """Configure root logger based on CLI verbosity. + + Verbosity mapping: + -1 (quiet) -> ERROR (40) + 0 (default) -> WARNING (30) + 1 (-v) -> INFO (20) + 2 (-vv) -> DEBUG (10) + """ + # Environment variable override for CI / scripting + env_level = os.environ.get("WMK_LOG_LEVEL") + if env_level is not None: + level = getattr(logging, env_level.upper(), None) + if level is None: + click.echo( + f"WARNING: Invalid WMK_LOG_LEVEL={env_level!r}, ignoring.", + err=True, + ) + level = logging.WARNING + else: + level = max(logging.DEBUG, logging.WARNING - (verbosity * 10)) + + # Format: terse for normal use, detailed for debug + if level <= logging.DEBUG: + fmt = "%(asctime)s %(name)s %(levelname)s %(message)s" + datefmt = "%H:%M:%S" + else: + fmt = "%(levelname)s: %(message)s" + datefmt = None + + logging.basicConfig( + level=level, + format=fmt, + datefmt=datefmt, + stream=sys.stderr, + ) + + # Suppress third-party noise at DEBUG level + if level <= logging.DEBUG: + for name in _NOISY_LOGGERS: + logging.getLogger(name).setLevel(logging.WARNING) + + +@click.group() +@click.version_option(version=__version__, prog_name="wmk") +@click.option( + "-v", + "--verbose", + count=True, + help="Increase verbosity (-v for info, -vv for debug).", +) +@click.option( + "-q", + "--quiet", + is_flag=True, + default=False, + help="Only show errors.", +) +@click.pass_context +def main(ctx: click.Context, verbose: int, quiet: bool) -> None: + """WML ModelKit - Accelerate Model Deployment on WinML. + + Universal ONNX export with QNN and OpenVINO backend support. + """ + verbosity = -1 if quiet else min(verbose, 2) + _configure_logging(verbosity) + + ctx.ensure_object(dict) + ctx.obj["verbosity"] = verbosity +``` + +### 11.3 Module Logger Convention + +Every module in the project should follow this pattern: + +```python +# Top of every .py file in modelkit/ +import logging + +logger = logging.getLogger(__name__) + +# Usage: +logger.debug("Shape resolved: %s -> %s", input_name, shape) +logger.info("Exported model to %s", output_path) +logger.warning("Preprocessor config not found, using defaults") +logger.error("Failed to load model: %s", exc) +``` + +### 11.4 Migration Checklist + +1. Replace `--debug` flag with `-v`/`-q` in `cli.py` +2. Change default level from `INFO` to `WARNING` +3. Add `_configure_logging()` helper +4. Add `WMK_LOG_LEVEL` environment variable support +5. Add third-party logger suppression +6. Audit all `print()` calls in modelkit -- convert diagnostics to `logger.*` +7. Audit all `click.echo()` calls -- ensure program output goes to stdout, diagnostics to stderr +8. Add `logger = logging.getLogger(__name__)` to any module that lacks it + +### 11.5 What NOT to Do + +- **No custom log levels**. The standard 5 levels (DEBUG, INFO, WARNING, ERROR, CRITICAL) are sufficient. +- **No `structlog` or `loguru`**. Zero dependencies for logging; the stdlib is enough. +- **No `click-log` or `click-logging`**. Unmaintained, unnecessary abstraction. +- **No `--log-level` option**. The `-v`/`-q` pattern is simpler and more conventional. The `WMK_LOG_LEVEL` env var covers the power-user case. +- **No JSON output** (for now). Revisit if CI pipeline integration demands it. +- **No per-subcommand logging configuration**. Configure once in the group, propagate via context. + +--- + +## Sources + +- [Configuring CLI output verbosity with logging and argparse](https://xahteiwi.eu/resources/hints-and-kinks/python-cli-logging-options/) +- [How to Set Logging Levels via Command Line in Python](https://signoz.io/guides/how-to-set-logging-level-from-command-line/) +- [Click Documentation: Options (count=True)](https://click.palletsprojects.com/en/stable/options/) +- [pip PR #9450: Add VERBOSE log level for -v](https://github.com/pypa/pip/pull/9450) +- [pip CLI Documentation](https://pip.pypa.io/en/stable/cli/pip/) +- [Logging HOWTO -- Python 3.14 documentation](https://docs.python.org/3/howto/logging.html) +- [Logging -- The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/logging/) +- [How and when to use stdout and stderr?](https://julienharbulot.com/python-cli-streams.html) +- [structlog Logging Best Practices](https://www.structlog.org/en/stable/logging-best-practices.html) +- [Verbosity In Ansible](https://www.builddevops.com/post/verbosity-in-ansible) +- [Ruff Configuration](https://docs.astral.sh/ruff/configuration/) +- [click-log Documentation](https://click-log.readthedocs.io/en/stable/) diff --git a/docs/design/session/monitor/1_prd.md b/docs/design/session/monitor/1_prd.md new file mode 100644 index 000000000..80c2785bc --- /dev/null +++ b/docs/design/session/monitor/1_prd.md @@ -0,0 +1,295 @@ +# Op-Tracing Refactor — Product Requirements Document + +**Version**: 2.2 +**Date**: 2026-04-19 +**Status**: Draft +**Module**: session/monitor +**Supersedes**: `docs/design/optracing/1_req.md` v1.0 (consolidated into this PRD per `docs/standards/design-doc-spec.md`) +**Depends-On**: `docs/standards/design-doc-spec.md` + +--- + +## Table of Contents + +- [1. Executive Summary](#1-executive-summary) +- [2. Scope](#2-scope) +- [3. User Stories](#3-user-stories) +- [4. Functional Requirements](#4-functional-requirements) +- [5. Non-Functional Requirements](#5-non-functional-requirements) +- [6. Technical Design (high-level)](#6-technical-design-high-level) +- [7. Design Constraints](#7-design-constraints) +- [8. Risks and Mitigations](#8-risks-and-mitigations) +- [9. Open Questions](#9-open-questions) +- [10. Appendix](#10-appendix) + - [10.1 Glossary](#101-glossary) + - [10.2 References](#102-references) + - [10.3 Document History](#103-document-history) + - [10.4 Migration Footprint](#104-migration-footprint) + - [10.5 Test Migration Footprint](#105-test-migration-footprint) + +--- + +## 1. Executive Summary + +### 1.1 Purpose + +Replace the current `QNNProfiler` / `OpTracer` hierarchy with an extended `EPMonitor` design so that per-operator profiling works against both `onnxruntime-qnn` and `onnxruntime-windowsml`, eliminates duplicated ORT session-creation logic, and exposes a single per-EP hierarchy for all vendor-specific observation. + +### 1.2 Problem Statement + +Two defects motivate this refactor: + +**D-1. `QNNProfiler` is broken with `onnxruntime-windowsml`.** The profiler creates its ORT session via the explicit-providers API, which searches for the QNN DLL in the pip package's `capi/` directory. `onnxruntime-windowsml` does not bundle that DLL — it lives under `C:\Program Files\WindowsApps\...` and is registered via WinML. The profiler's session silently falls back to CPU; no profiling data is produced. + +**D-2. `QNNProfiler` duplicates `WinMLSession`.** It creates its own `ort.InferenceSession`, duplicating device-policy resolution, EPContext handling, and EP-discovery logic that `WinMLSession` already owns correctly (via `add_provider_for_devices`). + +The codebase also carries two parallel per-EP hierarchies — `EPMonitor` in `session/monitor/` and `OpTracer` in `optracing/` — each with one QNN class. This duplication is untenable as more per-EP monitors land. + +### 1.3 Success Metrics + +- **SC-1** `wmk perf -m --device npu --op-tracing basic` produces a valid CSV and per-op cycle report when run against a QNN NPU under `onnxruntime-windowsml`. Currently fails (silent CPU fallback). +- **SC-2** `QNNProfiler`, `OpTracer`, `optracing/base.py`, `optracing/registry.py` are removed from the codebase. No remaining references via grep. +- **SC-3** `QNNMonitor.is_available()` returns `True` on any machine where `wmk perf --device npu` currently runs on QNN, regardless of which ORT distribution is installed. +- **SC-4** The standalone-profile idiom specified in §4.7 works end-to-end in a one-off script without introducing helper classes. +- **SC-5** All `tests/` pass. New tests cover the behaviors in NFR-7. +- **SC-6** `display_op_trace_report` and `write_op_trace_json` consume `OpTraceResult` (not dict) and are not modified by this refactor. `OpTraceResult.to_dict()` — which already exists at `optracing/result.py:79-95` — is preserved in its current nested schema and extended with additive top-level `status` and `error` keys. + +--- + +## 2. Scope + +### 2.1 In Scope + +- Deletion of `optracing/` package (except post-processing helpers, which move under `session/monitor/qnn/`). +- Extension of `EPMonitor` ABC with two optional hooks. +- Rewrite of `QNNMonitor` from placeholder to full implementation. +- Extension of `WinMLSession.perf()` to accept an EP monitor and yield a `PerfContext`. +- Collapse of the separate op-tracing block in `commands/perf.py` into the main benchmark loop. +- Extend the existing `OpTraceResult.to_dict()` method (at `optracing/result.py:79-95`) with additive top-level `status` and `error` keys; the existing nested schema is preserved. +- Extract WinML EP registry initializer from `WinMLSession._init_winml_eps_once()` to a module-level function to remove reverse coupling. + +### 2.2 Out of Scope + +- **OOS-1** QNNMonitor without a session (pure xrt-smi-style external telemetry). Possible future work. +- **OOS-2** New EPMonitor implementations for DML, OpenVINO, or TensorRT. This refactor reshapes the base class and reworks QNN only. +- **OOS-3** Changes to `HWMonitor` internals or PDH polling behavior. +- **OOS-4** Modifying `display_op_trace_report` / `write_op_trace_json` report writers. They continue to consume `OpTraceResult`. +- **OOS-5** Changes to the `wmk perf` CLI flags. `--op-tracing {basic|detail}` semantics preserved. +- **OOS-6** Multiple simultaneous EP monitors on one session. The `monitor=` parameter is singular. HWMonitor and EPMonitor coexist as orthogonal context managers. +- **OOS-7** Input generation utilities. No `generate_dummy_inputs` is added. Callers are responsible for their own input tensors. + +--- + +## 3. User Stories + +- **US-1** As a CLI user on a Qualcomm NPU running `onnxruntime-windowsml`, I run `wmk perf --op-tracing basic` and get a per-operator cycle report — without needing to install `onnxruntime-qnn`. +- **US-2** As a ModelKit developer, I add a new per-EP monitor (DML, OpenVINO) by subclassing `EPMonitor` — without duplicating ORT session-creation logic. +- **US-3** As a CI regression-check author, I capture per-operator metrics from a short Python script using only `WinMLSession` and `QNNMonitor` primitives. +- **US-4** As a library consumer, I attach EP-specific observation to an existing `WinMLSession` via `session.perf(monitor=...)` — without learning a second hierarchy. +- **US-5** As a QNN developer, I profile my actual benchmark workload rather than a synthetic profiling pass, so latency numbers reflect realistic inputs. + +--- + +## 4. Functional Requirements + +### 4.1 FR-1 — Op-tracing MUST work with both ORT distributions + +The refactor MUST produce valid profiling artifacts regardless of whether the user has `onnxruntime-qnn` (bundled QNN DLL) or `onnxruntime-windowsml` (WinML-registered QNN DLL) installed. The implementation MUST use `SessionOptions.add_provider_for_devices([ep_device], options)` after WinML EP registration. + +### 4.2 FR-2 — Op-tracing MUST attach via `session.perf(monitor=...)` + +The user-facing entry point MUST be a monitor attached via `session.perf(warmup, monitor=QNNMonitor(level=...))`. The separate `QNNProfiler.run(...)` entry point is deleted. The monitor contributes session options AND provider options to compile; parses output artifacts on exit. + +### 4.3 FR-3 — `EPMonitor` MUST be the single per-EP hierarchy + +The separate `OpTracer` hierarchy (`optracing/base.py`, `optracing/registry.py`) MUST be deleted. All per-EP observation and configuration is expressed through `EPMonitor` subclasses. + +### 4.4 FR-4 — `QNNMonitor` MUST replace `QNNProfiler` + +The current placeholder `QNNMonitor` MUST become the real implementation. It encodes all QNN-specific knowledge: CSV format, QHAS processing, backend DLL selection, `profiling_level` options, and ORT session teardown for CSV flush. `QNNProfiler` MUST be deleted. + +### 4.5 FR-5 — Two profiling levels MUST be exposed + +- `QNNMonitor(level="basic")` → `profiling_level="detailed"` → CSV with per-op cycle counts. +- `QNNMonitor(level="detail")` → `profiling_level="optrace"` → QHAS post-processing via QNN SDK viewer. If the SDK viewer is unavailable, the monitor MUST fall back to basic CSV parsing with a `WARNING` log and `status="basic_fallback"` in the result. + +### 4.6 FR-6 — `QNNMonitor` MUST produce an `OpTraceResult` + +`QNNMonitor.result` MUST expose an `OpTraceResult` object (the existing dataclass from `optracing/result.py`, relocated to `session/monitor/op_metrics.py`). `QNNMonitor.to_dict()` MUST delegate to `OpTraceResult.to_dict()`. The existing `OpTraceResult.to_dict()` method (at `optracing/result.py:79-95`) MUST be preserved in its current nested schema (`{metadata: {...}, summary, operators, statistics, artifacts}`) to keep `display_op_trace_report` and `write_op_trace_json` consumers unchanged per OOS-4. The refactor MAY extend `OpTraceResult` and its `to_dict()` output with new top-level keys `status` and `error` for failure reporting (see FR-5 and NFR-2); existing keys MUST NOT be renamed, removed, or restructured. The `model` field on `OpTraceResult` MAY be relaxed from `str` to `str | None` to support cases where the source model path is unknown (e.g., standalone programmatic profiling); this change is additive (`None` serialises cleanly to JSON `null`). + +### 4.7 FR-7 — Standalone profiling MUST work via primitive composition + +Callers without a benchmarking harness MUST be able to produce op-trace data using only `WinMLSession` + `QNNMonitor` primitives: + +```python +session = WinMLSession("model.onnx", device="npu") +with session.perf(monitor=QNNMonitor(level="basic")) as ctx: + for _ in range(N): + session.run(my_inputs) # caller provides inputs +print(ctx.monitor.to_dict()) +``` + +No `generate_dummy_inputs` utility is added. No helper class wraps the loop. If the caller lacks inputs, the caller MUST generate them. + +### 4.8 FR-8 — Availability reporting MUST align with actual usability + +`QNNMonitor.is_available()` MUST return `True` iff QNN EP is either bundled (`onnxruntime-qnn`) OR registered via WinML (`onnxruntime-windowsml`). The current single-path check (`ort.get_available_providers()` only) is insufficient. The implementation MUST call a module-level registry initializer (extracted from `WinMLSession._init_winml_eps_once`) and check `ort.get_ep_devices()`. + +### 4.9 FR-9 — `HWMonitor` MUST remain orthogonal + +`HWMonitor` is NOT migrated under `session.perf()`. It remains a standalone context manager, usable with or without a `WinMLSession`. `HWMonitor` and `EPMonitor` are independent context managers; they MAY be combined by the caller in a single `with` statement. + +### 4.10 FR-10 — `EPMonitor` MUST gain two optional hooks + +The `EPMonitor` base class MUST gain two optional hooks with defaults on the ABC itself (no Protocol, no Mixin): + +- `get_session_options(self) -> dict[str, str]` — default `{}`. Contributions to `SessionOptions.add_session_config_entry()` (e.g., `"session.disable_cpu_ep_fallback"`, `"ep.context_enable"`). +- `get_provider_options(self) -> dict[str, str]` — default `{}`. Contributions to `add_provider_for_devices([ep], opts)` (e.g., `"profiling_level"`, `"backend_path"`). + +`WinMLSession` MUST merge both into the respective ORT surfaces during compile. VitisAI / OpenVINO monitors inherit defaults and are unchanged. + +### 4.11 FR-11 — Monitor instantiation MUST NOT require a factory / registry + +`commands/perf.py` MUST resolve the correct `EPMonitor` class via explicit dispatch based on `--ep` and `--op-tracing` flags. No abstract factory, no registry module, no dynamic plugin loading. If op-tracing is requested against an EP that has no matching monitor, the command MUST fail hard with a descriptive error (no silent fallback). + +### 4.12 FR-12 — Monitor MUST NOT mutate process-global state + +`QNNMonitor` MUST NOT call `os.chdir()` or otherwise mutate the process's working directory. Output paths (CSV, schematic, QHAS) MUST be controlled via absolute paths in configuration. If a QNN SDK artifact (e.g., `*_schematic.bin`) cannot be redirected via configuration, the monitor MUST either (a) locate the artifact post-hoc from a known fallback location, or (b) document the limitation explicitly in its docstring and skip the artifact with a `WARNING` log. + +--- + +## 5. Non-Functional Requirements + +### 5.1 Performance + +- **NFR-1** CLI-level ergonomics MUST NOT regress. The benchmark command path MUST collapse the current three context managers (stats + hw + ep) to two (perf-with-monitor + hw) and run in comparable wall time. + +### 5.2 Reliability + +- **NFR-2** No silent failures. If QNN EP cannot be loaded, the session MUST raise a descriptive `CompilationError`. If the CSV is absent or parsing fails, `to_dict()` MUST return `status="no_data"` or `status="parse_failed"` — never an empty structure masquerading as success. +- **NFR-3** Auto-reset MUST be observable. When `session.perf(monitor=...)` auto-resets a previously compiled session to apply the monitor's options, the event MUST log at `WARNING` level: `"auto-resetting compiled session to apply monitor session/provider options"`. Silent mutation is forbidden. +- **NFR-4** Idempotency. `EPMonitor.get_session_options()` and `EPMonitor.get_provider_options()` MUST return the same dict on repeated calls within one monitor instance's lifetime. File paths MUST be produced at `__init__`, not on each call. + +### 5.3 Usability + +- **NFR-5** Exception transparency. Monitor `__exit__` MUST NOT suppress exceptions raised from the `with` body. Parse failures inside `__exit__` are logged and reflected in `to_dict()`, but any active exception from the `with` body MUST propagate normally. + +### 5.4 Compatibility + +- **NFR-6** No process-global state. `EPMonitor` instances MUST be stateless between uses (no module-level caches). Importing `QNNMonitor` MUST NOT trigger EP probes, DLL loads, or network activity. `os.chdir` and equivalent global-state mutations are forbidden (see FR-12). +- **NFR-7** Test coverage. All existing tests in `tests/` MUST pass after the refactor. New tests MUST cover: the availability check on both ORT distributions, CSV parsing, session/provider-option merging rules, auto-reset behavior, load-bearing teardown ordering in `perf().__exit__`, and the double-entry guard on `EPMonitor.__enter__`. + +--- + +## 6. Technical Design (high-level) + +Detail lives in `2_coreloop.md`. Headline decisions: + +- **Architectural pattern**: Hook-based Plugin + Template Method + Observer. `WinMLSession.compile()` is the template method; `EPMonitor` plugs into two hook points (`get_session_options`, `get_provider_options`); the monitor observes inference via context-manager lifecycle. +- **Session-owned ORT creation**: `WinMLSession` is the sole owner of `ort.InferenceSession` construction. Monitors never create ORT sessions. +- **Singular monitor**: `session.perf(monitor=EPMonitor|None)`. No `monitors=[...]` multi-monitor support. +- **Monitor factory by explicit dispatch**: `commands/perf.py` contains a 10-line dispatch function. No plugin registry. +- **Report consumers unchanged**: `OpTraceResult.to_dict()` is extended (not replaced); the existing nested schema is preserved; report writers continue to consume `OpTraceResult`. + +--- + +## 7. Design Constraints + +- **C-1** `WinMLSession` is the sole owner of ORT session creation. Monitors never create `ort.InferenceSession` instances directly. +- **C-2** Teardown ordering inside `perf().__exit__` is load-bearing: **session reset first, monitor `__exit__` second**. Reversing or parallelizing breaks QNN CSV parsing because QNN EP flushes CSV only on ORT session destruction. +- **C-3** `profiling_level` and `profiling_file_path` are NOT user-overridable via `extra_provider_options`. The monitor owns these keys; user overrides MUST be rejected by construction (enforced via explicit key assignment after merge, not via duplicate-key dict literals). +- **C-4** Only one `EPMonitor` per session. The `monitor=` parameter is singular. No multi-monitor support today or planned. +- **C-5** `EPMonitor` instances do not mutate process-global state. No `os.chdir()`, no env-var mutation, no module caches. +- **C-6** `requires_session_teardown: ClassVar[bool]` is an ORT-specific hint to the session that this monitor's data flush requires `ort.InferenceSession` destruction. It is the only place on the base ABC where an ORT implementation detail leaks in; this is accepted as a pragmatic tradeoff (YAGNI vs the architect's proposed `prepare_for_exit` callback). + +--- + +## 8. Risks and Mitigations + +- **R-1 / M-1**: Load-bearing teardown ordering regressed by a future contributor. → Integration test asserts `session._session is None` during `monitor.__exit__`; test lives in `tests/unit/session/test_perf_monitor.py`. +- **R-2 / M-2**: Windows file-handle lag after `ort.InferenceSession` destruction may leave CSV partially written when the monitor tries to parse it. → Call `gc.collect()` after `session.reset()` inside `perf().__exit__` to force handle release. Add fallback retry (one retry with 50ms delay) if CSV parse fails on first attempt. +- **R-3 / M-3**: Exception propagation through `perf().__exit__` silently swallowed if `session.reset()` raises while a caller exception is active. → Use `contextlib.ExitStack` or a `try/finally` chain that preserves the active exception per NFR-5. +- **R-4 / M-4**: QNN SDK `schematic.bin` file emitted to a location we cannot control (if absolute paths not supported). → Document as a known limitation; locate via glob post-hoc OR skip the artifact with a `WARNING` log (no `os.chdir`). +- **R-5 / M-5**: Auto-reset surprises a user debugging compile times. → `WARNING`-level log message; documented in `session.perf()` docstring. +- **R-6 / M-6**: Concurrent `WinMLSession` instances in one process both attempting op-tracing would race on the CSV output path (if default temp dirs collide). → QNNMonitor generates a unique output dir at `__init__` (`tempfile.mkdtemp(prefix="qnn_profile_")`) to eliminate collisions. + +--- + +## 9. Open Questions + +- **OQ-1** Does the QNN SDK accept an absolute path for `*_schematic.bin` output, enabling full elimination of `os.chdir`-style workarounds? If not, which fallback strategy (glob-locate post-hoc vs skip-with-warning) should be canonical? Resolve during implementation by empirical check against QNN SDK 2.42. +- **OQ-2** Should `OpTraceResult.to_dict()` include a schema version field for forward compatibility with future report formats? Currently leaning no (YAGNI), but decide before merge. + +--- + +## 10. Appendix + +### 10.1 Glossary + +| Term | Meaning | +|------|---------| +| **ORT** | ONNX Runtime | +| **EP** | Execution Provider — ORT's plugin for a specific backend (QNN, DML, TensorRT, ...) | +| **QNN** | Qualcomm Neural Network — AI runtime for Qualcomm NPUs | +| **QHAS** | QNN Hardware Analyzer Schematic — detailed per-op roofline / DMA traffic data | +| **EPContext** | ORT feature that persists a JIT-compiled model for fast reload | +| **PDH** | Windows Performance Data Helper — OS counters used by `HWMonitor` | +| **WinML EP registration** | `ort.register_execution_provider_library(name, dll_path)` populated from the Windows App SDK's `ExecutionProviderCatalog` | +| **HTP** | Hexagon Tensor Processor — Qualcomm NPU backend within QNN | +| **Op-tracing** | Per-operator profiling: capturing per-op execution cycles during inference | + +### 10.2 References + +- `docs/standards/design-doc-spec.md` — the spec this PRD conforms to. +- `docs/design/session/monitor/2_coreloop.md` — companion core-loop design. +- `docs/design/session/monitor/iterations/01.md` through `11.md` — brainstorming trail. +- `D:\BYOM\ModelKit_PRs\232\docs\design\perf\qnn_ep_profiling_investigation.md` — original QNN EP profiling investigation (three ORT APIs, five tests, `add_provider_for_devices` solution). +- `D:\BYOM\ModelKit_PRs\232\temp\prove_qnn_ep_profiling.py` — proof script validating the fix. + +### 10.3 Document History + +| Version | Date | Change | +|---------|------|--------| +| 1.0 | 2026-04-17 | Initial `1_req.md` (deleted). Captured requirements from iterations 01-11. | +| 2.0 | 2026-04-19 | Consolidated into `1_prd.md` per `docs/standards/design-doc-spec.md` v1.0. The prior `1_req.md` was deleted from disk (not deprecated-in-place) because its content is fully subsumed here; the `Supersedes` field preserves the historical link. Incorporated user directives (dual `get_session_options` + `get_provider_options` hooks; extend existing `OpTraceResult.to_dict()` — not replace; no `generate_dummy_inputs`; no `os.chdir`; no multi-monitor; factory dispatch; reorganized test migration). Incorporated critic and architect review findings. | +| 2.1 | 2026-04-19 | Post-audit fixes: added Table of Contents; renumbered Appendix to match spec §4.1 (Document History at §10.3); clarified that `OpTraceResult.to_dict()` already exists and the refactor preserves its nested schema, only adding `status`/`error` keys; clarified that `ep_registry.py` already exists and only gains a new `ensure_initialized()` function; added `fixtures/` to test migration; documented `commands/perf.py` import-path redirects. | +| 2.2 | 2026-04-24 | Relocated from docs/design/optracing/ to docs/design/session/monitor/ per spec §1.5.1 transitional commitment (implementation complete). Removed Transitional Location note. | + +### 10.4 Migration Footprint + +| Action | Paths | +|--------|-------| +| Delete | `src/winml/modelkit/optracing/base.py`, `src/winml/modelkit/optracing/registry.py`, `src/winml/modelkit/optracing/__init__.py`, `src/winml/modelkit/optracing/qnn/profiler.py` | +| Delete (entire directory after moves) | `src/winml/modelkit/optracing/` | +| Move | `optracing/qnn/csv_parser.py` → `session/monitor/qnn/csv_parser.py` | +| Move | `optracing/qnn/qhas_parser.py` → `session/monitor/qnn/qhas_parser.py` | +| Move | `optracing/qnn/viewer.py` → `session/monitor/qnn/viewer.py` | +| Move | `optracing/result.py` (`OpTraceResult`, `OperatorMetrics`) → `session/monitor/op_metrics.py` | +| Move | `optracing/report.py` (`display_op_trace_report`, `write_op_trace_json`) → `session/monitor/report.py` | +| Extend | Existing `OpTraceResult.to_dict()` in the relocated `op_metrics.py`: preserve nested schema; add top-level `status` and `error` keys. Add optional `status` / `error` dataclass fields (both default to `"ok"` / `None`). | +| Relax | `OpTraceResult.model: str` → `str \| None` for cases where source path is unknown. | +| Modify | `session/monitor/ep_monitor.py` — add `get_session_options`, `get_provider_options`, `requires_session_teardown` with defaults | +| Rewrite | `session/monitor/qnn_monitor.py` — from placeholder to full implementation | +| Modify | `session/session.py` — `perf()` gains `monitor=` parameter, returns `PerfContext`; compile-time hook integration; `_init_winml_eps_once` extracted to the module-level function described below | +| Modify | `session/ep_registry.py` — existing file gains a new module-level `ensure_initialized()` function that wraps `WinMLEPRegistry.get_instance().register_to_ort()`. The existing class-based API remains. | +| Modify | `commands/perf.py` — collapse separate op-tracing block; add `_resolve_ep_monitor()` dispatch helper. Import paths for `OpTraceResult`, `display_op_trace_report`, `write_op_trace_json` redirect from `..optracing` to `..session.monitor.report` / `..session.monitor.op_metrics`. Remove import of `is_qnn_profiling_available`, `get_tracer` (both deleted). | + +### 10.5 Test Migration Footprint + +| Existing test file | New location / action | +|---------------------|-----------------------| +| `tests/unit/optracing/test_csv_parser.py` | Move to `tests/unit/session/monitor/qnn/test_csv_parser.py` | +| `tests/unit/optracing/test_qhas_parser.py` | Move to `tests/unit/session/monitor/qnn/test_qhas_parser.py` | +| `tests/unit/optracing/test_detection.py` | Rewrite as `tests/unit/session/monitor/test_qnn_monitor_availability.py` | +| `tests/unit/optracing/test_integration.py` | Rewrite as `tests/unit/session/test_perf_monitor_integration.py` | +| `tests/unit/optracing/test_perf_optracing_cli.py` | Move to `tests/unit/commands/test_perf_optracing.py` | +| `tests/unit/optracing/test_qnn_profiler.py` | **Delete**; replaced by `tests/unit/session/monitor/test_qnn_monitor.py` (new) | +| `tests/unit/optracing/test_registry.py` | **Delete**; registry is removed. | +| `tests/unit/optracing/test_report.py` | Move to `tests/unit/session/monitor/test_report.py` | +| `tests/unit/optracing/test_result.py` | Move to `tests/unit/session/monitor/test_op_metrics.py`; add tests for the extended `OpTraceResult.to_dict()` with `status` / `error`. | +| `tests/unit/optracing/fixtures/` (directory) | Move to `tests/unit/session/monitor/qnn/fixtures/` — parsers use these (`optrace_resnet50.csv`, `qhas_resnet50.json`). | +| `tests/unit/optracing/` (directory) | Delete after all files moved / deleted. | +| — | **New**: `tests/unit/session/test_perf_monitor_integration.py` — asserts load-bearing teardown ordering (session `_session is None` during monitor `__exit__`). | +| — | **New**: `tests/unit/session/test_perf_auto_reset.py` — asserts `WARNING` log on auto-reset and that provider options are re-merged. | +| — | **New**: `tests/unit/session/monitor/test_ep_monitor_base.py` — asserts defaults of `get_session_options`, `get_provider_options`, `requires_session_teardown`, and double-entry guard. | +| — | **New**: `tests/unit/session/test_ep_registry.py` — asserts `ensure_initialized()` is idempotent and logs only on first call. | diff --git a/docs/design/session/monitor/2_coreloop.md b/docs/design/session/monitor/2_coreloop.md new file mode 100644 index 000000000..2ace2a21f --- /dev/null +++ b/docs/design/session/monitor/2_coreloop.md @@ -0,0 +1,873 @@ +# Op-Tracing Refactor — Core Loop Design + +**Version**: 2.2 +**Date**: 2026-04-19 +**Status**: Draft +**Module**: session/monitor +**Supersedes**: `docs/design/optracing/2_coreloop.md` v1.0 (consolidated per `docs/standards/design-doc-spec.md`) +**Depends-On**: `docs/design/session/monitor/1_prd.md`, `docs/standards/design-doc-spec.md` + +--- + +## Table of Contents + +- [0. Related Documents](#0-related-documents) +- [0.5 I/O Dependencies](#05-io-dependencies) +- [1. Design Philosophy](#1-design-philosophy) +- [2. Module Structure](#2-module-structure) +- [3. Core Loop Implementation](#3-core-loop-implementation) +- [4. API Design](#4-api-design) + - [4.1 EPMonitor — revised ABC](#41-epmonitor--revised-abc) + - [4.2 NullEPMonitor](#42-nullepmonitor) + - [4.3 QNNMonitor](#43-qnnmonitor) + - [4.4 PerfContext](#44-perfcontext) + - [4.5 WinMLSession.perf() — revised](#45-winmlsessionperf--revised) + - [4.6 OpTraceResult — extend existing to_dict()](#46-optraceresult--extend-existing-to_dict) + - [4.7 Factory helper in commands/perf.py](#47-factory-helper-in-commandsperfpy) +- [5. CLI Integration](#5-cli-integration) +- [6. Configuration / Data Structures](#6-configuration--data-structures) +- [7. Error Handling](#7-error-handling) +- [8. Testing Strategy](#8-testing-strategy) +- [9. Integration Points](#9-integration-points) +- [10. Future Work](#10-future-work) +- [11. Revision History](#11-revision-history) + +--- + +## 0. Related Documents + +| Document | Path | Purpose | +|----------|------|---------| +| PRD | `./1_prd.md` | Requirements, scope, constraints, migration footprint | +| Spec | `../../standards/design-doc-spec.md` | Normative doc standard this design conforms to | +| Iterations | `./iterations/01.md` – `11.md` | Brainstorming record (informational) | +| Upstream | `../../../src/winml/modelkit/session/session.py` | `WinMLSession` — existing code extended here | +| Upstream | `../../../src/winml/modelkit/session/monitor/ep_monitor.py` | `EPMonitor` ABC — existing code extended here | +| Upstream | `../../../src/winml/modelkit/commands/perf.py` | CLI benchmark entry point — existing code modified here | +| Deleted | `../../../src/winml/modelkit/optracing/qnn/profiler.py` | `QNNProfiler` — deleted by this refactor | + +## 0.5 I/O Dependencies + +This refactor orchestrates four subsystems. Data dependencies MUST be understood before reading the core loop. + +### 0.5.1 Key actors + +| Actor | Role | Location | +|-------|------|----------| +| `WinMLSession` | Owns `ort.InferenceSession` lifecycle; exposes `perf()` | `session/session.py` | +| `EPMonitor` (ABC) | Per-EP observer with two optional config hooks | `session/monitor/ep_monitor.py` | +| `QNNMonitor` | Concrete EPMonitor for Qualcomm NPU | `session/monitor/qnn_monitor.py` | +| `PerfContext` | Dataclass yielded by `session.perf()` | `session/session.py` (new) | +| `OpTraceResult` | Structured profiling output | `session/monitor/op_metrics.py` (relocated) | +| `ort.InferenceSession` | Actual ORT session; writes profiling CSV | ORT runtime | +| `ep_registry.ensure_initialized()` | Registers WinML EPs into ORT | `session/ep_registry.py` — **new module-level function added to the existing file**; wraps `WinMLEPRegistry.get_instance().register_to_ort()`. Replaces direct use of `WinMLSession._init_winml_eps_once`. | + +### 0.5.2 Data dependency graph + +``` +┌───────────────────────────────────────────────────────────────────────┐ +│ caller │ +│ session = WinMLSession("model.onnx", device="npu") │ +│ mon = QNNMonitor(level="basic", output_dir=Path(...)) │ +│ │ │ +│ ▼ │ +│ with session.perf(warmup=5, monitor=mon) as ctx: │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ perf.__enter__: │ │ +│ │ extra_sess = mon.get_session_options() ← HOOK 1 │ │ +│ │ extra_prov = mon.get_provider_options() ← HOOK 2 │ │ +│ │ if (extra_sess or extra_prov) and compiled: │ │ +│ │ logger.warning("auto-reset..."); self.reset() │ │ +│ │ merge into self._active_session_option_entries │ │ +│ │ merge into self._provider_options │ │ +│ │ mon.__enter__() │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ session.run(inputs) ── triggers lazy compile ──▶ uses merged opts │ +│ │ │ +│ │ ort.InferenceSession created with profiling options │ +│ │ CSV being written in background (inside the run) │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ perf.__exit__: │ │ +│ │ self._perf_stats = None │ │ +│ │ if mon.requires_session_teardown: │ │ +│ │ self.reset() # drop ort.InferenceSession│ │ +│ │ gc.collect() # release Windows handles │ │ +│ │ mon.__exit__(exc_info) # parse CSV → OpTraceResult│ │ +│ │ restore saved options │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ ctx.monitor.to_dict() → delegates to OpTraceResult.to_dict() │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +### 0.5.3 Module responsibility summary + +- **`WinMLSession`**: template-method owner. Merges monitor hook contributions at compile; creates `ort.InferenceSession`; runs inference; handles teardown ordering at `perf().__exit__`. +- **`EPMonitor` (ABC)**: contract definition. Two optional config hooks with base-class defaults. `is_available` classmethod. Mandatory `__enter__`/`__exit__`/`to_dict`. +- **`QNNMonitor`**: concrete implementation. Declares session+provider options; parses CSV/QHAS in `__exit__`; produces `OpTraceResult`. +- **`commands/perf.py`**: CLI dispatcher. Resolves the right monitor class by explicit `if/elif` on `--ep` and `--op-tracing` flags; constructs it with the appropriate `level` and `output_dir`. +- **`session/ep_registry.py`**: module-level `ensure_initialized()`. Single shared entry point for WinML EP registration. Eliminates the reverse-coupling `QNNMonitor → WinMLSession._init_winml_eps_once`. + +--- + +## 1. Design Philosophy + +### 1.1 Purpose + +Collapse the dual per-EP hierarchy (`EPMonitor` + `OpTracer`) into one; fix the broken `onnxruntime-windowsml` session-creation path; eliminate code duplication by routing all ORT session construction through `WinMLSession`. + +### 1.2 Core Principles + +- **P1 — Session owns the session; monitor informs the session.** `WinMLSession.compile()` is the sole owner of `ort.InferenceSession` construction. Monitors contribute configuration via two hooks but never create ORT sessions directly. +- **P2 — Delete > refactor.** Where two abstractions exist for the same concept, delete one. `QNNProfiler` and `OpTracer` are deleted rather than patched. +- **P3 — Good primitives > bespoke facades.** A clean pair (`WinMLSession`, `QNNMonitor`) composes cleanly into any caller shape. We do not add helper classes or wrapper utilities. +- **P4 — Extension by hook, not by new abstraction.** New EP monitors are added by subclassing `EPMonitor` and overriding the two hooks. No registry, no factory, no plugin loader. +- **P5 — Explicit over implicit.** No silent fallbacks. No silent session mutations (auto-reset logs at `WARNING`). No silent "ep unsupported" errors (hard-fail at dispatch time). + +### 1.3 Design Pattern + +**Hook-based Plugin + Template Method + Observer.** + +- `WinMLSession.compile()` is the template method: it owns the algorithm (resolve device → build session options → find EP device → merge provider options → create ORT session). +- It calls the monitor at two hook points: `get_session_options()` (add_session_config_entry contributions) and `get_provider_options()` (add_provider_for_devices contributions). +- The `EPMonitor` itself is a context-managed observer: `__enter__` prepares for observation, `__exit__` finalizes. +- The monitor never replaces session behavior — only augments specific steps. + +--- + +## 2. Module Structure + +### 2.1 File layout after refactor + +``` +src/winml/modelkit/ +├── session/ +│ ├── session.py # modified (see §4.5) +│ ├── ep_registry.py # modified — existing file; adds ensure_initialized() (see §4.3) +│ └── monitor/ +│ ├── ep_monitor.py # modified (see §4.1) +│ ├── hw_monitor.py # unchanged +│ ├── qnn_monitor.py # REWRITTEN (see §4.3) +│ ├── vitisai_monitor.py # unchanged +│ ├── openvino_monitor.py # unchanged (inherits new defaults) +│ ├── op_metrics.py # NEW — moved from optracing/result.py + .to_dict() +│ ├── report.py # NEW — moved from optracing/report.py +│ └── qnn/ +│ ├── csv_parser.py # moved from optracing/qnn/ +│ ├── qhas_parser.py # moved from optracing/qnn/ +│ └── viewer.py # moved from optracing/qnn/ +├── commands/ +│ └── perf.py # modified (see §5) +└── optracing/ # DELETED ENTIRELY +``` + +### 2.2 Key dependencies + +- `WinMLSession.compile()` calls `mon.get_session_options()` and `mon.get_provider_options()` on the active monitor. +- `QNNMonitor.is_available()` calls `session/ep_registry.py::ensure_initialized()` (NOT `WinMLSession._init_winml_eps_once`, which is deleted). +- `QNNMonitor.__exit__` reads the CSV written by QNN EP during `session.run()` and produces an `OpTraceResult`. +- `commands/perf.py` imports `QNNMonitor` and `VitisAIMonitor` directly; no registry. + +--- + +## 3. Core Loop Implementation + +### 3.1 High-level flow + +The canonical flow is the CLI benchmark with op-tracing enabled. It exercises every hook point. + +``` +wmk perf -m resnet50 --device npu --op-tracing basic + │ + ▼ +commands/perf.py + │ monitor = _resolve_ep_monitor(ep="qnn", op_tracing="basic", output_dir=...) + │ → QNNMonitor(level="basic", output_dir=...) + │ + ▼ +with session.perf(warmup=warmup, monitor=monitor) as ctx, HWMonitor() as hw: + │ ▲ + │ │ orthogonal, + │ │ process-wide counters + │ + │ perf.__enter__: + │ extra_sess = monitor.get_session_options() + │ → {"session.disable_cpu_ep_fallback": "1", + │ "ep.context_enable": "1", + │ "ep.context_embed_mode": "0"} + │ extra_prov = monitor.get_provider_options() + │ → {"backend_path": "QnnHtp.dll", ..., + │ "profiling_level": "detailed", + │ "profiling_file_path": ""} + │ if (extra_sess or extra_prov) and self._session is not None: + │ logger.warning("auto-resetting compiled session ...") + │ self.reset() + │ merge into session state + │ monitor.__enter__() # sets _entered flag + │ yield PerfContext(stats=PerfStats(...), monitor=monitor) + │ + │ for _ in range(iterations): + │ session.run(inputs) + │ → first call triggers lazy compile: + │ - SessionOptions.add_session_config_entry(k, v) for each extra_sess + │ - add_provider_for_devices([qnn_ep_dev], merged_provider_opts) + │ - ort.InferenceSession(...) created + │ → subsequent calls run; QNN EP appends to profiling CSV + │ + ▼ + perf.__exit__: + self._perf_stats = None + exc_info = sys.exc_info() # may be (None,None,None) + try: + if monitor.requires_session_teardown: # QNN: True + self.reset() # drops ort.InferenceSession → flushes CSV + gc.collect() # release Windows file handles + finally: + try: + monitor.__exit__(*exc_info) # parses CSV → OpTraceResult + finally: + restore saved session/provider options + │ + ▼ +# After the `with` block +if op_tracing: + display_op_trace_report(ctx.monitor.result, console) # OpTraceResult (not dict) + write_op_trace_json(ctx.monitor.result, json_path) +``` + +### 3.2 Lifecycle walkthrough — benchmark-only (no EP monitor) + +```python +with session.perf(warmup=10) as ctx: # monitor=None → NullEPMonitor + for _ in range(100): + session.run(inputs) +print(ctx.stats.mean_ms) +``` + +- `NullEPMonitor.get_session_options()` → `{}` +- `NullEPMonitor.get_provider_options()` → `{}` +- `needs_recompile = False` → no auto-reset +- `NullEPMonitor.requires_session_teardown = False` → no reset on exit +- `ctx.monitor.to_dict()` → `{}`, `ctx.monitor.result` is `None` + +Zero behavior change from today's `session.perf(warmup=10) as stats` — just one extra level of indirection via `ctx.stats`. + +### 3.3 Lifecycle walkthrough — benchmark with VitisAI proof-of-execution + +```python +with session.perf(warmup=10, monitor=VitisAIMonitor()) as ctx, HWMonitor() as hw: + session.run(inputs) +``` + +- `VitisAIMonitor.get_session_options()` → `{}` (inherits default) +- `VitisAIMonitor.get_provider_options()` → `{}` (inherits default) +- `needs_recompile = False` +- `VitisAIMonitor.__enter__` takes xrt-smi snapshot +- `VitisAIMonitor.__exit__` takes xrt-smi snapshot; `ctx.monitor.npu_proven` is True/False +- `VitisAIMonitor.requires_session_teardown = False` → no reset on exit + +Same API, different monitor. No QNN-specific code paths activated. + +### 3.4 Lifecycle walkthrough — standalone profile (no CLI) + +```python +session = WinMLSession("model.onnx", device="npu") +with session.perf(monitor=QNNMonitor(level="basic")) as ctx: + for _ in range(10): + session.run(my_inputs) # caller provides inputs +print(ctx.monitor.to_dict()) +``` + +- No helper class. No `generate_dummy_inputs`. The caller generates inputs. +- 6 lines excluding the import. + +### 3.5 Teardown ordering — load-bearing invariant + +Inside `session.perf().__exit__`, the following order is **load-bearing**: + +``` +1. Stop perf timing (self._perf_stats = None) +2. Capture sys.exc_info() (for propagation) +3. IF monitor.requires_session_teardown: + self.reset() ← drops ort.InferenceSession; QNN flushes CSV + gc.collect() ← Windows: release file handle on CSV +4. monitor.__exit__(*exc_info) ← parses CSV → OpTraceResult +5. Restore saved session_options + provider_options +``` + +Reversing step 3 and step 4 produces an empty CSV file. Running them concurrently produces a race. Explicitly forbidden by **C-2** in the PRD. + +An integration test (§8.3) asserts `session._session is None` during `monitor.__exit__` to lock this invariant. + +--- + +## 4. API Design + +### 4.1 `EPMonitor` — revised ABC + +```python +# session/monitor/ep_monitor.py + +class EPMonitor(ABC): + """Per-EP observer attached to a WinMLSession for an inference window.""" + + # ---- Optional hooks: defaults provided; subclasses override as needed ---- + + # ORT-specific hint: does this monitor's data flush require ort.InferenceSession destruction? + # True for QNNMonitor (CSV flushes on session delete). False for passive monitors. + requires_session_teardown: ClassVar[bool] = False + + def get_session_options(self) -> dict[str, str]: + """Entries to pass to SessionOptions.add_session_config_entry(). Default: none.""" + return {} + + def get_provider_options(self) -> dict[str, str]: + """Options to merge into add_provider_for_devices(). Default: none.""" + return {} + + # ---- Mandatory contract ---- + + @classmethod + @abstractmethod + def is_available(cls) -> bool: + """Whether this monitor's EP and infrastructure are usable on this system.""" + + @abstractmethod + def __enter__(self) -> Self: ... + + @abstractmethod + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """MUST NOT suppress exceptions from the `with` body.""" + + @abstractmethod + def to_dict(self) -> dict[str, Any]: + """JSON-serializable summary. MUST include `ep` key.""" +``` + +Invariants: + +- `get_session_options()` and `get_provider_options()` MUST be idempotent (NFR-4). +- `__enter__` MUST raise `RuntimeError(" already entered")` if called twice without intervening `__exit__`. +- `__exit__` MUST NOT return `True` (which would suppress exceptions). + +### 4.2 `NullEPMonitor` + +Unchanged from current `ep_monitor.py:62-88`. Inherits new default `get_session_options()` / `get_provider_options()` (both return `{}`) and `requires_session_teardown = False`. No edit needed; behavior automatic. + +### 4.3 `QNNMonitor` + +```python +# session/monitor/qnn_monitor.py + +class QNNMonitor(EPMonitor): + """Qualcomm NPU per-op profiler via ORT's QNN EP. + + Produces an OpTraceResult with per-operator cycle counts (level="basic") + or full QHAS roofline / DMA traffic (level="detail"). + """ + + requires_session_teardown: ClassVar[bool] = True + # QNN EP flushes the profiling CSV only on ort.InferenceSession destruction. + + def __init__( + self, + level: Literal["basic", "detail"] = "basic", + output_dir: Path | None = None, + extra_provider_options: Mapping[str, str] | None = None, + ) -> None: + if level not in ("basic", "detail"): + raise ValueError(f"level must be 'basic' or 'detail', got {level!r}") + self._level = level + # Idempotency: paths produced at __init__, not per-call + self._output_dir = Path(output_dir) if output_dir else Path( + tempfile.mkdtemp(prefix="qnn_profile_") + ) + self._output_dir.mkdir(parents=True, exist_ok=True) + self._csv_path = (self._output_dir / "profiling_output.csv").resolve() + self._extra = dict(extra_provider_options or {}) + self._entered = False + self._result: OpTraceResult | None = None + + @classmethod + def is_available(cls) -> bool: + import onnxruntime as ort + if "QNNExecutionProvider" in ort.get_available_providers(): + return True + # WinML-registered path. `ensure_initialized` is a NEW module-level function + # added to the existing `session/ep_registry.py`; it wraps the existing + # `WinMLEPRegistry.get_instance().register_to_ort()` as an idempotent entry point. + from ..ep_registry import ensure_initialized + ensure_initialized() + return any(d.ep_name == "QNNExecutionProvider" for d in ort.get_ep_devices()) + + def get_session_options(self) -> dict[str, str]: + return { + "session.disable_cpu_ep_fallback": "1", + "ep.context_enable": "1", + "ep.context_embed_mode": "0", + } + + def get_provider_options(self) -> dict[str, str]: + # Build in layers; last writer wins. Owner-enforced keys applied LAST. + opts: dict[str, str] = { + "backend_path": "QnnHtp.dll", + "htp_performance_mode": "high_performance", + "htp_graph_finalization_optimization_mode": "3", + "enable_htp_fp16_precision": "1", + } + opts.update(self._extra) + # C-3: these two keys are NEVER user-overridable + opts["profiling_level"] = "optrace" if self._level == "detail" else "detailed" + opts["profiling_file_path"] = str(self._csv_path) + return opts + + def __enter__(self) -> Self: + if self._entered: + raise RuntimeError("QNNMonitor already entered") + self._entered = True + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + # Parse whatever artifacts are on disk. Never suppress caller exceptions. + try: + self._result = self._parse_artifacts() + except Exception as e: + logger.warning("QNNMonitor: artifact parse failed: %s", e) + self._result = OpTraceResult( + model=None, device="npu", tracing_level=self._level, + ep="QNNExecutionProvider", tracing_backend="qnn", + operators=[], summary={}, artifacts={"csv": str(self._csv_path)}, + status="parse_failed", error=str(e), + ) + # Do not return True → does not suppress caller exception. + + def to_dict(self) -> dict[str, Any]: + if self._result is None: + return {"ep": "QNN", "device": "NPU", "status": "not_run"} + return self._result.to_dict() + + @property + def result(self) -> OpTraceResult | None: + """Structured result object. Preferred by report writers.""" + return self._result + + def _parse_artifacts(self) -> OpTraceResult: + """Parse CSV (always) and QHAS (detail mode).""" + # ... details: try CSV → fall back to no-data; if detail, try QHAS viewer + # On Windows file-handle lag: retry once with 50ms delay (R-2 mitigation) + ... +``` + +**On CWD / `*_schematic.bin`**: Per **C-5** and **FR-12**, `QNNMonitor` does NOT call `os.chdir`. If the QNN SDK emits `*_schematic.bin` to the process's CWD rather than to `profiling_file_path`'s directory, `_parse_artifacts` locates it via `glob` from the expected fallback locations and logs a `WARNING` if not found. The `detail`-mode path degrades gracefully to basic CSV parsing in that case (FR-5). + +### 4.4 `PerfContext` + +```python +# session/session.py + +@dataclass(frozen=True) +class PerfContext: + """Yielded by WinMLSession.perf(). Aggregates perf stats and the attached EP monitor.""" + stats: PerfStats + monitor: EPMonitor # NullEPMonitor when caller passed monitor=None +``` + +Frozen to prevent accidental mutation during the `with` block. Not a replacement for `PerfStats` — both `stats` and `monitor` are addressable by attribute. + +### 4.5 `WinMLSession.perf()` — revised + +```python +# session/session.py + +@contextmanager +def perf( + self, + warmup: int = 0, + monitor: EPMonitor | None = None, +) -> Generator[PerfContext, None, None]: + """Run a scoped performance window. + + Yields: + PerfContext with `stats: PerfStats` and `monitor: EPMonitor`. + + Behavior: + - If `monitor` contributes session_options or provider_options and this + session is already compiled, the ORT session is auto-reset with a + WARNING log. Future runs within the `with` body trigger recompile + with the merged options. + - If `monitor.requires_session_teardown`, `self.reset()` is called at + exit BEFORE `monitor.__exit__`, so the monitor sees the fully-flushed + artifacts (e.g., QNN CSV). + - Nested perf() is forbidden — raises RuntimeError on re-entry. + """ + if self._perf_stats is not None: + raise RuntimeError("session.perf() already active (nested perf is forbidden)") + + mon = monitor or NullEPMonitor() + + # Collect hook contributions + extra_sess = mon.get_session_options() + extra_prov = mon.get_provider_options() + needs_recompile = (extra_sess or extra_prov) and self._session is not None + if needs_recompile: + logger.warning( + "session.perf(): auto-resetting compiled session to apply monitor " + "session/provider options (monitor=%s)", type(mon).__name__ + ) + self.reset() + + # Save + merge + saved_sess_entries = dict(self._active_session_option_entries) + saved_prov = dict(self._provider_options) + self._active_session_option_entries = {**saved_sess_entries, **extra_sess} + self._provider_options = {**saved_prov, **extra_prov} + + stats = PerfStats(warmup=warmup) + self._perf_stats = stats + mon.__enter__() + + try: + yield PerfContext(stats=stats, monitor=mon) + finally: + self._perf_stats = None + exc_info = sys.exc_info() # propagates caller exception to monitor.__exit__ + try: + if mon.requires_session_teardown: + self.reset() + gc.collect() # Windows: release CSV file handle (R-2) + finally: + try: + mon.__exit__(*exc_info) + finally: + self._active_session_option_entries = saved_sess_entries + self._provider_options = saved_prov +``` + +Also in `WinMLSession.__init__`: + +```python +self._active_session_option_entries: dict[str, str] = {} # NEW state +``` + +And in `_build_session_options(self, device)`, add the application of monitor contributions: + +```python +def _build_session_options(self, device: str) -> ort.SessionOptions: + ... + # Apply monitor-contributed session config entries (if perf() context is active) + for key, value in self._active_session_option_entries.items(): + opts.add_session_config_entry(key, value) + ... +``` + +### 4.6 `OpTraceResult` — extend existing `to_dict()` + +`OpTraceResult.to_dict()` **already exists** at `optracing/result.py:79-95`. The refactor **preserves its nested schema exactly** (required by OOS-4: report writers are not modified). Two additive changes: + +1. Two new dataclass fields (`status`, `error`) with defaults that keep existing callers working. +2. Two new top-level keys in `to_dict()`, placed alongside the existing keys — the existing `metadata`, `summary`, `operators`, `statistics`, `artifacts` structure is untouched. +3. `model` field relaxed from `str` to `str | None` to support standalone programmatic profiling where the source path is unknown. + +```python +# session/monitor/op_metrics.py (moved from optracing/result.py, with additive changes) +# OperatorMetrics is co-located here (moved from optracing/result.py) and its +# existing `to_dict()` method is unchanged; no edits required. + +@dataclass +class OpTraceResult: + # ---- Required (unchanged, except `model` type relax to allow None) ---- + model: str | None # was: str — relaxed to accept None + device: str + tracing_level: str # "basic" or "detail" + + # ---- Defaulted fields (defaults preserved verbatim from current source) ---- + operators: list[OperatorMetrics] = field(default_factory=list) + ep: str = "" # default preserved — do NOT remove + tracing_backend: str = "" # default preserved — do NOT remove + timestamp: str = field( + default_factory=lambda: datetime.now(timezone.utc).isoformat() + ) + num_samples: int = 0 + summary: dict[str, Any] = field(default_factory=dict) + statistics: dict[str, dict[str, float]] = field(default_factory=dict) + artifacts: dict[str, str] = field(default_factory=dict) + + # ---- NEW fields (additive; defaults keep existing construction compatible) ---- + status: str = "ok" # "ok" | "no_data" | "parse_failed" | "basic_fallback" + error: str | None = None # populated when status == "parse_failed" + + def to_dict(self) -> dict[str, Any]: + """Existing nested schema preserved. New `status` / `error` keys added alongside.""" + return { + "metadata": { + "model": self.model, + "device": self.device, + "ep": self.ep, + "tracing_level": self.tracing_level, + "tracing_backend": self.tracing_backend, + "timestamp": self.timestamp, + "num_samples": self.num_samples, + }, + "summary": self.summary, + "operators": [op.to_dict() for op in self.operators], + "statistics": self.statistics, # PRESERVED — consumers depend on this + "artifacts": self.artifacts, + # ---- Additive keys ---- + "status": self.status, + "error": self.error, + } + + def to_json(self, indent: int = 2) -> str: # unchanged — preserved as-is + return json.dumps(self.to_dict(), indent=indent) +``` + +Report consumers (`display_op_trace_report`, `write_op_trace_json`) continue to accept `OpTraceResult` (not dict). `ctx.monitor.result` exposes it directly. New consumers that care about `status` read it at the top level; existing consumers that read `metadata["model"]`, `summary`, `operators`, `statistics`, `artifacts` are unaffected. + +### 4.7 Factory helper in `commands/perf.py` + +```python +# commands/perf.py (new helper, ~15 lines) + +def _resolve_ep_monitor( + ep: str, + op_tracing: str | None, + output_dir: Path, +) -> EPMonitor: + """Pick the EPMonitor for the requested EP and optional op-tracing level. + + Raises RuntimeError with a descriptive message when op-tracing is requested + against an EP that has no op-tracing monitor. + """ + if op_tracing: + if ep == "qnn" and QNNMonitor.is_available(): + return QNNMonitor(level=op_tracing, output_dir=output_dir) + raise RuntimeError( + f"Op-tracing not available for EP '{ep}'. Supported: 'qnn'." + ) + # Proof-of-execution monitors (no op-tracing) + if ep == "vitisai" and VitisAIMonitor.is_available(): + return VitisAIMonitor() + return NullEPMonitor() +``` + +No registry. No abstract factory. One function, `if/elif` dispatch on two string args. Extension by adding branches. + +--- + +## 5. CLI Integration + +### 5.1 Current code (being replaced) + +`commands/perf.py:1334-1386` — a separate op-tracing block after benchmark. Invokes `QNNProfiler` with its own iteration count and dummy-input generation. + +### 5.2 Replacement code + +The op-tracing block is **deleted**. Op-tracing is integrated into the benchmark's existing `session.perf()` call. + +```python +# inside the main benchmark loop in commands/perf.py + +output_dir = output.parent if output else Path.cwd() + +try: + monitor = _resolve_ep_monitor(ep=config.ep, op_tracing=op_tracing, output_dir=output_dir) +except RuntimeError as e: + console.print(f"[red]Error:[/red] {e}") + raise SystemExit(1) from None + +with ( + session.perf(warmup=config.warmup, monitor=monitor) as ctx, + hw_monitor as hw, +): + _run_monitored_loop(session, inputs, ctx.stats, hw, + total_iterations=total_iterations, ...) + if hw: + self._hw_metrics = hw.to_dict() + +# Post-benchmark: report +if op_tracing: + result = ctx.monitor.result # OpTraceResult (not dict) + if result is None or result.status == "no_data": + console.print("[yellow]Warning:[/yellow] No profiling data produced.") + else: + display_op_trace_report(result, console) + json_path = output_dir / f"{model_slug}_op_trace.json" + write_op_trace_json(result, json_path) + console.print(f"[green]Op-trace saved to:[/green] {json_path}") +``` + +Semantic change: op-tracing now observes the user's actual benchmark iterations rather than a separate synthetic profiling pass. Per **US-5**, this is the preferred behavior. + +### 5.3 Hard-fail on unsupported op-tracing + +If `--op-tracing basic` is requested against `--ep dml` (which has no op-tracing monitor), `_resolve_ep_monitor` raises `RuntimeError` with a descriptive message and the CLI exits with code 1. **No silent fallback** per NFR-2. + +--- + +## 6. Configuration / Data Structures + +### 6.1 Session options merge order + +Inside `WinMLSession.perf().__enter__`, session config entries are merged into `self._active_session_option_entries` in this order: + +1. **User-configured** (via `WinMLSession(...)` ctor, not mentioned in current code): base. +2. **Monitor contribution** via `mon.get_session_options()`: overrides #1. + +Applied in `_build_session_options()` via `opts.add_session_config_entry(k, v)` for each (k, v) pair. + +### 6.2 Provider options merge order + +Inside `WinMLSession.perf().__enter__`, provider options are merged into `self._provider_options` in this order: + +1. **User-configured** (via `WinMLSession(..., ep_config=EPConfig(provider_options=...))`): base. +2. **Monitor contribution** via `mon.get_provider_options()`: overrides #1. +3. **Monitor-enforced keys** (inside `QNNMonitor.get_provider_options` after all merges): specifically `profiling_level` and `profiling_file_path` — owned by the monitor, never user-overridable. + +Implementation detail: `QNNMonitor.get_provider_options()` builds the dict in layers with explicit key assignment at the end (not a duplicate-key dict literal, which would trigger Ruff `F601`): + +```python +opts = {...defaults...} +opts.update(self._extra) +opts["profiling_level"] = ... # LAST — cannot be overridden via extra +opts["profiling_file_path"] = ... +return opts +``` + +### 6.3 Restoration on exit + +Both `self._active_session_option_entries` and `self._provider_options` are saved at `perf().__enter__` and restored at `perf().__exit__`, regardless of whether the caller body raised an exception. + +--- + +## 7. Error Handling + +### 7.1 Exception types + +| Exception | Raised when | Caught by | +|-----------|-------------|-----------| +| `WinMLSession.CompilationError` | ORT session creation fails (existing behavior) | `commands/perf.py` CLI wrapper | +| `RuntimeError("QNNMonitor already entered")` | `monitor.__enter__` called twice | propagates; it's a programmer bug | +| `RuntimeError("session.perf() already active ...")` | Nested `perf()` | propagates; programmer bug | +| `RuntimeError("Op-tracing not available for EP ''")` | `_resolve_ep_monitor` called with unsupported `(ep, op_tracing)` pair | CLI exits with code 1 | + +### 7.2 Failure paths + +| Failure | Detected where | Behavior | +|---------|-----------------|----------| +| QNN EP not available (neither ORT variant has it) | `QNNMonitor.is_available()` → False | `_resolve_ep_monitor` raises descriptive `RuntimeError`. CLI exits 1. | +| Session compile fails with QNN options | `ort.InferenceSession(...)` raises inside `compile()` | Translated to `CompilationError` per existing `session.py:303-314`. Monitor `__exit__` still runs, sees no CSV, produces `status="no_data"`. | +| CSV missing after teardown | `QNNMonitor._parse_artifacts()` | `OpTraceResult(status="no_data", artifacts={})`. Logged at WARNING. Not an exception. | +| CSV parse error | `_parse_artifacts()` raises | Caught in `__exit__`; produces `OpTraceResult(status="parse_failed", error=msg)`. Logged at WARNING. Does not suppress caller exception. | +| QHAS viewer not found (detail mode) | `run_qhas_viewer()` raises / returns None | Fall back to basic CSV parsing. `OpTraceResult.status = "basic_fallback"`. Logged at WARNING. | +| Auto-reset fires | `perf().__enter__` | `logger.warning("auto-resetting ...")` (per NFR-3). Proceeds normally. | +| `__enter__` twice | `QNNMonitor.__enter__` | `RuntimeError("QNNMonitor already entered")`. | +| Windows CSV file-handle lag | `_parse_artifacts()` on first attempt | Retry once with `time.sleep(0.05)`. If still fails → `status="parse_failed"`. | + +### 7.3 Exception transparency (NFR-5) + +`perf().__exit__` uses a nested `try/finally` pattern that: + +1. Captures `sys.exc_info()` at entry (may be a live caller exception). +2. Performs session teardown in an inner `try/finally`. +3. Calls `monitor.__exit__(*exc_info)` so the monitor knows about any active exception. +4. Never calls `return True` → never suppresses. +5. Restores saved options even if monitor.__exit__ raises. + +--- + +## 8. Testing Strategy + +### 8.1 Test file migration + +See PRD §10.5 for the full migration table. Summary: + +- `tests/unit/optracing/*.py` → migrate files to `tests/unit/session/monitor/` and `tests/unit/commands/`. +- `tests/unit/optracing/fixtures/` (contains `optrace_resnet50.csv`, `qhas_resnet50.json`) → move to `tests/unit/session/monitor/qnn/fixtures/` so parsers' unit tests remain functional. +- Delete: `test_registry.py` (registry is removed), `test_qnn_profiler.py` (class is deleted; replaced by `test_qnn_monitor.py`). +- Redirect test imports: from `winml.modelkit.optracing.*` → `winml.modelkit.session.monitor.*` / `winml.modelkit.session.monitor.qnn.*`. + +### 8.2 Unit tests (per class) + +| Test | Asserts | +|------|---------| +| `test_ep_monitor_base.py::test_defaults` | `EPMonitor` subclasses with no overrides get `get_session_options() == {}`, `get_provider_options() == {}`, `requires_session_teardown == False`. | +| `test_ep_monitor_base.py::test_double_entry_guard` | Calling `monitor.__enter__()` twice on a concrete `QNNMonitor` raises `RuntimeError`. | +| `test_qnn_monitor.py::test_is_available_bundled` | Mocked `onnxruntime-qnn` → `is_available()` returns True. | +| `test_qnn_monitor.py::test_is_available_winml` | Mocked WinML registration + `get_ep_devices` → True. | +| `test_qnn_monitor.py::test_is_available_neither` | Both paths miss → False. | +| `test_qnn_monitor.py::test_get_provider_options_idempotent` | Two calls return equal dicts. | +| `test_qnn_monitor.py::test_profiling_keys_not_overridable` | `extra_provider_options={"profiling_level":"off"}` ignored; owner-enforced value wins. | +| `test_qnn_monitor.py::test_exit_no_csv` | `__exit__` with no CSV produces `OpTraceResult.status == "no_data"`. | +| `test_qnn_monitor.py::test_exit_parse_failure` | Corrupt CSV → `status == "parse_failed"`, `error` populated. | +| `test_op_metrics.py::test_to_dict_schema` | `OpTraceResult.to_dict()` has required keys (`ep`, `device`, `operators`, `summary`, `artifacts`, `num_samples`, `status`). | +| `test_ep_registry.py::test_ensure_initialized_idempotent` | Calling twice no-ops; logs on first call only. | + +### 8.3 Integration tests + +| Test | Asserts | +|------|---------| +| `test_perf_monitor_integration.py::test_teardown_ordering` | With a `FakeMonitor(requires_session_teardown=True)`, during `monitor.__exit__`, `session._session is None`. | +| `test_perf_monitor_integration.py::test_null_monitor_no_reset` | `session.perf()` with no monitor does NOT reset a compiled session. | +| `test_perf_auto_reset.py::test_auto_reset_fires_on_option_diff` | With a compiled session and a monitor that contributes options, `__enter__` logs WARNING and `session._session` becomes None. | +| `test_perf_auto_reset.py::test_auto_reset_restores_on_exit` | After `perf()` exit, `self._provider_options` is restored to pre-entry state. | +| `test_perf_monitor_integration.py::test_exception_transparency` | Caller exception in `with` body propagates; `monitor.__exit__` called with correct `exc_info`. | +| `test_perf_monitor_integration.py::test_nested_perf_forbidden` | Second `session.perf()` inside first raises `RuntimeError`. | + +### 8.4 CLI / E2E tests (hardware-gated) + +- `test_perf_optracing.py::test_cli_op_tracing_basic_on_qnn` (skip if no QNN NPU): runs `wmk perf -m resnet50 --device npu --op-tracing basic`, asserts CSV produced, `*_op_trace.json` written, at least one operator entry. +- `test_perf_optracing.py::test_cli_op_tracing_unsupported_ep` (no hardware needed): `--ep dml --op-tracing basic` exits with code 1 and descriptive message. + +--- + +## 9. Integration Points + +### 9.1 Downstream consumers + +- `commands/perf.py` — uses `_resolve_ep_monitor` + `session.perf(monitor=...)`. +- `display_op_trace_report(result: OpTraceResult, console)` — unchanged; consumes `ctx.monitor.result`. +- `write_op_trace_json(result: OpTraceResult, path)` — unchanged; consumes `ctx.monitor.result`. + +### 9.2 Upstream dependencies + +- `session/ep_registry.py::ensure_initialized()` — **new module-level function** added to existing `ep_registry.py`; wraps `WinMLEPRegistry.get_instance().register_to_ort()` behind an idempotent single-call entry point. Called by `QNNMonitor.is_available()` AND by `WinMLSession.__init__`. Replaces the existing classmethod `WinMLSession._init_winml_eps_once`, which is deleted. +- `WinMLSession.reset()` — called by `perf().__exit__` for `requires_session_teardown` monitors. +- Import redirect in `commands/perf.py`: `from ..optracing import display_op_trace_report, write_op_trace_json, OpTraceResult` → `from ..session.monitor.report import display_op_trace_report, write_op_trace_json` and `from ..session.monitor.op_metrics import OpTraceResult`. Remove imports of `is_qnn_profiling_available` and `get_tracer` (both deleted). + +### 9.3 How future EP monitors plug in + +Adding DMLMonitor (hypothetical): + +1. Create `session/monitor/dml_monitor.py` with `class DMLMonitor(EPMonitor)`. +2. Override `is_available()` to check for DML EP. +3. Override `get_provider_options()` if DML profiling requires config (e.g., `"dml_profiling_enabled": "1"`). +4. Override `requires_session_teardown` if DML's profiling data flush needs it. +5. Add one `elif` branch in `commands/perf.py::_resolve_ep_monitor`. + +No registry changes. No cross-file wiring. + +--- + +## 10. Future Work + +- **FW-1** Extract the auto-reset behavior to a reusable policy object when a second monitor type needs different reset semantics (YAGNI now). +- **FW-2** Investigate QNN SDK's support for absolute `*_schematic.bin` paths; if supported, eliminate the glob-fallback path in `_parse_artifacts` (OQ-1 in PRD). +- **FW-3** Multi-monitor support (`monitors=[...]`). Requires redesigning teardown ordering (see architect review). Out of scope per C-4. +- **FW-4** Schema versioning on `OpTraceResult.to_dict()` output. Consider if report writers need forward compatibility (OQ-2 in PRD). + +--- + +## 11. Revision History + +| Version | Date | Change | +|---------|------|--------| +| 1.0 | 2026-04-17 | Initial `2_coreloop.md`. Captured architecture from iterations 01-11. | +| 2.0 | 2026-04-19 | Restructured per `docs/standards/design-doc-spec.md` v1.0. Added metadata header, §0 Related Documents, §0.5 I/O Dependencies. Applied user directives: dual `get_session_options` + `get_provider_options` hooks; preserve `OpTraceResult.to_dict()` (not plain dict); `os.chdir` removed (use absolute paths + glob fallback); `generate_dummy_inputs` removed entirely; singular `monitor=`; factory dispatch replaces registry. Applied critic/architect findings: no duplicate dict keys (explicit pop-then-set); add `ep_registry.ensure_initialized` to remove reverse coupling; auto-reset at WARNING (not INFO); `gc.collect` + retry for Windows file-handle lag; exception transparency via `sys.exc_info()` capture; load-bearing teardown ordering made explicit with integration test. | +| 2.1 | 2026-04-19 | Post-audit fixes: added Table of Contents; corrected §4.6 to acknowledge `OpTraceResult.to_dict()` already exists at `optracing/result.py:79-95` and the refactor preserves its nested schema (adds `status`/`error` as additive top-level keys, keeps `metadata`/`summary`/`operators`/`statistics`/`artifacts`); clarified in §0.5.1 and §4.3 that `ensure_initialized()` is a NEW function added to the existing `ep_registry.py`; added `fixtures/` migration to §8.1; documented `commands/perf.py` import-path redirects in §9.2. | +| 2.2 | 2026-04-24 | Relocated from docs/design/optracing/ to docs/design/session/monitor/ per spec §1.5.1 transitional commitment (implementation complete). Removed Transitional Location note. | diff --git a/docs/design/session/monitor/iterations/01.md b/docs/design/session/monitor/iterations/01.md new file mode 100644 index 000000000..15296fab7 --- /dev/null +++ b/docs/design/session/monitor/iterations/01.md @@ -0,0 +1,53 @@ +# Iteration 01 — Problem: QNNProfiler fails with onnxruntime-windowsml + +## Entry point + +Learn from the companion investigation on branch `232`: +- `docs/design/perf/qnn_ep_profiling_investigation.md` +- `temp/prove_qnn_ep_profiling.py` + +Goal: eliminate the `onnxruntime-qnn` dependency for op-tracing while keeping per-operator cycle counts working. + +## The bug + +`QNNProfiler.run()` (src/winml/modelkit/optracing/qnn/profiler.py:129-134) creates its ORT session via: + +```python +session = ort.InferenceSession( + str(self.onnx_path), + providers=["QNNExecutionProvider"], + provider_options=provider_options, +) +``` + +With `onnxruntime-windowsml`, the QNN DLL lives at `C:\Program Files\WindowsApps\...` — registered via WinML, not bundled in `capi/`. The explicit-providers path calls `ProviderLibrary::Load()` → `GetRuntimePath() + "onnxruntime_providers_qnn.dll"`, finds nothing, and **silently falls back to CPU**. No profiling data is produced. + +The availability check (optracing/__init__.py:20) also checks `ort.get_available_providers()` only, which lists bundled EPs — so `is_qnn_profiling_available()` returns True for `onnxruntime-qnn` but False for `onnxruntime-windowsml`, despite QNN being usable via WinML in the latter. + +## The proven fix + +ORT exposes three APIs. The `prove_qnn_ep_profiling.py` script validates all five permutations; only one passes provider options to a WinML-registered QNN EP: + +```python +WinMLSession._init_winml_eps_once() # register QNN EP via WinML catalog +qnn_dev = next(d for d in ort.get_ep_devices() if d.ep_name == "QNNExecutionProvider") +opts = ort.SessionOptions() +opts.add_provider_for_devices([qnn_dev], {"profiling_level": "detailed", "profiling_file_path": csv_path}) +ort.InferenceSession(model_path, sess_options=opts) # QNN EP loads + profiling CSV emitted +``` + +This path is already in `WinMLSession._build_session_options()` (session.py:426-438) when `self._ep` is set — but `QNNProfiler` bypasses `WinMLSession` entirely, so it never benefits. + +## Quirk captured + +`add_provider_for_devices([qnn_dev], {})` with empty options causes QNN EP to fail with error 1003. Passing the profiling options makes it succeed. The profiler must always pass a non-empty options dict. + +## Initial proposed changes (3) + +1. Fix `QNNProfiler.is_available()` to check `get_ep_devices()` after WinML init. +2. Switch session creation in `run()` to `add_provider_for_devices` path (with fallback to explicit providers for `onnxruntime-qnn`). +3. Return `dict` instead of `list[dict]` from `_build_provider_options()` to match the new API signature. + +## Seed for later iterations + +The fix-in-place approach is **minimally invasive**. It does not address a deeper architectural duplication noticed here: `WinMLSession` and `QNNProfiler` both create ORT sessions, yet only `WinMLSession` knows about device policy, EPContext caching, and the `add_provider_for_devices` pattern. That observation seeded iteration 02. diff --git a/docs/design/session/monitor/iterations/02.md b/docs/design/session/monitor/iterations/02.md new file mode 100644 index 000000000..c677af12c --- /dev/null +++ b/docs/design/session/monitor/iterations/02.md @@ -0,0 +1,47 @@ +# Iteration 02 — EPMonitor hierarchy and unification question + +## Entry point + +Before committing to the minimal fix from iteration 01, step back: the codebase has `EPMonitor` (session/monitor/) with placeholder `QNNMonitor`, and a separate `OpTracer` hierarchy (optracing/) with `QNNProfiler`. Two QNN-specific classes doing vendor-specific profiling work. Is this duplication? + +## Observations + +Two parallel per-EP hierarchies exist: + +| Hierarchy | Base class | Purpose today | QNN impl | +|-----------|------------|---------------|----------| +| `session/monitor/` | `EPMonitor` (ABC) | Proof-of-execution (xrt-smi, driver telemetry) | `QNNMonitor` is a stub — `is_available()` returns False | +| `optracing/` | `OpTracer` (ABC) | Per-op profiling (CSV parsing, QHAS) | `QNNProfiler` is the real implementation | + +The `QNNMonitor` placeholder docstring literally anticipates this merger: + +> *"Future: Will wrap QAIRT profiling via qnn-profile-viewer.exe for Qualcomm-specific metrics (device execution time, queue wait, per-op traces)."* + +## The asymmetry discovered + +`VitisAIMonitor` (AMD's concrete `EPMonitor`) and `QNNProfiler` operate at fundamentally different lifecycles: + +| Dimension | VitisAIMonitor (observer) | QNNProfiler (configurator) | +|-----------|---------------------------|------------------------------| +| Timing | Wraps an existing session | Must be set **before** session creation | +| Data source | External CLI (`xrt-smi`) | The EP itself (writes CSV) | +| Mechanism | OS counters / subprocess | `provider_options["profiling_level"]` | +| Coexists with normal inference? | Yes | No — different EP compile path | +| Session used | Existing WinMLSession | Creates its own (bypasses WinMLSession) | + +**The core insight**: VitisAIMonitor *watches* what the hardware did. QNNProfiler *tells* the hardware to emit data. One is passive, one is active. + +## Three role options posed + +- **(A) Pure observer** — EPMonitor only watches; OpTracer stays as a separate active hierarchy. +- **(B) Vendor integration point** — EPMonitor is the per-EP behavior bag; OpTracer folds in; monitors can contribute `provider_options`. +- **(C) Coexist via shared "EP plugin" interface** — both stay, united by a third abstraction. + +## Decision lean + +Option **B** was recommended because: +1. Two QNN-specific classes for the same vendor is duplication. +2. The `QNNMonitor` docstring already anticipates it. +3. "Monitor" can mean "can contribute session configuration" as well as "can observe." + +But this requires extending `EPMonitor`'s contract — which is the question opened in iteration 03. diff --git a/docs/design/session/monitor/iterations/03.md b/docs/design/session/monitor/iterations/03.md new file mode 100644 index 000000000..ec551a5e9 --- /dev/null +++ b/docs/design/session/monitor/iterations/03.md @@ -0,0 +1,59 @@ +# Iteration 03 — Is OpTracer a kind of op-level monitor? + +## Entry point + +> *"OpTracer can be seen as one kind of 'monitor' right? monitoring on the op-level?"* + +## Side-by-side re-examined + +The passive/active asymmetry (iteration 02) is real, but the **output shapes** of the two hierarchies are similar — both produce *"here's what the EP did during my inference"*: + +- `VitisAIMonitor.to_dict()` → `{"command_submissions": N, ...}` +- `QNNProfiler.run()` → `{"operators": [...], "summary": {...}}` + +Both are "post-inference telemetry for a specific EP." What differs is lifecycle (passive observer vs active configurator) — not the output contract. + +## Can EPMonitor accommodate both lifecycles? + +Yes — by extending the interface with **one** method: + +```python +class EPMonitor(ABC): + def get_provider_options(self) -> dict[str, str]: # NEW + return {} + def __enter__(self) -> Self: ... + def __exit__(self, ...) -> None: ... + def to_dict(self) -> dict: ... + @classmethod + def is_available(cls) -> bool: ... +``` + +VitisAIMonitor returns empty options (unchanged behavior). QNNMonitor returns `{"profiling_level": ..., "profiling_file_path": ...}`. + +## Proposed flow + +```python +ep_monitor = QNNMonitor(level="basic") +session = WinMLSession(..., ep_config=EPConfig(provider_options=ep_monitor.get_provider_options())) + +with session.perf() as stats, ep_monitor as mon: + session.run(inputs) + +print(mon.to_dict()) # per-op cycle counts +``` + +## What this unifies + +- `QNNMonitor(level="basic")` → cycle counts (profiling_level=detailed) +- `QNNMonitor(level="detail")` → QHAS (profiling_level=optrace) +- `QNNMonitor()` (no level) → xrt-smi-equivalent telemetry (future) + +And the entire `optracing/` package could collapse into post-processing helpers (CSV parser, QHAS parser, report writer) — or get absorbed into `session/monitor/qnn/`. + +## Resolution + +OpTracer conceptually IS a kind of monitor — op-level — **if** we accept that "monitor" includes "can contribute session configuration." The interface extension is minimal (one method). Agreed to proceed with this interpretation; moved to how the session integration should look (iteration 04). + +## Open seam + +How does the monitor's `provider_options` get plumbed into the session at the right moment? This is iteration 04's concern. diff --git a/docs/design/session/monitor/iterations/04.md b/docs/design/session/monitor/iterations/04.md new file mode 100644 index 000000000..c5bee74b3 --- /dev/null +++ b/docs/design/session/monitor/iterations/04.md @@ -0,0 +1,69 @@ +# Iteration 04 — Session integration: put monitor under the session? + +## Entry point + +> *"To let monitor impact session, like customize session options/provider options, we need to put monitor method into session, right? Like session.perf()? Can we just reuse perf to init monitoring?"* + +The `session.perf(warmup)` context manager yields `PerfStats`. It's a method on `WinMLSession` because `PerfStats` needs to be reached from inside `session.run()` to time each iteration. The pattern is: **observers that need session internals belong as session methods**. + +## The clean test proposed + +An observer that needs to influence session internals → belongs on the session. +An observer that is purely external → stays standalone. + +Applying this test: + +| Observer | Needs session internals? | Belongs where | +|----------|--------------------------|---------------| +| `PerfStats` | Yes (wraps `run()`) | `session.perf()` | +| `EPMonitor` (generally) | Maybe (QNN needs provider_options; VitisAI doesn't) | Session method | +| `HWMonitor` | No (PDH counters, process-wide) | Standalone | + +## Why EPMonitor needs session integration + +For QNN op-tracing, `provider_options["profiling_level"]` MUST be set **before** `compile()` creates the ORT session. That's an internal integration point. Same reason `PerfStats` is on the session. + +For VitisAI, no integration is needed — but giving it a session-bound entry makes the API uniform across monitors. VitisAI's `get_provider_options()` just returns `{}`. + +## Why HWMonitor stays separate + +| Property | `PerfStats` / `EPMonitor` | `HWMonitor` | +|----------|---------------------------|-------------| +| Scope | Per session, per inference window | Process-wide | +| Used without a session? | No | Yes (`wmk perf --monitor` with no model) | +| Needs session internals? | Yes | No — pure PDH counters | +| Vendor-specific? | Yes (EPMonitor) | No | + +Forcing HWMonitor into `session.hw_monitor()` would be artificial coupling and break the existing standalone use case. + +## Proposed API (first draft) + +```python +@contextmanager +def monitor(self, ep_monitor: EPMonitor): + extra_opts = ep_monitor.get_provider_options() + if extra_opts and self._session is not None: + raise RuntimeError("Call session.monitor() before first run().") + saved = dict(self._provider_options) + self._provider_options = {**saved, **extra_opts} + with ep_monitor: + try: + yield ep_monitor + finally: + self._provider_options = saved +``` + +And benchmark code becomes: + +```python +with ( + session.perf(warmup=...) as stats, + session.monitor(ep_monitor) as ep_mon, # ← session-bound + hw_monitor as hw, +): + _run_monitored_loop(session, ...) +``` + +## Seam for iteration 05 + +Two session-bound context managers (`perf` and `monitor`) plus one standalone (`HWMonitor`). They are always used together in practice. Can we merge `perf` and `monitor` into one? diff --git a/docs/design/session/monitor/iterations/05.md b/docs/design/session/monitor/iterations/05.md new file mode 100644 index 000000000..b4e06e198 --- /dev/null +++ b/docs/design/session/monitor/iterations/05.md @@ -0,0 +1,60 @@ +# Iteration 05 — Merge `session.monitor()` into `session.perf()` + +## Entry point + +> *"Can we merge monitor and perf function? Just use perf and pass monitor config into it?"* + +Observation: in the actual benchmark code (`perf.py:417-421`), `session.perf()` and the EP monitor always appear together in the same `with` block. When two context managers always coexist, merging them reduces cognitive load. + +## Proposed signature + +```python +session.perf(warmup: int = 0, monitor: EPMonitor | None = None) -> PerfContext +``` + +## Three yield-shape options considered + +- **α — `PerfContext` dataclass**: `@dataclass class PerfContext: stats: PerfStats; monitor: EPMonitor`. Each observer retains its own identity. +- **β — Fold monitor into `PerfStats`**: backward-compatible (every `perf() as stats` still works), but conflates timing with hardware telemetry. +- **γ — Tuple `(stats, monitor)`**: rejected — positional unpacking awkward with `None`. + +## Decision: **Option α (PerfContext)** + +- Explicit aggregator; each observer is a separate named field. +- Extensible — future `ctx.hw_monitor`, `ctx.qhas_report`, etc. without bloating `PerfStats`. +- `monitor=None` defaults to `NullEPMonitor()` so the `with` block remains uniform. + +Rejected **β** because `PerfStats` is about *latency samples*. Bolting EP telemetry onto it muddies that abstraction. + +## HWMonitor decision + +HWMonitor stays **standalone** (confirmed from iteration 04). Benchmark code shrinks from three to two `with` blocks, each with a clear axis: + +```python +# Before (three with-blocks) +with ( + session.perf(warmup=...) as stats, + hw_monitor as hw, + ep_monitor as ep_mon, +): + ... + +# After (two with-blocks) +with ( + session.perf(warmup=..., monitor=ep_monitor) as ctx, + hw_monitor as hw, +): + ... +``` + +`ctx.stats` replaces `stats`. `ctx.monitor` replaces `ep_mon`. HWMonitor unchanged. + +## Implication for QNNProfiler + +With this merge, the entire `optracing/` package has no reason to exist as a parallel hierarchy. The `QNNProfiler` class folds into a real `QNNMonitor`. The CSV parser, QHAS parser, report writer become post-processing helpers under `session/monitor/qnn/` (or remain as `optracing/` helpers imported by the monitor). + +## Open questions after this iteration + +1. What are the full semantics of `QNNMonitor` in this unified design? — iteration 06 +2. Who creates the ORT session now, and what does the monitor contribute? — iteration 07 +3. What is EPMonitor's contract, precisely? — iteration 08 diff --git a/docs/design/session/monitor/iterations/06.md b/docs/design/session/monitor/iterations/06.md new file mode 100644 index 000000000..61daea166 --- /dev/null +++ b/docs/design/session/monitor/iterations/06.md @@ -0,0 +1,77 @@ +# Iteration 06 — QNNMonitor key points (27 decisions) + +## Entry point + +> *"Can you try design the new QNNMonitor? List all key points first."* + +## Role and responsibilities (1-3) + +1. **Single per-EP class for all QNN-specific behavior.** Replaces `QNNProfiler`. Lives at `session/monitor/qnn_monitor.py`. +2. **Two concerns**: *before* session compile — contribute `provider_options`; *after* session teardown — parse CSV/QHAS. +3. **Observer, not session owner.** Attached to an existing `WinMLSession` rather than creating its own `ort.InferenceSession`. + +## Interface extension on EPMonitor (4-5) + +4. Add `get_provider_options(self) -> dict[str, str]` to `EPMonitor`; default `{}`. +5. Add `requires_session_teardown: ClassVar[bool] = False`. QNN sets True (CSV only flushes on `ort.InferenceSession` destruction). + +## Configuration (6-9) + +6. Constructor takes `level: Literal["basic", "detail"]` (default `"basic"`). Basic = `profiling_level="detailed"` (CSV). Detail = `profiling_level="optrace"` (QHAS). +7. `output_dir: Path | None` — destination for CSV/schematic/QHAS; `None` → temp dir. +8. `extra_provider_options: dict | None` — user overrides. +9. Merge order: defaults (monitor) → user `session(provider_options=...)` → `extra_provider_options` (highest). **But `profiling_level` and `profiling_file_path` are never user-overridable** — the monitor owns them. + +## Session integration (10-13) + +10. Attached via `session.perf(monitor=QNNMonitor(...))` (per iteration 05). +11. Compile-timing guard: raise `RuntimeError` if session already compiled when non-empty options arrive. (**Revised in iteration 09** to auto-reset instead.) +12. CWD handling: change CWD to `output_dir` on `__enter__` for `schematic.bin` output; restore on `__exit__`. +13. EPContext interaction: CSV still works on cached `_ctx.onnx`; `schematic.bin` requires fresh compile for `detail` mode. Document as a limitation. + +## Lifecycle (14-16) + +14. `__enter__`: change CWD; options injected earlier via `get_provider_options()`. +15. `__exit__`: restore CWD, parse CSV (and QHAS if `detail`), cache parsed data for `to_dict()`. +16. Teardown ordering inside `session.perf().__exit__`: stop PerfStats → (if `requires_session_teardown`) `self.reset()` to flush CSV → `monitor.__exit__()` to parse → restore options. + +## Output (17-18) + +17. `to_dict()` schema: + + ```python + { + "ep": "QNN", "device": "NPU", "level": "basic" | "detail", + "operators": [...], "summary": {...}, "artifacts": {...}, "num_samples": N, + } + ``` + +18. `OpTraceResult` / `OperatorMetrics` dataclasses preserved in a post-processing helper module — zero disruption for `display_op_trace_report` / `write_op_trace_json`. + +## Availability (19) + +19. `is_available()` accepts both ORT distributions: + + - `onnxruntime-qnn`: `"QNNExecutionProvider" in ort.get_available_providers()` + - `onnxruntime-windowsml`: after `WinMLSession._init_winml_eps_once()`, check `ort.get_ep_devices()` + +## Error handling (20-22) + +20. No CSV after teardown → `to_dict()` returns `status="no_data"`, warns; does not crash. +21. `detail` mode with no QNN SDK viewer → fall back to CSV parsing with warning. +22. `add_provider_for_devices` path not available → raise at `session.perf()` entry, not deep inside inference. + +## File layout (23-25) + +23. Delete: `optracing/qnn/profiler.py` (QNNProfiler), `optracing/base.py` (OpTracer), `optracing/registry.py` (tracer registry). +24. Move: `optracing/qnn/{csv_parser,qhas_parser,viewer}.py` → `session/monitor/qnn/`; `optracing/result.py` → `session/monitor/op_metrics.py`; `optracing/report.py` → `session/monitor/report.py`. +25. Callers stay: `perf.py` op-tracing path simplified — pass `QNNMonitor(level=op_tracing)` to `session.perf(monitor=...)`. + +## Documentation cleanup (26-27) + +26. Rewrite `QNNMonitor` docstring (remove "placeholder" language). +27. Remove `EPMonitor` / `OpTracer` dual-ABC confusion by deleting `OpTracer`. + +## Outcome + +27 concrete decisions recorded. Iterations 07-10 then stress-tested these against architectural scrutiny (pattern names, session ownership, simplification, critic review). diff --git a/docs/design/session/monitor/iterations/07.md b/docs/design/session/monitor/iterations/07.md new file mode 100644 index 000000000..b6114aed1 --- /dev/null +++ b/docs/design/session/monitor/iterations/07.md @@ -0,0 +1,71 @@ +# Iteration 07 — Why on the base class? And what design pattern is this? + +## Entry points + +1. *"Why `get_provider_options(self) -> dict[str, str]` on EPMonitor — what for? Why on the base class?"* +2. *"How does `WinMLSession.perf` allow monitor to decorate/adapt the ort session? This is a design pattern, right — what's the name?"* + +## Q1 — Placement of `get_provider_options()` + +### The landscape + +| Monitor | Needs session config? | Returns | +|---------|------------------------|---------| +| `VitisAIMonitor` | No — xrt-smi is external | `{}` | +| `OpenVinoMonitor` | No (stub) | `{}` | +| `QNNMonitor` | Yes — `profiling_level`, `profiling_file_path` | real dict | + +Putting it on the ABC means 2 of 3 classes return `{}`. Is that justified? + +### Two lenses + +- **Interface-first**: EPMonitor's job is "per-EP observation behavior" — and *part* of that is "what does this EP need configured to yield data?" Default `{}` is fine — it's exercising a capability the monitor doesn't need. +- **ISP-first** (Interface Segregation Principle): clients (VitisAI) shouldn't depend on methods they don't use. Extract a `ProviderOptionsContributor` Protocol; only monitors that need it implement it. + +### Three concrete design options + +- **(a) On the ABC, default `return {}`** — one method, two no-op classes, all monitors look alike. +- **(b) Separate Protocol (PEP 544)** — `isinstance(mon, ProviderOptionsContributor)`; discoverable via grep. +- **(c) Duck typing with `getattr`** — fragile, invisible to type checkers; rejected. + +### Initial recommendation + +**Option (b) Protocol** — makes the capability explicit and searchable; mirrors `io.Readable`/`io.Writable`. Protocols are also more extensible (new capability = new Protocol, not bloat on the base ABC). + +**Later revised to (a) in iteration 10** after critic review — see iteration 10. + +## Q2 — Design pattern name + +### Patterns that could apply + +| Piece | Pattern | Why | +|-------|---------|-----| +| Session has a fixed algorithm with call-out points | **Template Method** | Session owns skeleton (build opts → compile → run → teardown); hook points call monitor. | +| Monitor contributes behavior via an interface | **Strategy** (loosely) | Monitor is the "profiling strategy"; swap monitors = swap strategy. | +| Monitor augments rather than replaces session behavior | **Hook / Plugin** | Most accurate name in modern usage (Django, Flask, pytest). | +| Session doesn't know concrete monitor types | **Dependency Injection** | Monitor injected via `perf(monitor=...)`. | +| Monitor watches inference events | **Observer** | Context-managed observer with `__enter__`/`__exit__`. | + +### Single best name + +**Hook-based Plugin pattern with Observer semantics.** + +In practice: + +- Python community: "pluggable context manager" or "plugin with hooks" +- GoF community: "Template Method with Strategy-attached Observer" +- Web frameworks: "hooks" (Django signals, pytest plugins, Rails callbacks) + +### What this is NOT + +- **Not Decorator** — we aren't wrapping the ORT session to add behavior. +- **Not Adapter** — not converting interfaces. + +### Why the name matters + +Naming the pattern correctly makes the contract explicit. "Hook/Plugin" tells readers: (1) session stays in control of the algorithm, (2) monitor is optional, (3) monitors can't break session invariants, (4) multiple monitors could chain later. Calling it "decorator" would imply the monitor wraps the session, which isn't what this design is. + +## Outcome + +- Initially recommended Protocol for `get_provider_options()` — revised later. +- Design pattern labeled as **Hook-based Plugin + Template Method + Observer** — this framing is kept. diff --git a/docs/design/session/monitor/iterations/08.md b/docs/design/session/monitor/iterations/08.md new file mode 100644 index 000000000..f492f79b6 --- /dev/null +++ b/docs/design/session/monitor/iterations/08.md @@ -0,0 +1,65 @@ +# Iteration 08 — Who creates the ORT session? + +## Entry point + +> *"If WinMLSession manages all ort sessions, what are the EPMonitor responsibilities? ... Who do you think should be responsible to create the ort session?"* + +## Four candidates + +| Option | Who creates `ort.InferenceSession` | Where monitor config goes | +|--------|-------------------------------------|-------------------------------| +| **A** | `WinMLSession.compile()` | Monitor contributes via hook; session merges and creates | +| **B** | `QNNMonitor` creates its own (status quo, QNNProfiler) | Monitor owns everything; bypasses WinMLSession | +| **C** | Shared `SessionBuilder` factory | Separate "how to build" from "how to use" | +| **D** | WinMLSession owns it, delegates to internal helper | A with internal refactor | + +## Decision: Option A — WinMLSession owns ORT session creation + +### The responsibility test + +WinMLSession is **already** the owner of: + +- Device policy resolution (`PREFER_NPU` → concrete EP) +- EPContext lifecycle (cache lookup, compile, persist) +- Input validation and dtype enforcement +- Error translation into `WinMLSessionError` subclasses +- Instance-level `SessionOptions` management + +None of this belongs in a per-EP monitor. If the monitor creates the session, it either duplicates all of the above or skips it — which is exactly how today's `QNNProfiler` ended up with a broken EP-discovery path. + +### The principle + +*"The session owns the session; the monitor informs the session."* + +### Template Method connection + +Template Method + Hook = the owner of the algorithm owns the skeleton. WinMLSession owns the compile→run→teardown skeleton. Therefore WinMLSession MUST create the ORT session. Otherwise the pattern inverts — monitor owns the algorithm, session becomes a thin post-creation wrapper. That would be architecturally worse. + +### Flow + +``` +WinMLSession.compile(): # template method + 1. Resolve device (auto → qnn) ← session's own concern + 2. Build SessionOptions ← session's own concern + 3. Find OrtEpDevice ← session's own concern + 4. extra = monitor.get_provider_options() ← HOOK + merged_opts = user_opts ∪ extra + 5. opts.add_provider_for_devices([dev], merged_opts) + 6. ort.InferenceSession(...) ← session creates ORT + 7. (on exit) if monitor.requires_session_teardown: + self.reset() ← HOOK +``` + +Steps 4 and 7 are the ONLY places where monitor influences the algorithm. + +## Why not option C (SessionBuilder)? + +YAGNI: after the refactor, nobody else creates ORT sessions in the modelkit. Extracting `SessionBuilder` is infrastructure for a second caller that doesn't exist. If a real need emerges later (e.g., `wmk inspect` wanting a custom policy), extract then. + +## The awkward case raised (resolved in iteration 09) + +Can the same `WinMLSession` be used for both a normal benchmark and a profiling pass? First answer: "call `session.reset()` between passes." User pushed back in iteration 09 — see there for resolution. + +## Outcome + +WinMLSession owns ORT session creation. Monitors never create ORT sessions. Monitors contribute via two hooks: `get_provider_options()` (pre-compile) and `requires_session_teardown` (exit). Everything else is session's domain. diff --git a/docs/design/session/monitor/iterations/09.md b/docs/design/session/monitor/iterations/09.md new file mode 100644 index 000000000..1a4f81e54 --- /dev/null +++ b/docs/design/session/monitor/iterations/09.md @@ -0,0 +1,79 @@ +# Iteration 09 — Auto-reset and the six responsibilities + +## Entry points + +1. *"Why need explicit `session.reset()`? Why can ctx.__exit__ not auto-restore session?"* +2. *"If WinMLSession manages all ort sessions, what are the EPMonitor responsibilities? We need a clear definition."* + +## Q1 — Auto-reset replaces the explicit rule + +Iteration 08 proposed: "pass 1 done → user calls `session.reset()` → pass 2 with monitor works." User pushed back: why not auto? + +### Corrected lifecycle + +``` +session.perf(monitor=mon).__enter__: + extra = mon.get_provider_options() + if extra and session is compiled: + self.reset() # ← AUTO-RESET, no RuntimeError + self._provider_options = {**user_opts, **extra} + mon.__enter__() + +session.perf(monitor=mon).__exit__: + mon.__exit__() + if mon.requires_session_teardown: + self.reset() # flush CSV, drop ORT session + restore saved options +``` + +### Why auto-reset is correct, not just convenient + +1. No useful information is lost — on-disk EPContext cache survives; only the in-memory `ort.InferenceSession` is dropped. Recompile can be near-instant when cache hits. +2. The two passes are genuinely incompatible — pass 2's provider_options differ from pass 1's. Preserving pass 1's ORT session accomplishes nothing. +3. Forcing explicit `reset()` leaks an implementation detail. +4. New `perf()` = "start fresh observations." Auto-reset aligns with that intent. + +### Symmetry + +- `__enter__` auto-resets *if needed for the monitor's options to apply*. +- `__exit__` auto-resets *if the monitor requires teardown to flush data*. + +Both are internal housekeeping triggered by the monitor's declared needs. User never thinks about ORT sessions. + +## Q2 — EPMonitor's six responsibilities (first draft) + +### Mandatory + +1. **Observe inference lifecycle** — implement `__enter__` / `__exit__`. Examples: VitisAI snapshots xrt-smi; QNN changes CWD. +2. **Advertise availability** — `is_available()` classmethod. Returns True only when EP is discoverable AND monitor's infra is usable. +3. **Expose structured results** — `to_dict() -> dict[str, Any]`. + +### Optional + +4. **Contribute session configuration** — via Protocol `ProviderOptionsContributor`, `get_provider_options() -> dict`. Only QNN today. +5. **Require session teardown** — `requires_session_teardown: ClassVar[bool] = True`. Only QNN today. + +### Behavioral + +6. **Remain EP-specific** — encode vendor knowledge (CSV, xrt-smi, QHAS). + +### What EPMonitor is NOT responsible for + +| Concern | Lives where | +|---------|-------------| +| Create / own `ort.InferenceSession` | WinMLSession | +| Run inference, handle I/O dtypes | WinMLSession | +| Measure inference latency | PerfStats | +| System-wide CPU/RAM/NPU% | HWMonitor | +| Display reports / write JSON | report layer | +| Decide when to attach | Caller (perf command, user code) | +| Know about other EPMonitors | — (no cross-EP coupling) | + +### One-liner mental model + +> An `EPMonitor` is a per-EP observer attached to a `WinMLSession` for the duration of an inference window. It optionally contributes provider options needed for its own telemetry, and optionally requires session teardown to flush its data. Its output is a vendor-specific structured summary of what the EP did during inference. + +## Outcome + +- Auto-reset accepted; explicit `session.reset()` removed from user-facing requirements. +- Six responsibilities enumerated — **but flagged for critic review** (next iteration). diff --git a/docs/design/session/monitor/iterations/10.md b/docs/design/session/monitor/iterations/10.md new file mode 100644 index 000000000..54c3764ac --- /dev/null +++ b/docs/design/session/monitor/iterations/10.md @@ -0,0 +1,94 @@ +# Iteration 10 — Critic agent review of the six responsibilities + +## Entry point + +> *"Leverage agent, review above design, criticize the 'EPMonitor: the six responsibilities'"* + +Dispatched `oh-my-claudecode:critic` with a detailed brief covering the current codebase, the problem, the proposed design, and 10 specific questions. + +## Verdict: REVISE + +### Critical findings (must address) + +**1. Monitor becomes a session-invalidating actor.** The proposal says monitors are NOT responsible for session ownership, then gives `get_provider_options()` the power to trigger `session.reset()` inside `perf().__enter__`. This is session ownership by proxy. Today `WinMLSession.compile()` is idempotent (session.py:231-233); the proposal breaks that invariant. The "no RuntimeError" decision is particularly dangerous — silent reset is harder to debug than loud refusal. + +**2. QNNProfiler is not decomposable into EPMonitor without losing value.** `QNNProfiler.run()` (profiler.py:108-150) owns its own dummy inputs, iteration count, and inference loop. `session.perf(monitor=)` hands those to the caller. This flips observation vs. orchestration. + +### Major findings + +**3. "#6 Remain EP-specific" is a design constraint, not a responsibility.** Inflates the count. Should be separated. + +**4. Protocol + ClassVar inconsistency is unjustified.** `get_provider_options()` as Protocol but `requires_session_teardown` as ClassVar is two different mechanisms for "optional." The ClassVar path uses `getattr(...)` — invisible to type checkers. + +### Minor findings + +5. `to_dict()` has no schema contract — "ep" key is implicit. +6. `is_available()` is classmethod on EPMonitor but instance-level on OpTracer. +7. Double `__enter__` unguarded. + +### Missing contracts + +- Idempotency for `get_provider_options()` (must not produce fresh temp paths per call). +- Error semantics for CSV parsing in `__exit__` (log-and-swallow vs re-raise). +- Lifecycle ordering in `perf().__exit__` (load-bearing: reset first, parse second). +- Migration path for `perf.py:1334-1386` (currently runs profiler as a separate step after benchmark). + +### Open question + +What if two monitors need to contribute provider options? Singular `monitor=` signature is already too narrow for that future. + +## Response and revisions + +### Accepted + +- **#3 and #6** ("EP-specific") — moved to a separate "Design constraints" section. Responsibility list becomes **4 mandatory + 2 optional = 6 explicit members** (not 7). +- **#4 Protocol+ClassVar** — unified. **Both defaults on the ABC**: + + ```python + class EPMonitor(ABC): + requires_session_teardown: ClassVar[bool] = False + def get_provider_options(self) -> dict[str, str]: return {} + # + mandatory abstractmethods + ``` + + Drop the Protocol. Every monitor has the same interface. +- **#5 Schema contract** — define `TypedDict` or document required keys (`ep`, `device`, monitor-specific fields). +- **#6 `is_available()` inconsistency** — keep classmethod everywhere. +- **Missing contracts** — explicit in the final spec: + - `get_provider_options()` must be idempotent (produce paths at `__init__`, not at call time) + - Failed CSV parsing sets `status="parse_failed"` in `to_dict()`, does not suppress caller exceptions + - `perf().__exit__` order is **load-bearing and documented**: reset → parse → restore + +### Partial pushback + +- **#1 Auto-reset** — accept the risk, but not the critic's fix. Keep auto-reset; add `logger.info("session.perf(): auto-resetting compiled session to apply monitor provider_options")` so it's audible. The rule is documented in the `perf()` docstring. +- **#2 QNNProfiler merge** — the critic framed this as "losing value." It's actually a **semantic change** from "profile synthetic inputs" to "profile YOUR workload" — arguably strictly better. (User agreed in iteration 11.) + +### Deferred + +- **Multiple monitors** — YAGNI today; flag as future extensibility. The `monitor=` singular signature can later become `monitors=[...]`. + +## Revised responsibilities list (4 mandatory + 2 optional) + +**Mandatory**: + +1. Observe inference lifecycle (`__enter__` / `__exit__`, guarded against double-entry) +2. Advertise availability (`is_available()` classmethod) +3. Expose results (`to_dict()` conforming to schema) +4. Encode vendor-specific parsing (CSV, xrt-smi, QHAS) + +**Optional** (defaults on base ABC): + +5. Contribute session config (`get_provider_options()` — default `{}`) +6. Declare teardown requirement (`requires_session_teardown` — default `False`) + +**Design constraints** (not responsibilities): + +- Stay stateless between uses +- Stay importable without side effects +- `get_provider_options()` idempotent +- Do not suppress exceptions in `__exit__` + +## Outcome + +Design tightened. Inconsistencies resolved. One remaining question (whether to keep QNNProfiler as a thin helper) answered in iteration 11. diff --git a/docs/design/session/monitor/iterations/11.md b/docs/design/session/monitor/iterations/11.md new file mode 100644 index 000000000..e30a3ab24 --- /dev/null +++ b/docs/design/session/monitor/iterations/11.md @@ -0,0 +1,84 @@ +# Iteration 11 — Delete QNNProfiler entirely, no replacement + +## Entry point + +In iteration 10, to address the critic's "use case lost" concern (standalone profiling without a benchmark), proposed keeping a thin `QNNProfiler.run()` helper on top of `WinMLSession + QNNMonitor`. User pushed back: + +> *"I don't get this part, why keep QNNProfiler?"* + +## Examining today's QNNProfiler.run() + +10 steps in `profiler.py:108-150`. After the refactor: + +| Step | What | After refactor, lives in | +|------|------|--------------------------| +| 1. Build SessionOptions | disable CPU fallback, EPContext | `WinMLSession.compile()` | +| 2. Build provider_options | profiling_level, backend_path | `QNNMonitor.get_provider_options()` | +| 3. Change CWD | schematic.bin output | `QNNMonitor.__enter__` | +| 4. Create ort.InferenceSession | via `add_provider_for_devices` | `WinMLSession.compile()` | +| 5. Generate dummy inputs | random tensors from session I/O | **??? — not QNN-specific** | +| 6. Run inference loop | warmup + measured | **User-owned** | +| 7. Teardown session | flush CSV | `session.perf()` exit | +| 8. Parse CSV | operator cycles | `QNNMonitor.__exit__` | +| 9. Run QHAS viewer | detail mode | `QNNMonitor.__exit__` | +| 10. Return `OpTraceResult` | structured data | `QNNMonitor.to_dict()` | + +### The only homeless piece: step 5 — dummy input generation + +`QNNProfiler._generate_inputs()` (profiler.py:191-199) is 8 lines of numpy reflecting on session inputs. **It has nothing to do with QNN.** It's a generic "make random tensors matching the session's I/O schema" utility. + +## The honest answer: delete `QNNProfiler` entirely + +If step 5 is extracted to a generic utility (where it belongs), then the standalone profiling use case becomes an 8-line idiom: + +```python +from winml.modelkit.session import WinMLSession, QNNMonitor +from winml.modelkit.session.inputs import generate_dummy_inputs # generic utility + +session = WinMLSession("model.onnx", device="npu") +inputs = generate_dummy_inputs(session.io_config) + +with session.perf(monitor=QNNMonitor(level="basic")) as ctx: + for _ in range(10): + session.run(inputs) + +print(ctx.monitor.to_dict()) +``` + +**No `QNNProfiler` class. No thin-helper file. Every line is a composable primitive.** + +### Why this is better than keeping a helper + +Bespoke classes with a single `.run()` method that does 10 things are convenient for exactly one use case and awkward for everything adjacent. Clean primitives compose into that use case AND any others: profile against specific inputs, inside a training loop, A/B two models, etc. + +The 8-line "facade" lives in the caller, so the caller controls it. This is **good primitives > bespoke facades**. + +## Final migration + +### Delete + +- `optracing/qnn/profiler.py` (QNNProfiler) +- `optracing/base.py` (OpTracer) +- `optracing/registry.py` + +### Move or keep + +- `optracing/qnn/csv_parser.py` → `session/monitor/qnn/csv_parser.py` +- `optracing/qnn/qhas_parser.py` → `session/monitor/qnn/qhas_parser.py` +- `optracing/qnn/viewer.py` → `session/monitor/qnn/viewer.py` +- `optracing/result.py` (`OpTraceResult`, `OperatorMetrics`) → `session/monitor/op_metrics.py` +- `optracing/report.py` (display, JSON writer) → `session/monitor/report.py` + +### Extract + +- `generate_dummy_inputs(io_config) -> dict[str, np.ndarray]` → `session/inputs.py` (new). Reused by `perf.py`'s existing benchmark flow. + +### Simplify + +- `perf.py:1334-1386` (op-tracing block) collapses into a single `monitor=QNNMonitor(level=op_tracing)` argument on the existing `session.perf()` call. + +## Outcome + +Delete `QNNProfiler` entirely. No thin helper. The 8-line idiom using clean primitives handles every known use case (including the "standalone profiling for CI regression checks" case that motivated keeping a helper). + +This closes the brainstorming. Ready for consolidation into `1_req.md` and `2_coreloop.md`. diff --git a/docs/design/static_analyzer/analyzer-perf-investigation.md b/docs/design/static_analyzer/analyzer-perf-investigation.md new file mode 100644 index 000000000..83e659db0 --- /dev/null +++ b/docs/design/static_analyzer/analyzer-perf-investigation.md @@ -0,0 +1,64 @@ +# Analyzer Performance Investigation + +**Date**: 2026-03-23 +**Branch**: `mvp/analyzer` + +## Problem + +bert-base-uncased (442 nodes) takes 2x longer to analyze than DETR (724 nodes): + +| Model | Nodes | QNN | All 3 EPs | +|-------|-------|-----|-----------| +| bert | 442 | 21s | 44s | +| DETR | 724 | 8.5s | 23s | + +## Root Cause + +**`make_hashable()` in `runtime_checker_query.py` is called 29 million times**, consuming 39s (75% of total time). + +### Profile (bert, QNN, run_unknown_op=False) + +``` +28,982,211 calls 20.7s make_hashable() — recursive tuple conversion +87,399,578 calls 10.5s isinstance() — called inside make_hashable + 288,946 calls 0.2s from pandas.apply — _sanitize_df table loading + 9,380 calls 0.01s from dictcomp line 481 — per-node conditions +``` + +### Call Chain + +1. `_sanitize_df()` runs `df[col].apply(make_hashable)` on every column of every operator's rule table +2. Rule tables are loaded from 627 MB JSON files (compressed to 9 MB in zip) +3. Tables are loaded **per EP** — 3 EPs × same data = 3x cost +4. `make_hashable` recurses into deeply nested list/dict structures in rule data + +### Why bert is slower than DETR + +Not about node count — about which ops are used: +- bert has `Gather`, `Where`, `Erf`, `Cast` — these have larger/more complex rule tables +- DETR has `Conv`, `Relu` — simpler rules, faster lookup + +## Attempted Fix + +Skipping `make_hashable(arr)` for weight initializers (line 406) — **no improvement** because the 29M calls come from `_sanitize_df` on rule tables, not from weight data. The initializer fix saved only 9,380 calls out of 29M. + +## Proposed Fixes (separate PR) + +### Option A: Cache tables across EPs (quick win) +Same `LazyDomainTables` data can be shared when EPs use the same opset tables. Currently each `RuntimeCheckerQuery` creates new instances. + +### Option B: Pre-sanitize at build time (permanent fix) +Run `_sanitize_df` once during rule generation (offline), save hashable tables to disk. Runtime loads pre-sanitized data — no `make_hashable` at query time. + +### Option C: Skip unnecessary columns (medium effort) +Only sanitize columns that appear in the query conditions. Most cells are already hashable (strings, ints, bools). Only list/dict cells need conversion. + +### Option D: Replace pandas with dict lookup (architectural) +The DataFrame + `query_table_exact_match` pattern is slow. A nested dict keyed by condition values would be O(1) lookup instead of O(n) DataFrame scan. + +## Files + +- `modelkit/static_analyzer/core/runtime_checker_query.py:71-79` — `_sanitize_df` +- `modelkit/static_analyzer/core/runtime_checker_query.py:82-103` — `LazyDomainTables` +- `modelkit/static_analyzer/utils/model_utils.py:205-222` — `make_hashable` +- `modelkit/static_analyzer/rules/runtime_check_rules/*.zip` — 627 MB JSON rule tables diff --git a/docs/design/static_analyzer/console-output-redesign.md b/docs/design/static_analyzer/console-output-redesign.md new file mode 100644 index 000000000..b4888e389 --- /dev/null +++ b/docs/design/static_analyzer/console-output-redesign.md @@ -0,0 +1,356 @@ +> Note: This design doc was written before implementation. Some details (file names, column labels) may differ from the final implementation in modelkit/commands/analyze.py. + +# Static Analyzer Console Output Redesign + +> Date: 2026-03-18 | Branch: `mvp/analyzer` +> Status: Draft +> Mockup: `docs/design/static_analyzer/console_mockup.py` + +--- + +## 1. Problem Statement + +The static analyzer's console output has several limitations: + +1. **No real-time progress** — analysis can take 30+ seconds with no visual feedback +2. **Per-instance data lost** — `RuntimeChecker` checks each node individually, but + `OutputAggregator` collapses results to per-type flat lists, losing instance counts +3. **Verbose flag is dead code** — `console_writer.py` accepts `verbose` but never uses it +4. **Default log level hides progress** — WARNING (30) suppresses INFO progress messages +5. **NullHandler missing** — library consumers get "no handler found" warnings + +## 2. Scope + +### In Scope + +- Logging infrastructure fixes (default level, NullHandler, verbosity plumbing) +- New per-instance classification field on `EPSupport` +- Callback API on `analyze()` for streaming per-node results +- New stacked bar console visualization with `rich.live.Live` +- CLI integration wiring callback to Rich Live display + +### Out of Scope + +- Changes to `RuntimeChecker` core logic (already per-node) +- Changes to JSON output format (backward compat) +- New CLI flags beyond existing `-v`/`-q` +- Pattern detection or information engine changes + +## 3. Design + +### 3.1 Logging Infrastructure + +#### 3.1.1 Default Level: WARNING -> INFO + +**File**: `modelkit/utils/logging.py` + +Change the base level so progress messages show by default: + +``` +-q → ERROR (40) errors only (quiet / scripting) +(default)→ INFO (20) progress messages visible +-v → DEBUG (10) developer tracing +``` + +Formula change: `max(DEBUG, INFO - verbosity * 10)`. Quiet override stays `ERROR`. + +Since `-v` and `-vv` both clamp to DEBUG with this formula, we effectively have +3 useful levels (ERROR, INFO, DEBUG). This is sufficient — the gap between INFO +and DEBUG can be addressed later with VERBOSE=15 if needed. + +#### 3.1.2 Fix Verbosity Plumbing + +**File**: `modelkit/commands/static-analyzer.py` + +```python +# Before (broken — hits deprecated bool compat path): +configure_logging(verbose=verbose, quiet=quiet) + +# After (uses int verbosity correctly): +configure_logging(verbosity=verbose, quiet=quiet) +``` + +Same fix needed in `compile.py` and `quantize.py`. Note: these commands currently use +`--verbose` as a `bool` flag (`is_flag=True`), not a count. To get the full verbosity +range, their `--verbose` option must also be changed to `count=True` to match +`static-analyzer.py`. Until then, `verbosity=True` coerces to `verbosity=1` (INFO), +which is acceptable as a first step. + +#### 3.1.3 NullHandler (Done) + +Already added to `modelkit/__init__.py:31`. All child loggers covered. + +#### 3.1.4 Third-Party Logger Suppression + +At DEBUG level, suppress noisy third-party loggers: + +```python +if log_level <= logging.DEBUG: + for name in ("onnx", "onnxruntime", "transformers", "urllib3"): + logging.getLogger(name).setLevel(logging.WARNING) +``` + +### 3.2 Data Model — Per-Instance Counts + +**File**: `modelkit/static_analyzer/models/output.py` + +Add new field to `EPSupport`: + +```python +class EPSupport(BaseModel): + # ... existing fields ... + + # NEW: per-instance classification counts + instance_counts: dict[str, dict[str, int]] = Field( + default_factory=dict, + description=( + "Per-operator-type instance counts by support level. " + "e.g., {'Conv': {'white': 53}, 'Add': {'white': 12, 'gray': 5, 'black': 1}}" + ), + ) +``` + +Uses string keys (not SupportLevel enum) for JSON serialization simplicity. + +**File**: `modelkit/static_analyzer/core/output_aggregator.py` + +In `build_ep_support()`, count per-instance results. + +Operator `pattern_id` format is always `OP/{domain}/{op_name}` (three segments, +e.g., `OP/ai.onnx/Conv`). Extract display name via a shared helper to avoid +duplicating the parse logic: + +```python +def _display_name(pattern_id: str) -> str: + """Extract operator display name from pattern_id ('OP/ai.onnx/Conv' -> 'Conv').""" + return pattern_id.split("/")[-1] + +instance_counts: dict[str, dict[str, int]] = {} +for pattern_runtime in check_results: + display_name = _display_name(pattern_runtime.pattern_id) + level = pattern_runtime.result.classification.value # e.g., "white" + + if display_name not in instance_counts: + instance_counts[display_name] = {} + instance_counts[display_name][level] = ( + instance_counts[display_name].get(level, 0) + 1 + ) +``` + +Existing `classification` field preserved unchanged for backward compat. + +**Scope limitation**: `instance_counts` covers operator-level nodes only, not subgraph +patterns. Subgraph results (`subgraph_runtime_check_result`) are passed to the +information engine but not included in `check_results` flowing to `aggregate()`. +This is intentional — subgraph patterns are displayed in the information/actions +section, not the operator bar chart. + +### 3.3 Callback API for Streaming Results + +Three callbacks provide lifecycle hooks for the analysis pipeline. All are +optional (`None` by default) — existing callers are unaffected. + +```python +def analyze( + self, + model_path: str, + ep: str | None = None, + ..., + on_node_result=None, # (PatternRuntime) -> None + on_ep_start=None, # (ep_name: str, operator_counts: dict[str, int]) -> None +) -> AnalysisResult: +``` + +#### Callback signatures and lifecycle + +``` +analyze_from_proto() + │ + ├─ Step 1: PatternExtractor.summary() + │ → extracts operator_counts: {"Conv": 53, "Relu": 49, ...} + │ → this is MODEL-LEVEL, same for all EPs + │ + ├─ Step 2: For each EP: + │ ├─ on_ep_start(ep_name, operator_counts) ← EP begins + │ │ The command uses operator_counts to build pending rows + │ │ (dim table with all ops listed, ░░░ placeholder bars) + │ │ + │ ├─ RuntimeChecker.op_support() loop: + │ │ for node in graph.node: + │ │ result = query.run_for_node(node) + │ │ on_node_result(result) ← per-node result + │ │ The command updates the row for this op type + │ │ + │ └─ (information engine runs, no callbacks) + │ + └─ Step 3: OutputAggregator → return AnalysisResult +``` + +#### Expected Live display behavior + +1. **on_ep_start fires**: Table appears with ALL op types as dim/pending rows + (we know the ops from `operator_counts`). Each row shows `░░░` bars. +2. **on_node_result fires per-node**: As each node is checked, its op type row + accumulates counts. When the row's count reaches the total for that type, + the row transitions from dim to colored (icon + S/P/U + stacked bar). +3. **All nodes checked**: Table shows "Complete" with all rows filled in. +4. **Next EP**: Reset counts, show new pending table, repeat. + +#### Threading through the API + +**`analyzer.py`**: `analyze()` → `analyze_from_proto()` → forwards both callbacks. + +**`analyze_from_proto()`**: Calls `on_ep_start(current_ep, operator_counts)` before +each `RuntimeChecker.summary()` call, where `operator_counts` comes from +`extraction_result["summary"].operator_counts`. + +**`runtime_checker.py`**: `summary()` → `op_support()` → invokes `on_node_result` +per-node. tqdm is disabled when `on_node_result` is provided. + +#### Design constraints +- Callbacks are `None` by default — zero impact on existing callers +- Callbacks receive plain data (no UI dependency) +- Analyzer API has no Rich dependency — stays a pure library +- `on_node_result` replaces tqdm's role for progress visualization +- `operator_counts` is passed on every `on_ep_start` call (same dict, no cost) + +### 3.4 Console Writer — Stacked Bar Visualization + +**File**: `modelkit/static_analyzer/console_writer.py` + +New visualization replacing the current operator table + classification sections. + +#### Layout + +``` + 📊 ONNX Static Analysis — QNNExecutionProvider ✅ Complete + Op Type Analyze + 🟢 Conv (53) 53/0/0 ████████████████████████████████████████ + 🟢 Relu (53) 53/0/0 ████████████████████████████████████████ + 🟡 MatMul (25) 20/5/0 ███████████████████ + 🔴 Add (18) 12/5/1 ██████████████ + 🔵 Reshape (10) 8/0/0/2 ████████ + 🟡 Erf (8) 0/8/0 ██████ + 🔴 Resize (3) 0/0/3 ██ + TOTAL (231) 203/19/4/5 ████████████████████████████████████████ +``` + +#### Column Specification + +| Column | Content | Width | +|--------|---------|-------| +| Op Type | `{icon} {op_name} ({total})` | 28 chars | +| Analyze | `W/G/B[/U]` colored counts | 14 chars | +| (untitled) | Stacked bar, variable width | Proportional to count | + +#### Icon Semantics (Worst-Case Indicator) + +| Icon | Condition | Meaning | +|------|-----------|---------| +| 🟢 | All instances WHITE | Fully supported | +| 🟡 | Any GRAY (no BLACK) | Partial support | +| 🔴 | Any BLACK | Unsupported instances | +| 🔵 | Any UNKNOWN (no GRAY/BLACK) | Unknown support | + +#### Bar Rendering + +- Width proportional to `op_total / max_op_total * MAX_BAR_WIDTH` +- Segments colored: green (white), yellow (gray), red (black), dim gray (unknown) +- Each segment gets at least 1 char if count > 0. To guarantee this, the total + bar width must be `max(bar_width, num_nonzero_segments)` so no segment is + clamped to 0 by the proportional width calculation. +- Built with `rich.text.Text` per-segment styling + +#### Analyze Column Format + +`W/G/B` with colored digits. Fourth value `/U` appended only when unknown > 0. + +``` +53/0/0 → green "53", dim "0", dim "0" +12/5/1 → green "12", yellow "5", red "1" +8/0/0/2 → green "8", dim "0", dim "0", gray "2" +``` + +### 3.5 CLI Integration + +**File**: `modelkit/commands/static-analyzer.py` + +```python +from rich.live import Live + +# Build initial empty table +console = Console(stderr=True) + +# Wire callback to Live display +counts = {} +# build_analysis_table must handle empty data gracefully (no crash on max() of empty seq) +with Live(build_analysis_table(counts), console=console, refresh_per_second=4) as live: + def on_result(pattern_runtime): + # Accumulate per-instance counts + op = pattern_runtime.pattern_id.split("/")[-1] + level = pattern_runtime.result.classification.value + counts.setdefault(op, {}).setdefault(level, 0) + counts[op][level] += 1 + live.update(build_analysis_table(counts)) + + result = analyzer.analyze( + model_path=model, + ep=ep_normalized, + on_node_result=on_result, + ... + ) + +# After Live exits: show remaining sections +# (header, pattern summary, information items, footer) +display_post_analysis(result.output) +``` + +**Quiet mode** (`-q`): Skip Live display entirely, pass `on_node_result=None`. + +### 3.6 Separation of Concerns + +``` +┌─────────────────────────────────┐ +│ analyzer.py (LIBRARY) │ +│ - logging.getLogger(__name__) │ +│ - callback(PatternRuntime) │ +│ - No Rich, no UI │ +└──────────┬──────────────────────┘ + │ callback with plain data +┌──────────▼──────────────────────┐ +│ static-analyzer.py (command) │ +│ - configure_logging() │ +│ - Rich Live display │ +│ - Wires callback to UI │ +└─────────────────────────────────┘ +``` + +## 4. Testing Strategy + +- **Unit tests**: `build_stacked_bar()`, `worst_level_icon()`, `build_analyzed_text()` — pure functions, no Rich Live needed +- **Data model tests**: Verify `instance_counts` populated correctly by OutputAggregator +- **Callback tests**: Mock callback, verify it receives correct PatternRuntime per node +- **Integration test**: Full `analyze()` with callback, verify counts match final result +- **Console writer**: Capture Rich output to string buffer, verify table structure + +## 5. Migration / Backward Compatibility + +- Existing `EPSupport.classification` field unchanged — JSON output stays the same +- `instance_counts` defaults to empty dict — old serialized data still valid +- `on_node_result=None` default — all existing callers work without changes +- Default log level change (WARNING → INFO) affects all commands — intentional, aligns with pip convention + +## 6. Files Changed + +| File | Change | +|------|--------| +| `modelkit/utils/logging.py` | Default level WARNING→INFO, third-party suppression | +| `modelkit/commands/static-analyzer.py` | Fix verbosity plumbing, Rich Live integration | +| `modelkit/commands/compile.py` | Fix verbosity plumbing (`verbosity=verbose`) | +| `modelkit/commands/quantize.py` | Fix verbosity plumbing (`verbosity=verbose`) | +| `modelkit/static_analyzer/models/output.py` | Add `instance_counts` field to `EPSupport` | +| `modelkit/static_analyzer/core/output_aggregator.py` | Populate `instance_counts` | +| `modelkit/static_analyzer/analyzer.py` | Add `on_node_result` callback parameter | +| `modelkit/static_analyzer/core/runtime_checker.py` | Invoke callback per-node | +| `modelkit/static_analyzer/console_writer.py` | New stacked bar table, remove old sections | +| `modelkit/__init__.py` | NullHandler (already done, no further changes needed) | diff --git a/docs/design/static_analyzer/console_mockup.py b/docs/design/static_analyzer/console_mockup.py new file mode 100644 index 000000000..9d1387911 --- /dev/null +++ b/docs/design/static_analyzer/console_mockup.py @@ -0,0 +1,467 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +# ruff: noqa: D103 + +"""Console writer mockup — full analyze command output. + +Demonstrates: +1. Per-EP stacked bar tables with progressive animation +2. Per-EP pattern support (detected globally, checked per-EP) +3. Overall analysis summary with detailed op/pattern listings + +Run: uv run python docs/design/static_analyzer/console_mockup.py + +Data Structure Convention (shared with modelkit/commands/analyze.py) +==================================================================== + +All operator and pattern data uses SupportLevel enum value strings as keys: + + Operator instance counts: + dict[str, dict[str, int]] + e.g., {"Conv": {"white": 53, "gray": 0, "black": 0, "unknown": 0}} + + Keys: + "white" → Supported (green, 🟢) + "gray" → Partial (yellow, 🟡) + "black" → Unsupported (red, 🔴) + "unknown" → Unknown (dim, 🔵) + + Pattern support: + dict[str, dict] + e.g., {"SUBGRAPH/GELU_Erf": {"count": 8, "status": "gray"}} + status uses the same SupportLevel value strings. + + Model-level operator counts: + dict[str, int] + e.g., {"Conv": 53, "Relu": 49, ...} + Keys are plain op names (not prefixed). +""" + +from __future__ import annotations + +import time + +from rich.console import Console +from rich.live import Live +from rich.table import Table +from rich.text import Text + + +# ── Shared constants (same as modelkit/commands/analyze.py) ─────────────── + +COLORS = { + "white": "green", + "gray": "yellow", + "black": "red", + "unknown": "bright_black", +} + +STATUS_ICONS = {"white": "🟢", "gray": "🟡", "black": "🔴", "unknown": "🔵"} + +PATTERN_STATUS_LABELS = { + "white": "supported", + "gray": "partial", + "black": "unsupported", + "unknown": "unknown", +} + +MAX_BAR_WIDTH = 40 + + +# ── Fake data ───────────────────────────────────────────────────────────── + +# Model-level operator counts (same for all EPs) +ALL_OPS: dict[str, int] = { + "Conv": 53, + "Relu": 53, + "BatchNormalization": 45, + "MatMul": 25, + "Add": 18, + "Reshape": 10, + "Transpose": 6, + "Erf": 8, + "Resize": 3, + "Softmax": 4, + "Gather": 3, + "LayerNorm": 3, +} + +# Per-EP operator instance counts +EP_DATA: dict[str, dict[str, dict[str, int]]] = { + "QNNExecutionProvider": { + "Conv": {"white": 53, "gray": 0, "black": 0, "unknown": 0}, + "Relu": {"white": 53, "gray": 0, "black": 0, "unknown": 0}, + "BatchNormalization": {"white": 45, "gray": 0, "black": 0, "unknown": 0}, + "MatMul": {"white": 20, "gray": 5, "black": 0, "unknown": 0}, + "Add": {"white": 12, "gray": 5, "black": 1, "unknown": 0}, + "Reshape": {"white": 8, "gray": 0, "black": 0, "unknown": 2}, + "Transpose": {"white": 6, "gray": 0, "black": 0, "unknown": 0}, + "Erf": {"white": 0, "gray": 8, "black": 0, "unknown": 0}, + "Resize": {"white": 0, "gray": 0, "black": 3, "unknown": 0}, + "Softmax": {"white": 4, "gray": 0, "black": 0, "unknown": 0}, + "Gather": {"white": 0, "gray": 0, "black": 0, "unknown": 3}, + "LayerNorm": {"white": 2, "gray": 1, "black": 0, "unknown": 0}, + }, + "OpenVINOExecutionProvider": { + "Conv": {"white": 53, "gray": 0, "black": 0, "unknown": 0}, + "Relu": {"white": 53, "gray": 0, "black": 0, "unknown": 0}, + "BatchNormalization": {"white": 45, "gray": 0, "black": 0, "unknown": 0}, + "MatMul": {"white": 25, "gray": 0, "black": 0, "unknown": 0}, + "Add": {"white": 18, "gray": 0, "black": 0, "unknown": 0}, + "Reshape": {"white": 10, "gray": 0, "black": 0, "unknown": 0}, + "Transpose": {"white": 6, "gray": 0, "black": 0, "unknown": 0}, + "Erf": {"white": 8, "gray": 0, "black": 0, "unknown": 0}, + "Resize": {"white": 3, "gray": 0, "black": 0, "unknown": 0}, + "Softmax": {"white": 4, "gray": 0, "black": 0, "unknown": 0}, + "Gather": {"white": 3, "gray": 0, "black": 0, "unknown": 0}, + "LayerNorm": {"white": 3, "gray": 0, "black": 0, "unknown": 0}, + }, + "VitisAIExecutionProvider": { + "Conv": {"white": 0, "gray": 0, "black": 0, "unknown": 53}, + "Relu": {"white": 0, "gray": 0, "black": 0, "unknown": 53}, + "BatchNormalization": {"white": 0, "gray": 0, "black": 0, "unknown": 45}, + "MatMul": {"white": 0, "gray": 0, "black": 0, "unknown": 25}, + "Add": {"white": 0, "gray": 0, "black": 0, "unknown": 18}, + "Reshape": {"white": 0, "gray": 0, "black": 0, "unknown": 10}, + "Transpose": {"white": 0, "gray": 0, "black": 0, "unknown": 6}, + "Erf": {"white": 0, "gray": 0, "black": 0, "unknown": 8}, + "Resize": {"white": 0, "gray": 0, "black": 0, "unknown": 3}, + "Softmax": {"white": 0, "gray": 0, "black": 0, "unknown": 4}, + "Gather": {"white": 0, "gray": 0, "black": 0, "unknown": 3}, + "LayerNorm": {"white": 0, "gray": 0, "black": 0, "unknown": 3}, + }, +} + +# Per-EP pattern support +EP_PATTERNS: dict[str, dict[str, dict]] = { + "QNNExecutionProvider": { + "SUBGRAPH/GELU_Erf": {"count": 8, "status": "gray"}, + "SUBGRAPH/LayerNorm": {"count": 4, "status": "white"}, + "SUBGRAPH/Attention": {"count": 2, "status": "white"}, + }, + "OpenVINOExecutionProvider": { + "SUBGRAPH/GELU_Erf": {"count": 8, "status": "white"}, + "SUBGRAPH/LayerNorm": {"count": 4, "status": "white"}, + "SUBGRAPH/Attention": {"count": 2, "status": "white"}, + }, + "VitisAIExecutionProvider": { + "SUBGRAPH/GELU_Erf": {"count": 8, "status": "unknown"}, + "SUBGRAPH/LayerNorm": {"count": 4, "status": "unknown"}, + "SUBGRAPH/Attention": {"count": 2, "status": "unknown"}, + }, +} + + +# ── Rendering helpers ───────────────────────────────────────────────────── + + +def build_stacked_bar(counts: dict[str, int], max_count: int) -> Text: + total = sum(counts.values()) + if total == 0: + return Text() + bar_width = max(1, round(total / max_count * MAX_BAR_WIDTH)) + nonzero = sum(1 for v in counts.values() if v > 0) + bar_width = max(bar_width, nonzero) + bar = Text() + chars_used = 0 + for level in ("white", "gray", "black", "unknown"): + count = counts.get(level, 0) + if count == 0: + continue + width = max(1, round(count / total * bar_width)) + width = min(width, bar_width - chars_used) + bar.append("█" * width, style=COLORS[level]) + chars_used += width + return bar + + +def worst_level_icon(counts: dict[str, int]) -> str: + if counts.get("black", 0) > 0: + return "🔴" + if counts.get("gray", 0) > 0: + return "🟡" + if counts.get("unknown", 0) > 0: + return "🔵" + return "🟢" + + +def build_spu_text(counts: dict[str, int]) -> Text: + w = counts.get("white", 0) + g = counts.get("gray", 0) + b = counts.get("black", 0) + u = counts.get("unknown", 0) + text = Text() + text.append(str(w), style="bold green") + text.append("/", style="dim") + text.append(str(g), style="bold yellow" if g > 0 else "dim") + text.append("/", style="dim") + text.append(str(b), style="bold red" if b > 0 else "dim") + if u > 0: + text.append("/", style="dim") + text.append(str(u), style="bold bright_black") + return text + + +def build_table( + data: dict[str, dict[str, int]], + ep_name: str = "", + complete: bool = False, + all_ops: dict[str, int] | None = None, +) -> Table: + """Build per-EP analysis table. + + Shows incremental progress: ops with data show colored bars (partial or + complete), ops without data show dim pending rows with placeholder bars. + """ + if all_ops: + display_order = sorted(all_ops, key=lambda x: all_ops[x], reverse=True) + else: + display_order = sorted(data, key=lambda x: sum(data[x].values()), reverse=True) + + # Stable max_count anchored to all_ops (no shifting during animation) + if all_ops: + max_count = max(all_ops.values()) + else: + vals = [data[op] for op in display_order if data.get(op)] + max_count = max((sum(c.values()) for c in vals), default=1) + + title = "📊 ONNX Static Analysis" + if ep_name: + title += f" — [bold cyan]{ep_name}[/bold cyan]" + if complete: + title += " [bold green]✅ Complete[/bold green]" + + table = Table( + title=title, + show_header=True, + header_style="bold", + box=None, + padding=(0, 1), + expand=False, + ) + table.add_column("Op Type", width=28, no_wrap=True) + table.add_column("S/P/U", width=14, no_wrap=True) + table.add_column("", no_wrap=True) + + agg: dict[str, int] = {"white": 0, "gray": 0, "black": 0, "unknown": 0} + + for op_type in display_order: + total = all_ops.get(op_type, 0) if all_ops else sum(data.get(op_type, {}).values()) + counts = data.get(op_type) + + if not counts: + # Pending — no data yet + bar_width = max(1, round(total / max_count * MAX_BAR_WIDTH)) + table.add_row( + Text(f" {op_type} ({total})", style="dim"), + Text("...", style="dim"), + Text("░" * bar_width, style="dim"), + ) + else: + # Has data — show progress + analyzed = sum(counts.values()) + for level in agg: + agg[level] += counts.get(level, 0) + + icon = worst_level_icon(counts) + op_label = Text() + op_label.append(f"{icon} ") + op_label.append(op_type, style="cyan") + if analyzed < total: + op_label.append(f" ({analyzed}/{total})", style="dim") + else: + op_label.append(f" ({total})", style="dim") + + # Colored portion + dim remainder + bar = build_stacked_bar(counts, max_count) + remaining = total - analyzed + if remaining > 0: + remaining_width = max(1, round(remaining / max_count * MAX_BAR_WIDTH)) + bar.append("░" * remaining_width, style="dim") + + table.add_row(op_label, build_spu_text(counts), bar) + + # TOTAL row + table.add_section() + total_ops = sum(all_ops.values()) if all_ops else sum(agg.values()) + analyzed_count = sum(agg.values()) + total_label = Text() + total_label.append("TOTAL", style="bold") + if analyzed_count < total_ops: + total_label.append(f" ({analyzed_count}/{total_ops})", style="dim") + else: + total_label.append(f" ({total_ops})", style="dim") + + # TOTAL bar: colored portion + dim remainder (same as per-op) + total_bar = build_stacked_bar(agg, max(total_ops, 1)) + total_remaining = total_ops - analyzed_count + if total_remaining > 0: + total_remaining_width = max(1, round(total_remaining / max(total_ops, 1) * MAX_BAR_WIDTH)) + total_bar.append("░" * total_remaining_width, style="dim") + + table.add_row(total_label, build_spu_text(agg), total_bar) + + return table + + +# ── Demo ────────────────────────────────────────────────────────────────── + + +def demo_full() -> None: + console = Console(width=95) + + # ── Model Info Header ── + console.print() + console.print("═" * 80) + console.print("📊 [bold]OP CHECK[/bold]") + console.print("═" * 80) + console.print(" 📦 Model: [bold cyan]convnext-tiny-224.onnx[/bold cyan]") + console.print(" 🔧 Opset: [green]17[/green] Producer: [green]pytorch v2.1.0[/green]") + console.print( + f" 📋 Operators: [cyan]{sum(ALL_OPS.values())}[/cyan] total, " + f"[cyan]{len(ALL_OPS)}[/cyan] unique types" + ) + console.print() + + # ── Per-EP tables with Live animation ── + for ep_idx, (ep_name, op_data) in enumerate(EP_DATA.items()): + sorted_op_names = sorted(ALL_OPS, key=lambda x: ALL_OPS[x], reverse=True) + + ep_num = ep_idx + 1 + total_eps = len(EP_DATA) + console.print("─" * 80) + console.print(f"💻 [bold]EP {ep_num}/{total_eps}[/bold]: [bold cyan]{ep_name}[/bold cyan]") + console.print("─" * 80) + + # Simulate incremental per-node analysis + results: dict[str, dict[str, int]] = {} + + with Live( + build_table(results, ep_name=ep_name, all_ops=ALL_OPS), + console=console, + refresh_per_second=8, + ) as live: + for op_type in sorted_op_names: + results[op_type] = op_data[op_type] + live.update(build_table(results, ep_name=ep_name, all_ops=ALL_OPS)) + time.sleep(0.15) + + # Final complete + live.update( + build_table( + results, + ep_name=ep_name, + all_ops=ALL_OPS, + complete=True, + ) + ) + + console.print() + + # ── Pattern Matching ── + console.print("═" * 80) + console.print("🔍 [bold]PATTERN MATCHING[/bold]") + console.print("═" * 80) + + for ep_name in EP_DATA: + patterns = EP_PATTERNS.get(ep_name, {}) + if not patterns: + continue + + # EP sub-header + console.print(f" 💻 [bold cyan]{ep_name}[/bold cyan]") + + for pat_id, pat_info in sorted(patterns.items(), key=lambda x: x[1]["count"], reverse=True): + status = pat_info["status"] + count = pat_info["count"] + icon = STATUS_ICONS.get(status, "❓") + label = PATTERN_STATUS_LABELS.get(status, "unknown") + console.print( + f" {icon} [cyan]{pat_id}[/cyan] [dim]({count} instances)[/dim] — {label}" + ) + + console.print() + + # ── Analysis Summary ── + console.print("═" * 80) + console.print("📈 [bold]ANALYSIS SUMMARY[/bold]") + console.print("═" * 80) + + for ep_name, op_data in EP_DATA.items(): + patterns = EP_PATTERNS.get(ep_name, {}) + + agg: dict[str, int] = {"white": 0, "gray": 0, "black": 0, "unknown": 0} + for counts in op_data.values(): + for level in agg: + agg[level] += counts.get(level, 0) + + icon = worst_level_icon(agg) + if agg["black"] > 0: + ep_style = "bold red" + elif agg["gray"] > 0: + ep_style = "bold yellow" + elif agg["unknown"] > 0 and agg["white"] == 0: + ep_style = "bold bright_black" + else: + ep_style = "bold green" + + spu = build_spu_text(agg) + console.print(f" {icon} [{ep_style}]{ep_name}[/{ep_style}]: ", end="") + console.print(spu) + + # List ops with issues (worst level takes priority) + black_ops = [op for op, c in op_data.items() if c.get("black", 0) > 0] + gray_ops = [ + op for op, c in op_data.items() if c.get("gray", 0) > 0 and c.get("black", 0) == 0 + ] + unknown_ops = [ + op + for op, c in op_data.items() + if c.get("unknown", 0) > 0 and c.get("black", 0) == 0 and c.get("gray", 0) == 0 + ] + + if black_ops: + console.print(" [red]⛔ Unsupported:[/red]") + for op in black_ops: + console.print(f" • [dim]OP/ai.onnx/{op}[/dim]") + if gray_ops: + console.print(" [yellow]⚠️ Partial:[/yellow]") + for op in gray_ops: + console.print(f" • [dim]OP/ai.onnx/{op}[/dim]") + if unknown_ops: + console.print(" [bright_black]❓ Unknown:[/bright_black]") + for op in unknown_ops: + console.print(f" • [dim]OP/ai.onnx/{op}[/dim]") + + bad_patterns = {pid: p for pid, p in patterns.items() if p["status"] != "white"} + if bad_patterns: + console.print(" [dim]Patterns:[/dim]") + for pid, p in sorted(bad_patterns.items(), key=lambda x: x[1]["count"], reverse=True): + status = p["status"] + icon_p = STATUS_ICONS.get(status, "❓") + label = PATTERN_STATUS_LABELS.get(status, "unknown") + console.print( + f" {icon_p} [dim]{pid}[/dim] ({p['count']} instances, {label})" + ) + + has_issues = black_ops or gray_ops or unknown_ops or bad_patterns + if not has_issues: + console.print(" [green]Ready to deploy[/green]") + + console.print() + + # ── Legend ── + console.print( + " [dim]S/P/U = Supported/Partial/Unsupported[/dim]" + " [green]██[/green] supported" + " [yellow]██[/yellow] partial" + " [red]██[/red] unsupported" + " [bright_black]██[/bright_black] unknown" + ) + console.print() + + +if __name__ == "__main__": + demo_full() diff --git a/docs/getting-started/04.customer-intro/.omc/state/agent-replay-866056b6-6304-4966-8249-722504474ade.jsonl b/docs/getting-started/04.customer-intro/.omc/state/agent-replay-866056b6-6304-4966-8249-722504474ade.jsonl new file mode 100644 index 000000000..40d272c1e --- /dev/null +++ b/docs/getting-started/04.customer-intro/.omc/state/agent-replay-866056b6-6304-4966-8249-722504474ade.jsonl @@ -0,0 +1,4 @@ +{"t":0,"agent":"a381754","agent_type":"code-reviewer","event":"agent_start","parent_mode":"none"} +{"t":0,"agent":"a381754","agent_type":"code-reviewer","event":"agent_stop","success":true,"duration_ms":63186} +{"t":0,"agent":"a1ea2ba","agent_type":"code-reviewer","event":"agent_start","parent_mode":"none"} +{"t":0,"agent":"a1ea2ba","agent_type":"code-reviewer","event":"agent_stop","success":true,"duration_ms":57041} diff --git a/docs/getting-started/04.customer-intro/.omc/state/idle-notif-cooldown.json b/docs/getting-started/04.customer-intro/.omc/state/idle-notif-cooldown.json new file mode 100644 index 000000000..da09cea22 --- /dev/null +++ b/docs/getting-started/04.customer-intro/.omc/state/idle-notif-cooldown.json @@ -0,0 +1,3 @@ +{ + "lastSentAt": "2026-04-13T07:00:46.988Z" +} diff --git a/docs/getting-started/04.customer-intro/.omc/state/last-tool-error.json b/docs/getting-started/04.customer-intro/.omc/state/last-tool-error.json new file mode 100644 index 000000000..c244c5788 --- /dev/null +++ b/docs/getting-started/04.customer-intro/.omc/state/last-tool-error.json @@ -0,0 +1,7 @@ +{ + "tool_name": "Read", + "tool_input_preview": "{\"file_path\":\"D:\\\\BYOM\\\\ModelKit_PRs\\\\mvp\\\\docs\\\\getting-started\\\\04.customer-intro\\\\bug-bash-guide.md\"}", + "error": "File content (12685 tokens) exceeds maximum allowed tokens (10000). Use offset and limit parameters to read specific portions of the file, or search for specific content instead of reading the whole file.", + "timestamp": "2026-04-13T06:58:06.118Z", + "retry_count": 1 +} diff --git a/docs/getting-started/04.customer-intro/.omc/state/mission-state.json b/docs/getting-started/04.customer-intro/.omc/state/mission-state.json new file mode 100644 index 000000000..b05f140ee --- /dev/null +++ b/docs/getting-started/04.customer-intro/.omc/state/mission-state.json @@ -0,0 +1,79 @@ +{ + "updatedAt": "2026-04-13T06:58:57.168Z", + "missions": [ + { + "id": "session:866056b6-6304-4966-8249-722504474ade:none", + "source": "session", + "name": "none", + "objective": "Session mission", + "createdAt": "2026-04-13T06:23:25.031Z", + "updatedAt": "2026-04-13T06:58:57.168Z", + "status": "done", + "workerCount": 2, + "taskCounts": { + "total": 2, + "pending": 0, + "blocked": 0, + "inProgress": 0, + "completed": 2, + "failed": 0 + }, + "agents": [ + { + "name": "code-reviewer:a381754", + "role": "code-reviewer", + "ownership": "a38175408c606e3a2", + "status": "done", + "currentStep": null, + "latestUpdate": "completed", + "completedSummary": null, + "updatedAt": "2026-04-13T06:24:28.217Z" + }, + { + "name": "code-reviewer:a1ea2ba", + "role": "code-reviewer", + "ownership": "a1ea2bae2c2cb6edf", + "status": "done", + "currentStep": null, + "latestUpdate": "completed", + "completedSummary": null, + "updatedAt": "2026-04-13T06:58:57.168Z" + } + ], + "timeline": [ + { + "id": "session-start:a38175408c606e3a2:2026-04-13T06:23:25.031Z", + "at": "2026-04-13T06:23:25.031Z", + "kind": "update", + "agent": "code-reviewer:a381754", + "detail": "started code-reviewer:a381754", + "sourceKey": "session-start:a38175408c606e3a2" + }, + { + "id": "session-stop:a38175408c606e3a2:2026-04-13T06:24:28.217Z", + "at": "2026-04-13T06:24:28.217Z", + "kind": "completion", + "agent": "code-reviewer:a381754", + "detail": "completed", + "sourceKey": "session-stop:a38175408c606e3a2" + }, + { + "id": "session-start:a1ea2bae2c2cb6edf:2026-04-13T06:58:00.127Z", + "at": "2026-04-13T06:58:00.127Z", + "kind": "update", + "agent": "code-reviewer:a1ea2ba", + "detail": "started code-reviewer:a1ea2ba", + "sourceKey": "session-start:a1ea2bae2c2cb6edf" + }, + { + "id": "session-stop:a1ea2bae2c2cb6edf:2026-04-13T06:58:57.168Z", + "at": "2026-04-13T06:58:57.168Z", + "kind": "completion", + "agent": "code-reviewer:a1ea2ba", + "detail": "completed", + "sourceKey": "session-stop:a1ea2bae2c2cb6edf" + } + ] + } + ] +} diff --git a/docs/getting-started/04.customer-intro/.omc/state/subagent-tracking.json b/docs/getting-started/04.customer-intro/.omc/state/subagent-tracking.json new file mode 100644 index 000000000..edb38fddf --- /dev/null +++ b/docs/getting-started/04.customer-intro/.omc/state/subagent-tracking.json @@ -0,0 +1,26 @@ +{ + "agents": [ + { + "agent_id": "a38175408c606e3a2", + "agent_type": "oh-my-claudecode:code-reviewer", + "started_at": "2026-04-13T06:23:25.031Z", + "parent_mode": "none", + "status": "completed", + "completed_at": "2026-04-13T06:24:28.217Z", + "duration_ms": 63186 + }, + { + "agent_id": "a1ea2bae2c2cb6edf", + "agent_type": "oh-my-claudecode:code-reviewer", + "started_at": "2026-04-13T06:58:00.127Z", + "parent_mode": "none", + "status": "completed", + "completed_at": "2026-04-13T06:58:57.168Z", + "duration_ms": 57041 + } + ], + "total_spawned": 2, + "total_completed": 2, + "total_failed": 0, + "last_updated": "2026-04-13T06:58:57.276Z" +} diff --git a/docs/getting-started/04.customer-intro/MVP.transcripts.md b/docs/getting-started/04.customer-intro/MVP.transcripts.md new file mode 100644 index 000000000..0e569c69f --- /dev/null +++ b/docs/getting-started/04.customer-intro/MVP.transcripts.md @@ -0,0 +1,201 @@ +# MVP + +## Opening + +In the next 20 minutes, I'll introduce ModelKit — a new product we've built over the past few months. + +## What is ModelKit? + +So, what is ModelKit? *ModelKit is a CLI toolkit to build portable, performant and high-quality models for Windows ML.* + +**Goals** + +With ModelKit, you can build a model once and run anywhere, using either built-in pipelines or composing your own from primitive commands. + +You can also drill down into model details with ModelKit, pinpoint errors, or performance bottlenecks. + +And ModelKit is AI-ready — we provide built-in skills that work with all mainstream coding agents. + +**Promises** + +ModelKit promises you an out-of-box user experience — one toolkit covers all the EPs, as well as full repeatability and traceability throughout both commands and pipelines. + +And we build quality gates into ModelKit to catch compatibility issues and suggest fixes automatically. + +## ModelKit — Command List + +Let's quickly go through the commands. Basically, we bucketize the commands into four categories. + +- The primitive commands — you can either use them individually or compose them into workflows. +- The pipeline commands that help you build and benchmark models end-to-end. +- And insight commands that enable model analysis and debugging. +- And a few utilities that support daily usage. + +Now let's see how to use ModelKit in practice + +## Background — BYOM Workflow + +Some background info, Before we jump into the demos, let me explain the workflow behind ModelKit. + +Here is a typical pipeline your model goes through. It is quite straightforward. Given a source model, the workflow will export, analyze, optimize, quantize if needed, and evaluate before shipping. + +Here, I want to highlight are three commands that serve as quality gates. + +- Analyze for portability, checking whether your model runs on the target EP +- Optimize, help improve the graph performance +- Quantize which affects the model accuracy + +These three steps define the quality of your output. And ModelKit gives you full control over each one. + +## Three Ways to Build ConvNeXt with ModelKit + +I'm going to show you three ways to build models with ModelKit. And for easy comparison, three demos will all use ConvNeXT. + +- First, I'll go with primitive commands. You'll see how to craft a model step by step with ModelKit. +- Then, I'll build ConvNeXT again with the config-driven pipeline, with only two commands. +- And last, I'll show you how to quick-bench a model with ModelKit — in one command + +## Build ConvNeXt with Primitive Commands + +In this demo, I'll walk you through these primitive commands. Each one handles a single stage of the pipeline. + +#### Demo 1: Build ConvNeXT with Primitive Commands + +Let's start with ConvNeXT. First, `inspect` + +ConvNeXt is a family of CNN model inspired by Vision Transformers, introduced by Facebook in 2022. + +It adopts several design choices from Transformers, and offers high accuracy while retaining the efficiency of CNNs, therefore it is widely adopted for tasks such as image classification, detection, and segmentation. + +this tells us everything about the model. Task, model class, I/O shapes. No weights loaded, just metadata. + +`wmk inspect -m facebook/convnext-base-224` + +Now we export from PyTorch to ONNX. + +`wmk export -m facebook/convnext-base-224 -o convnext/model.onnx -v` + +Let's run the analyzer right away. It checks every operator against EPs — tells you what's supported, what's partial, what needs fixing. And it generates an optimization config automatically. + +`wmk analyze -m convnext/model.onnx --optim-config optim.json` + +We apply the optimizer with that config. The analyzer told us what to fix, the optimizer fixes it. + +`wmk optimize -m convnext/model.onnx -c optim_config.json -o convnext/model_opt.onnx` + +Now quantize — compress the optimized model to INT8. At this point, we have a portable model. It can run on any ONNX Runtime backend. + +`wmk quantize -m convnext/model_opt.onnx -o convnext/model_opt_int8.onnx` + +Now let's compile for QNN — this generates device-specific binaries for the NPU. + +`wmk compile -m convnext/model_opt_int8.onnx --ep qnn -o convnext/model_compiled.onnx` + +And benchmark on NPU. Look at the latency — let's keep this number in mind. + +`wmk perf -m convnext/model_compiled.onnx --ep qnn --iterations 100` + +Now the same optimized model on CPU for comparison. See the difference? That's roughly a 25x speedup — the quantized model on NPU versus the original on CPU. Same model, same accuracy, completely different performance. + +`wmk perf -m convnext/model_opt.onnx --ep cpu --iterations 100` + +OK, let me recap. In demo one, we used primitive commands to bring ConvNeXT to Windows ML — step by step, in three phases. + +- Inspect the model. +- Build a portable ONNX through export, analyze, optimize, quantize. +- Then benchmark on device with compile, perf, and eval. + +This gives you full control — you can jump into any stage, try different settings to fix errors or tweak performance. + +## Analyze ConvNeXt for EP Compatibility + +Let me go deeper on the analyzer, because it is the key to building portable ONNX models. + +**The analyzer is made of two parts — Linter and AutoConf.** + +The linter is like ESLint, but for ONNX. As you saw, it checks operators compatibility and classifies them — green for supported, gray for partial, red for unsupported. + +AutoConf detects suboptimal patterns and generates the config for the optimizer. + +Together they form the analyze-optimize loop + +## Build ConvNeXt with Config-Driven Pipeline + +#### Demo2 + +Same model, different approach. Instead of running each command manually, let's use `config` and `build`. + +`wmk config` generates a JSON config. Let me show you what's inside. This is the config — it contains all settings for each pipeline step. Task, I/O shapes, optimization flags, quantization parameters, all auto-detected. You can review it, revise it, or pass it directly to the build command. + +`wmk config -m facebook/convnext-base-224 -o convnext_config.json` + +`wmk build` takes that config and runs the full pipeline. Export, analyze, optimize, quantize, compile — all in one go. + +`wmk build -c convnext_config.json -m facebook/convnext-base-224 -o convnext_build/` + +And let's benchmark the result. Same model, same quality — but two commands instead of eight. + +`wmk perf -m convnext_build/model.onnx --ep qnn --iterations 100` + +In demo two, we used config and build. Two commands instead of five. + +- Config command generates the build config — auto-detects everything. +- Build command orchestrates the full pipeline. + +Same result, repeatable and tweakable. Think CMake for models. + +## **Primitives Commands vs. Config-Driven Pipeline** + +So, when do you use which? + +Primitive commands are for flexible workflows — you can start from any stage, try different settings, fix errors, do experiments. Great for exploring and debugging, just like the coding phase in a development lifecycle + +Config-driven pipeline is for delivery — repeatable, scriptable, easy to share with your team. Same quality, but reproducible. + +Both approaches produce the same portable ONNX. It's about where you are and how much control you need. + +## Benchmark ConvNeXt in One Command + +> +> +> +> And the simplest way — one command. `wmk perf` with a model ID. It handles everything: load, export, optimize, benchmark. Live hardware monitoring included. +> +> `wmk perf -m facebook/convnext-base-224 --ep qnn --iterations 100 --monitor` +> +> Same ConvNeXT, three different approaches. Full control, automated pipeline, or one command. Pick what fits your workflow. +> + +And the third way — the easiest. Say someone hands you a production-ready model. You just want a quick smoke test — does it run, how fast is it? One command. `wmk perf` with a model ID. + +Load, export, optimize, benchmark — all behind the scenes. Think of it as a sanity check in the QA process. + +## Why ModelKit? + +OK, that's all for the demos. + +Same ConvNeXT, three different approaches. Full control, automated pipeline, or one command. Pick what fits your best. + +> Three ways to build model with ModelKit +> +> - Primitive commands for development — iterate, debug, experiment. +> - Config-driven pipeline for polish and hand over. +> - And one command for QA — validate, benchmark, deliver. + +Now — if you want any of these + +- build models for Windows ML, +- quick-bench a model +- catch compatibility issues ahead of time +- troubleshoot errors or performance bottlenecks +- or you just want AI to do the heavy lifting + +**Please reach out to us for early access. Your feedback is the most valuable thing to us.** + +**Roadmap** + +And regarding the roadmap + +- ModelKit is ready for early access by the end of this month +- We'll release the public beta in Q2, with coding agent skills and AITK integration. +- After that, we'll continue bringing more into ModelKit — LLM support, MLIR, and broader device coverage. diff --git a/docs/getting-started/04.customer-intro/README.md b/docs/getting-started/04.customer-intro/README.md new file mode 100644 index 000000000..518e15403 --- /dev/null +++ b/docs/getting-started/04.customer-intro/README.md @@ -0,0 +1,432 @@ +# ModelKit + +**CLI toolkit to build portable, performant and high-quality models for Windows ML.** + +![Status](https://img.shields.io/badge/status-early%20access-blue) +![Python](https://img.shields.io/badge/python-3.10%2B-blue?logo=python&logoColor=white) +![License](https://img.shields.io/badge/license-MIT-green) + +**ModelKit** is a CLI toolkit to build **portable, performant, and high-quality** models for Windows ML. It covers the entire journey from pretrained model to on-device inference — export, optimization, quantization, compilation, and benchmarking — across **all execution providers**, regardless of silicon. + +--- + +## :dart: ModelKit Is Right for You If + +- [x] You want to build models that run on **any Windows device** — Qualcomm, Intel, AMD, NVIDIA, or CPU +- [x] You want to benchmark a model with **one command** — latency, throughput, and live hardware utilization +- [x] You want to catch compatibility issues **ahead of time** — unsupported ops, shape mismatches, EP gaps +- [x] You want **deep insights** into your model — I/O shapes, task mapping, operator coverage per EP +- [x] You want a **repeatable and traceable** model building process — config-driven, inspectable at every stage +- [x] You want **AI agents** to build and profile models for you — agent-ready skills for coding assistants + +--- + +## :desktop_computer: Supported Hardware + +| Execution Provider | Hardware | Status | EP Flag | Device Flag | +|:-------------------|:---------|:------:|:--------|:------------| +| **QNN** | Qualcomm NPU (Snapdragon X Elite) | :green_circle: Ready | `--ep qnn` | `--device npu` | +| **OpenVINO** | Intel NPU (Meteor Lake / Lunar Lake) | :green_circle: Ready | `--ep openvino` | `--device npu` | +| **VitisAI** | AMD NPU (Ryzen AI) | :green_circle: Ready | `--ep vitisai` | `--device npu` | +| **TensorRT** | NVIDIA discrete GPUs | :large_orange_diamond: Planned | `--ep tensorrt` | `--device gpu` | +| **MIGraphX** | AMD discrete GPUs | :large_orange_diamond: Planned | `--ep migraphx` | `--device gpu` | +| **DirectML** | Hardware-agnostic GPU backend | :large_orange_diamond: Planned | `--ep dml` | `--device gpu` | +| **CPU** | Cross-platform fallback | :white_circle: Always available | `--ep cpu` | `--device cpu` | + +> **Tip:** Use `--device auto` and ModelKit picks the best available device — NPU first, then GPU, then CPU. + +--- + +## :clipboard: Prerequisites + +### Required Software + +| **Component** | **How to Get It** | +|-----------|--------------| +| **Windows 11** (x64 or ARM64) | Windows 11 24H2+ required for NPU support | +| **UV** | Install [UV](https://github.com/astral-sh/uv) | +| **Windows App SDK Runtime 1.8** | [Latest Windows App SDK downloads](https://learn.microsoft.com/en-us/windows/apps/windows-app-sdk/downloads) | +| **ModelKit** (Python wheel) | Download [winml_modelkit-0.0.1.dev1-py3-none-any.whl](https://microsoft.sharepoint-df.com/:u:/r/teams/WinPD/Shared%20Documents/Forms/Gallery.aspx?id=%2Fteams%2FWinPD%2FShared%20Documents%2FModelKit%2Fwinml%5Fmodelkit%2D0%2E0%2E1%2Edev1%2Dpy3%2Dnone%2Dany%2Ewhl&parent=%2Fteams%2FWinPD%2FShared%20Documents%2FModelKit&p=true&share=cQqnvDjbLu18QZ%5FhHSiX%2D2f3EgUCQzr1M%2DQKvecLbJEsxiAn7g) | + +### Required Hardware + +**ModelKit targets NPU.** We recommend testing on one of the following NPU devices: + +| Device | EP | Flag | +|--------|-----|------| +| Snapdragon X Elite (Qualcomm) | QNN | `--ep qnn --device npu` | +| Intel AI Boost (Meteor Lake / Lunar Lake) | OpenVINO | `--ep openvino --device npu` | +| AMD Ryzen AI (Phoenix / Hawk Point / Strix) | VitisAI | `--ep vitisai --device npu` | + +**No NPU?** Use `--device auto` — ModelKit will fall back to the best available device (GPU → CPU). Note that `winml compile` requires NPU and cannot run without one. + +### Accepted Inputs + +- **HuggingFace model ID** (e.g., `microsoft/resnet-50`) — weights are downloaded on first run +- **Local ONNX file** (e.g., `model.onnx`) — from `winml export`, `winml build`, or any ONNX you already have + +### The Golden Rule: Inspect First + +Before running any pipeline command, always verify the model is supported: + +```bash +winml inspect -m +``` + +If `inspect` prints an error or shows `Unsupported`, **skip that model**. Only models that pass inspect are valid inputs for export, analyze, build, perf, and eval. + +--- + +## :package: Installation + +ModelKit requires **Python 3.10** and is distributed as a Python wheel. We recommend [uv](https://docs.astral.sh/uv/) for fast, reproducible environment setup. + +**1. Create a Python 3.10 environment** + +```bash +uv venv --python 3.10 +``` + +Activate it: + +```bash +# Windows (PowerShell) +.venv\Scripts\activate + +# Windows (Git Bash / WSL) +source .venv/Scripts/activate +``` + +**2. Install from wheel** + +```bash +uv pip install winml_modelkit--py3-none-any.whl +``` + +**3. Verify your environment** + +```bash +winml sys --list-device --list-ep +``` + +Confirm that your target device and EP appear in the output: + +- **Snapdragon X Elite** — look for `QNNExecutionProvider` +- **Intel AI Boost** — look for `OpenVINOExecutionProvider` +- **AMD Ryzen AI** — look for `VitisAIExecutionProvider` + +If no NPU is detected, you can still use ModelKit with `--device auto` for most commands. The only exception is `winml compile`, which requires an NPU device. + +--- + +## :wrench: Commands + +| Category | Commands | Purpose | +|:---------|:---------|:--------| +| **Primitives** | `inspect` `export` `optimize` `quantize` `compile` | Single-stage building blocks | +| **Pipeline** | `config` `build` `perf` `eval` `run`\* | End-to-end orchestration | +| **Insights** | `analyze` `debug`\* | Diagnostics and compatibility | +| **Utilities** | `hub` `cache` `doctor` `setting` `sys` | Catalog, cache, and environment | + +\* = coming soon + +
+Primitives — one stage at a time + +**`winml inspect`** — Discover model metadata. Prints the task, model class, input/output tensor names and shapes, and execution provider compatibility. No weights are loaded — this reads only the model configuration, making it fast and lightweight. Always run inspect first to verify a model is supported. + +**`winml export`** — Convert a source model to ONNX. Takes a Hugging Face model ID (or local checkpoint) and produces a standards-compliant ONNX file with hierarchy-preserving metadata. + +**`winml optimize`** — Fuse operators, simplify graphs, and prepare for target EPs. Takes an ONNX model and an optimization config (typically generated by `winml analyze`) and applies graph-level transformations: operator fusion, constant folding, shape inference, and EP-specific rewrites. + +**`winml quantize`** — Compress to low-bit precision. Reduces model size and inference latency by converting weights and activations from FP32 to INT8 (or other low-bit formats). After quantization, the model is portable — it can run on any ONNX Runtime backend. + +**`winml compile`** — Generate device-specific binaries. Takes a quantized ONNX model and produces EP-specific compiled artifacts (for example, QNN context binaries for Qualcomm NPU). This step locks the model to a specific device but delivers the lowest possible inference latency. + +
+ +
+Pipeline — orchestrated workflows + +**`winml config`** — Auto-detect optimal settings into a JSON config. Inspects the model and generates a complete build specification: task, I/O shapes, optimization flags, quantization parameters, and target EP settings. The config file is reviewable, editable, and version-controllable — the single source of truth for your build. + +**`winml build`** — Orchestrate the full pipeline. Takes a config file and executes every stage in sequence: export, analyze, optimize, quantize, and compile. Two commands (`config` + `build`) replace eight manual steps. + +**`winml perf`** — Benchmark latency, throughput, and hardware utilization. Runs inference on the target device and reports latency percentiles (p50, p90, p99), throughput (inferences per second), and optionally live hardware monitoring (CPU, RAM, NPU utilization) with the `--monitor` flag. Can accept a local ONNX file or a Hugging Face model ID. + +**`winml eval`** — Measure model accuracy against reference datasets. Compares the output of your optimized/quantized model against the original to quantify any accuracy loss introduced by the pipeline. + +**`winml run`** — End-to-end inference with pre/post processing. *(Coming soon.)* + +
+ +
+Insights — understand what is happening inside + +**`winml analyze`** — Lint operators, check EP compatibility, and generate optimization config. The analyzer has two components: the **Linter** (like ESLint for ONNX) checks every operator against target EPs and classifies each as supported, partial, or unsupported. **AutoConf** detects suboptimal patterns and generates the optimization config that the optimizer consumes. Together they form the analyze-optimize loop. + +**`winml debug`** — Interactive model debugging and layer-by-layer inspection. *(Coming soon.)* + +
+ +
+Utilities — catalog, cache, and environment + +**`winml hub`** — Browse the curated built-in model catalog. + +**`winml cache`** — Manage built model artifacts and pipeline outputs. View, clean, or selectively remove cached models and intermediate files. + +**`winml doctor`** — Diagnose environment issues. Checks runtimes, execution providers, and dependencies to identify configuration problems. + +**`winml setting`** — Configure ModelKit preferences. Set default EPs, output directories, and other global options. + +**`winml sys`** — System information and capability reporting. Prints detected hardware, available EPs, Python version, and installed package versions. + +
+ +--- + +## :rocket: Quick Start + +### Inspect a Model + +The fastest way to get started is to inspect a model. Let's look at ResNet-50: + +```bash +winml inspect -m microsoft/resnet-50 +``` + +This prints the model's metadata without downloading weights: + +- **Task**: `image-classification` — what the model does +- **Model class**: `ResNetForImageClassification` — the architecture +- **Input tensors**: names, data types, and shapes (e.g., `pixel_values: float32 [1, 3, 224, 224]`) +- **Output tensors**: names, data types, and shapes (e.g., `logits: float32 [1, 1000]`) + +If inspect succeeds, the model is supported and you can proceed with the rest of the pipeline. + +> **Golden rule: always inspect first.** Before running export, build, perf, or any other pipeline command, verify the model is supported with `winml inspect`. + +### Build with Primitive Commands + +This walkthrough builds **ConvNeXT** (`facebook/convnext-base-224`) step by step using primitive commands. ConvNeXT is a family of CNN models inspired by Vision Transformers, introduced by Meta in 2022 — it offers high accuracy while retaining the efficiency of CNNs. + +#### Phase 1: Inspect + +```bash +winml inspect -m facebook/convnext-base-224 +``` + +#### Phase 2: Build a Portable Model + +**Export** from PyTorch to ONNX: + +```bash +winml export -m facebook/convnext-base-224 -o convnext/model.onnx -v +``` + +**Analyze** for EP compatibility: + +```bash +winml analyze -m convnext/model.onnx --optim-config optim.json +``` + +**Optimize** the graph using the analyzer's config: + +```bash +winml optimize -m convnext/model.onnx -c optim.json -o convnext/model_opt.onnx +``` + +**Quantize** to INT8: + +```bash +winml quantize -m convnext/model_opt.onnx -o convnext/model_opt_int8.onnx +``` + +#### Phase 3: Benchmark on Device + +**Compile** for NPU (generates device-specific binaries): + +```bash +winml compile -m convnext/model_opt_int8.onnx --ep qnn -o convnext/model_compiled.onnx +``` + +**Benchmark on NPU** — note the latency: + +```bash +winml perf -m convnext/model_compiled.onnx --ep qnn --iterations 100 +``` + +**Benchmark on CPU** for comparison: + +```bash +winml perf -m convnext/model_opt.onnx --ep cpu --iterations 100 +``` + +Compare the two numbers. You should see roughly a **25x speedup** — the quantized model on NPU versus the original on CPU. Same model, same accuracy, completely different performance. + +### Build with Config + Build + +Same model, different approach. Instead of running each command manually, use the config-driven pipeline. Think of it like CMake: `config` generates a build plan, `build` executes it. + +**Generate the build config:** + +```bash +winml config -m facebook/convnext-base-224 -o convnext_config.json +``` + +This creates a JSON file containing all settings for every pipeline step — task, I/O shapes, optimization flags, quantization parameters — all auto-detected from the model. + +**Build the model:** + +```bash +winml build -c convnext_config.json -m facebook/convnext-base-224 -o convnext_build/ +``` + +This orchestrates the full pipeline — export, analyze, optimize, quantize, compile — all in one go. Same result as the manual steps above, but in two commands. + +**Benchmark the result:** + +```bash +winml perf -m convnext_build/model.onnx --ep qnn --iterations 100 +``` + +The config file is the single source of truth for your build. Version-control it, share it with teammates, edit it to override settings, and replay builds deterministically on any machine. + +### Benchmark in One Command + +The simplest way to evaluate a model — one command, zero setup: + +```bash +winml perf -m facebook/convnext-base-224 --device npu --monitor +``` + +ModelKit handles everything behind the scenes: download the model from Hugging Face, export to ONNX, optimize the graph, and run the benchmark on your NPU. The `--monitor` flag enables live hardware monitoring — real-time CPU utilization, RAM usage, and NPU activity alongside the latency results. + +This is ideal for quick smoke tests: does the model run on this device, and how fast is it? + +--- + +## :arrows_counterclockwise: The BYOM Workflow + +The **Build Your Own Model** (BYOM) workflow is the philosophy behind ModelKit. It defines how a source model becomes a production-ready, device-optimized artifact. + +### The Pipeline + +``` +Source Model --> Export --> Analyze --> Optimize --> Quantize --> Compile --> Benchmark +``` + +![BYOM Workflow](workflow-only.svg) + +Each arrow is a ModelKit command. You can enter the pipeline at any stage (for example, start with a local ONNX file and skip export), exit early (stop after optimization if you do not need quantization), or loop back to repeat a stage with different settings. + +--- + +## :clipboard: Built-in Models + +Run `winml hub` to browse the full catalog interactively. + +
+Click to expand the full model catalog + +| Model ID | Task | Architecture | +|:---------|:-----|:-------------| +| `microsoft/resnet-50` | image-classification | ResNet | +| `google/vit-base-patch16-224` | image-classification | ViT | +| `microsoft/swin-large-patch4-window7-224` | image-classification | Swin | +| `facebook/convnext-tiny-224` | image-classification | ConvNeXT | +| `rizvandwiki/gender-classification` | image-classification | ViT | +| `ProsusAI/finbert` | text-classification | BERT | +| `Intel/bert-base-uncased-mrpc` | text-classification | BERT | +| `cardiffnlp/twitter-roberta-base-sentiment-latest` | text-classification | RoBERTa | +| `dslim/bert-base-NER` | token-classification | BERT | +| `dbmdz/bert-large-cased-finetuned-conll03-english` | token-classification | BERT | +| `Babelscape/wikineural-multilingual-ner` | token-classification | BERT | +| `w11wo/indonesian-roberta-base-posp-tagger` | token-classification | RoBERTa | +| `microsoft/table-transformer-detection` | object-detection | Table Transformer | +| `mattmdjaga/segformer_b2_clothes` | image-segmentation | SegFormer | +| `nvidia/segformer-b1-finetuned-ade-512-512` | image-segmentation | SegFormer | +| `nvidia/segformer-b2-finetuned-ade-512-512` | image-segmentation | SegFormer | +| `nvidia/segformer-b5-finetuned-ade-640-640` | image-segmentation | SegFormer | + +
+ +These models are verified against ModelKit's full pipeline and serve as reliable starting points. You are not limited to this list — any Hugging Face model that passes `winml inspect` is a valid input. + +For models not in this table, run `winml inspect -m ` to verify support before proceeding. + +--- + +## :warning: Scope & Limitations + +### What ModelKit supports + +ModelKit targets **classic deep learning models** — CNNs, encoders, vision transformers, NLP classifiers, token classifiers, object detection models, and segmentation models. + +Supported tasks include: +- Image classification (ResNet, ViT, Swin, ConvNeXT) +- Text classification (BERT, RoBERTa) +- Token classification / NER (BERT, RoBERTa) +- Object detection (Table Transformer) +- Image segmentation (SegFormer) + +### What ModelKit does not support + +**LLMs and generative models are not in scope.** Do not use ModelKit with GPT, LLaMA, Phi, Mistral, Stable Diffusion, or any model with a decoder-only or sequence-to-sequence generative architecture. LLM support (with LoRA) is planned for Q3-Q4 2026. + +### Known constraints + +- `winml compile` requires an NPU device. If no NPU is available, skip the compile step and use `--device auto` for benchmarking. +- Some models may export successfully but fail during optimization or quantization due to unsupported operator patterns. The analyzer will flag these issues. +- Performance numbers vary by device, driver version, and EP version. Always benchmark on your target hardware. + +--- + +## :world_map: Roadmap + +| Milestone | Target | Highlights | +|:----------|:-------|:-----------| +| :yellow_circle: **Kickoff** | Q4 2025 | Internal prototype, core primitive commands | +| :green_circle: **Early Access** | Q1 2026 | First external testers, config + build pipeline, hub catalog | +| :blue_circle: **Public Beta** | Q2 2026 | Open source, agent skills, AI Toolkit integration | +| :purple_circle: **RC** | Q3-Q4 2026 | **LLM support** (with LoRA), broader device coverage, MLIR | + +
+Click to expand roadmap details + +**Q4 2025 — Kickoff** +- Primitive commands: `inspect`, `export`, `optimize`, `quantize`, `compile` +- QNN, OpenVINO, and VitisAI execution provider support +- Internal validation with ResNet, BERT, ViT, SegFormer families + +**Q1 2026 — Early Access** +- Pipeline commands: `config`, `build`, `perf`, `eval` +- Analyzer with auto-configuration loop +- Built-in model catalog (`winml hub`) +- Live hardware monitoring (`--monitor`) + +**Q2 2026 — Public Beta** +- Open source release +- Agent-ready skills for coding assistants (Claude Code, Cursor, Copilot) +- AI Toolkit for VS Code integration + +**Q3-Q4 2026 — Release Candidate** +- LLM support (decoder-only architectures with LoRA adapters) +- TensorRT, MIGraphX, and DirectML execution providers +- MLIR-based optimization backend +- Public SDK and framework APIs + +
+ +--- + +## :handshake: Contributing + +*Coming soon.* We are working on contribution guidelines and will open the process during Public Beta. + +--- + +## :page_facing_up: License + +[MIT](../../LICENSE) diff --git a/docs/getting-started/04.customer-intro/blog/.omc/state/agent-replay-866056b6-6304-4966-8249-722504474ade.jsonl b/docs/getting-started/04.customer-intro/blog/.omc/state/agent-replay-866056b6-6304-4966-8249-722504474ade.jsonl new file mode 100644 index 000000000..3b21579bc --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/.omc/state/agent-replay-866056b6-6304-4966-8249-722504474ade.jsonl @@ -0,0 +1,10 @@ +{"t":0,"agent":"a1ec069","agent_type":"general-purpose","event":"agent_start","parent_mode":"none"} +{"t":0,"agent":"af8175a","agent_type":"general-purpose","event":"agent_start","parent_mode":"none"} +{"t":0,"agent":"a9eae2b","agent_type":"general-purpose","event":"agent_start","parent_mode":"none"} +{"t":0,"agent":"a1ec069","agent_type":"general-purpose","event":"agent_stop","success":true,"duration_ms":51012} +{"t":0,"agent":"af8175a","agent_type":"general-purpose","event":"agent_stop","success":true,"duration_ms":164633} +{"t":0,"agent":"a9eae2b","agent_type":"general-purpose","event":"agent_stop","success":true,"duration_ms":249207} +{"t":0,"agent":"a55e324","agent_type":"code-reviewer","event":"agent_start","parent_mode":"none"} +{"t":0,"agent":"a55e324","agent_type":"code-reviewer","event":"agent_stop","success":true,"duration_ms":145160} +{"t":0,"agent":"af28f9e","agent_type":"general-purpose","event":"agent_start","parent_mode":"none"} +{"t":0,"agent":"af28f9e","agent_type":"general-purpose","event":"agent_stop","success":true,"duration_ms":167223} diff --git a/docs/getting-started/04.customer-intro/blog/.omc/state/idle-notif-cooldown.json b/docs/getting-started/04.customer-intro/blog/.omc/state/idle-notif-cooldown.json new file mode 100644 index 000000000..9ae6dacd7 --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/.omc/state/idle-notif-cooldown.json @@ -0,0 +1,3 @@ +{ + "lastSentAt": "2026-04-13T06:19:41.820Z" +} diff --git a/docs/getting-started/04.customer-intro/blog/.omc/state/last-tool-error.json b/docs/getting-started/04.customer-intro/blog/.omc/state/last-tool-error.json new file mode 100644 index 000000000..0f551521c --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/.omc/state/last-tool-error.json @@ -0,0 +1,7 @@ +{ + "tool_name": "Bash", + "tool_input_preview": "{\"command\":\"ls \\\"D:\\\\BYOM\\\\ModelKit_PRs\\\\mvp\\\\src\\\\winml\\\\modelkit\\\\\\\" 2>/dev/null || ls \\\"D:\\\\BYOM\\\\ModelKit_PRs\\\\mvp\\\\src\\\\\\\" 2>/dev/null\",\"description\":\"List source directory structure\"}", + "error": "Exit code 2", + "timestamp": "2026-04-13T06:00:39.844Z", + "retry_count": 1 +} diff --git a/docs/getting-started/04.customer-intro/blog/.omc/state/mission-state.json b/docs/getting-started/04.customer-intro/blog/.omc/state/mission-state.json new file mode 100644 index 000000000..8388bc4b9 --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/.omc/state/mission-state.json @@ -0,0 +1,109 @@ +{ + "updatedAt": "2026-04-13T06:19:32.086Z", + "missions": [ + { + "id": "session:866056b6-6304-4966-8249-722504474ade:none", + "source": "session", + "name": "none", + "objective": "Session mission", + "createdAt": "2026-04-13T06:00:02.040Z", + "updatedAt": "2026-04-13T06:19:32.086Z", + "status": "done", + "workerCount": 5, + "taskCounts": { + "total": 5, + "pending": 0, + "blocked": 0, + "inProgress": 0, + "completed": 5, + "failed": 0 + }, + "agents": [ + { + "name": "general-purpose:a1ec069", + "role": "general-purpose", + "ownership": "a1ec069748f98639c", + "status": "done", + "currentStep": null, + "latestUpdate": "completed", + "completedSummary": null, + "updatedAt": "2026-04-13T06:00:53.052Z" + }, + { + "name": "general-purpose:af8175a", + "role": "general-purpose", + "ownership": "af8175aa34d0d64d5", + "status": "done", + "currentStep": null, + "latestUpdate": "completed", + "completedSummary": null, + "updatedAt": "2026-04-13T06:03:05.613Z" + }, + { + "name": "general-purpose:a9eae2b", + "role": "general-purpose", + "ownership": "a9eae2baaf11f14ae", + "status": "done", + "currentStep": null, + "latestUpdate": "completed", + "completedSummary": null, + "updatedAt": "2026-04-13T06:04:41.940Z" + }, + { + "name": "code-reviewer:a55e324", + "role": "code-reviewer", + "ownership": "a55e324ef83001c20", + "status": "done", + "currentStep": null, + "latestUpdate": "completed", + "completedSummary": null, + "updatedAt": "2026-04-13T06:07:24.518Z" + }, + { + "name": "general-purpose:af28f9e", + "role": "general-purpose", + "ownership": "af28f9e48f3e173eb", + "status": "done", + "currentStep": null, + "latestUpdate": "completed", + "completedSummary": null, + "updatedAt": "2026-04-13T06:19:32.086Z" + } + ], + "timeline": [ + { + "id": "session-start:a55e324ef83001c20:2026-04-13T06:04:59.358Z", + "at": "2026-04-13T06:04:59.358Z", + "kind": "update", + "agent": "code-reviewer:a55e324", + "detail": "started code-reviewer:a55e324", + "sourceKey": "session-start:a55e324ef83001c20" + }, + { + "id": "session-stop:a55e324ef83001c20:2026-04-13T06:07:24.518Z", + "at": "2026-04-13T06:07:24.518Z", + "kind": "completion", + "agent": "code-reviewer:a55e324", + "detail": "completed", + "sourceKey": "session-stop:a55e324ef83001c20" + }, + { + "id": "session-start:af28f9e48f3e173eb:2026-04-13T06:16:44.863Z", + "at": "2026-04-13T06:16:44.863Z", + "kind": "update", + "agent": "general-purpose:af28f9e", + "detail": "started general-purpose:af28f9e", + "sourceKey": "session-start:af28f9e48f3e173eb" + }, + { + "id": "session-stop:af28f9e48f3e173eb:2026-04-13T06:19:32.086Z", + "at": "2026-04-13T06:19:32.086Z", + "kind": "completion", + "agent": "general-purpose:af28f9e", + "detail": "completed", + "sourceKey": "session-stop:af28f9e48f3e173eb" + } + ] + } + ] +} diff --git a/docs/getting-started/04.customer-intro/blog/.omc/state/subagent-tracking.json b/docs/getting-started/04.customer-intro/blog/.omc/state/subagent-tracking.json new file mode 100644 index 000000000..736d2a7b1 --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/.omc/state/subagent-tracking.json @@ -0,0 +1,53 @@ +{ + "agents": [ + { + "agent_id": "a1ec069748f98639c", + "agent_type": "general-purpose", + "started_at": "2026-04-13T06:00:02.040Z", + "parent_mode": "none", + "status": "completed", + "completed_at": "2026-04-13T06:00:53.052Z", + "duration_ms": 51012 + }, + { + "agent_id": "af8175aa34d0d64d5", + "agent_type": "general-purpose", + "started_at": "2026-04-13T06:00:20.980Z", + "parent_mode": "none", + "status": "completed", + "completed_at": "2026-04-13T06:03:05.613Z", + "duration_ms": 164633 + }, + { + "agent_id": "a9eae2baaf11f14ae", + "agent_type": "general-purpose", + "started_at": "2026-04-13T06:00:32.733Z", + "parent_mode": "none", + "status": "completed", + "completed_at": "2026-04-13T06:04:41.940Z", + "duration_ms": 249207 + }, + { + "agent_id": "a55e324ef83001c20", + "agent_type": "oh-my-claudecode:code-reviewer", + "started_at": "2026-04-13T06:04:59.358Z", + "parent_mode": "none", + "status": "completed", + "completed_at": "2026-04-13T06:07:24.518Z", + "duration_ms": 145160 + }, + { + "agent_id": "af28f9e48f3e173eb", + "agent_type": "general-purpose", + "started_at": "2026-04-13T06:16:44.863Z", + "parent_mode": "none", + "status": "completed", + "completed_at": "2026-04-13T06:19:32.086Z", + "duration_ms": 167223 + } + ], + "total_spawned": 5, + "total_completed": 5, + "total_failed": 0, + "last_updated": "2026-04-13T06:19:32.200Z" +} diff --git a/docs/getting-started/04.customer-intro/blog/01a-why-modelkit.md b/docs/getting-started/04.customer-intro/blog/01a-why-modelkit.md new file mode 100644 index 000000000..57a38f5ad --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/01a-why-modelkit.md @@ -0,0 +1,32 @@ +# ModelKit — Swiss Knife for Windows ML Model + +## What Is ModelKit? + +ModelKit is a CLI toolkit to build portable, performant, and high-quality models for Windows ML — bridging the gap between pretrained models and on-device inference. + +One toolkit covers everything — export, optimization, quantization, compilation, and benchmarking — across all execution providers, regardless of silicon. + +## Goals + +ModelKit is built around four guiding principles: + +- **Portable Models** — **Build once, run anywhere**. A model prepared through ModelKit targets the Windows ML runtime and runs on **any supported execution provider** without per-device rework. +- **Flexible Pipeline** — **Compose your own pipeline** from independent primitives. Each stage (export, optimize, quantize, compile) can be mixed, matched, and reordered to build any model. +- **Human-in-the-Loop** — **Nothing is a black box**. Step into any stage for refinement or debugging. Every intermediate artifact is inspectable, editable, and reproducible. +- **AI Agent Ready** — ModelKit exposes built-in skills that AI-augmented workflows can consume. **Coding agents can drive the entire build pipeline** programmatically. + +## ModelKit Is Right for You If + +- You want to build models that run on **any Windows device** +- You want to benchmark a model with **one command** +- You want to catch compatibility issues **ahead of time** +- You want **deep insights** into your model +- You want a **repeatable and traceable** model building process +- You want **AI agents** to build and profile models for you + +## Roadmap + +- **Q4 2025** — Project kickoff +- **Q1 2026** — Early access and feedback +- **Q2 2026** — Public beta: open source, coding agent skills, AI Toolkit integration +- **Q3–Q4 2026** — Release candidate: LLM support with LoRA, more devices (GPU & NPU), MLIR diff --git a/docs/getting-started/04.customer-intro/blog/01b-what-modelkit-offers.md b/docs/getting-started/04.customer-intro/blog/01b-what-modelkit-offers.md new file mode 100644 index 000000000..37b37a9ed --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/01b-what-modelkit-offers.md @@ -0,0 +1,57 @@ +# ModelKit — Core Promises & Command Reference + +## Promises + +- **Out-of-Box Experience** — **Build models for Windows ML with minimal setup**. Every command auto-detects optimal settings for the given model and **executes accordingly** — no manual configuration needed. +- **One Toolkit Covers All EPs** — **No separate vendor toolchain** per silicon. QNN, OpenVINO, VitisAI, DirectML, TensorRT, MIGraphX, and CPU — **seven EPs, one roof**. +- **Full Control** — **Reproducible and traceable** — whether through built-in pipelines or individual commands. **Step into any stage** for refinement or debugging. Every intermediate artifact is **inspectable and editable**. +- **Build-Time Quality Gates** — Catch compatibility problems, suboptimal operators, and quantization regressions — and **suggest fixes automatically**, before the model ever reaches a device. **Three quality pillars**: + - **Analyze** ensures portability across target EPs + - **Optimize** improves graph performance + - **Evaluate** guards accuracy after quantization + +## Command Overview + +ModelKit organizes its commands into four categories: + +**Primitives** — The building blocks. Each command handles one stage of the model preparation pipeline. + +- `inspect` — Discover model metadata, task, I/O shapes, and EP support +- `export` — Convert source model to ONNX with hierarchy-preserving metadata +- `optimize` — Fuse operators, simplify graphs, prepare for target EP +- `quantize` — Compress model to low-bit precision for smaller footprint and faster inference +- `compile` — Generate device-specific binaries (e.g., QNN context binaries) + +**Pipeline** — Orchestration commands that chain primitives together. + +- `config` — Auto-detect task, I/O shapes, and optimal settings into a JSON config +- `build` — Orchestrate the full pipeline — export, analyze, optimize, quantize, compile +- `perf` — Benchmark latency, throughput, and hardware utilization +- `eval` — Evaluate model accuracy against reference datasets +- `run` — End-to-end inference with pre/post processing *(coming soon)* + +**Insights** — Diagnostic tools for understanding what is happening inside a model. + +- `analyze` — Lint operators, check EP compatibility, generate optimization config +- `debug` — Interactive model debugging and layer-by-layer inspection *(coming soon)* + +**Utilities** — Housekeeping and environment management. + +- `cache` — Manage built model artifacts and pipeline outputs +- `doctor` — Diagnose environment issues (runtimes, providers, dependencies) +- `setting` — Configure ModelKit preferences +- `sys` — System information and capability reporting + +## Execution Provider Coverage + +ModelKit supports the major execution providers in the Windows ML ecosystem: + +| Provider | Hardware | Status | +|----------|----------|--------| +| **QNN** | Qualcomm GPU and NPU | 🟢 Ready | +| **OpenVINO** | Intel CPU, iGPU, dGPU, and NPU | 🟢 Ready | +| **VitisAI** | AMD NPU | 🟢 Ready | +| **TensorRT** | NVIDIA discrete GPUs | 🔶 Planned | +| **MIGraphX** | AMD discrete GPUs | 🔶 Planned | +| **DirectML** | Hardware-agnostic GPU backend | 🔶 Planned | +| **CPU** | Cross-platform fallback | ⚪ Always | diff --git a/docs/getting-started/04.customer-intro/blog/02-primitive-commands-demo.md b/docs/getting-started/04.customer-intro/blog/02-primitive-commands-demo.md new file mode 100644 index 000000000..093516ca5 --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/02-primitive-commands-demo.md @@ -0,0 +1,144 @@ +# ModelKit 101 — Building Models Step by Step with Primitive Commands + +## The Coding Phase + +Every model goes through a development lifecycle. Before anything is polished, repeatable, or automated, there is a **coding phase** — where developers explore the model, experiment with settings, debug compatibility issues, and iterate until the output looks right. + +ModelKit's **primitive commands** are built for exactly this phase. Each command handles a single stage of the pipeline, and developers can run them in any order, skip stages, or repeat them with different parameters. Full control, no magic. + +This walkthrough builds a ConvNeXT model from scratch using primitives, one step at a time. + +## The Model: ConvNeXT + +ConvNeXT is **a family of CNN models inspired by Vision Transformers**, introduced by Meta (Facebook) in 2022. It borrows several design choices from Transformers — such as larger kernel sizes and modernized training recipes — while retaining the efficiency and simplicity of convolutional architectures. The result is a model that delivers high accuracy at competitive speed, which is why ConvNeXT is widely adopted for tasks such as **image classification, object detection, and segmentation**. + +For this walkthrough, the specific variant is `facebook/convnext-base-224`. + +## The BYOM Workflow + +Before diving into commands, it helps to understand the workflow behind ModelKit. Given a source model, the pipeline will export, analyze, optimize, quantize, and evaluate before shipping. + +Three commands in this pipeline serve as **quality gates**: + +- **Analyze** — checks **portability**: will this model run on the target execution provider? +- **Optimize** — improves **performance**: restructures the graph for faster inference. +- **Quantize** — controls **fidelity**: compresses the model while preserving accuracy. + +These three steps define the quality of the output. ModelKit gives developers full control over each one. + +## Step-by-Step Walkthrough + +### 1. Inspect the Model + +The first step is always to understand what you are working with. `wmk inspect` reads model metadata — task, model class, I/O shapes — without loading weights. + +```bash +wmk inspect -m facebook/convnext-base-224 +``` + +This tells you everything about the model before any heavy computation begins. + +### 2. Export from PyTorch to ONNX + +Next, export the model from its PyTorch source into ONNX format. + +```bash +wmk export -m facebook/convnext-base-224 -o convnext/model.onnx -v +``` + +The `-v` flag enables verbose output so you can see exactly what the exporter is doing. + +### 3. Analyze for EP Compatibility + +With the ONNX model in hand, run the analyzer. It checks every operator against execution providers — reporting what is supported, what is partial, and what needs fixing. It also generates an optimization config automatically. + +```bash +wmk analyze -m convnext/model.onnx --optim-config optim.json +``` + +The analyzer produces `optim.json`, which captures exactly what the optimizer needs to fix. More on the analyzer below. + +### 4. Optimize the Graph + +Apply the optimizer using the config the analyzer just generated. The analyzer identified what to fix; the optimizer fixes it. + +```bash +wmk optimize -m convnext/model.onnx -c optim.json -o convnext/model_opt.onnx +``` + +### 5. Quantize to INT8 + +Compress the optimized model to INT8. After this step, the model is portable — it can run on any ONNX Runtime backend. + +```bash +wmk quantize -m convnext/model_opt.onnx -o convnext/model_opt_int8.onnx +``` + +### 6. Compile for NPU + +Generate device-specific binaries for the NPU via the QNN execution provider. + +```bash +wmk compile -m convnext/model_opt_int8.onnx --ep qnn -o convnext/model_compiled.onnx +``` + +### 7. Benchmark on NPU + +Run the compiled model on the NPU and record the latency. + +```bash +wmk perf -m convnext/model_compiled.onnx --ep qnn --iterations 100 +``` + +Keep this number in mind — the next step puts it in context. + +### NPU vs. CPU: The 25x Speedup + +Now benchmark the same optimized model on CPU for comparison. + +```bash +wmk perf -m convnext/model_opt.onnx --ep cpu --iterations 100 +``` + +The difference is dramatic: the quantized model on NPU runs roughly **25x faster** than the original on CPU. Same model, same accuracy, completely different performance. This is what the full pipeline — analyze, optimize, quantize, compile — delivers. + +## Deep Dive: The Analyzer + +The analyzer deserves a closer look, because it is the key to building portable ONNX models. It is made of two parts: **Linter** and **AutoConf**. + +### Linter — ESLint for ONNX + +The Linter checks every operator in the model graph against target execution providers and classifies them with a simple color scheme: + +- **Green** — fully supported +- **Gray** — partially supported (runs, but may fall back to CPU for some configurations) +- **Red** — unsupported (will not run on the target EP) + +Think of it as **ESLint, but for ONNX models**. It gives developers an immediate, visual read on whether the model is portable. + +### AutoConf — Automatic Optimization Config + +AutoConf goes one step further. It **detects suboptimal patterns in the graph and generates the configuration for the optimizer**. Instead of manually figuring out which optimization passes to apply, the analyzer does the detective work and writes `optim.json` for you. + +Together, Linter and AutoConf form the **analyze-optimize loop**: the analyzer diagnoses, the optimizer treats. + +## Recap: Three Phases + +The primitive workflow breaks down into three phases: + +1. **Inspect** — understand the model (task, architecture, I/O shapes) +2. **Build a Portable Model** — export, analyze, optimize, quantize +3. **On-Device Benchmarking** — compile, perf, eval + +Each command is independent. Developers can jump into any stage, swap parameters, retry with different settings, or skip steps entirely. That is the point of primitives — they give you the building blocks and get out of the way. + +## When to Use Primitives + +Primitive commands are the right tool when: + +- You are **exploring a new model** and want to understand its structure +- You need to **debug a compatibility issue** at a specific pipeline stage +- You want to **experiment with different optimization or quantization settings** +- You are building a **custom workflow** that does not follow the standard pipeline order + +In short, primitives are for the coding phase — flexible, transparent, and fully under your control. When the workflow is nailed down and it is time to automate, ModelKit offers a config-driven pipeline that collapses these steps into two commands. But that is a story for the next article. diff --git a/docs/getting-started/04.customer-intro/blog/03-config-build-demo.md b/docs/getting-started/04.customer-intro/blog/03-config-build-demo.md new file mode 100644 index 000000000..a7174e7dd --- /dev/null +++ b/docs/getting-started/04.customer-intro/blog/03-config-build-demo.md @@ -0,0 +1,79 @@ +# From Nine Commands to Two — ModelKit's Config-Driven Pipeline + +## Same Model, Different Approach + +The previous article walked through building ConvNeXT with primitive commands — seven individual steps from inspect to benchmark. That workflow is great for exploration and debugging, but once the recipe is known, running each command by hand gets repetitive. + +ModelKit's **config-driven pipeline** offers a different approach: automation over manual steps. Same model (`facebook/convnext-base-224`), same output quality, but collapsed into two commands. + +## The CMake Analogy + +If the concept sounds familiar, it should. The config-driven pipeline follows the same pattern as CMake: + +- **`wmk config`** is like **`cmake configure`** — it inspects the source, detects settings, and writes a build configuration. +- **`wmk build`** is like **`cmake --build`** — it reads that configuration and executes the full pipeline. + +Separate the *what* from the *how*. Configure once, build repeatably. + +## Step 1: Generate the Config + +```bash +wmk config -m facebook/convnext-base-224 -o convnext_config.json +``` + +This single command auto-detects everything the pipeline needs: + +- **Task** — image classification +- **I/O shapes** — input dimensions, output labels +- **Optimization flags** — which passes to apply for the target EP +- **Quantization parameters** — compression settings for INT8 + +The result is a JSON file that captures the entire build recipe. It is **reviewable** — open it in any editor to see exactly what will happen. It is **editable** — override any setting before building. And it is **version-controllable** — check it into source control alongside the model, so every build is traceable and reproducible. + +## Step 2: Build the Model + +```bash +wmk build -c convnext_config.json -m facebook/convnext-base-224 -o convnext_build/ +``` + +`wmk build` takes the config and runs the full pipeline in one go: **export, analyze, optimize, quantize, compile**. Every stage that was a separate command in the primitive workflow now executes automatically, in the right order, with the right parameters. + +## Benchmark the Result + +```bash +wmk perf -m convnext_build/model.onnx --ep qnn --iterations 100 +``` + +Same model, same quality — **two commands instead of eight**. The output is identical to what the primitive workflow produces, because the underlying pipeline stages are the same. The config-driven approach simply orchestrates them. + +## Primitive Commands vs. Config-Driven Pipeline + +| | **Primitive Commands** | **Config-Driven Pipeline** | +|---|---|---| +| **Lifecycle analogy** | Coding phase | Polish and delivery | +| **Best for** | Exploring, debugging, experimenting | Repeatable builds, team handoff, CI/CD | +| **Control** | Full — run any stage independently | Guided — configure once, build automatically | +| **Steps** | 7+ individual commands | 2 commands (config + build) | +| **When to use** | You need to iterate on a specific stage | The recipe is known and needs to be reproducible | + +Both approaches produce the same portable ONNX model. The difference is where you are in the development lifecycle and how much manual control you need. + +## One-Command Benchmark + +There is an even simpler path. When someone hands over a model and the only question is *"does it run, and how fast?"*, ModelKit can handle everything in a single command: + +```bash +wmk perf -m facebook/convnext-base-224 --ep qnn --iterations 100 --monitor +``` + +`wmk perf` with a model ID loads the model, exports it, optimizes it, and benchmarks it — all behind the scenes, with live hardware monitoring included. Think of it as a **sanity check for QA**: no config files, no build steps, just a quick smoke test. + +## Recap: Three Ways to Build + +ModelKit offers three approaches to building models, each tuned to a different stage of the development lifecycle: + +1. **Primitive commands** for development — iterate, debug, experiment. Full control over every stage. +2. **Config + Build** for delivery — repeatable, scriptable, easy to share with the team. Two commands. +3. **One command** for QA — validate, benchmark, and deliver. A quick smoke test when the model is ready. + +Same ConvNeXT, three different workflows. Pick what fits. diff --git a/docs/getting-started/04.customer-intro/bug-bash-guide.md b/docs/getting-started/04.customer-intro/bug-bash-guide.md new file mode 100644 index 000000000..e6e1ce7b8 --- /dev/null +++ b/docs/getting-started/04.customer-intro/bug-bash-guide.md @@ -0,0 +1,816 @@ +🐛 ModelKit Bug Bash Guide +======================= + +[Reference: Modelkit BugBash](https://github.com/microsoft/ModelKit/blob/qiowu/bugbash/bug-bash-guide.md#modelkit-bug-bash-guide) + +**Release**: v0.0.1.dev1 | **Date**: 2026-04-02 + +___ + +## Welcome! +Welcome to the **ModelKit Bug Bash** ! ModelKit is an **open‑source CLI on GitHub** that converts and optimizes PyTorch and Hugging Face models into high‑quality ONNX models for Windows ML. The current scope focuses on **classic deep learning models** (e.g., CNNs, vision transformers, NLP classifiers, segmentation). **LLMs and generative models are out of scope**—do not test GPT, LLaMA, Phi, Mistral, Stable Diffusion, or other decoder‑only or seq2seq generative architectures. + +## 📋 Prerequisites  +### Required Software +| **Component** | **How to Get It** | +|-----------|--------------| +| **Windows 11** (x64 or ARM64) | Windows 11 24H2+ required for NPU support | +| **UV**|Install [UV](https://github.com/astral-sh/uv)| +|**Windows APP SDK Runtime 1.8**| [Latest Windows App SDK downloads - Windows apps](https://learn.microsoft.com/en-us/windows/apps/windows-app-sdk/downloads)| +| **Modelkit (python wheel)** | Download [winml_modelkit-0.0.1.dev1-py3-none-any.whl](https://microsoft.sharepoint-df.com/:u:/r/teams/WinPD/Shared%20Documents/Forms/Gallery.aspx?id=%2Fteams%2FWinPD%2FShared%20Documents%2FModelKit%2Fwinml%5Fmodelkit%2D0%2E0%2E1%2Edev1%2Dpy3%2Dnone%2Dany%2Ewhl&parent=%2Fteams%2FWinPD%2FShared%20Documents%2FModelKit&p=true&share=cQqnvDjbLu18QZ%5FhHSiX%2D2f3EgUCQzr1M%2DQKvecLbJEsxiAn7g)| + +### Required Hardware +**This bug bash targets NPU only.** We recommend testing on one of the following NPU devices: +| Device | EP | Flag | +| --- | --- | --- | +| Snapdragon X Elite (Qualcomm) | QNN | `--ep qnn --device npu` | +| Intel AI Boost (Meteor Lake / Lunar Lake) | OpenVINO | `--ep openvino --device npu` | +| AMD Ryzen AI (Phoenix / Hawk Point / Strix) | VitisAI | `--ep vitisai --device npu` | +**No NPU?** Use `--device auto` — ModelKit will fall back to the best available device (GPU → CPU). Note that `winml compile` requires NPU and cannot run without one. + +--- + +##⚡ Reporting Bugs +[Modelkit and Local Model Agent Skills Bugs.loop](https://microsoft.sharepoint.com/:fl:/s/b35b0ac6-fbb0-43c1-abcc-4f55dd436ab2/IQCb2-g6XPRRTbcEz5TGPcqaATQAIt_nFDWhOLj5BRXwISY?e=kmgnzR&nav=cz0lMkZzaXRlcyUyRmIzNWIwYWM2LWZiYjAtNDNjMS1hYmNjLTRmNTVkZDQzNmFiMiZkPWIlMjFSWno0enY0YzhFdVR1eXFBZlpoeFVDRmJ4NTkxelZKSG93TUM5X1NqLUpTQS1BUEpPQ2RaU2JCZzlVckV2YUkwJmY9MDE0VUhFUUlFMzNQVURVWEhVS0ZHM09CR1BTVEREM1NVMiZjPSUyRiZhPUxvb3BBcHAmcD0lNDBmbHVpZHglMkZsb29wLXBhZ2UtY29udGFpbmVy) + +--- +## 🧩 Modelkit Setup + +Download the wheel file shared for this bug bash: `winml_modelkit-0.0.1.dev1-py3-none-any.whl` + +```bash +# Create a Python 3.10 virtual environment +uv venv --python 3.10 +.\.venv\Scripts\activate + +# Install Modelkit from wheel +uv pip install '.\\winml_modelkit-0.0.1.dev1-py3-none-any.whl' + +# Sanity check — verify NPU device and EP are available +winml sys --list-device --list-ep +``` + +> **Dependency**: ModelKit depends on onnxruntime-windowsml 1.23.x. Please ensure the Execution Providers (EPs) installed + +> **NPU required for E2E tests.** Run `winml sys --list-device --list-ep` to confirm your NPU and EP are detected: +> - Snapdragon X Elite → `QNNExecutionProvider` listed +> - Intel AI Boost → `OpenVINOExecutionProvider` listed +> - AMD Ryzen AI → `VitisAIExecutionProvider` listed +> +> If no NPU is available, use `--device auto` for perf, config, build, and eval — ModelKit will fall back to the best available device. `winml compile` requires NPU and should be marked SKIP if no NPU is present. + +--- + +##🤖 Quick Start and Core Feature Tests + +### 📝 Path A: Run with Claude Code, Github Copilot. +Paste this prompt into your coding agent (Claude Code, Cursor, Copilot, etc.) to run everything automatically: + +#### Prompt A — PERF: HuggingFace end-to-end only + +Use this for a quick smoke test of the auto-pipeline perf path. + +```text +I have winml-modelkit installed from wheel (`uv pip install winml_modelkit-0.0.1.dev1-py3-none-any.whl`) in the current venv. + +Before starting, run `winml sys --list-device --list-ep` to identify your NPU type and note +the corresponding EP flag: + - Snapdragon X Elite → --ep qnn --device npu + - Intel AI Boost → --ep openvino --device npu + - AMD Ryzen AI → --ep vitisai --device npu + +If no NPU is detected, use `--device auto` throughout. + +MODEL SELECTION: + Run `winml hub` to get the built-in model catalog. Randomly pick: + - MODEL_A: one image-classification model + - MODEL_B: one token-classification model + Run `winml inspect -m ` and `winml inspect -m `. + If either fails, pick a different model. + +Important: run all `winml` commands directly — do not pipe output (e.g. no `| tail`, `| head`, `| tee`). +The EP requires an unpiped process; piping causes a crash that looks like a real failure. + +Run the following tests, capture output, and report pass/fail. +Stop and flag any failure before continuing. + +1. SYSTEM INFO + Run: winml sys --list-device --list-ep + Pass: your NPU device and its EP are listed. + +2. INSPECT + Run: winml inspect -m MODEL_A + Run: winml inspect -m MODEL_B + Pass: model task, loader/exporter/inference class, and support status all printed. + +3. PERF — HuggingFace end-to-end + Run: winml perf -m MODEL_A --device npu --iterations 100 + Run: winml perf -m MODEL_B --device npu --iterations 100 + Pass: auto-runs full pipeline for each model; reports NPU latency and throughput. + +4. PERF — live hardware monitor + Run: winml perf -m MODEL_A --device npu --monitor --iterations 1000 + Pass: live NPU utilization chart shown during run; final latency table printed. + +After all tests, produce a summary table: + | # | Test | Model | Status | Notes | + showing PASS / FAIL / SKIP for each item, + including which MODEL_A, MODEL_B, and EP were used. +``` + +--- + +#### Prompt B — Config + Build + Perf with ONNX + +Use this to test the config/build pipeline and then benchmark both the compiled ONNX and the HuggingFace auto-pipeline. + +```text +I have winml-modelkit installed from wheel (`uv pip install winml_modelkit-0.0.1.dev1-py3-none-any.whl`) in the current venv. + +Before starting, run `winml sys --list-device --list-ep` to identify your NPU type and note +the corresponding EP flag: + - Snapdragon X Elite → --ep qnn --device npu + - Intel AI Boost → --ep openvino --device npu + - AMD Ryzen AI → --ep vitisai --device npu + +If no NPU is detected, use `--device auto` for all commands that accept a device flag. +`winml compile` (inside `winml build`) requires NPU; mark build steps SKIP if no NPU is present. + +MODEL SELECTION: + Run `winml hub` to get the built-in model catalog. Randomly pick: + - MODEL_A: one image-classification model + - MODEL_B: one token-classification model + Run `winml inspect -m ` and `winml inspect -m `. + If either fails, pick a different model. Substitute EP_FLAGS with your device's EP flag. + +Important: run all `winml` commands directly — do not pipe output (e.g. no `| tail`, `| head`, `| tee`). +The EP requires an unpiped process; piping causes a crash that looks like a real failure. + +Run the following tests, capture output, and report pass/fail. +Stop and flag any failure before continuing. + +1. SYSTEM INFO + Run: winml sys --list-device --list-ep + Pass: your NPU device and its EP are listed. + +2. INSPECT + Run: winml inspect -m MODEL_A + Run: winml inspect -m MODEL_B --verbose + Pass: model task, loader/exporter/inference class, and support status all printed. + +3. CONFIG + BUILD — MODEL_A + Run: winml config -m MODEL_A --device npu --precision int8 -o model_a_config/config.json + Run: winml build -c model_a_config/config.json -m MODEL_A -o model_a_config/ + Pass: config.json generated; build completes all stages (export→optimize→quantize→compile) targeting NPU. + +4. CONFIG + BUILD — MODEL_B + Run: winml config -m MODEL_B --device npu --precision int8 -o model_b_config/config.json + Run: winml build -c model_b_config/config.json -m MODEL_B -o model_b_config/ + Pass: config.json generated; build completes all stages targeting NPU. + +5. PERF — direct ONNX (built artifact) + Run: winml perf -m model_a_config/.onnx --device npu --iterations 100 + Run: winml perf -m model_b_config/.onnx --device npu --iterations 100 + Pass: reports P50/P90/Avg latency and throughput on NPU for each model. + +6. PERF — HuggingFace end-to-end + Run: winml perf -m MODEL_A --device npu --iterations 100 + Run: winml perf -m MODEL_B --device npu --iterations 100 + Pass: auto-runs full pipeline for each model; reports NPU latency and throughput. + +After all tests, produce a summary table: + | # | Test | Model | Status | Notes | + showing PASS / FAIL / SKIP for each item, + including which MODEL_A, MODEL_B, and EP_FLAGS were used, + followed by any commands that need investigation. +``` + +--- + +#### Prompt C — Full pipeline (all features) + +Use this for a complete end-to-end bug bash covering all commands. + +```text +I have winml-modelkit installed from wheel (`uv pip install winml_modelkit-0.0.1.dev1-py3-none-any.whl`) in the current venv. +All tests target NPU. Before starting, run `winml sys --list-device --list-ep` to identify your NPU type and use +the corresponding EP flag throughout: + - Snapdragon X Elite → --ep qnn --device npu + - Intel AI Boost → --ep openvino --device npu + - AMD Ryzen AI → --ep vitisai --device npu + +If no NPU is detected, use `--device auto` for all commands that accept a device flag — ModelKit will fall back +to the best available device (GPU → CPU). Only `winml compile` (step 8) requires NPU; mark that step SKIP if +no NPU is available. + +Important: run all `winml` commands directly — do not pipe output (e.g. no `| tail`, `| head`, `| tee`). +The EP requires an unpiped process; piping causes a crash that looks like a real failure. + +Run through the following ModelKit core feature tests in order, executing each command, +capturing its output, and reporting pass/fail with a brief summary. Stop and +flag any failure before continuing — do not skip errors silently. + +MODEL SELECTION (do this before starting): + Run `winml hub` to get the full built-in model catalog. From the output, + randomly pick: + - MODEL_A: one image-classification model + - MODEL_B: one token-classification model (avoid text-classification — eval is broken, see known issue #216) + Then run `winml inspect -m ` and `winml inspect -m `. + If either fails, pick a different model. Use these two models throughout + all tests below (substitute wherever you see MODEL_A / MODEL_B). + Also substitute EP_FLAGS with the EP flag for your device (e.g., --ep qnn --device npu). + +1. SYSTEM INFO + Run: winml sys + Run: winml sys --list-device --list-ep + Pass: your NPU device and its EP are listed. + +2. INSPECT + Run: winml inspect -m MODEL_A + Run: winml inspect -m MODEL_B --verbose + Pass: model task, loader/exporter/inference class, and support status all printed. + +3. HUB + Run: winml hub + Run: winml hub --task image-classification + Run: winml hub --model MODEL_A + Pass: catalog table shown; per-model detail includes accuracy info. + +4. EXPORT + Run: winml export -m MODEL_A -o model_a/model.onnx + Pass: ONNX file produced at the specified path; no errors. + +5. ANALYZE + Note: winml analyze requires --device NPU (uppercase). Replace the --device portion of EP_FLAGS with NPU. + Run: winml analyze --model model_a/model.onnx --ep --device NPU + Run: winml analyze --model model_a/model.onnx --ep --device NPU --information + Run: winml analyze --model model_a/model.onnx --ep --device NPU --run-unknown-op + Pass: operator compatibility report shown for your NPU EP; --run-unknown-op runs without crash. + +6. OPTIMIZE + Run: winml optimize --list-capabilities + Run: winml optimize --list-rewrites + Run: winml optimize -m model_a/model.onnx -o model_a/model_opt.onnx + Pass: optimized ONNX produced; file size equal or smaller than input. + +--- Steps 7–14: use --device npu if NPU is available, otherwise --device auto (except compile, which requires NPU). --- + +7. QUANTIZE + Run: winml quantize -m model_a/model_opt.onnx --precision int8 -o model_a/model_int8.onnx + Run: winml quantize -m model_a/model_opt.onnx --weight-type int8 --activation-type uint16 -o model_a/model_w8a16.onnx + Pass: each completes; output ONNX contains QDQ nodes. + Note: re-running with the same -o path will crash with FileExistsError (known issue #185) — delete old output first. + +8. COMPILE + Run: winml compile --list + Run: winml compile -m model_a/model_int8.onnx --output-dir model_a/compiled/ EP_FLAGS + Run: winml compile -m model_a/model_int8.onnx --output-dir model_a/compiled_noquant/ EP_FLAGS --no-quantize + Pass: compiled ONNX produced in output dir. + +9. PERF — direct ONNX + Run: winml perf -m model_a/compiled/.onnx --device npu --iterations 100 + Pass: reports P50/P90/Avg latency and throughput on NPU. + +10. PERF — HuggingFace end-to-end + Run: winml perf -m MODEL_A --device npu --iterations 100 + Run: winml perf -m MODEL_B --device npu --iterations 100 + Pass: auto-runs full pipeline for each model; reports NPU latency and throughput. + +11. PERF — live hardware monitor + Run: winml perf -m MODEL_A --device npu --monitor --iterations 1000 + Pass: live NPU utilization chart shown during run; final latency table printed. + +12. CONFIG + BUILD — MODEL_A + Run: winml config -m MODEL_A --device npu --precision int8 -o model_a_config/config.json + Run: winml build -c model_a_config/config.json -m MODEL_A -o model_a_config/ + Pass: build completes all stages (export→optimize→quantize→compile) targeting NPU. + +13. CONFIG + BUILD — MODEL_B + Run: winml config -m MODEL_B --device npu --precision int8 -o model_b_config/config.json + Run: winml build -c model_b_config/config.json -m MODEL_B -o model_b_config/ + Pass: config.json generated; build completes all stages targeting NPU. + +14. EVAL + Note: Only the following models support a built-in default eval dataset: + image-classification: microsoft/resnet-50, facebook/convnext-tiny-224 + text-classification: Intel/bert-base-uncased-mrpc + token-classification: dslim/bert-base-NER, dbmdz/bert-large-cased-finetuned-conll03-english, + Babelscape/wikineural-multilingual-ner + Other models require a custom --dataset config. Use one of the above as MODEL_A/MODEL_B, + or skip this step if neither model is in the list above. + Run: winml eval -m MODEL_A --device npu --samples 100 + Run: winml eval -m MODEL_B --device npu --samples 100 + Pass: accuracy metric reported without error (uses auto-detected dataset). + +After all tests, produce a summary table: + | # | Test | Model | Status | Notes | + showing PASS / FAIL / SKIP for each item, + including which MODEL_A, MODEL_B, and EP_FLAGS were used, + followed by any commands that need investigation. +``` + +### 📝 Path B: Run CLI in your terminal + +Run through each section below and note any failures. Report issues with the exact command, output, and machine spec. +#### Accepted inputs + +- **HuggingFace model ID** (e.g., `microsoft/resnet-50`) — model weights are downloaded on first run. +- **Local ONNX file** (e.g., `model.onnx`) — produced by `winml export` or `winml build`, or any ONNX file you already have on hand. + +#### The golden rule: inspect first + +Before running any pipeline command on a model, always verify it is supported: + +```bash +winml inspect -m +``` + +If `inspect` prints an error or shows `Unsupported`, **stop and skip that model**. Only models that pass inspect are valid inputs for export, analyze, perf, build, and eval. + +#### Recommended test models + +Use models from the built-in catalog (`winml hub`) or from the following. + +**Built-in hub models** (`winml hub` to list all): + +| Model ID | Task | Architecture | +|----------|------|--------------| +| `microsoft/resnet-50` | image-classification | resnet | +| `google/vit-base-patch16-224` | image-classification | vit | +| `microsoft/swin-large-patch4-window7-224` | image-classification | swin | +| `facebook/convnext-tiny-224` | image-classification | convnext | +| `rizvandwiki/gender-classification` | image-classification | vit | +| `ProsusAI/finbert` | text-classification | bert | +| `Intel/bert-base-uncased-mrpc` | text-classification | bert | +| `cardiffnlp/twitter-roberta-base-sentiment-latest` | text-classification | roberta | +| `dslim/bert-base-NER` | token-classification | bert | +| `dbmdz/bert-large-cased-finetuned-conll03-english` | token-classification | bert | +| `Babelscape/wikineural-multilingual-ner` | token-classification | bert | +| `w11wo/indonesian-roberta-base-posp-tagger` | token-classification | roberta | +| `microsoft/table-transformer-detection` | object-detection | table-transformer | +| `mattmdjaga/segformer_b2_clothes` | image-segmentation | segformer | +| `nvidia/segformer-b1-finetuned-ade-512-512` | image-segmentation | segformer | +| `nvidia/segformer-b2-finetuned-ade-512-512` | image-segmentation | segformer | +| `nvidia/segformer-b5-finetuned-ade-640-640` | image-segmentation | segformer | + + +--- + +### 💡 Using one command to build models + +### Perf (`winml perf`) + +```bash + +# HuggingFace end-to-end (auto-pipeline) on NPU +winml perf -m microsoft/resnet-50 --device npu --iterations 100 + +# Live NPU utilization monitor +winml perf -m microsoft/resnet-50 --device npu --monitor --iterations 1000 +``` + +**Options**: +- `-m/--model` — HuggingFace model ID or local `.onnx` file +- `--task` — explicit task (auto-detected if not specified) +- `--iterations` — benchmark iterations (default: 100) +- `--warmup` — warmup iterations excluded from stats (default: 10) +- `--device` — target device; always specify `npu` in this bug bash +- `--precision` — `auto`, `int8`, `int16`, or `w{x}a{y}` (default: auto) +- `--ep` — force specific EP (use `qnn` for NPU) +- `-o/--output` — output JSON file path +- `--batch-size` — input batch size (default: 1) +- `--shape-config` — JSON file with shape overrides +- `--no-quantize` — skip quantization during auto build +- `--rebuild` — force rebuild of cached artifacts +- `--ignore-cache` — build in temp folder, discard after run +- `--monitor` — live NPU utilization chart during benchmark +- `--op-tracing [basic|detail]` — operator-level profiling (requires `onnxruntime-qnn`; see known issue #217 — may crash or produce empty trace) +- `-v/--verbose` + +**Pass criteria**: +- All variants report P50/P90/Avg latency and throughput on NPU +- `--monitor` shows live NPU utilization chart during run + +--- + +### 💡 Using pipeline to build models + +### System Info (`winml sys`) + +```bash +winml sys +winml sys --list-device --list-ep +``` + +**Options**: +- `-f/--format [text|json|compact]` — output format (default: text) +- `-v/--verbose` — additional diagnostic information +- `--list-device` — list available devices in priority order +- `--list-ep` — list available execution providers + +**Pass criteria**: Your NPU device and its EP (QNN / OpenVINO / VitisAI) are listed. + +--- + +### Hub (`winml hub`) + +```bash +winml hub +winml hub --model-type bert +winml hub --task image-classification +winml hub --model microsoft/resnet-50 +winml hub --output catalog.json +``` + +**Options**: +- `-t/--model-type` — filter by architecture (e.g., `bert`, `vit`) +- `-k/--task` — filter by task (e.g., `text-classification`) +- `-m/--model` — show detail for a specific model +- `-o/--output` — save results to JSON file + +**Pass criteria**: Catalog table displayed; per-model detail includes accuracy verdict. + +--- + +### Inspect (`winml inspect`) + +> Run this before testing any model. If inspect fails, skip the model entirely. + +```bash +winml inspect -m microsoft/resnet-50 +winml inspect -m microsoft/resnet-50 --verbose +winml inspect -m microsoft/resnet-50 --hierarchy +``` + +**Options**: +- `-m/--model` — HuggingFace model ID or local ONNX file path (required) +- `-f/--format [table|json]` — output format (default: table) +- `-v/--verbose` — show full configuration details +- `-t/--task` — override auto-detected task +- `-H/--hierarchy` — show HF module hierarchy (random weights, no download) + +**Pass criteria**: Model task, loader/exporter class, inference class, and support status all printed without error. + +--- + +### Config + Build (`winml config` + `winml build`) + +```bash +# Generate NPU config and build +winml config -m microsoft/resnet-50 --device npu --precision int8 -o resnet_config/config.json +winml build -c resnet_config/config.json -m microsoft/resnet-50 -o resnet_config/ + +# Text model on NPU +winml config -m dslim/bert-base-NER --device npu --precision int8 -o bert_config/config.json +winml build -c bert_config/config.json -m dslim/bert-base-NER -o bert_config/ + +# Use global cache +winml build -c resnet_config/config.json -m microsoft/resnet-50 --use-cache +``` + +**`winml config` options**: +- `-m/--model` — HuggingFace model ID or `.onnx` file +- `-t/--task` — override auto-detected task +- `-d/--device npu` — always use `npu` in this bug bash +- `--ep [qnn|openvino|vitisai]` — EP matching your device +- `-p/--precision` — `int8`, `int16`, or `w{x}a{y}` recommended for NPU +- `-o/--output` — output JSON file (default: stdout) +- `--no-quant` — exclude quantization from config +- `--no-compile` — exclude compilation from config +- `--shape-config` — JSON with shape overrides + +**`winml build` options**: +- `-c/--config` — WinMLBuildConfig JSON (required) +- `-m/--model` — HuggingFace model ID or `.onnx` file (required) +- `-o/--output-dir` — output directory +- `--use-cache` — use global cache `~/.cache/winml/` +- `--rebuild` — force rebuild +- `--no-quant`, `--no-compile`, `--no-optimize` — skip stages +- `--no-analyze` — skip analyzer loop +- `--max-optim-iterations` — max autoconf re-optimization rounds (default: 3) + +**Pass criteria**: Config JSON generated; build completes all stages (export → optimize → quantize → compile) targeting NPU. + +--- + +### Perf (`winml perf`) + +```bash +# Direct ONNX benchmark on NPU +winml perf -m resnet_compiled/resnet_quant_int8_qnn_ctx.onnx --device npu --iterations 100 + +# HuggingFace end-to-end (auto-pipeline) on NPU +winml perf -m microsoft/resnet-50 --device npu --iterations 100 + +# Live NPU utilization monitor +winml perf -m microsoft/resnet-50 --device npu --monitor --iterations 1000 +``` + +**Options**: +- `-m/--model` — HuggingFace model ID or local `.onnx` file +- `--task` — explicit task (auto-detected if not specified) +- `--iterations` — benchmark iterations (default: 100) +- `--warmup` — warmup iterations excluded from stats (default: 10) +- `--device` — target device; always specify `npu` in this bug bash +- `--precision` — `auto`, `int8`, `int16`, or `w{x}a{y}` (default: auto) +- `--ep` — force specific EP (use `qnn` for NPU) +- `-o/--output` — output JSON file path +- `--batch-size` — input batch size (default: 1) +- `--shape-config` — JSON file with shape overrides +- `--no-quantize` — skip quantization during auto build +- `--rebuild` — force rebuild of cached artifacts +- `--ignore-cache` — build in temp folder, discard after run +- `--monitor` — live NPU utilization chart during benchmark +- `--op-tracing [basic|detail]` — operator-level profiling (requires `onnxruntime-qnn`; see known issue #217 — may crash or produce empty trace) +- `-v/--verbose` + +**Pass criteria**: +- All variants report P50/P90/Avg latency and throughput on NPU +- `--monitor` shows live NPU utilization chart during run + +--- + +### Eval (`winml eval`) + +```bash +winml eval -m microsoft/resnet-50 --device npu --samples 100 +winml eval -m dslim/bert-base-NER --device npu --samples 100 +winml eval --task image-classification --schema +``` + +**Options**: +- `-m/--model` — HuggingFace model ID or `.onnx` file +- `--model-id` — HuggingFace model ID when `-m` points to an `.onnx` file +- `--dataset` — HF dataset path (auto-selected per task if omitted) +- `--task` — override auto-detected task +- `--device npu` — always use `npu` in this bug bash +- `--samples` — number of samples (default: 100) +- `--split` — dataset split (default: validation) +- `--shuffle/--no-shuffle` +- `--streaming` — stream dataset instead of downloading fully +- `-o/--output` — output JSON file path +- `--schema` — print expected dataset schema for the given task and exit + +**Models with built-in default dataset** (no `--dataset` needed): +- image-classification: `microsoft/resnet-50`, `facebook/convnext-tiny-224` +- text-classification: `Intel/bert-base-uncased-mrpc` *(but see known issue #216 — avoid for eval)* +- token-classification: `dslim/bert-base-NER`, `dbmdz/bert-large-cased-finetuned-conll03-english`, `Babelscape/wikineural-multilingual-ner` + +Other models require `--dataset `. Skip this command if the model has no default dataset. + +**Pass criteria**: Accuracy metric reported without error on NPU. + +--- + +### 💡 Using primitive command to build model step by step + +### Export (`winml export`) + +```bash +winml export -m microsoft/resnet-50 -o resnet_onnx/model.onnx +``` + +**Options**: +- `-m/--model` — HuggingFace model ID or local path (required) +- `-o/--output` — output ONNX file path (required) +- `-v/--verbose` — verbose 8-step output (crashes on Windows cp1252 terminals; see known issue #214) +- `--with-report` — generate markdown + JSON reports alongside ONNX +- `--clean-onnx` — produce a clean ONNX without embedded metadata +- `--no-hierarchy` — skip embedding hierarchy metadata +- `--dynamo` — use PyTorch 2.0+ dynamo export +- `--torch-module` — include specific `torch.nn` modules in hierarchy (comma-separated) +- `-t/--task` — override auto-detected task +- `--input-specs` — JSON file with custom input specifications +- `--export-config` — ONNX export configuration JSON +- `--shape-config` — JSON with shape overrides + +**Pass criteria**: ONNX file produced at the specified path; no errors. + +--- + +### Analyze (`winml analyze`) + +Use the EP matching your device: + +```bash +# Snapdragon X Elite (QNN) +winml analyze --model resnet_onnx/model.onnx --ep qnn --device NPU --information +winml analyze --model resnet_onnx/model.onnx --ep qnn --device NPU --run-unknown-op + +# Intel AI Boost (OpenVINO) +winml analyze --model resnet_onnx/model.onnx --ep openvino --device NPU --information + +# AMD Ryzen AI (VitisAI) +winml analyze --model resnet_onnx/model.onnx --ep vitisai --device NPU --information + +# Save output to file +winml analyze --model resnet_onnx/model.onnx --ep qnn --device NPU --output results.json +``` + +**Options**: +- `--model` — path to ONNX model (required) +- `--ep` — target EP: use `qnn` for NPU +- `--device [CPU|GPU|NPU]` — target device; always use `NPU` (uppercase) in this bug bash +- `-v/--verbose` / `-q/--quiet` +- `--output` — save JSON output to file +- `--information/--no-information` — include detailed recommendations (default: enabled) +- `--run-unknown-op/--no-run-unknown-op` — run unknown ops on local machine (default: enabled) +- `--save-node [partial|unsupported]` — save specific node types for further analysis +- `--htp-metadata` — path to HTP metadata JSON for enhanced pattern extraction + +**Pass criteria**: EP compatibility report shown for your NPU; `--run-unknown-op` runs without crash. + +> **Known issue #194**: Exit code is 1 when any EP has unknown operators, even if QNN NPU reports 100% support. Check the report content, not just the exit code. + +--- + +### Optimize (`winml optimize`) + +```bash +winml optimize --list-capabilities +winml optimize --list-rewrites +winml optimize -m resnet_onnx/model.onnx -o resnet_optimized.onnx +``` + +**Options**: +- `-l/--list-capabilities` — list all registered capabilities and exit +- `--list-rewrites` — list available pattern rewrite families and exit +- `-m/--model` — input ONNX model file +- `-o/--output` — output path (default: `{input}_opt.onnx`) +- `-c/--config` — YAML/JSON config file +- `-v/--verbose` +- `--enable-*/--disable-*` — toggle individual capabilities (see `--list-capabilities`) + +**Pass criteria**: Optimized ONNX produced; file size equal or smaller than input. + +--- + +### Quantize (`winml quantize`) + +```bash +winml quantize -m resnet_optimized.onnx --precision int8 -o resnet_quant_int8.onnx +winml quantize -m resnet_optimized.onnx --weight-type int8 --activation-type uint16 -o resnet_quant_w8a16.onnx +``` + +**Options**: +- `-m/--model` — input ONNX model file (required) +- `-o/--output` — output path (default: `{input}_qdq.onnx`) +- `-p/--precision [int8|int16|w{x}a{y}]` — precision shorthand +- `--weight-type [uint8|int8|uint16|int16]` — explicit weight type (overrides `--precision`) +- `--activation-type [uint8|int8|uint16|int16]` — explicit activation type (overrides `--precision`) +- `--samples` — calibration samples (default: 10) +- `--method [minmax|entropy|percentile]` — calibration method (default: minmax) +- `--per-channel` — per-channel quantization +- `--symmetric` — symmetric quantization +- `-v/--verbose` + +**Pass criteria**: Each command completes; output ONNX contains QDQ nodes. + +> **Known issue #185 (P1)**: Re-running with the same `-o` path crashes with `FileExistsError` if a `.onnx.data` sidecar already exists. Delete the old output file and its `.data` sidecar before re-running. + +> **Known issue #193 (P1)**: Output from standalone `winml quantize` may fail NPU compilation (QNN MaxPool NHWC layout error). Use `winml build` for the full NPU pipeline. + +--- + +### Compile (`winml compile`) + +```bash +# List available compilers +winml compile --list + +# Snapdragon X Elite (QNN) +winml compile -m resnet_quant_int8.onnx --output-dir resnet_compiled/ --ep qnn --device npu +winml compile -m resnet_quant_int8.onnx --output-dir resnet_compiled/ --ep qnn --device npu --no-quantize + +# Intel AI Boost (OpenVINO) +winml compile -m resnet_quant_int8.onnx --output-dir resnet_compiled/ --ep openvino --device npu + +# AMD Ryzen AI (VitisAI) +winml compile -m resnet_quant_int8.onnx --output-dir resnet_compiled/ --ep vitisai --device npu +``` + +**Options**: +- `-m/--model` — input ONNX model (required unless `--list`) +- `--output-dir` — output directory (default: same as input) +- `-d/--device` — target device; always `npu` in this bug bash (default: npu) +- `--ep` — use `qnn` for NPU +- `--quantize/--no-quantize` — enable/disable internal quantization (default: enabled) +- `--validate/--no-validate` — validate compiled model (default: enabled) +- `-v/--verbose` +- `--compiler [ort|qairt]` — compiler backend (default: ort) +- `--qnn-sdk-root` — path to QAIRT SDK root +- `--embed` — embed EP context in ONNX (default: external `.bin` file) +- `--list` — list available compilers for selected device and exit + +**Pass criteria**: Compiled ONNX produced in output directory. + +> **Known issue #186 (P1)**: `--ep qnn` silently falls back to OpenVINO when QNN SDK is not available. Verify your EP is correctly listed in `winml sys --list-ep` before compiling. + +--- + +### Perf (`winml perf`) + +```bash +# Direct ONNX benchmark on NPU +winml perf -m resnet_compiled/resnet_quant_int8_qnn_ctx.onnx --device npu --iterations 100 + +# HuggingFace end-to-end (auto-pipeline) on NPU +winml perf -m microsoft/resnet-50 --device npu --iterations 100 + +# Live NPU utilization monitor +winml perf -m microsoft/resnet-50 --device npu --monitor --iterations 1000 +``` + +**Options**: +- `-m/--model` — HuggingFace model ID or local `.onnx` file +- `--task` — explicit task (auto-detected if not specified) +- `--iterations` — benchmark iterations (default: 100) +- `--warmup` — warmup iterations excluded from stats (default: 10) +- `--device` — target device; always specify `npu` in this bug bash +- `--precision` — `auto`, `int8`, `int16`, or `w{x}a{y}` (default: auto) +- `--ep` — force specific EP (use `qnn` for NPU) +- `-o/--output` — output JSON file path +- `--batch-size` — input batch size (default: 1) +- `--shape-config` — JSON file with shape overrides +- `--no-quantize` — skip quantization during auto build +- `--rebuild` — force rebuild of cached artifacts +- `--ignore-cache` — build in temp folder, discard after run +- `--monitor` — live NPU utilization chart during benchmark +- `--op-tracing [basic|detail]` — operator-level profiling (requires `onnxruntime-qnn`; see known issue #217 — may crash or produce empty trace) +- `-v/--verbose` + +**Pass criteria**: +- All variants report P50/P90/Avg latency and throughput on NPU +- `--monitor` shows live NPU utilization chart during run + +--- + +### Eval (`winml eval`) + +```bash +winml eval -m microsoft/resnet-50 --device npu --samples 100 +winml eval -m dslim/bert-base-NER --device npu --samples 100 +winml eval --task image-classification --schema +``` + +**Options**: +- `-m/--model` — HuggingFace model ID or `.onnx` file +- `--model-id` — HuggingFace model ID when `-m` points to an `.onnx` file +- `--dataset` — HF dataset path (auto-selected per task if omitted) +- `--task` — override auto-detected task +- `--device npu` — always use `npu` in this bug bash +- `--samples` — number of samples (default: 100) +- `--split` — dataset split (default: validation) +- `--shuffle/--no-shuffle` +- `--streaming` — stream dataset instead of downloading fully +- `-o/--output` — output JSON file path +- `--schema` — print expected dataset schema for the given task and exit + +**Models with built-in default dataset** (no `--dataset` needed): +- image-classification: `microsoft/resnet-50`, `facebook/convnext-tiny-224` +- text-classification: `Intel/bert-base-uncased-mrpc` *(but see known issue #216 — avoid for eval)* +- token-classification: `dslim/bert-base-NER`, `dbmdz/bert-large-cased-finetuned-conll03-english`, `Babelscape/wikineural-multilingual-ner` + +Other models require `--dataset `. Skip this command if the model has no default dataset. + +**Pass criteria**: Accuracy metric reported without error on NPU. + +--- + +## Known Issues (found during bug bash) + +| # | Issue | Severity | Area | Description | Workaround | +|---|-------|----------|------|-------------|------------| +| 1 | [#192](https://github.com/microsoft/ModelKit/issues/192) | P1 | `winml perf --module` | `AttributeError: ResNetModel has no attribute 'resnet'` — module path construction bug in `perf.py` when using `--module` on ResNet | Avoid `--module` on ResNet; try BERT (`BertAttention`) instead | +| 2 | [#193](https://github.com/microsoft/ModelKit/issues/193) | P1 | `winml quantize` → NPU | ONNX produced by standalone `winml quantize` fails NPU compilation (QNN MaxPool NHWC layout error); `winml build` pipeline is unaffected | Use `winml build` for NPU targets | +| 3 | [#185](https://github.com/microsoft/ModelKit/issues/185) | P1 | `winml quantize` re-run | Crashes with `FileExistsError` when output `.onnx.data` sidecar from a previous run already exists | Delete the old `.onnx` and `.onnx.data` files before re-running | +| 4 | [#186](https://github.com/microsoft/ModelKit/issues/186) | P1 | `winml compile --ep qnn` | Silently falls back to OpenVINO when QNN SDK is not installed; output file is still named `*_qnn_ctx.onnx`; downstream `winml perf` crashes | Verify QNN SDK is present via `winml sys` before compiling | +| 5 | [#194](https://github.com/microsoft/ModelKit/issues/194) | P2 | `winml analyze` exit code | Exits with code 1 when any EP has unknown operators, even when QNN NPU reports 100% support — may break CI pipelines | Check report content, not just exit code | +| 6 | [#195](https://github.com/microsoft/ModelKit/issues/195) | P2 | `winml perf --module` | `--module` expects a **class name** (e.g., `BertAttention`), not a module path; unclear error when wrong format is used | Run `winml inspect -m --hierarchy` to discover valid class names | +| 7 | [#175](https://github.com/microsoft/ModelKit/issues/175) | P2 | `winml perf` vs `winml build` | `winml perf -m ` and `winml config` + `winml build` can produce different export results for the same model | Use `winml build` output for production; file a repro if you observe discrepancies | +| 8 | [#182](https://github.com/microsoft/ModelKit/issues/182) | P2 | `winml analyze --run-unknown-op` | Static analyzer still creates single-node models for unknown QDQ ops even when `--run-unknown-op` is enabled | No workaround; may report incorrect support status for those ops | +| 9 | [#214](https://github.com/microsoft/ModelKit/issues/214) | P1 | `winml export --verbose` | `UnicodeEncodeError: 'charmap' codec can't encode character` — emoji in verbose output crashes on Windows (cp1252 encoding) | Omit `--verbose` flag; #208 fix incomplete for export command | +| 10 | [#215](https://github.com/microsoft/ModelKit/issues/215) | P2 | `winml analyze --device` | `--device` only accepts uppercase (`NPU`/`GPU`/`CPU`); all other commands accept lowercase — causes confusing errors | Use uppercase: `--device NPU` | +| 11 | [#216](https://github.com/microsoft/ModelKit/issues/216) | P1 | `winml eval` text-classification | `RuntimeError: Label alignment failed` — auto-selected dataset labels don't match model's label set (e.g. finbert positive/negative/neutral vs GLUE) | Avoid text-classification models for eval; image-classification and token-classification are unaffected | +| 12 | [#217](https://github.com/microsoft/ModelKit/issues/217) | P2 | `winml perf --op-tracing` | `--op-tracing` may crash or produce an empty trace; requires `onnxruntime-qnn` to be installed separately | Omit `--op-tracing` unless specifically testing this feature | + +--- + +## Quick Reference + +| Command | Purpose | +|---------|---------| +| `winml sys` | System info + device/EP inventory | +| `winml inspect -m ` | Verify model is supported (**run this first**) | +| `winml hub` | Browse built-in validated model catalog | +| `winml export -m -o dir/` | Export to ONNX | +| `winml analyze --model --ep qnn --device NPU` | Analyze QNN NPU compatibility | +| `winml optimize -m -o out.onnx` | Apply graph optimizations | +| `winml quantize -m --precision int8` | Insert QDQ quantization nodes | +| `winml compile -m --ep qnn --device npu` | Compile for NPU | +| `winml perf -m --device npu` | Benchmark on NPU | +| `winml perf -m --device npu --monitor` | Benchmark with live NPU chart | +| `winml config -m --device npu -o config.json` | Generate NPU build config | +| `winml build -c config.json -m ` | Build all stages for NPU | +| `winml eval -m --device npu` | Evaluate accuracy on NPU | +| `winml --help` / `winml --help` | Command reference | diff --git a/docs/getting-started/04.customer-intro/customer-intro-v5.md b/docs/getting-started/04.customer-intro/customer-intro-v5.md new file mode 100644 index 000000000..6c424b32a --- /dev/null +++ b/docs/getting-started/04.customer-intro/customer-intro-v5.md @@ -0,0 +1,319 @@ +# ModelKit Customer Introduction Deck — V5 + +**Session**: 20-minute introduction for software vendors +**Audience**: Developers and engineering leads new to ModelKit, interested in bringing models to WinML +**Author**: Zheng Te +**Date**: March 2026 +**Classification**: MVP Summit + +--- + +## Flow Overview + +``` +Opening → Slide 1-2 (Intro) → Slide 3-4 (Demo Setup) → Live Demos → Slide 5-9 (Recap) → Slide 10 (Close) +``` + +| Phase | Content | Time | +|-------|---------|------| +| Opening + Intro | Opening → Slide 1 → Slide 2 | ~4 min | +| Demo Setup | Slide 3 (Workflow) → Slide 4 (Three Demos) | ~2 min | +| Live Demos | Demo 1-3 in terminal | ~8 min | +| Recap | Slides 5-9 | ~4 min | +| Close | Slide 10: Why ModelKit? | ~2 min | +| **Total** | | **~20 min** | + +--- + +## Opening + +> Hello everyone, I'm Zheng from the WinPD team, welcome to this session. In the next 20 minutes, I'll introduce ModelKit — a new toolkit we've built over the past few months. + +--- + +## Part 1: Intro + +--- + +### Slide 1: What is ModelKit? + +#### Definition + +_ModelKit is a CLI toolkit to build portable, performant and high-quality models for Windows ML._ + +#### Goals + +1. **Portable Models** — Build once, run anywhere +2. **Flexible Pipeline** — Compose pipeline to build any model +3. **Human-in-the-Loop** — Step into any stage for model refinement and error debugging +4. **AI Agent Ready** — Skills for AI-augmented model building workflows + +#### Promises + +1. **Out-of-box Experience** — supported models work with minimal setup +2. **One Toolkit Covers All EPs** — no separate toolchain per vendor +3. **Repeatability and Traceability** — config-driven, predictable builds +4. **Build-Time Quality Gates** — catch issues ahead of time, with auto-fix + +#### Speaker Transcript + +> So, what is ModelKit? +> +> ModelKit is a CLI toolkit to build portable, performant and high-quality models for Windows ML. +> +> With ModelKit, you can build a model once and run anywhere — using built-in pipelines or composing your own from primitive commands. You can drill down into model details, pinpoint errors or performance bottlenecks at any stage. And ModelKit is AI-ready — we provide built-in skills that work with your favorite coding agent. +> +> ModelKit promises you an out-of-box experience — one toolkit covers all EPs, as well as full repeatability and traceability throughout both commands and pipelines. And we build quality gates into ModelKit to catch compatibility issues and suggest fixes automatically. + +--- + +### Slide 2: ModelKit — Command List + +#### Command List + +| Primitives | Pipeline | Insights | Utilities | +|-----------|----------|----------|-----------| +| inspect | config | analyze | cache | +| export | build | debug* | doctor | +| optimize | perf | | setting | +| quantize | eval | | sys | +| compile | run* | | | + +*\* coming soon* + +#### Speaker Transcript + +> OK, let's quickly go through the commands. We bucketize the commands into four categories. +> +> Primitive commands — you can use them individually or compose them into workflows. Pipeline commands that help you build and benchmark models end-to-end. Insight commands that enable model analysis and debugging. And a few utilities that support daily usage. + +--- + +## Part 2: ModelKit in Practice + +--- + +### Slide 3: Background — BYOM Workflow + +#### One-liner + +_Enhanced workflow with three commands as quality gates_ + +#### Three Pillar Steps + +| Command | Pillar | What it guards | +|---------|--------|----------------| +| **Analyze** | Portability | Does it run on the target EP? | +| **Optimize** | Performance | Is the graph efficient enough? | +| **Quantize** | Fidelity | Is the model still accurate? | + +#### Speaker Transcript + +> Before we jump into the demos, let me show you the workflow behind ModelKit. +> +> This is the pipeline your model goes through. Given a source model, we export to ONNX, analyze, optimize, quantize if needed, and evaluate before shipping. +> +> Here I want to highlight three commands that serve as quality gates. Analyze for portability — checking whether your model runs on the target EP. Optimize for performance — making sure the graph is efficient enough. And quantize for fidelity — understanding the impact on model accuracy after compression. +> +> These three steps directly impact the quality of your output. And ModelKit gives you full control over each one. + +--- + +### Slide 4: Three Ways to Build ConvNeXT with ModelKit + +#### Demo Preview + +All three demos use the **same model — ConvNeXT** — but in different ways: + +1. **Build with Primitive Commands** + - Full control, step by step + - Iterate, debug, experiment + +2. **Build with Config-Driven Pipeline** + - Two commands, automated + - Reproduce, polish, hand over + +3. **Benchmark in One Command** + - Zero setup, instant results + - Validate, benchmark, deliver + +#### Speaker Transcript + +> I'm going to show you three ways to build models with ModelKit. All with the same model — ConvNeXT — for easy comparison. +> +> First, I'll go with primitive commands. You'll see how to craft a model step by step with ModelKit. Then, I'll build ConvNeXT again with the config-driven pipeline — only two commands. And last, I'll show you how to quick-bench a model with ModelKit — in one command. +> +> Let's start. + +--- + +## Part 3: Recap Slides + +*After demos, switch back to slides.* + +--- + +### Slide 5: Build ConvNeXT with Primitive Commands + +#### Two Phases + +**📦 Build Portable Model** +- `export` — PyTorch to ONNX conversion +- `analyze` — EP compatibility and performance gap detection +- `optimize` — graph optimizations (shape inference, fusion, rewrite) +- `quantize` — low-bit model compression for fast inference + +**⚡ On-Device Benchmarking** +- `compile` — target a specific EP +- `perf` & `eval` — measure latency, throughput, and accuracy on device + +#### Speaker Transcript + +> OK, let me recap. In demo one, we used primitive commands to bring ConvNeXT to Windows ML — step by step. +> +> Two phases. Build a portable ONNX through export, analyze, optimize, quantize. Then benchmark on device with compile, perf, and eval. +> +> This gives you full control — you can jump into any stage, try different settings to fix errors or tweak performance. + +--- + +### Slide 6: Analyze ConvNeXT for EP Compatibility + +#### Key Points + +- Tagline: "Analyzer is the key to building portable ONNX models" +- **🚦 Linter**: ESLint for ONNX — Supported / Partial / Unsupported +- **⚙️ AutoConf**: GNU AutoConf for ONNX — tests capabilities, detects patterns, suggests fixes + +#### Speaker Transcript + +> Let me go deeper on the analyzer — it's the key to building portable ONNX models. +> +> The analyzer is made of two parts — Linter and AutoConf. +> +> The linter is like ESLint, but for ONNX. As you saw, it checks operator compatibility and classifies — green for supported, gray for partial, red for unsupported. +> +> AutoConf detects suboptimal patterns and generates the config for the optimizer. Together they form the analyze-optimize loop — which is what makes the models portable. + +--- + +### Slide 7: Build ConvNeXT with Config-Driven Pipeline + +#### Key Points + +- Two steps: `wmk config` → `wmk build` — same pattern as CMake +- Config is reviewable, editable, version-controllable + +#### Speaker Transcript + +> In demo two, we used `config` and `build`. Two commands instead of eight. +> +> `wmk config` generates the build config — auto-detects everything. `wmk build` orchestrates the full pipeline. Same result, repeatable and scriptable. Think CMake for models. + +--- + +### Slide 8: Primitive Commands vs. Config-Driven Pipeline + +#### Comparison + +| | Primitive Commands | Config-Driven Pipeline | +|---|---|---| +| **Lifecycle Analogy** | Coding | Polish | +| **Best for** | Flexible workflow | Production-ready delivery | +| **Control** | Start from any stage, try different settings to fix errors or tweak performance | Repeatable, scriptable, version-controllable | +| **Steps** | One command per stage | Two commands: config + build | +| **When to use** | Exploring, debugging, prototyping | CI/CD, batch builds, team workflows | + +#### Speaker Transcript + +> So when do you use which? Think of it like a development lifecycle. +> +> Primitive commands are the coding phase — you explore, debug, try different settings until the model works the way you want. Full flexibility. +> +> Config-driven pipeline is the polish phase — you've figured out what works, now you make it repeatable, scriptable, and easy to hand over to your team. +> +> Both produce the same portable ONNX. It's about where you are in your workflow and how much control you need. + +--- + +### Slide 9: Benchmark ConvNeXT in One Command + +#### Key Points + +- `wmk perf -m facebook/convnext-base-224 --ep qnn --iterations 100 --monitor` +- One command: load, export, optimize, benchmark +- Live NPU/CPU monitoring, latency percentiles, throughput + +#### Speaker Transcript + +> And the third way — the easiest. Say someone hands you a production-ready model. You just want a quick smoke test — does it run, how fast is it? One command. `wmk perf` with a model ID. Load, export, optimize, benchmark — all behind the scenes. Think of it as a sanity check before you commit to a full integration. +> +> That's three ways to build. Primitive commands for development — iterate, debug, experiment. Config-driven pipeline for release — reproduce, polish, hand over. And one command for QA — validate, benchmark, deliver. + +--- + +## Part 4: Close + +--- + +### Slide 10: Why ModelKit? + +#### 🎯 ModelKit is Right for You If + +1. You want to build models that run on **any Windows device** +2. You want to benchmark a model with **one command** +3. You want to catch compatibility issues **ahead of time** +4. You want **deep insights** into your model +5. You want a **repeatable and traceable** model building process +6. You want **AI agents** to build and profile models for you + +#### 🗺️ Roadmap + +- **Q4 2025**: Project Kickoff +- **Q1 2026**: Early Access & Feedback +- **Q2 2026**: Public Beta + - Open Source + - Coding Agent Skills + - AITK Integration +- **Q3–Q4 2026**: Release Candidate + - LLM Support (with LoRA) + - More Devices — GPU & NPU + - MLIR + +#### Speaker Transcript + +> OK, that's all for the demos. Now — if you want to build models for Windows ML, quickly benchmark a model, catch compatibility issues ahead of time, troubleshoot errors or performance bottlenecks, or you just want AI to do the heavy lifting — please reach out to us for early access. Your feedback is the most valuable thing to us. +> +> And the roadmap — ModelKit is ready for early access now. We'll release the public beta in Q2, with coding agent skills available and AITK integration. After that, we'll continue bringing more into ModelKit — LLM support, MLIR, and broader device coverage. +> +> Thank you. Happy to take any questions. + +--- + +## Appendix + +### Session Flow + +| Phase | Content | Time | +|-------|---------|------| +| Opening + Intro | Opening → Slide 1 → Slide 2 | ~4 min | +| Demo Setup | Slide 3 (Workflow) → Slide 4 (Three Demos) | ~2 min | +| Live Demos | Demo 1: ConvNeXT primitives | ~4 min | +| | Demo 2: ConvNeXT config+build | ~2.5 min | +| | Demo 3: ConvNeXT one command | ~1.5 min | +| Recap | Slides 5-9 | ~4 min | +| Close | Slide 10: Why ModelKit? | ~2 min | +| **Total** | | **~20 min** | + +### EP Coverage Reference + +| EP | Hardware | Priority | +|----|----------|----------| +| QNN | Qualcomm NPU | P1 | +| OpenVINO | Intel NPU/GPU/CPU | P1 | +| VitisAI | AMD NPU | P1 | +| QNN | Qualcomm GPU | P2 | +| MIGraphX | AMD GPU | P2 | +| TensorRT | NVIDIA GPU | P2 | +| DML | DirectML (any GPU) | P2 | +| CPU | Fallback | P3 | diff --git a/docs/getting-started/04.customer-intro/demo-script-v5.md b/docs/getting-started/04.customer-intro/demo-script-v5.md new file mode 100644 index 000000000..669b2692f --- /dev/null +++ b/docs/getting-started/04.customer-intro/demo-script-v5.md @@ -0,0 +1,73 @@ +# ModelKit Demo Script + +**Model**: `facebook/convnext-base-224` (all three demos) + +--- + +## Demo 1: Build ConvNeXT with Primitive Commands + +Let's start with ConvNeXT. First, `inspect` + +ConvNeXt is a family of CNN model inspired by Vision Transformers, introduced by Facebook in 2022. + +It adopts several design choices from Transformers, and offers high accuracy while retaining the efficiency of CNNs, therefore it is widely adopted for tasks such as image classification, detection, and segmentation. + +this tells us everything about the model. Task, model class, I/O shapes. No weights loaded, just metadata. + +`wmk inspect -m facebook/convnext-base-224` + +Now we export from PyTorch to ONNX. + +`wmk export -m facebook/convnext-base-224 -o convnext/model.onnx -v` + +Let's run the analyzer right away. It checks every operator against EPs — tells you what's supported, what's partial, what needs fixing. And it generates an optimization config automatically. + +`wmk analyze -m convnext/model.onnx --optim-config optim.json` + +We apply the optimizer with that config. The analyzer told us what to fix, the optimizer fixes it. + +`wmk optimize -m convnext/model.onnx -c optim_config.json -o convnext/model_opt.onnx` + +Now quantize — compress the optimized model to INT8. At this point, we have a portable model. It can run on any ONNX Runtime backend. + +`wmk quantize -m convnext/model_opt.onnx -o convnext/model_opt_int8.onnx` + +Now let's compile for QNN — this generates device-specific binaries for the NPU. + +`wmk compile -m convnext/model_opt_int8.onnx --ep qnn -o convnext/model_compiled.onnx` + +And benchmark on NPU. Look at the latency — let's keep this number in mind. + +`wmk perf -m convnext/model_compiled.onnx --ep qnn --iterations 100` + +Now the same optimized model on CPU for comparison. See the difference? That's roughly a 25x speedup — the quantized model on NPU versus the original on CPU. Same model, same accuracy, completely different performance. + +`wmk perf -m convnext/model_opt.onnx --ep cpu --iterations 100` + +--- + +## Demo 2: Build ConvNeXT with Config + Build + +Same model, different approach. Instead of running each command manually, let's use `config` and `build`. + +`wmk config` generates a JSON config. Let me show you what's inside. This is the config — it contains all settings for each pipeline step. Task, I/O shapes, optimization flags, quantization parameters, all auto-detected. You can review it, revise it, or pass it directly to the build command. + +`wmk config -m facebook/convnext-base-224 -o convnext_config.json` + +`wmk build` takes that config and runs the full pipeline. Export, analyze, optimize, quantize, compile — all in one go. + +`wmk build -c convnext_config.json -m facebook/convnext-base-224 -o convnext_build/` + +And let's benchmark the result. Same model, same quality — but two commands instead of eight. + +`wmk perf -m convnext_build/model.onnx --ep qnn --iterations 100` + +--- + +## Demo 3: Benchmark ConvNeXT in One Command + +And the simplest way — one command. `wmk perf` with a model ID. It handles everything: load, export, optimize, benchmark. Live hardware monitoring included. + +`wmk perf -m facebook/convnext-base-224 --ep qnn --iterations 100 --monitor` + +Same ConvNeXT, three different approaches. Full control, automated pipeline, or one command. Pick what fits your workflow. diff --git a/docs/getting-started/04.customer-intro/guides/01-intro.md b/docs/getting-started/04.customer-intro/guides/01-intro.md new file mode 100644 index 000000000..613150852 --- /dev/null +++ b/docs/getting-started/04.customer-intro/guides/01-intro.md @@ -0,0 +1,60 @@ +# Getting Started with ModelKit — Swiss Knife for Windows ML Model + +## What is ModelKit? + +ModelKit is a CLI toolkit that builds **portable, performant, and high-quality** models for Windows ML. It bridges the gap between pretrained models and on-device inference — you bring a model from Hugging Face (or your own checkpoint), and ModelKit takes it all the way to optimized, device-ready ONNX. + +## Goals + +**Portable Models** — Build a model once and run it anywhere. ModelKit produces ONNX models that work across every ONNX Runtime backend. + +**Flexible Pipeline** — Use built-in pipelines for end-to-end builds, or compose your own workflows from primitive commands. You decide how much control you need. + +**Human-in-the-Loop** — Drill down into model details, pinpoint errors, and identify performance bottlenecks. ModelKit keeps you in the driver's seat at every stage. + +**AI Agent Ready** — ModelKit provides built-in skills that work with all mainstream coding agents, so you can automate model-building workflows with AI assistance. + +## Promises + +**Out-of-Box Experience** — Install ModelKit and start building immediately. No boilerplate, no scaffolding, no manual dependency wrangling. + +**One Toolkit Covers All EPs** — A single CLI handles every supported execution provider. You do not need separate tools for QNN, OpenVINO, VitisAI, or any other backend. + +**Repeatability and Traceability** — Every command and pipeline produces deterministic, reproducible results. Configs capture the full build specification so you can replay, share, and audit builds. + +**Build-Time Quality Gates** — ModelKit catches compatibility issues and suggests fixes automatically. The analyzer checks operator support before you deploy, not after. + +## Command Overview + +ModelKit organizes its commands into four categories: + +| Category | Commands | Purpose | +|---|---|---| +| **Primitives** | `inspect`, `export`, `analyze`, `optimize`, `quantize`, `compile` | Individual pipeline stages you can run standalone or compose into custom workflows | +| **Pipeline** | `config`, `build` | Config-driven end-to-end builds that orchestrate primitives automatically | +| **Insights** | `perf`, `eval` | Benchmarking, evaluation, and hardware monitoring | +| **Utilities** | `env`, `cache` | Environment setup and cache management | + +## ModelKit Is Right for You If + +- You want to build optimized models for Windows ML without stitching together separate tools +- You want to quick-bench a model on NPU, GPU, or CPU with a single command +- You need to catch EP compatibility issues before deployment, not after +- You want repeatable, config-driven builds you can share with your team and check into source control +- You need to troubleshoot errors or performance bottlenecks in your ONNX pipeline +- You want AI agents to handle model optimization while you focus on your application + +## Execution Provider Coverage + +ModelKit supports a broad set of execution providers across priority tiers: + +| Priority | Execution Provider | Status | +|---|---|---| +| P1 | QNN (NPU) | Supported | +| P1 | OpenVINO | Supported | +| P1 | VitisAI | Supported | +| P2 | QNN GPU | Planned | +| P2 | MIGraphX | Planned | +| P2 | TensorRT | Planned | +| P2 | DirectML | Planned | +| P3 | CPU | Planned | diff --git a/docs/getting-started/04.customer-intro/guides/02-primitive-commands-101.md b/docs/getting-started/04.customer-intro/guides/02-primitive-commands-101.md new file mode 100644 index 000000000..2a77f7533 --- /dev/null +++ b/docs/getting-started/04.customer-intro/guides/02-primitive-commands-101.md @@ -0,0 +1,113 @@ +# ModelKit 101 — Build Your First Model with Primitive Commands + +## What Are Primitive Commands? + +Primitive commands are the individual building blocks of ModelKit. Each one handles a single stage of the model-building pipeline: inspect, export, analyze, optimize, quantize, or compile. You can run them standalone, reorder them, or compose them into custom workflows. + +Think of this as the **coding phase** — you are iterating, experimenting, and debugging your way to a production-ready model. + +## The Model: ConvNeXT + +In this guide you will build **ConvNeXT** (`facebook/convnext-base-224`) — a family of CNN models inspired by Vision Transformers, introduced by Meta in 2022. ConvNeXT adopts several design choices from Transformers and offers high accuracy while retaining the efficiency of CNNs. It is widely adopted for image classification, detection, and segmentation. + +## The BYOM Workflow + +The Build Your Own Model (BYOM) workflow takes a source model through a straightforward pipeline. Along the way, three commands serve as **quality gates**: + +| Quality Gate | Command | What It Checks | +|---|---|---| +| Portability | `analyze` | Can your model run on the target EP? | +| Performance | `optimize` | Is the graph structured for efficient inference? | +| Fidelity | `quantize` | Does compression preserve acceptable accuracy? | + +These three steps define the quality of your output. ModelKit gives you full control over each one. + +## Step by Step + +### 1. Inspect the Model + +Start by inspecting the model metadata. This tells you the task, model class, and I/O shapes — no weights loaded, just metadata. + +```bash +wmk inspect -m facebook/convnext-base-224 +``` + +### 2. Export to ONNX + +Export the model from PyTorch to ONNX format. + +```bash +wmk export -m facebook/convnext-base-224 -o convnext/model.onnx -v +``` + +### 3. Analyze for EP Compatibility + +Run the analyzer to check every operator against the target EPs. It tells you what is supported, what is partial, and what needs fixing. It also generates an optimization config automatically. + +```bash +wmk analyze -m convnext/model.onnx --optim-config optim.json +``` + +### 4. Optimize the Graph + +Apply the optimizer using the config the analyzer generated. The analyzer told you what to fix; the optimizer fixes it. + +```bash +wmk optimize -m convnext/model.onnx -c optim.json -o convnext/model_opt.onnx +``` + +### 5. Quantize to INT8 + +Compress the optimized model to INT8. After this step you have a portable model that can run on any ONNX Runtime backend. + +```bash +wmk quantize -m convnext/model_opt.onnx -o convnext/model_opt_int8.onnx +``` + +### 6. Compile for NPU + +Compile the quantized model for QNN — this generates device-specific binaries for the NPU. + +```bash +wmk compile -m convnext/model_opt_int8.onnx --ep qnn -o convnext/model_compiled.onnx +``` + +### 7. Benchmark on NPU + +Run the benchmark on NPU. Take note of the latency number. + +```bash +wmk perf -m convnext/model_compiled.onnx --ep qnn --iterations 100 +``` + +### 8. Benchmark on CPU for Comparison + +Run the same optimized model on CPU. You should see roughly a **25x speedup** — the quantized model on NPU versus the original on CPU. Same model, same accuracy, completely different performance. + +```bash +wmk perf -m convnext/model_opt.onnx --ep cpu --iterations 100 +``` + +## Analyzer Deep Dive + +The analyzer is the key to building portable ONNX models. It is made of two parts: + +**Linter** — Like ESLint, but for ONNX. It checks operator compatibility and classifies each operator: green for supported, gray for partial, red for unsupported. + +**AutoConf** — Detects suboptimal patterns in the graph and generates the optimization config automatically. + +Together they form the **analyze-optimize loop**: the linter finds the issues, AutoConf writes the fix config, and the optimizer applies it. + +## Three-Phase Recap + +In this guide you used primitive commands to bring ConvNeXT to Windows ML in three phases: + +1. **Inspect** — Understand the model (task, class, I/O shapes) +2. **Build portable ONNX** — Export, analyze, optimize, quantize +3. **Benchmark on device** — Compile, perf on NPU vs CPU + +This workflow gives you full control. You can jump into any stage, try different settings, fix errors, or tweak performance. + +## Next Steps + +You have seen how to build a model step by step with primitive commands. Next, try the config-driven pipeline to achieve the same result with just two commands. See [Two Commands to Production](03-config-build-pipeline.md). diff --git a/docs/getting-started/04.customer-intro/guides/03-config-build-pipeline.md b/docs/getting-started/04.customer-intro/guides/03-config-build-pipeline.md new file mode 100644 index 000000000..d218db921 --- /dev/null +++ b/docs/getting-started/04.customer-intro/guides/03-config-build-pipeline.md @@ -0,0 +1,79 @@ +# Two Commands to Production — ModelKit's Config-Driven Pipeline + +## From Coding to Polish + +In the previous guide you built ConvNeXT step by step with primitive commands — the coding phase. Now you will build the same model with just two commands using the config-driven pipeline. Think of this as the **polish phase**: repeatable, scriptable, and ready to hand off to your team. + +## Step by Step + +### 1. Generate the Build Config + +Run `wmk config` to generate a JSON config file. This config contains all settings for every pipeline step — task, I/O shapes, optimization flags, quantization parameters — all auto-detected from the model. + +```bash +wmk config -m facebook/convnext-base-224 -o convnext_config.json +``` + +You can review it, edit it, or pass it directly to the build command. The config is a plain JSON file — open it in any editor and you will see every knob ModelKit exposes. + +### 2. Build the Model + +Run `wmk build` with that config. It orchestrates the full pipeline — export, analyze, optimize, quantize, compile — all in one go. + +```bash +wmk build -c convnext_config.json -m facebook/convnext-base-224 -o convnext_build/ +``` + +### 3. Benchmark the Result + +Run the benchmark to confirm. Same model, same quality — but two commands instead of eight. + +```bash +wmk perf -m convnext_build/model.onnx --ep qnn --iterations 100 +``` + +## The CMake Analogy + +If you have used CMake, the pattern will feel familiar. `wmk config` is like `cmake -B build` — it inspects the project and generates a build specification. `wmk build` is like `cmake --build build` — it executes that specification. The config file sits in the middle: reviewable, editable, and version-controllable. + +## Config Is Your Build Specification + +The generated config file is the single source of truth for your build. You can: + +- **Review** it before building to verify auto-detected settings +- **Edit** it to override optimization flags, quantization parameters, or target EPs +- **Version-control** it alongside your application code +- **Share** it with teammates so everyone produces identical builds +- **Replay** builds deterministically on any machine + +## Primitive Commands vs. Config-Driven Pipeline + +| | Primitive Commands | Config-Driven Pipeline | +|---|---|---| +| **Lifecycle analogy** | Coding phase | Polish phase | +| **Best for** | Exploring, debugging, experimenting | Delivery, handoff, CI/CD | +| **Control** | Full — enter at any stage, tweak any setting | Config-level — edit the JSON, then build | +| **Steps** | One command per stage (~8 commands) | Two commands (`config` + `build`) | +| **When to use** | You need to iterate on a specific stage or diagnose an issue | You want repeatable, scriptable builds | + +Both approaches produce the same portable ONNX model. The difference is where you are in the development lifecycle and how much manual control you need. + +## One-Command Benchmark + +There is an even simpler option. If you have a model and just want a quick smoke test — does it run, how fast is it — use `wmk perf` with a model ID directly: + +```bash +wmk perf -m facebook/convnext-base-224 --ep qnn --iterations 100 --monitor +``` + +This handles everything behind the scenes: load, export, optimize, and benchmark. Live hardware monitoring is included with the `--monitor` flag. Think of it as a sanity check in the QA process. + +## Recap: Three Ways to Build + +You have now seen all three ways to build a model with ModelKit: + +1. **Primitive commands** for development — iterate, debug, experiment +2. **Config-driven pipeline** for polish and handoff — repeatable, scriptable, two commands +3. **One command** for QA — validate, benchmark, deliver + +Same ConvNeXT, three different approaches. Pick what fits your workflow. diff --git a/docs/getting-started/04.customer-intro/readme-review.md b/docs/getting-started/04.customer-intro/readme-review.md new file mode 100644 index 000000000..fe0cc95fb --- /dev/null +++ b/docs/getting-started/04.customer-intro/readme-review.md @@ -0,0 +1,183 @@ +# README.md — Three-Version Review + +## Versions + +| Version | Style | Lines | File | +|---------|-------|-------|------| +| V1 | Concise & minimal | ~160 | `readme-v1.md` | +| V2 | Detailed & educational | ~432 | `readme-v2.md` | +| V3 | Visual & GitHub-optimized | ~380 | `readme-v3.md` | + +--- + +## Scores + +| Criteria | V1 (Concise) | V2 (Detailed) | V3 (Visual) | +|----------|:---:|:---:|:---:| +| Accuracy | 4 | **5** | 2 | +| Completeness | 3 | **5** | 4 | +| Clarity | 4 | 4 | **5** | +| Readability | **5** | 3 | 4 | +| GitHub rendering | 3 | 4 | **5** | +| Consistency | 4 | 3 | 3 | +| Tone | **5** | 4 | 4 | +| **Total** | **28** | **28** | **27** | + +--- + +## Section-by-Section Comparison + +### 1. Title + Tagline + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | One-liner subtitle | Bold subtitle + expanding paragraph | Bold subtitle + badges + paragraph | +| Accuracy | Matches source | Matches source | Matches source | +| Readability | Clean, fast | Paragraph is long (4 lines) | Badges add visual anchoring | +| **Best for this section** | Minimalism | Detail | **GitHub presence** | + +### 2. ModelKit Is Right for You If + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | 6 bullet points, one line each | 6 paragraphs, 2-4 sentences each | 6 checkbox bullets with bold + dashed explanations | +| Accuracy | All 6 match source | All 6 match, expansions accurate | All 6 match | +| Readability | Fastest to scan | Long, slows scanning | Good balance | +| **Best for this section** | Quick scan | Deep evaluation | **Best balance** | + +### 3. Supported Hardware + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | 7-row table + `--device auto` note | 7-row table with "Device Flag" column + auto note | 7-row table with "Device Flag" column + tip | +| Accuracy | Uses only `--device` | Shows both `--ep` and `--device` (safest) | Uses only `--device` | +| **Best for this section** | Simplicity | **Completeness (both flags)** | Visual polish | + +### 4. Installation + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | Single code block, 3 steps | 3 numbered sections + PowerShell/Git Bash variants | 3 bold numbered steps | +| Accuracy | `.venv/Scripts/activate` (forward slash — broken in PowerShell) | Shows both shell variants (most accurate) | `.venv\Scripts\activate` (backslash — correct for PowerShell) | +| **Best for this section** | Experienced devs | **Newcomers** | Typical README | + +### 5. Commands + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | Summary table only | Summary table + paragraph descriptions | Summary table + collapsible details | +| Accuracy | **Missing `hub`** | **Missing `hub`** | Includes `hub` ✅ | +| **Best for this section** | Overview only | Documentation depth | **GitHub UX (collapsible + hub)** | + +### 6. Quick Start + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | 4 subsections, minimal | 4 subsections, full walkthrough with 25x narrative | 4 subsections with extra flag variants | +| Accuracy | Correct | Closest to source | **5 HIGH issues — fabricated flags** | +| **Best for this section** | Brevity | **Accuracy** | DANGEROUS — unattested flags | + +### 7. BYOM Workflow + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | ASCII pipeline + bullet list | ASCII pipeline + titled paragraphs + analyze-optimize loop | ASCII box diagram + table + numbered steps | +| Accuracy | Matches source | Matches source + adds loop explanation | Matches source | +| **Best for this section** | Quick reference | **Learning the philosophy** | Visual impact | + +### 8. Built-in Models + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | 10 models | 17 models (full catalog) | 17 models in collapsible | +| Architecture casing | lowercase (resnet) | Proper case (ResNet) ✅ | lowercase (resnet) | +| **Best for this section** | Incomplete | **Completeness + casing** | Collapsible UX | + +### 9. Scope & Limitations + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | 3 bullet points | 4 sub-sections | Two-column table (✅/❌) | +| Accuracy | Correct | Most thorough | Adds DeiT, ESRGAN (not confirmed for MVP) | +| **Best for this section** | Minimal | **Completeness** | Visual scanning | + +### 10. Roadmap + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Content | 4-row table | 4 titled paragraphs | Table + collapsible detail | +| Readability | Quick scan | Slower (paragraphs) | Best of both | +| **Best for this section** | Minimal | Narrative | **Table + collapsible** | + +### 11. Contributing & 12. License + +| Aspect | V1 | V2 | V3 | +|--------|-----|-----|-----| +| Contributing | "Coming soon" | Mentions WinPD team | "Coming soon" + timeline | +| License | "MIT" | "MIT" | Link to LICENSE file ✅ | + +--- + +## Issues Found + +### V1 Issues + +| Line | Issue | Severity | +|------|-------|----------| +| 33 | `.venv/Scripts/activate` forward slashes — broken in PowerShell | MEDIUM | +| 46 | Utilities row missing `hub` command | MEDIUM | +| 114-126 | Model catalog only 10 of 17 models | MEDIUM | + +### V2 Issues + +| Line | Issue | Severity | +|------|-------|----------| +| 49 | "GPU and CPU providers are coming" — contradicts table (CPU = "Always available") | MEDIUM | +| 117 | Utilities row missing `hub` command | MEDIUM | +| 305-309 | Mixed `--device` vs `--ep` conventions across Quick Start sections | MEDIUM | + +### V3 Issues + +| Line | Issue | Severity | +|------|-------|----------| +| 142-153 | `--format json`, `--list-tasks`, `--task fill-mask` — **flags not in source, may not exist** | **HIGH** | +| 175 | `--precision w16a16` — **not attested** | **HIGH** | +| 204-209 | `--model-type`, `--precision w8a16`, `--no-compile` — **not attested** | **HIGH** | +| 212 | `--no-quant`, `--no-compile` for build — **not attested** | **HIGH** | +| 233-239 | `--precision w8a16`, `--batch-size 4`, `--warmup 20` — **not attested** | **HIGH** | +| 319 | DeiT not in source or model catalog | LOW | +| 327 | ESRGAN is feature branch, not confirmed in MVP | MEDIUM | + +--- + +## Verdict + +**Use V2 as base, cherry-pick V3's visual structure.** + +### What to take from each version + +| Section | Take from | Reason | +|---------|-----------|--------| +| Title + badges | V3 | GitHub presence | +| "Right for You" format | V3 | Checkbox style, best balance | +| Hardware table | V2 | Shows both `--ep` and `--device` | +| Installation | V2 content, V3 density | Shell variants matter, but trim prose | +| Commands | V3 collapsibles + V2 descriptions | Add `hub`, use collapsible UX | +| Quick Start | **V2** | No fabricated flags | +| BYOM Workflow | V2 content, V3 visual | Keep loop explanation, add diagram | +| Built-in Models | V3 collapsible + V2 casing | ResNet not resnet | +| Scope | V2 | No unattested claims | +| Roadmap | V3 | Table + collapsible | +| License | V3 | Linked format | + +### Do NOT take from V3 + +- Any unattested CLI flags (`--format json`, `--list-tasks`, `--model-type`, `--precision`, `--no-compile`, `--no-quant`, `--batch-size`, `--warmup`) +- DeiT and ESRGAN scope claims +- `w{x}a{y}` quantization syntax + +### Fix in V2 before merging + +- Line 49: CPU is "Always available", not "coming" +- Line 117: Add `hub` to Utilities +- Lines 305-309: Standardize `--device` vs `--ep` usage diff --git a/docs/getting-started/04.customer-intro/readme-toc.md b/docs/getting-started/04.customer-intro/readme-toc.md new file mode 100644 index 000000000..6d4a3a4a2 --- /dev/null +++ b/docs/getting-started/04.customer-intro/readme-toc.md @@ -0,0 +1,48 @@ +# README.md — Approved TOC + +## Structure + +``` +# ModelKit +> One-liner: CLI toolkit to build portable, performant and high-quality models for Windows ML + +## ModelKit Is Right for You If +(6-item checklist) + +## Supported Hardware +(EP table with device/status badges) + +## Installation +(uv venv, pip install, sanity check with winml sys) + +## Commands +(Four-category table: Primitives, Pipeline, Insights, Utilities) + +## Quick Start +### Inspect a Model +### Build with Primitive Commands +### Build with Config + Build +### Benchmark in One Command + +## The BYOM Workflow +(Pipeline diagram, three quality gates: Analyze/Optimize/Evaluate) + +## Built-in Models +(Hub table with model ID, task, architecture) + +## Scope & Limitations +(Classic DL only, no LLMs, known constraints) + +## Roadmap + +## Contributing + +## License +``` + +## Source Material + +- `MVP.transcripts.md` — presentation transcript +- Bug bash guide from `microsoft/ModelKit` repo (`qiowu/bugbash` branch) +- `blog/01a-why-modelkit.md` — goals, right-for-you +- `blog/01b-what-modelkit-offers.md` — promises, commands, EP coverage diff --git a/docs/getting-started/04.customer-intro/readme-v1.md b/docs/getting-started/04.customer-intro/readme-v1.md new file mode 100644 index 000000000..b6ec0269c --- /dev/null +++ b/docs/getting-started/04.customer-intro/readme-v1.md @@ -0,0 +1,150 @@ +# ModelKit + +CLI toolkit to build portable, performant and high-quality models for Windows ML — bridging the gap between pretrained models and on-device inference. + +## ModelKit Is Right for You If + +- You want to build models that run on **any Windows device** +- You want to benchmark a model with **one command** +- You want to catch compatibility issues **ahead of time** +- You want **deep insights** into your model +- You want a **repeatable and traceable** model building process +- You want **AI agents** to build and profile models for you + +## Supported Hardware + +| Provider | Hardware | Status | +|----------|----------|--------| +| QNN | Qualcomm GPU and NPU | :green_circle: Ready | +| OpenVINO | Intel CPU, iGPU, dGPU, and NPU | :green_circle: Ready | +| VitisAI | AMD NPU | :green_circle: Ready | +| TensorRT | NVIDIA discrete GPUs | :large_orange_diamond: Planned | +| MIGraphX | AMD discrete GPUs | :large_orange_diamond: Planned | +| DirectML | Hardware-agnostic GPU backend | :large_orange_diamond: Planned | +| CPU | Cross-platform fallback | :white_circle: Always | + +> No NPU? Use `--device auto` — ModelKit falls back to GPU, then CPU. + +## Installation + +```bash +# Create a Python 3.10 virtual environment +uv venv --python 3.10 +.venv/Scripts/activate + +# Install from wheel +uv pip install winml_modelkit--py3-none-any.whl + +# Sanity check — verify devices and execution providers +winml sys --list-device --list-ep +``` + +## Commands + +| Category | Commands | Purpose | +|----------|----------|---------| +| Primitives | `inspect`, `export`, `optimize`, `quantize`, `compile` | Single-stage operations | +| Pipeline | `config`, `build`, `perf`, `eval`, `run`* | End-to-end workflows | +| Insights | `analyze`, `debug`* | Analysis and debugging | +| Utilities | `cache`, `doctor`, `setting`, `sys` | Environment management | + +\* = coming soon + +## Quick Start + +### Inspect a Model + +```bash +winml inspect -m microsoft/resnet-50 +``` + +### Build with Primitive Commands + +```bash +# Export HuggingFace model to ONNX +winml export -m facebook/convnext-base-224 + +# Analyze portability +winml analyze -m model.onnx + +# Optimize graph +winml optimize -m model.onnx + +# Quantize for NPU +winml quantize -m model.onnx --device npu + +# Benchmark +winml perf -m model.onnx --device npu --iterations 100 +``` + +### Build with Config + Build + +Like CMake: `config` generates a build plan, `build` executes it. + +```bash +# Generate a build config +winml config -m facebook/convnext-base-224 --device npu + +# Execute the build +winml build +``` + +### Benchmark in One Command + +```bash +winml perf -m facebook/convnext-base-224 --device npu --iterations 100 --monitor +``` + +## The BYOM Workflow + +``` +Source Model → Export → Analyze → Optimize → Quantize → Compile → Benchmark +``` + +Three quality gates guard the pipeline: + +- **Analyze** — portability: catches unsupported ops and shape issues before they reach hardware +- **Optimize** — performance: graph transformations that reduce latency +- **Evaluate** — fidelity: measures accuracy loss from quantization and compilation + +## Built-in Models + +Run `winml hub` to see the full catalog. + +| Model ID | Task | Architecture | +|----------|------|--------------| +| `microsoft/resnet-50` | image-classification | resnet | +| `google/vit-base-patch16-224` | image-classification | vit | +| `microsoft/swin-large-patch4-window7-224` | image-classification | swin | +| `facebook/convnext-tiny-224` | image-classification | convnext | +| `rizvandwiki/gender-classification` | image-classification | vit | +| `ProsusAI/finbert` | text-classification | bert | +| `dslim/bert-base-NER` | token-classification | bert | +| `microsoft/table-transformer-detection` | object-detection | table-transformer | +| `mattmdjaga/segformer_b2_clothes` | image-segmentation | segformer | +| `nvidia/segformer-b1-finetuned-ade-512-512` | image-segmentation | segformer | + +> Golden rule: always run `winml inspect -m ` before any pipeline command. + +## Scope & Limitations + +- **Supported**: classic deep learning models — CNNs, vision transformers, NLP classifiers, token classifiers, object detection, segmentation +- **Not supported**: LLMs (GPT, LLaMA, Phi, Mistral), diffusion models (Stable Diffusion), or any decoder-only / seq2seq generative architecture +- LLM support is on the roadmap + +## Roadmap + +| Milestone | Target | Highlights | +|-----------|--------|------------| +| Kickoff | Q4 2025 | Internal prototype | +| Early Access | Q1 2026 | First external testers | +| Public Beta | Q2 2026 | Open source, agent skills, AITK integration | +| RC | Q3-Q4 2026 | LLM + LoRA, more devices, MLIR backend | + +## Contributing + +*Coming soon.* + +## License + +MIT diff --git a/docs/getting-started/04.customer-intro/readme-v2.md b/docs/getting-started/04.customer-intro/readme-v2.md new file mode 100644 index 000000000..6287cacf8 --- /dev/null +++ b/docs/getting-started/04.customer-intro/readme-v2.md @@ -0,0 +1,432 @@ +# ModelKit + +**CLI toolkit to build portable, performant and high-quality models for Windows ML — bridging the gap between pretrained models and on-device inference.** + +ModelKit takes a pretrained model from Hugging Face (or a local ONNX file) and prepares it for on-device inference through a complete pipeline: export to ONNX, optimize the graph, quantize to low-bit precision, compile for target hardware, and benchmark on device. One toolkit covers every execution provider — QNN, OpenVINO, VitisAI, TensorRT, MIGraphX, DirectML, and CPU — so you never need a separate vendor toolchain per silicon. + +--- + +## Table of Contents + +- [ModelKit Is Right for You If](#modelkit-is-right-for-you-if) +- [Supported Hardware](#supported-hardware) +- [Installation](#installation) +- [Commands](#commands) +- [Quick Start](#quick-start) +- [The BYOM Workflow](#the-byom-workflow) +- [Built-in Models](#built-in-models) +- [Scope & Limitations](#scope--limitations) +- [Roadmap](#roadmap) +- [Contributing](#contributing) +- [License](#license) + +--- + +## ModelKit Is Right for You If + +**You want to build models that run on any Windows device.** +ModelKit produces ONNX models that target the Windows ML runtime. A model prepared through ModelKit runs on any supported execution provider — Qualcomm NPU, Intel NPU, AMD NPU, NVIDIA GPU, or CPU — without per-device rework. Build once, run anywhere. + +**You want to benchmark a model with one command.** +Run `winml perf -m --device npu` and ModelKit handles everything behind the scenes — download, export, optimize, and benchmark. You get latency percentiles, throughput numbers, and live hardware utilization in seconds. + +**You want to catch compatibility issues ahead of time.** +The built-in analyzer checks every operator against your target execution provider before you deploy, not after. It classifies operators as supported, partial, or unsupported — and generates an optimization config that fixes the issues automatically. + +**You want deep insights into your model.** +`winml inspect` reveals task, model class, I/O tensor shapes, and EP compatibility without loading weights. `winml analyze` goes deeper — linting operators, detecting suboptimal patterns, and identifying performance bottlenecks in the graph. + +**You want a repeatable and traceable model building process.** +`winml config` captures every build setting — task, I/O shapes, optimization flags, quantization parameters — into a single JSON file. Check it into source control, share it with your team, and replay builds deterministically on any machine. Think CMake for models. + +**You want AI agents to build and profile models for you.** +ModelKit provides built-in skills that coding agents (Claude Code, Cursor, Copilot, and others) can consume. Agents can drive the entire build pipeline programmatically — from model selection through optimization to benchmarking. + +--- + +## Supported Hardware + +ModelKit supports the major execution providers in the Windows ML ecosystem. The three NPU providers are fully supported today; GPU and CPU providers are coming in future releases. + +| Execution Provider | Hardware | Status | Device Flag | +|---|---|---|---| +| **QNN** | Qualcomm NPU (Snapdragon X Elite) | 🟢 Ready | `--ep qnn --device npu` | +| **OpenVINO** | Intel NPU (Meteor Lake / Lunar Lake) | 🟢 Ready | `--ep openvino --device npu` | +| **VitisAI** | AMD NPU (Ryzen AI) | 🟢 Ready | `--ep vitisai --device npu` | +| **TensorRT** | NVIDIA discrete GPUs | 🔶 Planned | `--ep tensorrt --device gpu` | +| **MIGraphX** | AMD discrete GPUs | 🔶 Planned | `--ep migraphx --device gpu` | +| **DirectML** | Hardware-agnostic GPU backend | 🔶 Planned | `--ep dml --device gpu` | +| **CPU** | Cross-platform fallback | ⚪ Always available | `--ep cpu --device cpu` | + +**Automatic device selection.** If you are unsure which EP to use, pass `--device auto`. ModelKit will detect the best available device on your machine and select the appropriate execution provider automatically, falling back through NPU, GPU, and finally CPU. + +--- + +## Installation + +ModelKit requires **Python 3.10** and is distributed as a Python wheel. We recommend using [uv](https://docs.astral.sh/uv/) for fast, reproducible environment setup. + +### Step 1: Create a virtual environment + +```bash +uv venv --python 3.10 +``` + +This creates a `.venv` directory with an isolated Python 3.10 environment. Activate it: + +```bash +# Windows (PowerShell) +.venv\Scripts\activate + +# Windows (Git Bash / WSL) +source .venv/Scripts/activate +``` + +### Step 2: Install ModelKit + +```bash +uv pip install winml_modelkit-0.0.1.dev1-py3-none-any.whl +``` + +This installs the `winml` CLI and all required dependencies (ONNX Runtime, Hugging Face Transformers, optimization libraries, and more). + +### Step 3: Sanity check + +Verify that your system is ready: + +```bash +winml sys --list-device --list-ep +``` + +This command prints your system information, detected hardware devices, and available execution providers. Confirm that your target device and EP appear in the output: + +- **Snapdragon X Elite** — look for `QNNExecutionProvider` +- **Intel AI Boost** — look for `OpenVINOExecutionProvider` +- **AMD Ryzen AI** — look for `VitisAIExecutionProvider` + +If no NPU is detected, you can still use ModelKit with `--device auto` for most commands. The only exception is `winml compile`, which requires an NPU device to generate device-specific binaries. + +--- + +## Commands + +ModelKit organizes its commands into four categories: **Primitives** for individual pipeline stages, **Pipeline** for orchestrated workflows, **Insights** for diagnostics, and **Utilities** for housekeeping. + +### Summary + +| Category | Commands | Purpose | +|---|---|---| +| **Primitives** | `inspect`, `export`, `optimize`, `quantize`, `compile` | Individual building blocks — one command per pipeline stage | +| **Pipeline** | `config`, `build`, `perf`, `eval`, `run`* | Orchestration, benchmarking, and evaluation | +| **Insights** | `analyze`, `debug`* | Diagnostics, compatibility checking, and debugging | +| **Utilities** | `cache`, `doctor`, `setting`, `sys` | Environment management and housekeeping | + +*\* Coming soon* + +### Primitives + +These are the individual building blocks of the model preparation pipeline. Each command handles exactly one stage. You can run them standalone, reorder them, or compose them into custom workflows. + +**`winml inspect`** — Discover model metadata. Prints the task, model class, input/output tensor names and shapes, and execution provider compatibility. No weights are loaded — this reads only the model configuration, making it fast and lightweight. Always run inspect first to verify a model is supported before feeding it into the pipeline. + +**`winml export`** — Convert a source model to ONNX. Takes a Hugging Face model ID (or local checkpoint) and produces a standards-compliant ONNX file with hierarchy-preserving metadata. This is the entry point for any model that starts as a PyTorch checkpoint. + +**`winml optimize`** — Fuse operators, simplify graphs, and prepare for target EPs. Takes an ONNX model and an optimization config (typically generated by `winml analyze`) and applies graph-level transformations: operator fusion, constant folding, shape inference, and EP-specific rewrites. + +**`winml quantize`** — Compress to low-bit precision. Reduces model size and inference latency by converting weights and activations from FP32 to INT8 (or other low-bit formats). After quantization, the model is portable — it can run on any ONNX Runtime backend. + +**`winml compile`** — Generate device-specific binaries. Takes a quantized ONNX model and produces EP-specific compiled artifacts (for example, QNN context binaries for Qualcomm NPU). This step locks the model to a specific device but delivers the lowest possible inference latency. + +### Pipeline + +Orchestration commands that chain primitives together, plus benchmarking and evaluation tools. + +**`winml config`** — Auto-detect optimal settings into a JSON config. Inspects the model and generates a complete build specification: task, I/O shapes, optimization flags, quantization parameters, and target EP settings. The config file is reviewable, editable, and version-controllable — it is the single source of truth for your build. + +**`winml build`** — Orchestrate the full pipeline. Takes a config file and executes every stage in sequence: export, analyze, optimize, quantize, and compile. Two commands (`config` + `build`) replace eight manual steps. + +**`winml perf`** — Benchmark latency, throughput, and hardware utilization. Runs inference on the target device and reports latency percentiles (p50, p90, p99), throughput (inferences per second), and optionally live hardware monitoring (CPU, RAM, NPU utilization) with the `--monitor` flag. Can accept either a local ONNX file or a Hugging Face model ID — in the latter case, it handles the full pipeline automatically. + +**`winml eval`** — Measure model accuracy against reference datasets. Compares the output of your optimized/quantized model against the original to quantify any accuracy loss introduced by the pipeline. + +**`winml run`** — End-to-end inference with pre/post processing. *(Coming soon.)* + +### Insights + +Diagnostic tools for understanding what is happening inside your model. + +**`winml analyze`** — Lint operators, check EP compatibility, and generate optimization config. The analyzer has two components. The **Linter** is like ESLint for ONNX — it checks every operator against target EPs and classifies each as supported (green), partial (gray), or unsupported (red). **AutoConf** detects suboptimal patterns in the graph and generates the optimization config that the optimizer consumes. Together they form the analyze-optimize loop. + +**`winml debug`** — Interactive model debugging and layer-by-layer inspection. *(Coming soon.)* + +### Utilities + +Housekeeping and environment management commands. + +**`winml cache`** — Manage built model artifacts and pipeline outputs. View, clean, or selectively remove cached models and intermediate files. + +**`winml doctor`** — Diagnose environment issues. Checks runtimes, execution providers, and dependencies to identify configuration problems. + +**`winml setting`** — Configure ModelKit preferences. Set default EPs, output directories, and other global options. + +**`winml sys`** — System information and capability reporting. Prints detected hardware, available EPs, Python version, and installed package versions. The first command to run after installation. + +--- + +## Quick Start + +This section walks you through ModelKit from simplest to most detailed, using real commands and real models. + +### Inspect a Model + +The fastest way to get started is to inspect a model. Let's look at ResNet-50: + +```bash +winml inspect -m microsoft/resnet-50 +``` + +This prints the model's metadata without downloading weights: + +- **Task**: `image-classification` — what the model does +- **Model class**: `ResNetForImageClassification` — the architecture +- **Input tensors**: names, data types, and shapes (e.g., `pixel_values: float32 [1, 3, 224, 224]`) +- **Output tensors**: names, data types, and shapes (e.g., `logits: float32 [1, 1000]`) + +If inspect succeeds, the model is supported and you can proceed with the rest of the pipeline. If it prints an error or `Unsupported`, skip that model — it is not yet compatible with ModelKit. + +**Golden rule: always inspect first.** Before running export, build, perf, or any other pipeline command, verify the model is supported with `winml inspect`. + +### Build with Primitive Commands + +This walkthrough builds **ConvNeXT** (`facebook/convnext-base-224`) step by step using primitive commands. ConvNeXT is a family of CNN models inspired by Vision Transformers, introduced by Meta in 2022 — it offers high accuracy while retaining the efficiency of CNNs. + +The workflow has three phases: **Inspect**, **Build a Portable Model**, and **Benchmark on Device**. + +#### Phase 1: Inspect + +```bash +winml inspect -m facebook/convnext-base-224 +``` + +This tells you everything about the model — task, model class, I/O shapes — without loading weights. + +#### Phase 2: Build a Portable Model + +**Export** from PyTorch to ONNX: + +```bash +winml export -m facebook/convnext-base-224 -o convnext/model.onnx -v +``` + +The `-v` flag enables verbose output so you can see the export progress and any warnings. + +**Analyze** for EP compatibility: + +```bash +winml analyze -m convnext/model.onnx --optim-config optim.json +``` + +The analyzer checks every operator against the target EPs. It tells you what is supported, what is partial, what needs fixing — and it writes an optimization config (`optim.json`) that captures the recommended fixes. + +**Optimize** the graph using the analyzer's config: + +```bash +winml optimize -m convnext/model.onnx -c optim.json -o convnext/model_opt.onnx +``` + +The analyzer told you what to fix; the optimizer fixes it. This applies operator fusion, constant folding, and EP-specific rewrites. + +**Quantize** to INT8: + +```bash +winml quantize -m convnext/model_opt.onnx -o convnext/model_opt_int8.onnx +``` + +After this step you have a portable model — it can run on any ONNX Runtime backend. + +#### Phase 3: Benchmark on Device + +**Compile** for NPU (generates device-specific binaries): + +```bash +winml compile -m convnext/model_opt_int8.onnx --ep qnn -o convnext/model_compiled.onnx +``` + +**Benchmark on NPU** — note the latency: + +```bash +winml perf -m convnext/model_compiled.onnx --ep qnn --iterations 100 +``` + +**Benchmark on CPU** for comparison: + +```bash +winml perf -m convnext/model_opt.onnx --ep cpu --iterations 100 +``` + +Compare the two numbers. You should see roughly a **25x speedup** — the quantized model on NPU versus the original on CPU. Same model, same accuracy, completely different performance. + +### Build with Config + Build + +Same model, different approach. Instead of running each command manually, use the config-driven pipeline. + +**Generate the build config:** + +```bash +winml config -m facebook/convnext-base-224 -o convnext_config.json +``` + +This creates a JSON file containing all settings for every pipeline step — task, I/O shapes, optimization flags, quantization parameters — all auto-detected from the model. Open it in any editor to review or adjust. The config follows the same pattern as CMake: `winml config` is like `cmake -B build` (generate the specification), and `winml build` is like `cmake --build build` (execute it). + +**Build the model:** + +```bash +winml build -c convnext_config.json -m facebook/convnext-base-224 -o convnext_build/ +``` + +This orchestrates the full pipeline — export, analyze, optimize, quantize, compile — all in one go. Same result as the eight manual steps above, but in two commands. + +**Benchmark the result:** + +```bash +winml perf -m convnext_build/model.onnx --ep qnn --iterations 100 +``` + +Same model, same quality — two commands instead of eight. + +The config file is the single source of truth for your build. You can version-control it, share it with teammates, edit it to override settings, and replay builds deterministically on any machine. + +### Benchmark in One Command + +The simplest way to evaluate a model — one command, zero setup: + +```bash +winml perf -m facebook/convnext-base-224 --device npu --monitor +``` + +ModelKit handles everything behind the scenes: download the model from Hugging Face, export to ONNX, optimize the graph, and run the benchmark on your NPU. The `--monitor` flag enables live hardware monitoring — you will see real-time CPU utilization, RAM usage, and NPU activity alongside the latency results. + +This is ideal for quick smoke tests: does the model run on this device, and how fast is it? Think of it as the QA step — validate, benchmark, deliver. + +--- + +## The BYOM Workflow + +The **Build Your Own Model** (BYOM) workflow is the philosophy behind ModelKit. It defines how a source model becomes a production-ready, device-optimized artifact. + +### The Pipeline + +``` +Source Model → Export → Analyze → Optimize → Quantize → Compile → Benchmark +``` + +Each arrow is a ModelKit command. You can enter the pipeline at any stage (for example, start with a local ONNX file and skip export), exit early (stop after optimization if you do not need quantization), or loop back to repeat a stage with different settings. + +### Three Quality Gates + +The pipeline embeds three quality gates — checkpoints where ModelKit validates the model before proceeding. These three steps define the quality of your output. + +**Analyze — Portability Gate.** +Does the model run on the target EP? The analyzer lints every operator and checks compatibility against your target execution providers. If an operator is unsupported, you find out here — before you spend time on optimization and quantization. The analyzer also generates the optimization config that feeds into the next stage. + +**Optimize — Performance Gate.** +Is the graph efficient enough? The optimizer applies graph-level transformations (fusion, constant folding, shape inference, EP-specific rewrites) to produce a model that runs efficiently on the target hardware. Compare perf numbers before and after optimization to measure the improvement. + +**Evaluate — Fidelity Gate.** +Is the model still accurate after quantization? Compressing from FP32 to INT8 reduces size and improves latency, but it can degrade accuracy. The eval command measures the difference so you can make an informed decision about the quality-performance tradeoff. + +### The Analyze-Optimize Loop + +The analyzer and optimizer work as a pair. The analyzer's **Linter** (like ESLint for ONNX) identifies compatibility issues and classifies operators. The analyzer's **AutoConf** (like GNU AutoConf for ONNX) detects suboptimal patterns and generates a fix config. The optimizer consumes that config and applies the transformations. + +If the first pass does not achieve full compatibility, you can iterate: analyze again, review the updated config, optimize again. This loop is what makes models portable across execution providers. + +--- + +## Built-in Models + +ModelKit ships with a curated catalog of tested models. Run `winml hub` to list all available models. + +| Model ID | Task | Architecture | +|---|---|---| +| `microsoft/resnet-50` | image-classification | ResNet | +| `google/vit-base-patch16-224` | image-classification | ViT | +| `microsoft/swin-large-patch4-window7-224` | image-classification | Swin | +| `facebook/convnext-tiny-224` | image-classification | ConvNeXT | +| `rizvandwiki/gender-classification` | image-classification | ViT | +| `ProsusAI/finbert` | text-classification | BERT | +| `Intel/bert-base-uncased-mrpc` | text-classification | BERT | +| `cardiffnlp/twitter-roberta-base-sentiment-latest` | text-classification | RoBERTa | +| `dslim/bert-base-NER` | token-classification | BERT | +| `dbmdz/bert-large-cased-finetuned-conll03-english` | token-classification | BERT | +| `Babelscape/wikineural-multilingual-ner` | token-classification | BERT | +| `w11wo/indonesian-roberta-base-posp-tagger` | token-classification | RoBERTa | +| `microsoft/table-transformer-detection` | object-detection | Table Transformer | +| `mattmdjaga/segformer_b2_clothes` | image-segmentation | SegFormer | +| `nvidia/segformer-b1-finetuned-ade-512-512` | image-segmentation | SegFormer | +| `nvidia/segformer-b2-finetuned-ade-512-512` | image-segmentation | SegFormer | +| `nvidia/segformer-b5-finetuned-ade-640-640` | image-segmentation | SegFormer | + +These models are verified against ModelKit's full pipeline and serve as reliable starting points for testing and experimentation. You are not limited to this list — any Hugging Face model that passes `winml inspect` is a valid input. + +**Golden rule: inspect first.** Before running any pipeline command on a model not in this table, run `winml inspect -m ` to verify it is supported. + +--- + +## Scope & Limitations + +### What ModelKit supports + +ModelKit targets **classic deep learning models** — CNNs, encoders, vision transformers, NLP classifiers, token classifiers, object detection models, and segmentation models. These are the architectures that run well on the ONNX Runtime execution providers available today. + +Supported tasks include: +- Image classification (ResNet, ViT, Swin, ConvNeXT) +- Text classification (BERT, RoBERTa) +- Token classification / NER (BERT, RoBERTa) +- Object detection (Table Transformer) +- Image segmentation (SegFormer) + +### What ModelKit does not support + +**LLMs and generative models are not in scope.** Do not use ModelKit with GPT, LLaMA, Phi, Mistral, Stable Diffusion, or any model with a decoder-only or sequence-to-sequence generative architecture. LLM support (with LoRA) is planned for Q3-Q4 2026. + +### Accepted inputs + +ModelKit accepts two types of input: + +- **Hugging Face model ID** (e.g., `microsoft/resnet-50`) — model weights are downloaded automatically on first use and cached locally. +- **Local ONNX file** (e.g., `model.onnx`) — produced by `winml export`, `winml build`, or any other ONNX exporter. + +### Known constraints + +- `winml compile` requires an NPU device. It cannot run without one. If no NPU is available, skip the compile step and use `--device auto` for benchmarking. +- Some models may export successfully but fail during optimization or quantization due to unsupported operator patterns. The analyzer will flag these issues. +- Performance numbers vary by device, driver version, and EP version. Always benchmark on your target hardware. + +--- + +## Roadmap + +**Q4 2025 — Project Kickoff.** +Initial development of the ModelKit CLI, core pipeline commands, and execution provider integrations. + +**Q1 2026 — Early Access & Feedback.** +Internal release to partner teams. Validation across QNN, OpenVINO, and VitisAI execution providers. Bug bash and usability testing with real-world models. + +**Q2 2026 — Public Beta.** +Open source release. Coding agent skills for Claude Code, Cursor, Copilot, and other AI-assisted development tools. Integration with AI Toolkit (AITK) for Visual Studio Code. + +**Q3-Q4 2026 — Release Candidate.** +LLM support with LoRA adapters. Expanded device coverage for GPU and additional NPU platforms. MLIR integration for next-generation compiler backends. + +--- + +## Contributing + +Contributing guidelines are coming soon. If you are interested in contributing to ModelKit, please reach out to the WinPD team for early access and collaboration opportunities. + +--- + +## License + +MIT diff --git a/docs/getting-started/04.customer-intro/readme-v3.md b/docs/getting-started/04.customer-intro/readme-v3.md new file mode 100644 index 000000000..47347ba33 --- /dev/null +++ b/docs/getting-started/04.customer-intro/readme-v3.md @@ -0,0 +1,380 @@ +# ModelKit + +**CLI toolkit to build portable, performant and high-quality models for Windows ML.** + +![Status](https://img.shields.io/badge/status-early%20access-blue) +![Python](https://img.shields.io/badge/python-3.10%2B-blue?logo=python&logoColor=white) +![License](https://img.shields.io/badge/license-MIT-green) + +ModelKit bridges the gap between pretrained models and on-device inference. +Export from HuggingFace, optimize graphs, quantize weights, compile to device-specific binaries, +and benchmark — all from a single CLI. No separate vendor toolchain per silicon. +Built-in quality gates catch compatibility problems, suboptimal operators, and quantization +regressions — and suggest fixes automatically — before the model ever reaches a device. + +--- + +## :dart: ModelKit Is Right for You If + +- [x] You want to build models that run on **any Windows device** — Qualcomm, Intel, AMD, NVIDIA, or CPU +- [x] You want to benchmark a model with **one command** — latency, throughput, and live hardware utilization +- [x] You want to catch compatibility issues **ahead of time** — unsupported ops, shape mismatches, EP gaps +- [x] You want **deep insights** into your model — I/O shapes, task mapping, operator coverage per EP +- [x] You want a **repeatable and traceable** model building process — config-driven, inspectable at every stage +- [x] You want **AI agents** to build and profile models for you — agent-ready skills and structured JSON output + +--- + +## :desktop_computer: Supported Hardware + +| Provider | Hardware | Device Flag | Status | +|:---------|:---------|:------------|:------:| +| **QNN** | Qualcomm GPU and NPU | `--device npu` | :green_circle: Ready | +| **OpenVINO** | Intel CPU, iGPU, dGPU, and NPU | `--device npu` | :green_circle: Ready | +| **VitisAI** | AMD NPU | `--device npu` | :green_circle: Ready | +| **TensorRT** | NVIDIA discrete GPUs | `--device gpu` | :large_orange_diamond: Planned | +| **MIGraphX** | AMD discrete GPUs | `--device gpu` | :large_orange_diamond: Planned | +| **DirectML** | Hardware-agnostic GPU backend | `--device gpu` | :large_orange_diamond: Planned | +| **CPU** | Cross-platform fallback | `--device cpu` | :white_circle: Always | + +> **Tip:** Use `--device auto` and ModelKit picks the best available device — NPU first, then GPU, then CPU. + +--- + +## :package: Installation + +**1. Create a Python 3.10 environment** + +```bash +uv venv --python 3.10 +.venv\Scripts\activate # Windows +``` + +**2. Install from wheel** + +```bash +uv pip install winml_modelkit--py3-none-any.whl +``` + +**3. Verify your environment** + +```bash +winml sys --list-device --list-ep +``` + +This prints detected devices, available execution providers, and library versions — a quick sanity check before you start building. + +--- + +## :wrench: Commands + +| Category | Commands | Purpose | +|:---------|:---------|:--------| +| **Primitives** | `inspect` `export` `optimize` `quantize` `compile` | Single-stage building blocks | +| **Pipeline** | `config` `build` `perf` `eval` `run`\* | End-to-end orchestration | +| **Insights** | `analyze` `debug`\* | Diagnostics and compatibility | +| **Utilities** | `hub` `cache` `doctor` `setting` `sys` | Catalog, cache, and environment | + +\* = coming soon + +
+Primitives — one stage at a time + +| Command | Description | +|:--------|:------------| +| `winml inspect` | Discover model metadata, task, I/O shapes, and EP support | +| `winml export` | Convert a HuggingFace model to ONNX with hierarchy-preserving metadata | +| `winml optimize` | Fuse operators, simplify graphs, prepare for target EP | +| `winml quantize` | Compress to low-bit precision (int8, int16, mixed `w{x}a{y}`) with calibration | +| `winml compile` | Generate device-specific binaries (e.g., QNN context binaries) | + +
+ +
+Pipeline — orchestrated workflows + +| Command | Description | +|:--------|:------------| +| `winml config` | Auto-detect task, I/O shapes, and optimal settings into a JSON build config | +| `winml build` | Execute the full pipeline: export, analyze, optimize, quantize, compile | +| `winml perf` | Benchmark latency, throughput, and hardware utilization with `--monitor` | +| `winml eval` | Evaluate model accuracy against reference datasets (ImageNet, GLUE, etc.) | +| `winml run`\* | End-to-end inference with pre/post processing | + +
+ +
+Insights — understand what is happening inside + +| Command | Description | +|:--------|:------------| +| `winml analyze` | Lint operators, check EP compatibility, generate optimization config | +| `winml debug`\* | Interactive model debugging and layer-by-layer inspection | + +
+ +
+Utilities — environment and catalog + +| Command | Description | +|:--------|:------------| +| `winml hub` | Browse the curated built-in model catalog with accuracy verdicts | +| `winml cache` | Manage built model artifacts and pipeline outputs | +| `winml doctor` | Diagnose environment issues (runtimes, providers, dependencies) | +| `winml setting` | Configure ModelKit preferences | +| `winml sys` | System information, device list, and EP capability reporting | + +
+ +--- + +## :rocket: Quick Start + +### :mag: Inspect a Model + +Before building anything, ask ModelKit what it knows about your model: + +```bash +winml inspect -m microsoft/resnet-50 +``` + +ModelKit resolves the task, model class, I/O tensor shapes, and the export/quantize/compile +strategy — everything the build pipeline will use. Add `--format json` for machine-readable +output that agents and scripts can consume directly. + +```bash +# JSON output for automation +winml inspect -m microsoft/resnet-50 --format json + +# List all supported tasks +winml inspect --list-tasks + +# Inspect with a specific task override +winml inspect -m google-bert/bert-base-uncased --task fill-mask +``` + +> **Golden rule:** always run `winml inspect -m ` before any pipeline command. + +--- + +### :package: Build with Primitive Commands + +Use individual commands for fine-grained control. Here is a ConvNeXT walkthrough: + +```bash +# 1. Export HuggingFace model to ONNX +winml export -m facebook/convnext-base-224 -o convnext.onnx + +# 2. Check EP compatibility +winml analyze -m convnext.onnx --device npu + +# 3. Optimize the graph +winml optimize -m convnext.onnx -o convnext_opt.onnx + +# 4. Quantize for NPU (int16 weights + int16 activations) +winml quantize -m convnext_opt.onnx --device npu --precision w16a16 + +# 5. Compile to device binary +winml compile -m convnext_opt_qdq.onnx --device npu + +# 6. Benchmark +winml perf -m convnext_opt_qdq.onnx --device npu --iterations 100 +``` + +Each step produces an inspectable artifact — you can stop, examine, tweak, and resume at any point. + +--- + +### :gear: Build with Config + Build + +Think of it like CMake: **`config` generates a build plan, `build` executes it.** + +```bash +# Generate a build config (auto-detects task, shapes, quant settings) +winml config -m facebook/convnext-base-224 --device npu -o build_config.json + +# Execute the full pipeline in one shot +winml build -c build_config.json -m facebook/convnext-base-224 -o output/ +``` + +The config file is a plain JSON document — edit it to override precision, skip stages, or target +a different EP. You can also generate a config without downloading weights: + +```bash +# Config from model type alone (no download required) +winml config --model-type bert --device npu -o bert_config.json + +# Config with custom precision and no compilation stage +winml config -m microsoft/resnet-50 --precision w8a16 --no-compile -o resnet_config.json +``` + +The `build` command reads this config and orchestrates the full pipeline — export, analyze, +optimize, quantize, and compile — with a single invocation. Use `--no-quant` or `--no-compile` +to skip stages on the fly without editing the config file. + +--- + +### :zap: Benchmark in One Command + +Point `perf` at any model — HuggingFace ID or local `.onnx` file — and get latency stats instantly: + +```bash +winml perf -m microsoft/resnet-50 --device npu --iterations 100 + +# With live hardware utilization chart +winml perf -m microsoft/resnet-50 --device npu --iterations 100 --monitor +``` + +The `--monitor` flag renders a live terminal chart showing NPU/GPU utilization, CPU%, and memory +during the benchmark run. + +```bash +# Benchmark with custom precision and batch size +winml perf -m model.onnx --device npu --precision w8a16 --batch-size 4 + +# Force a specific execution provider +winml perf -m model.onnx --ep qnn --iterations 200 --warmup 20 + +# Save results to JSON for CI/CD integration +winml perf -m model.onnx --device npu -o results/perf_report.json +``` + +`perf` reports median latency, P90/P95/P99 percentiles, throughput (inferences/sec), and +memory usage. JSON output makes it easy to track regressions across builds. + +--- + +## :arrows_counterclockwise: The BYOM Workflow + +``` + Source Model + | + v + +---------+ +---------+ +----------+ +----------+ +---------+ + | Export | --> | Analyze | --> | Optimize | --> | Quantize | --> | Compile | + +---------+ +---------+ +----------+ +----------+ +---------+ + | | | + v v v + Portability Performance Benchmark + Report Report & Evaluate +``` + +Three quality gates guard the pipeline: + +| Gate | Pillar | What It Catches | +|:-----|:-------|:----------------| +| :shield: **Analyze** | Portability | Unsupported ops, shape mismatches, EP compatibility gaps | +| :zap: **Optimize** | Performance | Suboptimal operator patterns, fusion opportunities, graph simplifications | +| :bar_chart: **Evaluate** | Fidelity | Accuracy regressions from quantization and compilation | + +**How it works in practice:** + +1. **Export** converts a HuggingFace (or custom) model into ONNX format with rich metadata +2. **Analyze** scans every operator against the target EP's capability matrix and flags issues +3. **Optimize** applies graph transformations — operator fusion, constant folding, layout optimization +4. **Quantize** compresses weights and activations to int8/int16 with calibration data +5. **Compile** produces device-specific binaries (e.g., QNN context binaries for Qualcomm NPUs) + +At each stage, artifacts are saved to disk and can be inspected or edited before continuing. +The `build` command chains all five stages together, with the analyzer running in a loop to +auto-configure optimizations for maximum EP coverage. + +--- + +## :clipboard: Built-in Models + +Run `winml hub` to browse the full catalog interactively. Use `winml hub --model ` for per-model details. + +
+Click to expand the full model catalog + +| Model ID | Architecture | Task | +|:---------|:-------------|:-----| +| `ProsusAI/finbert` | bert | text-classification | +| `Intel/bert-base-uncased-mrpc` | bert | text-classification | +| `dslim/bert-base-NER` | bert | token-classification | +| `dbmdz/bert-large-cased-finetuned-conll03-english` | bert | token-classification | +| `Babelscape/wikineural-multilingual-ner` | bert | token-classification | +| `cardiffnlp/twitter-roberta-base-sentiment-latest` | roberta | text-classification | +| `w11wo/indonesian-roberta-base-posp-tagger` | roberta | token-classification | +| `google/vit-base-patch16-224` | vit | image-classification | +| `rizvandwiki/gender-classification` | vit | image-classification | +| `microsoft/swin-large-patch4-window7-224` | swin | image-classification | +| `facebook/convnext-tiny-224` | convnext | image-classification | +| `microsoft/resnet-50` | resnet | image-classification | +| `microsoft/table-transformer-detection` | table-transformer | object-detection | +| `mattmdjaga/segformer_b2_clothes` | segformer | image-segmentation | +| `nvidia/segformer-b1-finetuned-ade-512-512` | segformer | image-segmentation | +| `nvidia/segformer-b2-finetuned-ade-512-512` | segformer | image-segmentation | +| `nvidia/segformer-b5-finetuned-ade-640-640` | segformer | image-segmentation | + +
+ +> Every model in the catalog has been validated end-to-end: export, quantize, device deployment, and accuracy verification. + +--- + +## :warning: Scope & Limitations + +| :white_check_mark: Supported | :x: Not Supported | +|:------------------------------|:-------------------| +| CNNs (ResNet, ConvNeXT, Swin) | LLMs (GPT, LLaMA, Phi, Mistral) | +| Vision Transformers (ViT, DeiT) | Diffusion models (Stable Diffusion, SDXL) | +| NLP classifiers (BERT, RoBERTa) | Decoder-only / seq2seq generative models | +| Token classifiers (NER, POS tagging) | Multi-modal generative models | +| Object detection (DETR, Table Transformer) | Models requiring custom CUDA kernels | +| Semantic segmentation (SegFormer) | Training or fine-tuning workflows | +| Image super-resolution (Swin2SR, ESRGAN) | | + +> LLM and generative model support is on the roadmap. + +--- + +## :world_map: Roadmap + +| Milestone | Target | Highlights | +|:----------|:-------|:-----------| +| :yellow_circle: **Kickoff** | Q4 2025 | Internal prototype, core primitive commands | +| :green_circle: **Early Access** | Q1 2026 | First external testers, config + build pipeline, hub catalog | +| :blue_circle: **Public Beta** | Q2 2026 | Open source, agent skills, AI Toolkit integration | +| :purple_circle: **RC** | Q3-Q4 2026 | LLM + LoRA support, GPU and NPU expansion, MLIR backend | + +
+Click to expand roadmap details + +**Q4 2025 — Kickoff** +- Primitive commands: `inspect`, `export`, `optimize`, `quantize`, `compile` +- QNN, OpenVINO, and VitisAI execution provider support +- Internal validation with ResNet, BERT, ViT, SegFormer families + +**Q1 2026 — Early Access** +- Pipeline commands: `config`, `build`, `perf`, `eval` +- Analyzer with auto-configuration loop +- Built-in model catalog (`winml hub`) with accuracy verdicts +- Live hardware monitoring (`--monitor`) + +**Q2 2026 — Public Beta** +- Open source release +- Agent-ready skills for coding assistants +- AI Toolkit for VS Code integration +- Expanded model catalog: depth estimation, super-resolution, CLIP + +**Q3-Q4 2026 — Release Candidate** +- LLM support (decoder-only architectures with LoRA adapters) +- TensorRT, MIGraphX, and DirectML execution providers +- MLIR-based optimization backend +- Public SDK and framework APIs + +
+ +--- + +## :handshake: Contributing + +*Coming soon.* We are working on contribution guidelines and will open the process during Public Beta. + +--- + +## :page_facing_up: License + +[MIT](../../LICENSE) diff --git a/docs/getting-started/04.customer-intro/workflow-only.png b/docs/getting-started/04.customer-intro/workflow-only.png new file mode 100644 index 000000000..9cb01b542 Binary files /dev/null and b/docs/getting-started/04.customer-intro/workflow-only.png differ diff --git a/docs/getting-started/04.customer-intro/workflow-only.typ b/docs/getting-started/04.customer-intro/workflow-only.typ new file mode 100644 index 000000000..2cb6a7322 --- /dev/null +++ b/docs/getting-started/04.customer-intro/workflow-only.typ @@ -0,0 +1,67 @@ +#import "@preview/fletcher:0.5.7": diagram, node, edge + +#set page(width: auto, height: auto, margin: 12pt, fill: rgb("#EEEEEE")) +#set text(fill: rgb("#1a2332")) + +#text(size: 0.75em)[ + #diagram( + spacing: (22pt, 25pt), + node-stroke: 0.8pt + luma(120), + node-fill: white, + node-corner-radius: 4pt, + edge-stroke: 0.8pt + luma(100), + mark-scale: 60%, + + // Artifacts (circles) + node((0, 0), text(fill: rgb("#1a2332"))[Source\ Model], fill: rgb("#f5e6d3"), stroke: 1pt + rgb("#c9a87c"), shape: circle, name: ), + node((7, 0), text(fill: rgb("#1a2332"))[Portable\ ONNX], fill: rgb("#f5e6d3"), stroke: 1pt + rgb("#c9a87c"), shape: circle, name: ), + + // Export + node((1.5, 0), text(fill: rgb("#1a2332"))[Load &\ Export], fill: rgb("#c8e6c9"), stroke: 1pt + rgb("#66bb6a"), name: ), + + // Analyzer box with Lint and Conf + node((3, 0), text(fill: rgb("#1a2332"))[Lint], fill: rgb("#e3f2fd"), name: ), + node((4.2, 0), text(fill: rgb("#1a2332"))[Conf], fill: rgb("#f3e5f5"), name: ), + node( + enclose: (, ), + fill: rgb("#c8e6c9"), + stroke: 1pt + luma(180), + inset: 12pt, + snap: -1, + name: , + ), + node((3.6, 0.7), [Analyzer], stroke: none, fill: none), + + // Optimizer (above analyzer) + node((3.6, -2.3), text(fill: rgb("#1a2332"))[Optimize], fill: rgb("#c8e6c9"), stroke: 1pt + rgb("#66bb6a"), name: ), + + // Quantize (below main line - optional path) + node((5.5, 1), text(fill: rgb("#1a2332"))[Quantize], fill: rgb("#c8e6c9"), stroke: 1pt + rgb("#66bb6a"), name: ), + + // Evaluate, Profile, and Deploy + node((9.5, 0), text(fill: rgb("#1a2332"))[Evaluate], fill: rgb("#c8e6c9"), stroke: 1pt + rgb("#66bb6a"), name: ), + node((9.5, 1), text(fill: rgb("#1a2332"))[Profile], fill: rgb("#fff9c4"), stroke: 1pt + rgb("#fbc02d"), name: ), + node((11, 0), text(fill: rgb("#1a2332"))[Deploy], fill: rgb("#e1bee7"), stroke: (dash: "dashed", paint: rgb("#7b1fa2")), shape: circle, name: ), + + // Main flow + edge(, , "-|>"), + edge(, , "-|>"), + edge(, , "-|>"), + + // Optimizer feedback loop + edge(, , "-|>", bend: -25deg, label: [_config_], label-side: right), + edge(, , "-|>", bend: -25deg, label: [_optimized_], label-side: right), + + // Main path (fp16 - skip quantize) + edge(, , "-|>", label: [_fp16_], label-side: left), + + // Optional quantize path (n-bit) + edge(, , "-|>", bend: -25deg, label: [_n-bit_], label-side: left), + edge(, , "-|>", bend: -25deg, label: [_qdq_], label-side: left), + + // Continue to eval and output + edge(, , "-|>", label: [_compile_]), + edge(, , "--|>", stroke: (dash: "dashed")), + edge(, , "-|>"), + ) +] diff --git a/docs/standards/design-doc-spec.md b/docs/standards/design-doc-spec.md new file mode 100644 index 000000000..df73c0621 --- /dev/null +++ b/docs/standards/design-doc-spec.md @@ -0,0 +1,450 @@ +# ModelKit Design Document Standard + +**Version**: 1.1 +**Date**: 2026-04-19 +**Status**: Active +**Supersedes**: `docs/design/optracing/learnings_from_232.md` (survey; promoted to normative spec) +**Owner**: `docs/` CODEOWNERS (repository maintainers) + +--- + +## 1. Scope and Applicability + +### 1.1 Purpose + +This document defines the mandatory structure, content, and lifecycle of all design documents checked into the ModelKit repository. It is normative: a pull request whose design docs violate this standard MAY be rejected on that basis alone. + +### 1.2 RFC 2119 Terminology + +The key words **MUST**, **MUST NOT**, **SHOULD**, **SHOULD NOT**, and **MAY** in this document are to be interpreted as described in RFC 2119. In brief: + +- **MUST** / **MUST NOT** — absolute requirement / prohibition +- **SHOULD** / **SHOULD NOT** — strong recommendation; exceptions permitted with written rationale in the doc +- **MAY** — optional; author's discretion + +### 1.3 When a Design Doc Is Required + +A design doc **MUST** be authored before implementation when any of the following applies: + +- Adding a new module under `src/winml/modelkit/` (a new top-level package directory) +- Introducing a new public API surface (new `__init__.py` exports) +- Changing the behavior of an existing public API in a backward-incompatible way +- Refactoring that spans three or more modules or deletes an existing package +- Introducing a new CLI command under `wmk` +- Changing a persisted on-disk format (cache layout, config schema, output file format) + +A design doc **MAY** be skipped for: + +- Bug fixes that preserve public API +- Documentation-only changes +- Test additions that do not change production code +- Dependency bumps that do not alter behavior +- Refactoring confined to a single file + +### 1.4 Document Types + +Three document types are defined. A feature **MUST** have at least `1_prd.md`. It **SHOULD** have `2_coreloop.md` when the implementation is non-trivial (>1 file or >200 lines of code). It **MAY** have one or more `3_design_.md` files for detailed component specifications. + +| File | Role | Required for | Audience | +|------|------|---------------|----------| +| `1_prd.md` | Product Requirements Document — WHAT the system does and WHY | Every feature | PMs, architects, reviewers | +| `2_coreloop.md` | Core Loop Design — HOW to build it (architecture, API, flow) | Every non-trivial feature | Implementers, reviewers | +| `3_design_.md` | Design Detail — internal APIs, call graphs, test strategy for one component | Optional, split from `2_coreloop.md` when a component exceeds ~300 lines of spec | Implementers, reviewers | + +### 1.5 Path Conventions + +Design docs **MUST** live under `docs/design//` where `` matches the primary source directory name under `src/winml/modelkit/`. + +``` +docs/design// +├── 1_prd.md (required) +├── 2_coreloop.md (required if non-trivial) +├── 3_design_.md (optional; split from coreloop when needed) +└── iterations/ (optional; brainstorming record) + ├── 01.md + └── 02.md +``` + +The `iterations/` subdirectory **MAY** be used to record the brainstorming history leading to the final design. Iteration files are informational, not normative, and are not subject to this spec's structural rules. + +Cross-module features (affecting multiple `` directories) **SHOULD** be documented in the directory of the primary module, with references from other affected modules' docs. + +### 1.5.1 Transitional Locations (refactor exception) + +A refactor that renames or relocates the primary source directory **MAY** keep its design docs at the original directory name for the duration of one release cycle, provided the docs include a **Transitional Location** note immediately after the metadata header. The note **MUST** contain: + +- The current doc directory (legacy name) +- The target `` value declared in the `Module` metadata field +- A commitment to relocate the docs under `docs/design//` in a named follow-up PR or within a stated timeframe + +Example note: + +```markdown +**Transitional Location**: This doc lives at `docs/design//` while its `Module` field declares the post-refactor target ``. Relocation to `docs/design//` is scheduled for PR # (tracked in issue #). +``` + +The exception expires when the refactor implementation lands. At that point the docs **MUST** be moved to comply with §1.5 proper. + +--- + +## 2. Metadata Header (MANDATORY) + +Every design doc **MUST** begin with a metadata header immediately after the H1 title. The header **MUST** use bold key-value syntax and include all required fields. + +### 2.1 Required Fields + +```markdown +# + +**Version**: <semver, start at 1.0> +**Date**: <YYYY-MM-DD> +**Status**: <one of: Draft | Active | Implemented | Deprecated> +**Module**: <primary module name, matches docs/design/<module>/> +``` + +### 2.2 Optional Fields + +```markdown +**Supersedes**: <path to previous doc> +**Depends-On**: <comma-separated paths to upstream docs> +**Author**: <name or handle> +**Related Documents**: see §<N> (when a table appears later) +``` + +### 2.3 Status Values — Semantics + +| Status | Meaning | +|--------|---------| +| **Draft** | Under active authoring; not yet reviewed. Implementation **MUST NOT** begin. | +| **Active** | Reviewed and approved. Implementation in progress. | +| **Implemented** | Feature is shipped; doc reflects current reality. | +| **Deprecated** | Feature removed or superseded. Doc retained for history. **MUST** include `Supersedes` pointing to the replacement (or `Removed-In: <commit hash>`). | + +### 2.4 Example + +```markdown +# WinMLSession — Core Loop Design + +**Version**: 2.1 +**Date**: 2026-04-19 +**Status**: Active +**Module**: session +**Supersedes**: docs/design/session/archive/2_coreloop_v2.md +**Depends-On**: docs/design/compiler/1_prd.md, docs/design/session/monitor/2_coreloop.md +``` + +--- + +## 3. Structural Conventions + +### 3.1 Section Numbering + +Every section at H2 and H3 levels **MUST** be numbered. Deeper nesting **MAY** omit numbers. + +```markdown +## 1. Overview ✓ MUST +### 1.1 Purpose ✓ MUST +#### Why this matters ✓ MAY (unnumbered OK at H4+) +``` + +### 3.2 Table of Contents + +Docs longer than 200 lines **MUST** include a Table of Contents as the first `## ` section (before §1). TOC entries **MUST** be anchor-linked. Docs shorter than 200 lines **MAY** omit the TOC. + +### 3.3 Related Documents Table + +`2_coreloop.md` and `3_design_*.md` **MUST** include a "Related Documents" section as §0 (before §1), formatted as a table: + +```markdown +## 0. Related Documents + +| Document | Path | Purpose | +|----------|------|---------| +| PRD | `./1_prd.md` | Feature requirements | +| Upstream | `../foo/2_coreloop.md` | Provides X API | +| Parallel | `./3_design_bar.md` | Details of the Bar component | +``` + +`1_prd.md` **MAY** include a Related Documents table but it is not required. + +### 3.4 Requirement ID Prefixes + +Requirements, stories, constraints, and criteria **MUST** use the following ID prefixes where applicable: + +| Prefix | Meaning | Scope | +|--------|---------|-------| +| `US-N` | User Story | `1_prd.md` §3 | +| `FR-N` | Functional Requirement | `1_prd.md` §4 | +| `NFR-N` | Non-Functional Requirement | `1_prd.md` §5 | +| `C-N` | Design Constraint | `1_prd.md` §7 | +| `R-N` / `M-N` | Risk / Mitigation pair | `1_prd.md` §8 | +| `SC-N` | Success Criterion | `1_prd.md` §1.3 or §10 | +| `FP-N` | Forbidden Pattern | this doc §8 only | + +IDs **MUST** be unique within a doc and **SHOULD** be stable across versions (do not renumber unless you renumber the entire set). + +### 3.5 Versioning Suffix Rule + +Architectural revisions that change the document's direction (not just fixes) **MUST** bump the Version field. When a revision is large enough that cross-references to the old version would be confusing, the author **MAY** create `<N>_<type>_v2.md` as a new file and set `Supersedes` on the new file. In that case, the old file's Status **MUST** be updated to `Deprecated` and **MUST** include `Supersedes` pointing to the new file. + +Minor edits (typos, clarifications, link fixes) **SHOULD NOT** bump the version. + +--- + +## 4. PRD Skeleton (`1_prd.md`) + +A PRD **MUST** include the mandatory sections below, in order. A PRD's substance is its Requirements (§4 and §5) — earlier sections provide context, later sections capture risk and history. A lightweight PRD for an internal refactor **MAY** have a one-paragraph Executive Summary but **MUST NOT** omit the Requirements sections. + +### 4.1 Mandatory Sections + +``` +## 1. Executive Summary + 1.1 Purpose + 1.2 Problem Statement + 1.3 Success Metrics (SC-N) + +## 2. Scope + 2.1 In Scope + 2.2 Out of Scope + +## 4. Functional Requirements (FR-N) + +## 5. Non-Functional Requirements (NFR-N) + 5.1 Performance + 5.2 Reliability + 5.3 Usability + 5.4 Compatibility + +## 10. Appendix + 10.1 Glossary + 10.2 References + 10.3 Document History +``` + +### 4.2 Conditional Sections + +Include these only when applicable. Omission is permitted but **SHOULD** be justified in the Open Questions (§9) if the reader might expect them. + +``` +## 3. User Stories (US-N) — when there is a PM / end-user audience +## 6. Technical Design (high-level) — when architectural overview aids comprehension +## 7. Design Constraints (C-N) — when external constraints bind the design +## 8. Risks and Mitigations (R-N/M-N) — when non-obvious failure modes exist +## 9. Open Questions — when the design has unresolved points +``` + +### 4.3 Exemplar + +See §7.1 of this spec for the canonical PRD reference. + +--- + +## 5. Coreloop Skeleton (`2_coreloop.md`) + +`2_coreloop.md` describes the architecture and core execution flow. It **MUST** reference `1_prd.md` via the Related Documents table (§3.3 of this spec). + +### 5.1 Mandatory Sections + +``` +## 0. Related Documents (MUST, per §3.3) + +## 1. Design Philosophy + 1.1 Purpose + 1.2 Core Principles + 1.3 Design Pattern (if applicable) + +## 2. Module Structure + 2.1 Component Diagram or File Layout + 2.2 Key Dependencies + +## 3. Core Loop Implementation + 3.1 High-Level Flow + 3.2 Call Sequence / Data Flow + +## 4. API Design + 4.1 Public Functions / Classes + 4.2 Function Signatures + 4.3 Return Types / Data Structures + +## 7. Error Handling + +## 8. Testing Strategy + 8.1 Unit Tests + 8.2 Integration Tests + +## 11. Revision History +``` + +### 5.2 Conditional Sections + +``` +## 0.5 I/O Dependencies — when the module orchestrates 3+ upstream modules (see config/2_coreloop.md §0 for a canonical example) +## 5. CLI Design / Integration — when the feature ships a CLI command +## 6. Configuration / Data Structures — when configuration format is non-trivial +## 9. Integration Points — when downstream modules consume this one +## 10. Future Work — when forward-looking notes matter +``` + +### 5.3 Data Flow Diagrams + +Large coreloops **SHOULD** include at least one data flow diagram. Format **MAY** be ASCII box art or Mermaid. A doc **MUST NOT** mix formats within a single diagram. + +--- + +## 6. Design Detail Skeleton (`3_design_<topic>.md`) + +### 6.1 When to Create + +A `3_design_<topic>.md` **SHOULD** be created when a single component's detailed design would push `2_coreloop.md` above ~900 lines, or when the component has a well-bounded internal API worth documenting separately (e.g., `export/3_design_io.md`, `loader/3_design_task.md`). + +### 6.2 Naming + +The filename **MUST** be `3_design_<topic>.md` where `<topic>` is a snake_case noun phrase describing the component (e.g., `3_design_qnn_monitor.md`, `3_design_io.md`, `3_design_task.md`). + +### 6.3 Mandatory Sections + +``` +## 0. Related Documents (MUST, per §3.3) + +## 1. Purpose + 1.1 What problem does this solve? + 1.2 Scope + +## 2. Public API + 2.1 Classes / Functions + 2.2 Signatures and contracts + +## 3. Internal Implementation + 3.1 Internal functions (prefix `_`) + 3.2 Call graph + 3.3 Design rationale + +## 4. Resolution / Scenario Flows (when multiple code paths exist) + +## 5. Integration / Override Mechanism (when extensible by users) + +## 7. Test Strategy + (test cases tied to specific API functions — see §8.3 of this spec) + +## 8. Future Extensions +``` + +--- + +## 7. Exemplars + +The following documents are the canonical style references. If a future doc of the same type conflicts with this spec, the spec wins; but within the freedoms the spec permits, these are the style anchors. + +**Branch note**: the exemplars below live on branch `232` (see `D:\BYOM\ModelKit_PRs\232\docs\design\`). They are not present on `feat/mvp`. Authors on `feat/mvp` **SHOULD** consult them via the `232` branch checkout. When `232` merges into `feat/mvp` (or into `main`), this section's paths become relative. + +### 7.1 Best PRD + +`docs/design/build/1_prd.md` (branch `232`) — paired user stories (US-1 to US-6), explicit two-step workflow scoping, detailed output directory contract. + +### 7.2 Best Coreloop + +`docs/design/config/2_coreloop.md` (branch `232`) — §0 I/O Dependencies section (upstream-first), four-tier priority system, call sequence diagrams, scenario-driven flows. + +### 7.3 Best Design Detail + +`docs/design/export/3_design_io.md` (branch `232`) — public vs internal function split, current-vs-proposed call graph, test strategy tied to specific API functions. + +### 7.4 First compliant exemplar on `feat/mvp` + +`docs/design/session/monitor/1_prd.md` + `docs/design/session/monitor/2_coreloop.md` — the first doc pair authored against v1.0 of this spec. Use the 232 exemplars above for depth and pattern; use the optracing pair to see how the spec's rules apply in practice on this branch. + +--- + +## 8. Forbidden Patterns (MUST NOT) + +### FP-1. Multiple approaches without designating canonical + +A doc **MUST NOT** present multiple implementation approaches as equally viable. One **MUST** be designated canonical; others **MUST** be labeled `Rejected Alternative` with rationale. Violation example: `module/1_prd.md` on branch `232` presents three approaches without sequencing. + +### FP-2. Circular cross-references + +Doc A referencing Doc B referencing Doc C referencing Doc A **MUST NOT** occur. Cross-references **MUST** flow downward in the dependency graph: PRD → Coreloop → Design Detail. Upstream docs may reference downstream docs only via the Related Documents table, never inline. + +### FP-3. Test strategy disconnected from API + +A Testing Strategy section **MUST NOT** list generic test categories. It **MUST** map specific test files or test cases to specific API functions or classes. Example of compliant form: *"`tests/session/test_perf.py::test_auto_reset_fires_when_options_differ` validates `WinMLSession.perf().__enter__` in §4.5."* + +### FP-4. Missing success criteria + +A PRD **MUST NOT** ship without at least one `SC-N` success criterion. Features without measurable completion criteria cannot be verified as done. + +### FP-5. Silent supersession + +A doc **MUST NOT** replace another doc without adding a `Supersedes` field to the new doc and updating the old doc's Status to `Deprecated`. Silent replacement breaks the doc history chain. + +### FP-6. Undocumented abbreviations + +Module-specific acronyms (QNN, HTP, QDQ, EPContext, PDH, QHAS, etc.) **MUST** be defined in §10 (Appendix Glossary) of the PRD. A doc **MUST NOT** use such acronyms without either glossary entry or inline expansion on first use. + +### FP-7. Mixed markdown diagram formats + +A single diagram **MUST NOT** mix ASCII and Mermaid. A document **MAY** use both formats for different diagrams, but each diagram **MUST** be internally consistent. + +--- + +## 9. Deprecation and Lifecycle + +### 9.1 Marking a Doc as Deprecated + +When a feature is removed or replaced: + +1. Set the old doc's `Status: Deprecated` in the metadata header. +2. Add `Supersedes: <path to replacement>` OR `Removed-In: <commit hash / PR number>` field. +3. Add a prominent H2 section immediately after the header: + ```markdown + ## ⚠ Deprecated + This document describes a superseded design. See the replacement at <path>. Retained for historical context only. + ``` + +### 9.2 Superseding a Doc + +The new doc **MUST**: + +1. Include `Supersedes: <path to old doc>` in its metadata header. +2. State in its §1.2 Problem Statement how it differs from the predecessor. +3. Migrate any still-relevant cross-references (e.g., other docs pointing at the old file) in the same PR. + +### 9.3 Archiving + +A deprecated doc **SHOULD** be retained in place for 6 months after supersession, then **MAY** be moved to `docs/design/<module>/archive/`. Archive files **MUST** keep their metadata header. They **MUST NOT** be deleted from git history. + +--- + +## 10. Spec Governance + +### 10.1 Owner + +Changes to this spec are approved by the `docs/` CODEOWNERS (currently the repository maintainers). In the absence of an explicit CODEOWNERS file, the reviewer of the latest merged PR touching `docs/` is de-facto owner. + +### 10.2 Change Process + +Amendments to this spec **MUST** be made via pull request and **MUST**: + +1. Update the `Version` field (semver: MAJOR = breaking rule change; MINOR = new rule; PATCH = clarification). +2. Update the `Date` field. +3. Append an entry to the Revision History (§11). +4. Receive at least one approval from someone who has authored a design doc under this spec. + +### 10.3 Review Cadence + +This spec **SHOULD** be reviewed annually and **SHOULD** be revisited after the first three new modules adopt it, to incorporate real-world feedback. + +### 10.4 Enforcement + +Reviewers **MAY** reject a PR whose design docs violate this spec, citing the violated section. Authors **MAY** appeal by proposing an amendment to the spec itself. + +Exceptions to **SHOULD**-level rules **MUST** be justified inline in the doc (e.g., "This doc omits §5 because no CLI integration exists; see §9 Open Questions"). Exceptions to **MUST**-level rules **MUST** be resolved either by fixing the doc or amending this spec. + +--- + +## 11. Revision History + +| Version | Date | Change | +|---------|------|--------| +| 1.0 | 2026-04-19 | Initial version. Promoted from `docs/design/optracing/learnings_from_232.md` (descriptive survey) to normative spec. Added RFC 2119 vocabulary, metadata header requirement, `3_design_*.md` skeleton, Forbidden Patterns, Deprecation protocol, and Governance section. | +| 1.1 | 2026-04-19 | Post-audit amendments: (a) added §1.5.1 "Transitional Locations" as a principled exception mechanism for refactors that rename or relocate their primary source directory — addresses the first compliant doc pair's need to sit at its legacy path until implementation lands; (b) §7 now explicitly notes that the canonical exemplars live on branch `232` (not on `feat/mvp`), with cross-branch consultation guidance and a local `feat/mvp` reference to the first compliant doc pair. | diff --git a/docs/superpowers/plans/2026-04-23-op-tracing-refactor.md b/docs/superpowers/plans/2026-04-23-op-tracing-refactor.md new file mode 100644 index 000000000..9d4ffa162 --- /dev/null +++ b/docs/superpowers/plans/2026-04-23-op-tracing-refactor.md @@ -0,0 +1,2182 @@ +# Op-Tracing Refactor Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Unify the `EPMonitor` (`session/monitor/`) and `OpTracer` (`optracing/`) hierarchies into one; fix QNN op-tracing to work with both `onnxruntime-qnn` AND `onnxruntime-windowsml`; delete the `optracing/` package. + +**Architecture:** Hook-based Plugin + Template Method + Observer. `WinMLSession.perf(warmup, monitor=...)` yields a `PerfContext(stats, monitor)`. Monitors contribute `get_session_options()` + `get_provider_options()` at compile time. `QNNMonitor` replaces `QNNProfiler` and owns all QNN-specific knowledge. See `docs/design/session/monitor/2_coreloop.md` v2.2. + +**Tech Stack:** Python 3.11+, ONNX Runtime (three variants: `onnxruntime`, `onnxruntime-qnn`, `onnxruntime-windowsml`), pytest, ruff. Testing via `uv run pytest`. + +**Related docs:** +- `docs/design/session/monitor/1_prd.md` v2.2 (requirements) +- `docs/design/session/monitor/2_coreloop.md` v2.2 (core design) +- `docs/standards/design-doc-spec.md` v1.1 (doc standard) + +**Sequencing strategy:** Relocate helpers FIRST as backward-compatible shims (old imports still work); build new monitor on top; flip callers; delete the old package last. Each task leaves `uv run pytest tests/` green and `uv run ruff check src/ tests/` clean. + +--- + +## Task 0: Prep — create branch, verify starting state + +**Files:** none yet (orientation only). + +- [ ] **Step 1: Ensure clean working tree** + +Run: +```bash +git status +git rev-parse --abbrev-ref HEAD +``` +Expected: current branch is `feat/mvp` or a branch off of it; no uncommitted changes (or only docs changes from this session). + +- [ ] **Step 2: Create implementation branch** + +Run: +```bash +git checkout -b feat/op-tracing-refactor +``` +Expected: branch created from current HEAD. + +- [ ] **Step 3: Verify current test baseline** + +Run: +```bash +uv run pytest tests/unit/optracing/ -v --tb=short +``` +Expected: existing optracing tests PASS (or whatever the baseline is — record the exact count and any pre-existing failures unrelated to our work; those stay pre-existing). + +Record the baseline output in a scratch note; we will reuse it as regression checks. + +- [ ] **Step 4: Verify ruff is clean on touched modules** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/ src/winml/modelkit/optracing/ src/winml/modelkit/commands/perf.py +``` +Expected: No findings, OR a short list of pre-existing findings (record them; they stay pre-existing). + +--- + +## Task 1: Add `ensure_initialized()` module function to `ep_registry.py` + +**Rationale:** Break the reverse-coupling where `QNNMonitor.is_available()` would otherwise need to import `WinMLSession`. A thin module-level wrapper gives us an import-cycle-safe entry point. + +**Files:** +- Modify: `src/winml/modelkit/session/ep_registry.py` (existing; add ~10 lines) +- Test: `tests/unit/session/test_ep_registry.py` (new) + +- [ ] **Step 1: Create the test directory if missing** + +Run: +```bash +mkdir -p tests/unit/session +test -f tests/unit/session/__init__.py || touch tests/unit/session/__init__.py +``` + +- [ ] **Step 2: Write the failing test** + +Create `tests/unit/session/test_ep_registry.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for ep_registry module-level helpers.""" + +from __future__ import annotations + +from unittest.mock import patch + +from winml.modelkit.session.ep_registry import ensure_initialized + + +def test_ensure_initialized_calls_registry_once(): + """ensure_initialized() calls register_to_ort() once regardless of call count.""" + with patch( + "winml.modelkit.session.ep_registry.WinMLEPRegistry" + ) as mock_registry_cls: + instance = mock_registry_cls.get_instance.return_value + instance.winml_available = True + + ensure_initialized() + ensure_initialized() + ensure_initialized() + + # Singleton should be fetched, register_to_ort called each time (idempotent inside registry) + assert mock_registry_cls.get_instance.call_count >= 1 + # Multiple calls must not raise +``` + +- [ ] **Step 3: Run test — expect ImportError (function not defined)** + +Run: +```bash +uv run pytest tests/unit/session/test_ep_registry.py -v +``` +Expected: FAIL with `ImportError: cannot import name 'ensure_initialized' from 'winml.modelkit.session.ep_registry'`. + +- [ ] **Step 4: Add the function to `ep_registry.py`** + +Modify `src/winml/modelkit/session/ep_registry.py`. After the `get_ort_available_providers` function (around line 199), append: + +```python +def ensure_initialized() -> None: + """Idempotent module-level entry point for WinML EP registration. + + Wraps ``WinMLEPRegistry.get_instance().register_to_ort()`` so callers + (e.g. ``QNNMonitor.is_available``) can trigger EP registration without + importing ``WinMLSession`` — breaks a latent import cycle. + + Safe to call multiple times. No-op if WinML is unavailable on this system. + """ + try: + registry = WinMLEPRegistry.get_instance() + if registry.winml_available: + registry.register_to_ort() + except Exception as exc: # noqa: BLE001 — log-and-continue is intentional + logger.debug("ensure_initialized: WinML EP registration skipped: %s", exc) +``` + +- [ ] **Step 5: Run test — expect PASS** + +Run: +```bash +uv run pytest tests/unit/session/test_ep_registry.py -v +``` +Expected: PASS. + +- [ ] **Step 6: Run ruff** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/ep_registry.py tests/unit/session/test_ep_registry.py --fix +``` +Expected: no findings. + +- [ ] **Step 7: Full pytest sanity** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: All tests pass (same as baseline). + +- [ ] **Step 8: Commit** + +Run: +```bash +git add src/winml/modelkit/session/ep_registry.py tests/unit/session/test_ep_registry.py tests/unit/session/__init__.py +git commit -m "feat(session): add ensure_initialized() module function to ep_registry + +Breaks reverse-coupling for QNNMonitor.is_available() by providing a +module-level entry point that wraps the WinMLEPRegistry singleton. +Idempotent; safe to call multiple times." +``` + +--- + +## Task 2: Extend `EPMonitor` ABC with optional default hooks + +**Rationale:** Add the two hook methods (`get_session_options`, `get_provider_options`) and the `requires_session_teardown` class attribute that the new design relies on. All three have defaults so existing `VitisAIMonitor`, `NullEPMonitor`, `OpenVinoMonitor`, `QNNMonitor` (placeholder) continue to work unchanged. + +**Files:** +- Modify: `src/winml/modelkit/session/monitor/ep_monitor.py` +- Test: `tests/unit/session/monitor/test_ep_monitor_base.py` (new) + +- [ ] **Step 1: Create test directory** + +Run: +```bash +mkdir -p tests/unit/session/monitor +touch tests/unit/session/monitor/__init__.py +``` + +- [ ] **Step 2: Write failing tests** + +Create `tests/unit/session/monitor/test_ep_monitor_base.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for EPMonitor ABC default hook behavior and NullEPMonitor inheritance.""" + +from __future__ import annotations + +import pytest + +from winml.modelkit.session.monitor.ep_monitor import EPMonitor, NullEPMonitor + + +def test_null_monitor_default_get_session_options(): + """NullEPMonitor inherits empty session-options default.""" + assert NullEPMonitor().get_session_options() == {} + + +def test_null_monitor_default_get_provider_options(): + """NullEPMonitor inherits empty provider-options default.""" + assert NullEPMonitor().get_provider_options() == {} + + +def test_null_monitor_default_requires_teardown(): + """NullEPMonitor.requires_session_teardown is False by default.""" + assert NullEPMonitor.requires_session_teardown is False + + +def test_ep_monitor_is_abstract(): + """EPMonitor cannot be instantiated directly (still abstract).""" + with pytest.raises(TypeError): + EPMonitor() # type: ignore[abstract] + + +def test_hooks_return_fresh_dicts(): + """get_*_options returns a fresh dict each call (not a shared mutable).""" + m = NullEPMonitor() + d1 = m.get_session_options() + d1["injected"] = "1" + d2 = m.get_session_options() + assert "injected" not in d2 +``` + +- [ ] **Step 3: Run tests — expect failures** + +Run: +```bash +uv run pytest tests/unit/session/monitor/test_ep_monitor_base.py -v +``` +Expected: multiple FAILs with `AttributeError: ... has no attribute 'get_session_options'` etc. + +- [ ] **Step 4: Add defaults to the ABC** + +Modify `src/winml/modelkit/session/monitor/ep_monitor.py`. Change the class definition to add three members. Replace the current class body's top with: + +```python +from typing import Any, ClassVar + +class EPMonitor(ABC): + """Base class for EP-specific hardware performance monitoring. + + Used as a context manager alongside ``PerfStats`` to collect + hardware utilization metrics during inference. + + Example:: + + with session.perf(warmup=10, monitor=SomeEPMonitor()) as ctx: + for _ in range(110): + session.run(inputs) + + print(ctx.stats.mean_ms) # inference timing + print(ctx.monitor.to_dict()) # proof-of-execution data + """ + + # ---- Optional hooks: defaults provided; subclasses override as needed ---- + + #: ORT-specific hint: does this monitor's data flush require + #: ``ort.InferenceSession`` destruction? Example: QNN flushes CSV + #: only on session destroy. Default: False (no teardown needed). + requires_session_teardown: ClassVar[bool] = False + + def get_session_options(self) -> dict[str, str]: + """Entries to pass to ``SessionOptions.add_session_config_entry()``. + + Default: empty dict. Override in subclasses that need e.g. + ``"session.disable_cpu_ep_fallback": "1"``. + """ + return {} + + def get_provider_options(self) -> dict[str, str]: + """Options to merge into ``add_provider_for_devices([ep], opts)``. + + Default: empty dict. Override in subclasses that need e.g. + ``"profiling_level": "detailed"``. + """ + return {} + + # ---- Mandatory contract ---- +``` + +Keep the existing `@abstractmethod` methods (`__enter__`, `__exit__`, `to_dict`, `is_available`) unchanged below. + +- [ ] **Step 5: Run tests — expect PASS** + +Run: +```bash +uv run pytest tests/unit/session/monitor/test_ep_monitor_base.py -v +``` +Expected: all PASS. + +- [ ] **Step 6: Verify no regression to VitisAI / OpenVINO / QNN placeholder** + +Run: +```bash +uv run pytest tests/ -k "monitor" -v +``` +Expected: all existing monitor tests still pass. + +- [ ] **Step 7: Ruff** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/monitor/ep_monitor.py tests/unit/session/monitor/ --fix +``` + +- [ ] **Step 8: Commit** + +Run: +```bash +git add src/winml/modelkit/session/monitor/ep_monitor.py tests/unit/session/monitor/ +git commit -m "feat(monitor): add default hooks on EPMonitor ABC + +Adds three optional members with safe defaults: +- requires_session_teardown: ClassVar[bool] = False +- get_session_options() -> {} +- get_provider_options() -> {} + +Existing subclasses (VitisAI, OpenVINO, QNN placeholder, NullEPMonitor) +inherit defaults unchanged." +``` + +--- + +## Task 3: Relocate `OpTraceResult` / `OperatorMetrics` → `session/monitor/op_metrics.py` with additive `status` / `error` fields + +**Rationale:** Content move + additive extension. We move the file, keep `optracing/result.py` as a temporary re-export shim so old callers keep working during the transition, then delete the shim in Task 14. + +**Files:** +- Create: `src/winml/modelkit/session/monitor/op_metrics.py` +- Modify: `src/winml/modelkit/optracing/result.py` → re-export shim +- Test: `tests/unit/session/monitor/test_op_metrics.py` (new) + +- [ ] **Step 1: Write failing tests for the new location + new fields** + +Create `tests/unit/session/monitor/test_op_metrics.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for the relocated OpTraceResult + new status/error fields.""" + +from __future__ import annotations + +import json + +from winml.modelkit.session.monitor.op_metrics import ( + OperatorMetrics, + OpTraceResult, +) + + +def test_model_field_accepts_none(): + """model: str | None — passing None must not raise.""" + r = OpTraceResult(model=None, device="npu", tracing_level="basic") + assert r.model is None + + +def test_status_default_is_ok(): + """New status field defaults to 'ok' for backward compat with existing construction.""" + r = OpTraceResult(model="x", device="npu", tracing_level="basic") + assert r.status == "ok" + assert r.error is None + + +def test_status_can_be_set(): + r = OpTraceResult( + model="x", device="npu", tracing_level="basic", + status="parse_failed", error="corrupt CSV", + ) + assert r.status == "parse_failed" + assert r.error == "corrupt CSV" + + +def test_to_dict_preserves_nested_schema(): + """Existing nested schema (metadata / summary / operators / statistics / artifacts) preserved.""" + r = OpTraceResult(model="m.onnx", device="npu", tracing_level="basic", ep="QNN") + d = r.to_dict() + # Existing keys — must still exist exactly as before + assert "metadata" in d + assert d["metadata"]["model"] == "m.onnx" + assert d["metadata"]["device"] == "npu" + assert d["metadata"]["tracing_level"] == "basic" + assert d["metadata"]["ep"] == "QNN" + assert "summary" in d + assert "operators" in d + assert "statistics" in d + assert "artifacts" in d + + +def test_to_dict_adds_status_and_error_at_top_level(): + """New fields appear as additive top-level keys, not replacing anything.""" + r = OpTraceResult( + model="x", device="npu", tracing_level="basic", + status="no_data", error=None, + ) + d = r.to_dict() + assert d["status"] == "no_data" + assert d["error"] is None + + +def test_to_json_round_trip(): + """to_json must produce valid JSON containing both old and new fields.""" + r = OpTraceResult(model="x", device="npu", tracing_level="basic", status="ok") + parsed = json.loads(r.to_json()) + assert parsed["metadata"]["model"] == "x" + assert parsed["status"] == "ok" + + +def test_operator_metrics_to_dict_preserved(): + op = OperatorMetrics(name="Conv", op_path="/conv_1", duration_us=12.5, percent_of_total=5.0) + d = op.to_dict() + assert d["name"] == "Conv" + assert d["duration_us"] == 12.5 +``` + +- [ ] **Step 2: Run tests — expect ImportError** + +Run: +```bash +uv run pytest tests/unit/session/monitor/test_op_metrics.py -v +``` +Expected: `ImportError: No module named 'winml.modelkit.session.monitor.op_metrics'`. + +- [ ] **Step 3: Create the new file by copying current `optracing/result.py`, then extend** + +Copy the content of `src/winml/modelkit/optracing/result.py` into a new file `src/winml/modelkit/session/monitor/op_metrics.py`, and apply these changes: + +1. Change the `OpTraceResult.model` field from `model: str` to `model: str | None`. +2. Add two new fields after `artifacts` (at the end of the existing field list, BEFORE `to_dict`): + ```python + # Status of the trace — "ok" | "no_data" | "parse_failed" | "basic_fallback" + status: str = "ok" + # Populated when status == "parse_failed" + error: str | None = None + ``` +3. Modify `to_dict()` to include the new keys additively at top level (existing keys untouched): + ```python + def to_dict(self) -> dict[str, Any]: + """Serialize to structured dict. Preserves existing nested schema; + adds top-level ``status`` and ``error`` keys additively.""" + return { + "metadata": { + "model": self.model, + "device": self.device, + "ep": self.ep, + "tracing_level": self.tracing_level, + "tracing_backend": self.tracing_backend, + "timestamp": self.timestamp, + "num_samples": self.num_samples, + }, + "summary": self.summary, + "operators": [op.to_dict() for op in self.operators], + "statistics": self.statistics, + "artifacts": self.artifacts, + # ---- Additive ---- + "status": self.status, + "error": self.error, + } + ``` + +Module docstring at top should be updated to: +```python +"""OpTraceResult + OperatorMetrics — structured profiling output. + +Relocated from ``optracing/result.py`` as part of the op-tracing refactor. +Extended with ``status`` / ``error`` fields for failure reporting. +""" +``` + +Required imports at top: `from __future__ import annotations`, `import json`, `from dataclasses import dataclass, field, asdict`, `from datetime import datetime, timezone`, `from typing import Any`. + +- [ ] **Step 4: Replace `optracing/result.py` with a re-export shim** + +Overwrite `src/winml/modelkit/optracing/result.py` with: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Backward-compatibility shim. + +``OpTraceResult`` and ``OperatorMetrics`` moved to +``winml.modelkit.session.monitor.op_metrics``. This shim keeps old imports +working during the op-tracing refactor; removed once all callers are updated. +""" + +from __future__ import annotations + +from ..session.monitor.op_metrics import OperatorMetrics, OpTraceResult + + +__all__ = ["OperatorMetrics", "OpTraceResult"] +``` + +- [ ] **Step 5: Run new tests — expect PASS** + +Run: +```bash +uv run pytest tests/unit/session/monitor/test_op_metrics.py -v +``` +Expected: all 7 PASS. + +- [ ] **Step 6: Run existing tests via old import path — expect PASS (shim works)** + +Run: +```bash +uv run pytest tests/unit/optracing/test_result.py -v +``` +Expected: existing tests still PASS (shim re-exports). + +- [ ] **Step 7: Full test sanity** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: no regressions. + +- [ ] **Step 8: Ruff** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/monitor/op_metrics.py src/winml/modelkit/optracing/result.py tests/unit/session/monitor/test_op_metrics.py --fix +``` + +- [ ] **Step 9: Commit** + +Run: +```bash +git add src/winml/modelkit/session/monitor/op_metrics.py src/winml/modelkit/optracing/result.py tests/unit/session/monitor/test_op_metrics.py +git commit -m "feat(monitor): relocate OpTraceResult → session/monitor/op_metrics + +Move dataclasses from optracing/result.py. Additive changes: +- model: str -> str | None (allows None for standalone profiling) +- New fields: status (default 'ok'), error (default None) +- to_dict() preserves nested schema; adds top-level status/error keys + +Old import path retained as re-export shim; removed in later task." +``` + +--- + +## Task 4: Relocate report helpers → `session/monitor/report.py` + +**Rationale:** `display_op_trace_report` and `write_op_trace_json` move verbatim. Old `optracing/report.py` becomes a shim. + +**Files:** +- Create: `src/winml/modelkit/session/monitor/report.py` +- Modify: `src/winml/modelkit/optracing/report.py` → shim +- Test: `tests/unit/session/monitor/test_report.py` (move existing) + +- [ ] **Step 1: Copy content verbatim** + +Read `src/winml/modelkit/optracing/report.py`. Create `src/winml/modelkit/session/monitor/report.py` with identical content, but update: +- Module docstring: `"""Report helpers — display / write JSON for op-trace results.\n\nRelocated from optracing/report.py."""`. +- Any internal imports of `.result` → `.op_metrics` (since OpTraceResult now lives there). +- Import path for `OpTraceResult`: `from .op_metrics import OpTraceResult, OperatorMetrics` (replace the old `from .result import ...`). + +- [ ] **Step 2: Replace `optracing/report.py` with a shim** + +Overwrite `src/winml/modelkit/optracing/report.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Backward-compatibility shim. + +Report helpers moved to ``winml.modelkit.session.monitor.report``. +""" + +from __future__ import annotations + +from ..session.monitor.report import ( + display_op_trace_report, + write_op_trace_json, +) + + +__all__ = ["display_op_trace_report", "write_op_trace_json"] +``` + +- [ ] **Step 3: Move the existing test file** + +Run: +```bash +git mv tests/unit/optracing/test_report.py tests/unit/session/monitor/test_report.py +``` + +Then update imports in the moved file: replace `from winml.modelkit.optracing.report import` with `from winml.modelkit.session.monitor.report import`, and any `from winml.modelkit.optracing.result import` with `from winml.modelkit.session.monitor.op_metrics import`. + +- [ ] **Step 4: Run tests** + +Run: +```bash +uv run pytest tests/unit/session/monitor/test_report.py tests/unit/optracing/ -v +``` +Expected: all PASS. The shim ensures old-path imports still resolve. + +- [ ] **Step 5: Ruff** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/monitor/report.py src/winml/modelkit/optracing/report.py tests/unit/session/monitor/test_report.py --fix +``` + +- [ ] **Step 6: Commit** + +Run: +```bash +git add src/winml/modelkit/session/monitor/report.py src/winml/modelkit/optracing/report.py tests/unit/session/monitor/test_report.py tests/unit/optracing/ +git commit -m "feat(monitor): relocate report helpers → session/monitor/report + +Moves display_op_trace_report + write_op_trace_json. Old path retained +as re-export shim." +``` + +--- + +## Task 5: Relocate QNN helpers → `session/monitor/qnn/` + +**Rationale:** Move the three QNN-specific helper modules (`csv_parser.py`, `qhas_parser.py`, `viewer.py`) and fixtures. Old `optracing/qnn/` keeps `profiler.py` alive via shims until Task 12. + +**Files:** +- Create: `src/winml/modelkit/session/monitor/qnn/__init__.py` +- Create: `src/winml/modelkit/session/monitor/qnn/{csv_parser.py, qhas_parser.py, viewer.py}` +- Modify: `src/winml/modelkit/optracing/qnn/{csv_parser.py, qhas_parser.py, viewer.py}` → shims +- Test: move fixtures + test files + +- [ ] **Step 1: Create new package directory** + +Run: +```bash +mkdir -p src/winml/modelkit/session/monitor/qnn +``` + +Create `src/winml/modelkit/session/monitor/qnn/__init__.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""QNN-specific helpers for QNNMonitor: CSV parser, QHAS parser, viewer shell-out.""" +``` + +- [ ] **Step 2: Move three helper files via git mv** + +Run (one file at a time so git tracks rename; then restore the shim): +```bash +git mv src/winml/modelkit/optracing/qnn/csv_parser.py src/winml/modelkit/session/monitor/qnn/csv_parser.py +git mv src/winml/modelkit/optracing/qnn/qhas_parser.py src/winml/modelkit/session/monitor/qnn/qhas_parser.py +git mv src/winml/modelkit/optracing/qnn/viewer.py src/winml/modelkit/session/monitor/qnn/viewer.py +``` + +- [ ] **Step 3: Add shims back at old paths** + +Create `src/winml/modelkit/optracing/qnn/csv_parser.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Backward-compatibility shim. Moved to session/monitor/qnn/csv_parser.py.""" + +from __future__ import annotations + +from ...session.monitor.qnn.csv_parser import * # noqa: F401,F403 +from ...session.monitor.qnn.csv_parser import parse_qnn_profiling_csv + + +__all__ = ["parse_qnn_profiling_csv"] +``` + +Repeat identical pattern for `qhas_parser.py` (exporting `parse_qhas`) and `viewer.py` (exporting `find_qnn_sdk`, `run_qhas_viewer`). + +- [ ] **Step 4: Update moved files' internal imports** + +If any of the moved files import from `..result` or `..report`, redirect: +- `from ..result import` → `from ..op_metrics import` +- `from ..report import` → `from ..report import` (already present in monitor/) + +- [ ] **Step 5: Move test files + fixtures** + +Run: +```bash +mkdir -p tests/unit/session/monitor/qnn +touch tests/unit/session/monitor/qnn/__init__.py +git mv tests/unit/optracing/test_csv_parser.py tests/unit/session/monitor/qnn/test_csv_parser.py +git mv tests/unit/optracing/test_qhas_parser.py tests/unit/session/monitor/qnn/test_qhas_parser.py +git mv tests/unit/optracing/fixtures tests/unit/session/monitor/qnn/fixtures +``` + +Update imports in the moved test files: +- `from winml.modelkit.optracing.qnn.csv_parser import` → `from winml.modelkit.session.monitor.qnn.csv_parser import` +- Likewise for `qhas_parser`. + +Update fixture paths if any tests load them via relative paths. + +- [ ] **Step 6: Run tests** + +Run: +```bash +uv run pytest tests/unit/session/monitor/qnn/ tests/unit/optracing/ -v +``` +Expected: all PASS (new path + shim-based old path both work). + +- [ ] **Step 7: Ruff** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/monitor/qnn/ src/winml/modelkit/optracing/qnn/ tests/unit/session/monitor/qnn/ --fix +``` + +- [ ] **Step 8: Commit** + +Run: +```bash +git add src/winml/modelkit/session/monitor/qnn/ src/winml/modelkit/optracing/qnn/ tests/unit/session/monitor/qnn/ tests/unit/optracing/ +git commit -m "feat(monitor): relocate QNN helpers → session/monitor/qnn/ + +Moves csv_parser.py, qhas_parser.py, viewer.py + fixtures. +Old paths retained as shims." +``` + +--- + +## Task 6: Add `PerfContext` dataclass to `session/session.py` + +**Rationale:** `session.perf()` will yield this dataclass instead of a raw `PerfStats`. Introducing it now lets Task 8 extend `perf()` without churn. + +**Files:** +- Modify: `src/winml/modelkit/session/session.py` + +- [ ] **Step 1: Add the dataclass near other session types** + +In `src/winml/modelkit/session/session.py`, after the existing `SessionState` enum (around line 58-64), add: + +```python +@dataclass(frozen=True) +class PerfContext: + """Yielded by ``WinMLSession.perf()``. + + Aggregates perf statistics and the optional attached EP monitor. + Frozen: mutation is not a supported pattern — update the underlying + objects instead. + """ + stats: PerfStats + monitor: EPMonitor # NullEPMonitor when no monitor was passed +``` + +Ensure imports at top of file include: +- `from dataclasses import dataclass` +- `from .monitor.ep_monitor import EPMonitor, NullEPMonitor` +- Existing `from .stats import PerfStats` + +- [ ] **Step 2: Verify the import doesn't cause a circular dependency** + +Run: +```bash +uv run python -c "from winml.modelkit.session.session import WinMLSession, PerfContext; print('OK')" +``` +Expected: `OK`. + +- [ ] **Step 3: No test yet — dataclass is glue. Defer to Task 8's perf() test.** + +- [ ] **Step 4: Ruff** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/session.py --fix +``` + +- [ ] **Step 5: Commit** + +Run: +```bash +git add src/winml/modelkit/session/session.py +git commit -m "feat(session): add PerfContext dataclass + +Frozen dataclass aggregating PerfStats + EPMonitor. Prep for +session.perf(monitor=...) signature change." +``` + +--- + +## Task 7: Add `_active_session_option_entries` state + merge-in-_build_session_options + +**Rationale:** New instance attribute tracks monitor-contributed session-level config entries. `_build_session_options` applies them. Safe to land before `perf()` sets them because the dict is empty by default. + +**Files:** +- Modify: `src/winml/modelkit/session/session.py` + +- [ ] **Step 1: Initialize state in `__init__`** + +In `WinMLSession.__init__` (around line 165-220), after `self._provider_options = ep_config.provider_options if ep_config else {}` (~line 202), add: +```python +# Monitor-contributed session config entries (populated by session.perf(monitor=...)) +self._active_session_option_entries: dict[str, str] = {} +``` + +- [ ] **Step 2: Apply entries in `_build_session_options`** + +In the `_build_session_options` method (around line 415-452), after the method obtains `opts` (either the policy path or the explicit-EP path), and BEFORE returning `opts`, insert: +```python +# Apply monitor-contributed session config entries (active during session.perf(monitor=...)) +for key, value in self._active_session_option_entries.items(): + opts.add_session_config_entry(key, value) +``` + +Do this in BOTH branches (explicit-EP path and policy path) so the entries apply regardless. + +- [ ] **Step 3: Write a unit test verifying entries are applied** + +In `tests/unit/session/test_perf_monitor_integration.py` (new file), start a file we'll fill in more across subsequent tasks: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Integration tests for WinMLSession.perf(monitor=...) — teardown ordering, +auto-reset, session/provider option merging, exception transparency.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import onnxruntime as ort +import pytest + + +def test_active_session_option_entries_applied(): + """_build_session_options applies monitor-contributed entries.""" + from winml.modelkit.session.session import WinMLSession + + # Use a tiny real ONNX model or mock the ORT parts + # For unit isolation: mock _find_ep_device + InferenceSession entirely + with patch.object(WinMLSession, "_find_ep_device", return_value=None): + session = WinMLSession.__new__(WinMLSession) + session._device = "cpu" + session._ep = None + session._session_options = ort.SessionOptions() + session._provider_options = {} + session._active_session_option_entries = {"session.disable_cpu_ep_fallback": "1"} + + opts = session._build_session_options("cpu") + # ORT does not expose a clean read-back API, so at minimum verify no exception + # and the dict was consumed + assert isinstance(opts, ort.SessionOptions) +``` + +- [ ] **Step 4: Run test** + +Run: +```bash +uv run pytest tests/unit/session/test_perf_monitor_integration.py::test_active_session_option_entries_applied -v +``` +Expected: PASS. + +- [ ] **Step 5: Full test sanity** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: no regressions (the new dict is empty during normal runs). + +- [ ] **Step 6: Ruff + commit** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/session.py tests/unit/session/test_perf_monitor_integration.py --fix +git add src/winml/modelkit/session/session.py tests/unit/session/test_perf_monitor_integration.py +git commit -m "feat(session): add _active_session_option_entries state + +Infrastructure for session.perf(monitor=...) to contribute session-level +config entries via add_session_config_entry. Empty by default; populated +transiently during perf() context." +``` + +--- + +## Task 8: Extend `WinMLSession.perf()` to accept `monitor=` + full lifecycle + +**Rationale:** The central change. Implements hook invocation, auto-reset on option conflict, teardown ordering (reset → monitor.__exit__), exception transparency via `sys.exc_info()`, gc.collect for Windows file handles, nested-perf guard. + +**Files:** +- Modify: `src/winml/modelkit/session/session.py` +- Test: `tests/unit/session/test_perf_monitor_integration.py` (extend) +- Test: `tests/unit/session/test_perf_auto_reset.py` (new) + +- [ ] **Step 1: Write failing integration tests** + +Append to `tests/unit/session/test_perf_monitor_integration.py`: +```python +def test_perf_monitor_none_backward_compatible(tmp_path): + """perf() with no monitor works as before — yields PerfContext with NullEPMonitor.""" + from winml.modelkit.session.session import WinMLSession, PerfContext + from winml.modelkit.session.monitor.ep_monitor import NullEPMonitor + + # Minimal model — use existing test fixture or skip if not available + # [Use a pre-existing fixture path — reference whatever the project uses] + model_path = _get_minimal_onnx_fixture() # helper; may need to import from tests/ + session = WinMLSession(model_path, device="cpu") + with session.perf(warmup=0) as ctx: + assert isinstance(ctx, PerfContext) + assert isinstance(ctx.monitor, NullEPMonitor) + + +def test_nested_perf_raises(): + """Entering perf() while another perf() is active raises RuntimeError.""" + from winml.modelkit.session.session import WinMLSession + model_path = _get_minimal_onnx_fixture() + session = WinMLSession(model_path, device="cpu") + with session.perf(): + with pytest.raises(RuntimeError, match="already active"): + with session.perf(): + pass + + +def test_teardown_ordering_reset_before_monitor_exit(): + """Monitor.requires_session_teardown=True → self.reset() fires BEFORE monitor.__exit__.""" + from winml.modelkit.session.session import WinMLSession + from winml.modelkit.session.monitor.ep_monitor import EPMonitor + + observations = [] + + class _TeardownMonitor(EPMonitor): + requires_session_teardown = True + @classmethod + def is_available(cls): + return True + def __enter__(self): + return self + def __exit__(self, exc_type, exc_val, exc_tb): + # The session._session attribute should have been cleared by now + observations.append(("exit", getattr(self, "_session_at_exit", "MISSING"))) + def to_dict(self): + return {"ep": "test"} + + model_path = _get_minimal_onnx_fixture() + session = WinMLSession(model_path, device="cpu") + mon = _TeardownMonitor() + + with session.perf(monitor=mon) as ctx: + session.run({_get_first_input_name(model_path): _get_zero_input(model_path)}) + + # After exit, capture state the monitor observed at exit time + # Arrangement: add a __setattr__ trick OR check that _session is None post-exit + assert session._session is None # reset happened + + +def _get_minimal_onnx_fixture(): + """Return path to a trivially runnable ONNX model fixture.""" + # Use whatever fixture the project's session tests use; if none exists, + # create one in tests/unit/session/fixtures/. For plan purposes, delegate: + from tests._helpers import get_minimal_onnx_model_path + return get_minimal_onnx_model_path() + + +def _get_first_input_name(model_path): + import onnx + m = onnx.load(str(model_path)) + return m.graph.input[0].name + + +def _get_zero_input(model_path): + import numpy as np + import onnx + m = onnx.load(str(model_path)) + inp = m.graph.input[0] + shape = [d.dim_value if d.dim_value > 0 else 1 for d in inp.type.tensor_type.shape.dim] + return np.zeros(shape, dtype=np.float32) +``` + +If `tests/_helpers.py::get_minimal_onnx_model_path` doesn't exist, create it: +```python +# tests/_helpers.py +from pathlib import Path + +def get_minimal_onnx_model_path() -> Path: + """Return path to a tiny ONNX Identity model used by WinMLSession tests.""" + import onnx + from onnx import helper, TensorProto + fixture = Path(__file__).parent / "_fixtures" / "identity.onnx" + if not fixture.exists(): + fixture.parent.mkdir(exist_ok=True) + inp = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 4]) + out = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, 4]) + node = helper.make_node("Identity", ["input"], ["output"]) + graph = helper.make_graph([node], "identity", [inp], [out]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 17)]) + model.ir_version = 8 + onnx.save(model, fixture) + return fixture +``` + +Create `tests/unit/session/test_perf_auto_reset.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for session.perf() auto-reset behavior when a monitor contributes options.""" + +from __future__ import annotations + +import logging +from tests._helpers import get_minimal_onnx_model_path + + +def test_auto_reset_fires_when_options_contributed(caplog): + """If session is already compiled AND monitor contributes provider options, + session.perf().__enter__ auto-resets with a WARNING log.""" + from winml.modelkit.session.session import WinMLSession + from winml.modelkit.session.monitor.ep_monitor import EPMonitor + + class _ContributingMonitor(EPMonitor): + @classmethod + def is_available(cls): return True + def __enter__(self): return self + def __exit__(self, *a): pass + def to_dict(self): return {"ep": "test"} + def get_provider_options(self): + return {"some_provider_option": "1"} + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + # Force compile + session.compile() + assert session._session is not None + pre_compile_obj = session._session + + with caplog.at_level(logging.WARNING): + with session.perf(monitor=_ContributingMonitor()): + pass # reset should happen on enter + + assert any("auto-reset" in rec.message.lower() for rec in caplog.records) + # After perf exits (and perf's exit restores options), session may or may not + # be compiled. The key is that the pre-compile object was dropped. + assert session._session is None or session._session is not pre_compile_obj +``` + +- [ ] **Step 2: Run tests — expect failures** + +Run: +```bash +uv run pytest tests/unit/session/test_perf_monitor_integration.py tests/unit/session/test_perf_auto_reset.py -v +``` +Expected: FAILS because `perf()` doesn't accept `monitor=` yet. + +- [ ] **Step 3: Rewrite `WinMLSession.perf()`** + +In `src/winml/modelkit/session/session.py`, replace the existing `perf` method (around lines 583-603) with: + +```python +@contextmanager +def perf( + self, + warmup: int = 0, + monitor: EPMonitor | None = None, +) -> Generator[PerfContext, None, None]: + """Run a scoped performance window yielding a PerfContext. + + Args: + warmup: Number of initial samples to exclude from statistics. + monitor: Optional EPMonitor. Contributes session/provider options at + compile time (auto-resets the session if already compiled with + different options — logs WARNING). Parses artifacts on exit. + + Yields: + PerfContext(stats=PerfStats, monitor=EPMonitor | NullEPMonitor) + + Raises: + RuntimeError: If another perf() context is already active on this session. + """ + if self._perf_stats is not None: + raise RuntimeError( + "session.perf() already active; nested perf is forbidden" + ) + + mon: EPMonitor = monitor if monitor is not None else NullEPMonitor() + + # Collect hook contributions — must be idempotent per EPMonitor contract + extra_sess = mon.get_session_options() + extra_prov = mon.get_provider_options() + + # Auto-reset if options to apply AND session is already compiled + if (extra_sess or extra_prov) and self._session is not None: + logger.warning( + "session.perf(): auto-resetting compiled session to apply monitor " + "session/provider options (monitor=%s)", + type(mon).__name__, + ) + self.reset() + + # Save + merge + saved_sess = dict(self._active_session_option_entries) + saved_prov = dict(self._provider_options) + self._active_session_option_entries = {**saved_sess, **extra_sess} + self._provider_options = {**saved_prov, **extra_prov} + + stats = PerfStats(warmup=warmup) + self._perf_stats = stats + mon.__enter__() + + try: + yield PerfContext(stats=stats, monitor=mon) + finally: + self._perf_stats = None + exc_info = sys.exc_info() + try: + if mon.requires_session_teardown: + self.reset() + gc.collect() # Windows: release CSV file handle + finally: + try: + mon.__exit__(*exc_info) + finally: + self._active_session_option_entries = saved_sess + self._provider_options = saved_prov +``` + +Required imports at top of `session.py`: +```python +import gc +import sys +``` + +And near existing `@contextmanager` import, confirm `from contextlib import contextmanager` is imported. + +- [ ] **Step 4: Run tests — expect PASS** + +Run: +```bash +uv run pytest tests/unit/session/test_perf_monitor_integration.py tests/unit/session/test_perf_auto_reset.py -v +``` +Expected: all PASS. + +- [ ] **Step 5: Backward-compat check: existing `session.perf()` users** + +Run: +```bash +uv run pytest tests/ -k "perf" -v --tb=short +``` +Expected: no regressions — the old `session.perf(warmup=10) as stats` pattern still works because `PerfContext.stats` is accessible as `.stats` but also the old callers likely do `stats.mean_ms` — that breaks! We need to handle this. + +- [ ] **Step 6: Fix: audit existing callers** + +Run: +```bash +uv run grep -rn "session.perf(" src/ tests/ --include="*.py" +``` + +For each call site, if it uses `as stats:` and treats the yielded object as a `PerfStats`, update it to `as ctx:` and use `ctx.stats`. Primary callers: +- `src/winml/modelkit/commands/perf.py` (benchmark loop) +- Any benchmark helper in `src/winml/modelkit/session/perf_benchmark.py` if present + +Update each — example for `commands/perf.py` benchmark loop: +```python +# Before +with session.perf(warmup=...) as stats: + ... + stats.mean_ms # etc. + +# After +with session.perf(warmup=...) as ctx: + stats = ctx.stats + ... + stats.mean_ms # etc. +``` + +This keeps the minimal delta. Task 11 will add `monitor=` to these calls. + +- [ ] **Step 7: Rerun full tests** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: all pass. + +- [ ] **Step 8: Ruff + commit** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/session.py src/winml/modelkit/commands/perf.py tests/unit/session/ --fix +git add src/winml/modelkit/session/session.py src/winml/modelkit/commands/perf.py tests/unit/session/ tests/_helpers.py tests/_fixtures/ +git commit -m "feat(session): extend perf() with monitor= yielding PerfContext + +- perf(warmup, monitor=None) yields PerfContext(stats, monitor) +- Auto-reset on option conflict (WARNING log) +- Teardown ordering: reset → gc.collect → monitor.__exit__ → restore +- Exception transparency via sys.exc_info() +- Nested perf() raises RuntimeError + +Migrate existing callers to use ctx.stats." +``` + +--- + +## Task 9: Rewrite `QNNMonitor` from placeholder to full implementation + +**Rationale:** The new monitor. Uses the relocated helpers, the new `OpTraceResult` with `status`/`error`, the new base-class hooks, and `ensure_initialized()`. + +**Files:** +- Modify: `src/winml/modelkit/session/monitor/qnn_monitor.py` +- Test: `tests/unit/session/monitor/test_qnn_monitor.py` (new) + +- [ ] **Step 1: Write failing tests** + +Create `tests/unit/session/monitor/test_qnn_monitor.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for QNNMonitor — the QNN EP op-tracing monitor.""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +import pytest + + +def test_ctor_defaults(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor() + assert m._level == "basic" + assert m._output_dir.exists() # tempdir created + assert m._csv_path.is_absolute() + + +def test_ctor_rejects_invalid_level(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + with pytest.raises(ValueError, match="level"): + QNNMonitor(level="invalid") # type: ignore[arg-type] + + +def test_get_session_options(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor() + opts = m.get_session_options() + assert opts["session.disable_cpu_ep_fallback"] == "1" + assert opts["ep.context_enable"] == "1" + assert opts["ep.context_embed_mode"] == "0" + + +def test_get_provider_options_basic(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor(level="basic") + opts = m.get_provider_options() + assert opts["profiling_level"] == "detailed" + assert opts["backend_path"] == "QnnHtp.dll" + assert "profiling_file_path" in opts + + +def test_get_provider_options_detail(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor(level="detail") + opts = m.get_provider_options() + assert opts["profiling_level"] == "optrace" + + +def test_profiling_keys_not_user_overridable(): + """User extras cannot override profiling_level or profiling_file_path.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor( + level="basic", + extra_provider_options={ + "profiling_level": "off", + "profiling_file_path": "/attacker/path", + "htp_performance_mode": "balanced", + }, + ) + opts = m.get_provider_options() + assert opts["profiling_level"] == "detailed" # monitor-owned + assert opts["profiling_file_path"] != "/attacker/path" + assert opts["htp_performance_mode"] == "balanced" # user extra honored + + +def test_get_provider_options_idempotent(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor(level="basic") + assert m.get_provider_options() == m.get_provider_options() + + +def test_requires_session_teardown_true(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + assert QNNMonitor.requires_session_teardown is True + + +def test_double_enter_raises(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor() + m.__enter__() + with pytest.raises(RuntimeError, match="already entered"): + m.__enter__() + + +def test_exit_with_no_csv_reports_no_data(tmp_path): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + m = QNNMonitor(output_dir=tmp_path) + m.__enter__() + m.__exit__(None, None, None) + d = m.to_dict() + assert d["status"] == "no_data" + + +def test_is_available_via_bundled(): + """When QNN EP is in get_available_providers(), is_available() returns True.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + with patch( + "onnxruntime.get_available_providers", + return_value=["QNNExecutionProvider", "CPUExecutionProvider"], + ): + assert QNNMonitor.is_available() is True + + +def test_is_available_via_winml(tmp_path): + """When QNN EP is registered via WinML (in get_ep_devices), is_available returns True.""" + from unittest.mock import MagicMock + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + fake_ep = MagicMock() + fake_ep.ep_name = "QNNExecutionProvider" + with patch("onnxruntime.get_available_providers", return_value=["CPUExecutionProvider"]), \ + patch("onnxruntime.get_ep_devices", return_value=[fake_ep]), \ + patch("winml.modelkit.session.ep_registry.ensure_initialized"): + assert QNNMonitor.is_available() is True + + +def test_is_available_neither(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + with patch("onnxruntime.get_available_providers", return_value=["CPUExecutionProvider"]), \ + patch("onnxruntime.get_ep_devices", return_value=[]), \ + patch("winml.modelkit.session.ep_registry.ensure_initialized"): + assert QNNMonitor.is_available() is False +``` + +- [ ] **Step 2: Run tests — expect failures** + +Run: +```bash +uv run pytest tests/unit/session/monitor/test_qnn_monitor.py -v +``` +Expected: most fail (placeholder doesn't have these methods). + +- [ ] **Step 3: Rewrite `qnn_monitor.py`** + +Overwrite `src/winml/modelkit/session/monitor/qnn_monitor.py`: +```python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""QNNMonitor — Qualcomm NPU per-operator profiling via ORT's QNN EP. + +Produces an OpTraceResult with per-op cycle counts (level="basic") or full +QHAS roofline / DMA traffic data (level="detail"). Attached to a +WinMLSession via ``session.perf(monitor=QNNMonitor(...))``. +""" +from __future__ import annotations + +import json +import logging +import tempfile +import time +from pathlib import Path +from typing import Any, ClassVar, Literal, Mapping, TYPE_CHECKING + +from .ep_monitor import EPMonitor +from .op_metrics import OperatorMetrics, OpTraceResult +from .qnn.csv_parser import parse_qnn_profiling_csv + + +if TYPE_CHECKING: + from typing_extensions import Self + + +logger = logging.getLogger(__name__) + +# Level → QNN profiling_level value +_LEVEL_TO_PROFILING: dict[str, str] = { + "basic": "detailed", + "detail": "optrace", +} + + +class QNNMonitor(EPMonitor): + """Qualcomm NPU per-op profiler via ORT's QNN EP. + + Level modes: + - ``"basic"``: CSV with per-op cycle counts (fast; covers most use cases). + - ``"detail"``: QHAS via QNN SDK viewer (roofline + DMA traffic; + requires QNN SDK installed; falls back to CSV with a warning if not). + + Example:: + + with session.perf(monitor=QNNMonitor(level="basic")) as ctx: + for _ in range(10): + session.run(inputs) + print(ctx.monitor.to_dict()) + """ + + requires_session_teardown: ClassVar[bool] = True + + def __init__( + self, + level: Literal["basic", "detail"] = "basic", + output_dir: Path | None = None, + extra_provider_options: Mapping[str, str] | None = None, + ) -> None: + if level not in _LEVEL_TO_PROFILING: + raise ValueError( + f"level must be 'basic' or 'detail', got {level!r}" + ) + self._level = level + # Idempotency: resolve all paths at __init__, not per-call + self._output_dir = ( + Path(output_dir) + if output_dir is not None + else Path(tempfile.mkdtemp(prefix="qnn_profile_")) + ) + self._output_dir.mkdir(parents=True, exist_ok=True) + self._csv_path = (self._output_dir / "profiling_output.csv").resolve() + self._extra = dict(extra_provider_options or {}) + self._entered = False + self._result: OpTraceResult | None = None + + # ---- EPMonitor contract ---- + + @classmethod + def is_available(cls) -> bool: + """True iff QNN EP is usable via bundled DLL or WinML registration.""" + try: + import onnxruntime as ort + except ImportError: + return False + if "QNNExecutionProvider" in ort.get_available_providers(): + return True + # WinML path + try: + from ..ep_registry import ensure_initialized + ensure_initialized() + return any( + d.ep_name == "QNNExecutionProvider" + for d in ort.get_ep_devices() + ) + except Exception as exc: # noqa: BLE001 + logger.debug("QNNMonitor.is_available: WinML path failed: %s", exc) + return False + + def get_session_options(self) -> dict[str, str]: + return { + "session.disable_cpu_ep_fallback": "1", + "ep.context_enable": "1", + "ep.context_embed_mode": "0", + } + + def get_provider_options(self) -> dict[str, str]: + # Build in layers; owner-enforced keys last so they can't be overridden. + opts: dict[str, str] = { + "backend_path": "QnnHtp.dll", + "htp_performance_mode": "high_performance", + "htp_graph_finalization_optimization_mode": "3", + "enable_htp_fp16_precision": "1", + } + opts.update(self._extra) + # C-3: these two keys are NEVER user-overridable. + opts["profiling_level"] = _LEVEL_TO_PROFILING[self._level] + opts["profiling_file_path"] = str(self._csv_path) + return opts + + def __enter__(self) -> Self: + if self._entered: + raise RuntimeError("QNNMonitor already entered") + self._entered = True + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: # noqa: D401 + # Must not suppress caller exceptions — return None implicitly. + try: + self._result = self._parse_artifacts() + except Exception as e: # noqa: BLE001 + logger.warning("QNNMonitor: artifact parse failed: %s", e) + self._result = self._make_failure_result("parse_failed", str(e)) + + def to_dict(self) -> dict[str, Any]: + if self._result is None: + return {"ep": "QNN", "device": "NPU", "status": "not_run"} + return self._result.to_dict() + + @property + def result(self) -> OpTraceResult | None: + """Structured result — consumed by display_op_trace_report / write_op_trace_json.""" + return self._result + + # ---- Internals ---- + + def _parse_artifacts(self) -> OpTraceResult: + """Parse CSV (and QHAS if detail). Retry once for Windows file-handle lag.""" + if not self._csv_path.exists(): + # Retry — Windows may lag on file-handle release + time.sleep(0.05) + if not self._csv_path.exists(): + logger.warning( + "QNNMonitor: no CSV at %s — nothing to parse", self._csv_path + ) + return self._make_failure_result("no_data", None) + + parsed = parse_qnn_profiling_csv(self._csv_path) + meta = parsed["metadata"] + total_cycles = meta.get("accel_execute_cycles", 0) + accel_us = meta.get("accel_execute_us", 0) + cycle_to_us = accel_us / total_cycles if total_cycles > 0 else 0.0 + + operators = [ + OperatorMetrics( + name=op["name"], + op_path=op["name"], + op_id=op["op_id"], + duration_us=op["cycles"] * cycle_to_us, + percent_of_total=( + op["cycles"] / total_cycles * 100 if total_cycles > 0 else 0 + ), + ) + for op in parsed["operators"] + ] + + artifacts: dict[str, str] = {"csv": str(self._csv_path)} + qnn_log = Path(str(self._csv_path) + "_qnn.log") + if qnn_log.is_file(): + artifacts["qnn_log"] = str(qnn_log) + + status = "ok" + if self._level == "detail": + qhas_path = self._try_qhas(qnn_log, artifacts) + if qhas_path is None: + status = "basic_fallback" + logger.warning( + "QNNMonitor: detail mode requested but QHAS viewer " + "unavailable; falling back to basic CSV data" + ) + + return OpTraceResult( + model=None, + device="npu", + tracing_level=self._level, + ep="QNNExecutionProvider", + tracing_backend="qnn", + operators=operators, + num_samples=meta.get("num_samples", 0), + summary={ + "hvx_threads": meta.get("hvx_threads", 0), + "accel_execute_cycles": meta.get("accel_execute_cycles", 0), + "accel_execute_us": accel_us, + }, + artifacts=artifacts, + status=status, + ) + + def _try_qhas(self, qnn_log: Path, artifacts: dict[str, str]) -> Path | None: + """Detail mode: run QNN SDK viewer to produce QHAS. Returns path or None.""" + if not qnn_log.is_file(): + return None + try: + from .qnn.viewer import find_qnn_sdk, run_qhas_viewer + except ImportError: + return None + sdk = find_qnn_sdk() + if sdk is None: + return None + # Locate schematic.bin via glob fallback (no os.chdir per FR-12) + schematics = list(self._output_dir.glob("*_schematic.bin")) + if not schematics: + # Fallback: check process CWD (QNN SDK default behavior) + schematics = list(Path.cwd().glob("*_schematic.bin")) + if not schematics: + logger.warning("QNNMonitor: no *_schematic.bin found for detail mode") + return None + schematic = schematics[0] + artifacts["schematic"] = str(schematic) + qhas_out = self._output_dir / "qhas_output.json" + result_path = run_qhas_viewer(qnn_log, schematic, qhas_out, sdk_root=sdk) + if result_path is not None and result_path.is_file(): + artifacts["qhas"] = str(result_path) + return result_path + return None + + def _make_failure_result( + self, status: str, error: str | None + ) -> OpTraceResult: + return OpTraceResult( + model=None, + device="npu", + tracing_level=self._level, + ep="QNNExecutionProvider", + tracing_backend="qnn", + operators=[], + summary={}, + artifacts={"csv": str(self._csv_path)} if self._csv_path.exists() else {}, + status=status, + error=error, + ) +``` + +- [ ] **Step 4: Run tests — expect PASS** + +Run: +```bash +uv run pytest tests/unit/session/monitor/test_qnn_monitor.py -v +``` +Expected: all PASS. + +- [ ] **Step 5: Full suite sanity** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: no regressions. + +- [ ] **Step 6: Ruff + commit** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/monitor/qnn_monitor.py tests/unit/session/monitor/test_qnn_monitor.py --fix +git add src/winml/modelkit/session/monitor/qnn_monitor.py tests/unit/session/monitor/test_qnn_monitor.py +git commit -m "feat(monitor): rewrite QNNMonitor from placeholder to real impl + +- get_session_options() contributes disable_cpu_ep_fallback, ep.context_* +- get_provider_options() contributes backend_path + profiling_level + (user overrides honored for non-profiling keys; profiling_level and + profiling_file_path are owner-enforced) +- __exit__ parses CSV → OpTraceResult; retries once on Windows lag +- detail mode: runs QHAS viewer if SDK available; falls back to basic +- No os.chdir (glob fallback for schematic.bin location) +- is_available works with both onnxruntime-qnn AND onnxruntime-windowsml" +``` + +--- + +## Task 10: Port availability test from `test_detection.py` (and delete) + +**Rationale:** The existing `tests/unit/optracing/test_detection.py` tests the old `is_qnn_profiling_available()`. Rewrite minimally as `test_qnn_monitor_availability.py` at the new location, then delete the old file. + +**Files:** +- Create: `tests/unit/session/monitor/test_qnn_monitor_availability.py` +- Delete: `tests/unit/optracing/test_detection.py` + +- [ ] **Step 1: Read the existing test** + +Run: +```bash +cat tests/unit/optracing/test_detection.py +``` +Identify the behaviors being tested. + +- [ ] **Step 2: Write the replacement** + +Create `tests/unit/session/monitor/test_qnn_monitor_availability.py` with equivalent behaviors re-expressed against `QNNMonitor.is_available()` (note: Task 9 already covers most of this; this task just ensures we don't drop any test from `test_detection.py` that wasn't covered). + +If `test_detection.py` checks behaviors not already covered in `test_qnn_monitor.py`, add them here. Otherwise, delete without replacement. + +- [ ] **Step 3: Delete the old file** + +Run: +```bash +git rm tests/unit/optracing/test_detection.py +``` + +- [ ] **Step 4: Run tests** + +Run: +```bash +uv run pytest tests/unit/session/monitor/ -v +``` +Expected: PASS. + +- [ ] **Step 5: Commit** + +Run: +```bash +git add tests/unit/session/monitor/test_qnn_monitor_availability.py tests/unit/optracing/test_detection.py +git commit -m "test(monitor): port QNN availability tests to new location + +Replaces tests/unit/optracing/test_detection.py with tests exercising +QNNMonitor.is_available() — covers both bundled (onnxruntime-qnn) and +WinML-registered (onnxruntime-windowsml) EP discovery paths." +``` + +--- + +## Task 11: Add `_resolve_ep_monitor` dispatch + wire op-tracing into main benchmark loop in `commands/perf.py` + +**Rationale:** Collapse the separate op-tracing block (`perf.py:1334-1386`) into the existing benchmark `session.perf()` call by passing `monitor=QNNMonitor(...)`. + +**Files:** +- Modify: `src/winml/modelkit/commands/perf.py` +- Test: `tests/unit/commands/test_perf_optracing.py` (move from `tests/unit/optracing/test_perf_optracing_cli.py`) + +- [ ] **Step 1: Move the existing CLI test file** + +Run: +```bash +mkdir -p tests/unit/commands +test -f tests/unit/commands/__init__.py || touch tests/unit/commands/__init__.py +git mv tests/unit/optracing/test_perf_optracing_cli.py tests/unit/commands/test_perf_optracing.py +``` + +Update imports in the moved file to match the new module structure (replace `winml.modelkit.optracing.*` references with `winml.modelkit.session.monitor.*` where applicable). + +- [ ] **Step 2: Add `_resolve_ep_monitor` helper to `commands/perf.py`** + +Near the top of `commands/perf.py`, after the imports and `DYNAMIC_DIM_DEFAULTS`, add: +```python +def _resolve_ep_monitor( + ep: str, + op_tracing: str | None, + output_dir: Path, +): + """Pick the EPMonitor for the requested EP + optional op-tracing level. + + Explicit dispatch — no registry, no plugin loading. Raises RuntimeError + when op-tracing is requested against an EP that has no op-tracing monitor. + """ + from ..session.monitor.ep_monitor import NullEPMonitor + if op_tracing: + from ..session.monitor.qnn_monitor import QNNMonitor + if ep == "qnn" and QNNMonitor.is_available(): + return QNNMonitor(level=op_tracing, output_dir=output_dir) + raise RuntimeError( + f"Op-tracing not available for EP '{ep}'. Supported: 'qnn'." + ) + from ..session.monitor.vitisai_monitor import VitisAIMonitor + if ep == "vitisai" and VitisAIMonitor.is_available(): + return VitisAIMonitor() + return NullEPMonitor() +``` + +- [ ] **Step 3: Collapse the op-tracing block** + +Locate the op-tracing block (currently around lines 1334-1386). **Delete** the entire block (from `if op_tracing:` through `console.print(f"[green]Op-trace saved to:[/green] {trace_output}")`). + +- [ ] **Step 4: Wire `monitor=` into the main benchmark loop** + +Find the main benchmark invocation. It currently looks like: +```python +with session.perf(warmup=...) as ctx: + ... +``` + +Replace with: +```python +monitor = None +if op_tracing: + try: + monitor = _resolve_ep_monitor( + ep=config.ep, + op_tracing=op_tracing, + output_dir=output.parent if output else Path.cwd(), + ) + except RuntimeError as e: + console.print(f"[red]Error:[/red] {e}") + raise SystemExit(1) from None + +with session.perf(warmup=config.warmup, monitor=monitor) as ctx: + ... # existing loop body; stats = ctx.stats +``` + +- [ ] **Step 5: Add the post-benchmark report logic** + +After the `with` block, add: +```python +if op_tracing: + from ..session.monitor.report import display_op_trace_report, write_op_trace_json + result = ctx.monitor.result + if result is None or result.status == "no_data": + console.print( + "[yellow]Warning:[/yellow] No profiling data produced." + ) + else: + display_op_trace_report(result, console) + model_slug = hf_model.replace("/", "_").replace("\\", "_") + if is_onnx: + model_slug = model_path.stem + trace_output = (output.parent if output else Path.cwd()) / f"{model_slug}_op_trace.json" + write_op_trace_json(result, trace_output) + console.print(f"[green]Op-trace saved to:[/green] {trace_output}") +``` + +- [ ] **Step 6: Remove dead imports** + +At the top of `commands/perf.py`, remove the now-unused `from ..optracing import is_qnn_profiling_available`, `get_tracer`, etc. (search and remove). + +- [ ] **Step 7: Run CLI tests** + +Run: +```bash +uv run pytest tests/unit/commands/test_perf_optracing.py -v +``` +Expected: PASS (may need test updates to match new dispatch — fix per-test). + +- [ ] **Step 8: Manual CLI smoke (non-hardware-gated)** + +Run: +```bash +uv run winml perf --help | head -20 +``` +Expected: help output renders, no import errors. + +- [ ] **Step 9: Full test sanity** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: no regressions. + +- [ ] **Step 10: Ruff + commit** + +Run: +```bash +uv run ruff check src/winml/modelkit/commands/perf.py tests/unit/commands/ --fix +git add src/winml/modelkit/commands/perf.py tests/unit/commands/ tests/unit/optracing/ +git commit -m "feat(perf): collapse op-tracing block into integrated monitor path + +- Add _resolve_ep_monitor(ep, op_tracing, output_dir) dispatch helper +- Delete standalone op-tracing block (~50 lines) +- Pass monitor=QNNMonitor(...) to session.perf() in main benchmark +- Hard-fail when op-tracing requested against unsupported EP" +``` + +--- + +## Task 12: Delete `QNNProfiler` and related deprecated modules + +**Rationale:** With `commands/perf.py` no longer importing `QNNProfiler` / `is_qnn_profiling_available` / `get_tracer`, the old optracing classes can go. + +**Files:** +- Delete: `src/winml/modelkit/optracing/qnn/profiler.py` +- Delete: `src/winml/modelkit/optracing/base.py` +- Delete: `src/winml/modelkit/optracing/registry.py` +- Delete: `tests/unit/optracing/test_qnn_profiler.py` +- Delete: `tests/unit/optracing/test_registry.py` +- Delete: `tests/unit/optracing/test_integration.py` (replaced by `tests/unit/session/test_perf_monitor_integration.py`) + +- [ ] **Step 1: Verify no remaining imports** + +Run: +```bash +uv run grep -rn "from winml.modelkit.optracing" src/ tests/ --include="*.py" | grep -v "^tests/unit/optracing" +``` +Expected: empty output. If not empty, redirect remaining imports first. + +Also: +```bash +uv run grep -rn "QNNProfiler\|OpTracer\|is_qnn_profiling_available\|get_tracer\|register_tracer" src/ tests/ --include="*.py" +``` +Expected: matches only inside `optracing/` directory itself (safe to delete). + +- [ ] **Step 2: Delete the files** + +Run: +```bash +git rm src/winml/modelkit/optracing/qnn/profiler.py +git rm src/winml/modelkit/optracing/base.py +git rm src/winml/modelkit/optracing/registry.py +git rm tests/unit/optracing/test_qnn_profiler.py +git rm tests/unit/optracing/test_registry.py +git rm tests/unit/optracing/test_integration.py +``` + +- [ ] **Step 3: Run full tests** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: PASS. + +- [ ] **Step 4: Commit** + +Run: +```bash +git commit -m "refactor(optracing): delete QNNProfiler, OpTracer, registry + +- QNNProfiler replaced by QNNMonitor (session/monitor/qnn_monitor.py) +- OpTracer ABC + registry collapsed into EPMonitor hierarchy +- Tests migrated to tests/unit/session/" +``` + +--- + +## Task 13: Delete `optracing/` package entirely (shims included) + +**Rationale:** All callers migrated. Shims in `optracing/result.py`, `report.py`, `qnn/*.py`, `__init__.py` can be removed. + +**Files:** +- Delete: `src/winml/modelkit/optracing/` (entire directory) +- Delete: `tests/unit/optracing/` (entire directory, after moving the `__init__.py`) + +- [ ] **Step 1: Sanity check for final references** + +Run: +```bash +uv run grep -rn "winml.modelkit.optracing\|from .optracing\|from ..optracing\|from ...optracing" src/ tests/ --include="*.py" +``` +Expected: empty. + +- [ ] **Step 2: Delete directories** + +Run: +```bash +git rm -r src/winml/modelkit/optracing/ +git rm -r tests/unit/optracing/ +``` + +- [ ] **Step 3: Run full tests** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: PASS. + +- [ ] **Step 4: Ruff** + +Run: +```bash +uv run ruff check src/ tests/ --fix +``` + +- [ ] **Step 5: Commit** + +Run: +```bash +git commit -m "refactor: delete src/winml/modelkit/optracing/ package + +All functionality relocated to session/monitor/. Shims removed now +that no caller imports from the old paths." +``` + +--- + +## Task 14: Delete `WinMLSession._init_winml_eps_once` classmethod; use `ensure_initialized` module function + +**Rationale:** The classmethod is now redundant — the module function does the same thing and is what `QNNMonitor.is_available` uses. + +**Files:** +- Modify: `src/winml/modelkit/session/session.py` + +- [ ] **Step 1: Replace calls to `_init_winml_eps_once` with `ensure_initialized`** + +In `session.py`, find the classmethod `_init_winml_eps_once` (around line 149-163) and its one caller in `__init__` (around line 191). + +Replace the `__init__` call from: +```python +WinMLSession._init_winml_eps_once() +``` +to: +```python +from .ep_registry import ensure_initialized +ensure_initialized() +``` + +Remove the classmethod itself and the class attribute `_eps_initialized` if it was only used there. + +- [ ] **Step 2: Run tests** + +Run: +```bash +uv run pytest tests/ -x --tb=short -q +``` +Expected: PASS. + +- [ ] **Step 3: Commit** + +Run: +```bash +uv run ruff check src/winml/modelkit/session/session.py --fix +git add src/winml/modelkit/session/session.py +git commit -m "refactor(session): remove _init_winml_eps_once classmethod + +Redundant with ep_registry.ensure_initialized() module function. +WinMLSession.__init__ now calls the module function directly." +``` + +--- + +## Task 15: Relocate design docs per spec §1.5.1 transitional commitment + +**Rationale:** Implementation complete. Per the Transitional Location note in both design docs, move them under `docs/design/session/monitor/`. + +**Files:** +- Move: `docs/design/optracing/` → `docs/design/session/monitor/` +- Modify: the two design docs to remove the Transitional Location note and update `Module` / cross-refs as needed. + +- [ ] **Step 1: Create the new directory** + +Run: +```bash +mkdir -p docs/design/session/monitor +``` + +- [ ] **Step 2: Move the docs and iterations** + +Run: +```bash +git mv docs/design/optracing/1_prd.md docs/design/session/monitor/1_prd.md +git mv docs/design/optracing/2_coreloop.md docs/design/session/monitor/2_coreloop.md +git mv docs/design/optracing/iterations docs/design/session/monitor/iterations +``` + +- [ ] **Step 3: Remove the Transitional Location note from both docs** + +In both `1_prd.md` and `2_coreloop.md`, delete the four-line `**Transitional Location**` block immediately after the metadata header. Replace it with a line-break only. + +- [ ] **Step 4: Bump Version to 2.2 with a Revision History entry** + +In both docs, change `**Version**: 2.1` → `**Version**: 2.2` and append a Revision History row: +```markdown +| 2.2 | 2026-04-23 | Relocated from `docs/design/optracing/` to `docs/design/session/monitor/` per spec §1.5.1 transitional commitment (implementation complete). Removed Transitional Location note. | +``` + +- [ ] **Step 5: Delete the now-empty optracing doc directory** + +Run: +```bash +rmdir docs/design/optracing 2>&1 || true +``` + +- [ ] **Step 6: Update any cross-references** + +Run: +```bash +uv run grep -rn "docs/design/optracing" --include="*.md" +``` + +For each match, update the path. Notably: `docs/standards/design-doc-spec.md` §7.4 references `docs/design/optracing/1_prd.md` + `2_coreloop.md` — update to `docs/design/session/monitor/1_prd.md` + `2_coreloop.md`. + +- [ ] **Step 7: Commit** + +Run: +```bash +git add docs/design/ docs/standards/ +git commit -m "docs: relocate op-tracing design to docs/design/session/monitor/ + +Per spec §1.5.1 transitional commitment — implementation landed, so docs +move to their spec-compliant location under the target module directory. +Version bumped 2.1 → 2.2. Transitional Location note removed." +``` + +--- + +## Task 16: Final end-to-end verification + +**Files:** (none modified) + +- [ ] **Step 1: Full test suite** + +Run: +```bash +uv run pytest tests/ -v --tb=short +``` +Expected: all pass (or only pre-existing failures noted in Task 0 baseline). + +- [ ] **Step 2: Ruff clean** + +Run: +```bash +uv run ruff check src/ tests/ docs/ +``` +Expected: no findings. + +- [ ] **Step 3: Verify the CLI import smoke** + +Run: +```bash +uv run winml perf --help +``` +Expected: help renders. + +- [ ] **Step 4: Check for any stale `optracing` references anywhere** + +Run: +```bash +uv run grep -rn "optracing" src/ tests/ docs/standards/ --include="*.py" --include="*.md" +``` +Expected: no matches (or only matches in the Revision History entries / Migration Footprint, which are historical). + +- [ ] **Step 5: Hardware-gated E2E (if QNN NPU available)** + +Run: +```bash +uv run winml perf -m microsoft/resnet-50 --device npu --op-tracing basic +``` +Expected (on QNN hardware): CSV produced, per-op report rendered, JSON file written. On non-QNN machines: helpful error message. + +- [ ] **Step 6: Verify SC-1 through SC-6 from PRD** + +- **SC-1** ✓ if step 5 produced valid output on a QNN machine. +- **SC-2** ✓ via step 4 (no `optracing` references). +- **SC-3** ✓ covered by `test_qnn_monitor_availability.py`. +- **SC-4** ✓ — the 8-line idiom works (covered by integration tests). +- **SC-5** ✓ — step 1. +- **SC-6** ✓ — `display_op_trace_report` / `write_op_trace_json` consume `OpTraceResult`; `OpTraceResult.to_dict()` preserved. + +- [ ] **Step 7: Final commit if any cleanup was done** + +Run: +```bash +git status +# if anything left: +git add -A +git commit -m "chore: final cleanup from op-tracing refactor E2E" +``` + +- [ ] **Step 8: Summarize for PR description** + +Draft a PR description citing: +- Bug fixed (D-1: `QNNProfiler` broken on `onnxruntime-windowsml`) +- Architectural simplification (`OpTracer` hierarchy merged into `EPMonitor`) +- Spec compliance (first doc pair authored against `design-doc-spec.md` v1.1) +- All 16 SCs from the PRD + the Transitional Location commitment honored. + +--- + +## Self-Review + +**Spec coverage:** Each PRD section has a task. +- FR-1 (both ORT variants) → Task 9 `is_available` + Task 11 dispatch +- FR-2 (`session.perf(monitor=...)`) → Tasks 6, 8 +- FR-3 (single hierarchy) → Tasks 12, 13 +- FR-4 (`QNNMonitor` replaces `QNNProfiler`) → Tasks 9, 12 +- FR-5 (basic/detail levels) → Task 9 +- FR-6 (`OpTraceResult` preserved + extended) → Task 3 +- FR-7 (8-line standalone idiom) → verified in Task 16.6 +- FR-8 (availability) → Task 9 +- FR-9 (HWMonitor orthogonal) → no task needed; already orthogonal +- FR-10 (EPMonitor hooks) → Task 2 +- FR-11 (factory dispatch, no registry) → Task 11 +- FR-12 (no `os.chdir`) → Task 9 uses glob fallback +- NFR-1 through NFR-7 covered in Tasks 2, 8, 9 +- All risks (R-1 through R-6) have mitigations implemented (teardown ordering, gc.collect, exception transparency, fresh tempdir, WARNING log, retry on CSV lag) + +**Placeholder scan:** No TBDs. Every code block is complete. + +**Type consistency:** `OpTraceResult` status field default `"ok"` consistent between Task 3 (where added) and Task 9 (where used). `ensure_initialized` function signature consistent between Task 1 (where added) and Task 9 (where called). `PerfContext(stats, monitor)` consistent between Task 6 and Task 8. + +**One risk worth flagging:** The `test_teardown_ordering_reset_before_monitor_exit` test in Task 8 is written to check the final state (`session._session is None`) rather than capture the intermediate state during `monitor.__exit__`. A more rigorous test would inject an observer into `monitor.__exit__` that checks `session._session` at that exact moment. If the subagent executing Task 8 wants stronger verification, they may add that — the weaker check is sufficient for the load-bearing invariant given the implementation is direct. + +--- + +## Execution Handoff + +**Plan complete and saved to `docs/superpowers/plans/2026-04-23-op-tracing-refactor.md`. Two execution options:** + +**1. Subagent-Driven (recommended)** — Dispatch a fresh subagent per task, review between tasks, fast iteration. This plan has 16 self-contained tasks well-suited to this pattern. + +**2. Inline Execution** — Execute tasks in this session using `executing-plans`, batch execution with checkpoints for review. + +**Which approach?** diff --git a/src/winml/modelkit/__init__.py b/src/winml/modelkit/__init__.py index c10fcc9de..59a45c313 100644 --- a/src/winml/modelkit/__init__.py +++ b/src/winml/modelkit/__init__.py @@ -28,15 +28,13 @@ model = WinMLAutoModel.from_pretrained("facebook/convnext-tiny-224", config=config) """ +import logging from importlib.metadata import PackageNotFoundError, version + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + from . import _warnings # Configure warning filters before importing subpackages -from .config import WinMLBuildConfig -from .models import ( - WinMLAutoModel, - WinMLModelForImageClassification, - WinMLPreTrainedModel, -) try: @@ -51,3 +49,33 @@ "WinMLPreTrainedModel", "__version__", ] + + +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "WinMLBuildConfig": (".config", "WinMLBuildConfig"), + "WinMLAutoModel": (".models", "WinMLAutoModel"), + "WinMLPreTrainedModel": (".models", "WinMLPreTrainedModel"), + "WinMLModelForImageClassification": (".models", "WinMLModelForImageClassification"), +} + + +def __getattr__(name: str): + """Lazy-load heavy exports on first access (PEP 562). + + This avoids importing torch/transformers/optimum (~30s) when only + lightweight operations are needed (e.g., ``winml --help``). + """ + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + """Include lazy attributes in dir() for debugger/IPython compatibility.""" + return list(set(list(globals()) + __all__)) diff --git a/src/winml/modelkit/_warnings.py b/src/winml/modelkit/_warnings.py index 530be88ab..fbe08288d 100644 --- a/src/winml/modelkit/_warnings.py +++ b/src/winml/modelkit/_warnings.py @@ -44,38 +44,50 @@ class _DiffusersDistributionFilter(logging.Filter): def filter(self, record: logging.LogRecord) -> bool: return "Multiple distributions found" not in record.getMessage() - logging.getLogger("diffusers.utils.import_utils").addFilter( - _DiffusersDistributionFilter() - ) + logging.getLogger("diffusers.utils.import_utils").addFilter(_DiffusersDistributionFilter()) - class _HFPipelineFalsePositiveFilter(logging.Filter): - """Filter false-positive HF pipeline warnings when using WinML models. + class _PipelineNoiseFilter(logging.Filter): + """Filter noisy HF Pipeline warnings. - HF pipeline emits these because WinMLModel wraps ONNX via ORT, not a - native HF model class. These are expected and not actionable. + - 'The model X is not supported for Y' — WinML models are duck-type + compatible but not in HF's supported list. + - 'Device set to use cpu' — HF Pipeline forces CPU, we handle device. + - 'Using a slow image processor' — cosmetic deprecation notice. """ - _FALSE_POSITIVES = ( - "WinMLModel", # False positive warning which says WinML is not native HF model class - "Device set to use", # PyTorch tensor device, not ONNX device - "Using a slow image processor", # expected when using processor with pipeline. + _SUPPRESSED = ( + "is not supported for", + "Device set to use cpu", + "Using a slow image processor", ) def filter(self, record: logging.LogRecord) -> bool: msg = record.getMessage() - return not any(phrase in msg for phrase in self._FALSE_POSITIVES) + return not any(s in msg for s in self._SUPPRESSED) - for _name in ( - "transformers.pipelines.base", - "transformers.models.auto.image_processing_auto", - ): - logging.getLogger(_name).addFilter(_HFPipelineFalsePositiveFilter()) + logging.getLogger("transformers.pipelines.base").addFilter(_PipelineNoiseFilter()) # ========================================================================= # Warning filters (for warnings.warn() calls) # ========================================================================= - warnings.filterwarnings("ignore", category=FutureWarning, module=r"transformers.*") - warnings.filterwarnings("ignore", category=UserWarning, module=r"torch.*") + # Transformers: suppress cosmetic warnings (not RuntimeWarning/ResourceWarning) + for _cat in (FutureWarning, DeprecationWarning, UserWarning): + warnings.filterwarnings("ignore", category=_cat, module=r"transformers\..*") + + # PyTorch: suppress cosmetic warnings (not RuntimeWarning/ResourceWarning) + for _cat in (FutureWarning, DeprecationWarning, UserWarning): + warnings.filterwarnings("ignore", category=_cat, module=r"torch\..*") + + # TracerWarning (from torch.jit, inherits Warning not UserWarning) + # fires during ONNX export tracing — safe to suppress in both torch and transformers + try: + from torch.jit import TracerWarning + + warnings.filterwarnings("ignore", category=TracerWarning) + except ImportError: + pass # torch not installed + + # Diffusers warnings.filterwarnings( "ignore", message=r".*CUDA.*", category=UserWarning, module=r"diffusers.*" ) diff --git a/src/winml/modelkit/analyze/analyzer.py b/src/winml/modelkit/analyze/analyzer.py index edb119412..e44a804e3 100644 --- a/src/winml/modelkit/analyze/analyzer.py +++ b/src/winml/modelkit/analyze/analyzer.py @@ -23,6 +23,8 @@ if TYPE_CHECKING: + from collections.abc import Callable + import onnx from .models.information import Action @@ -492,6 +494,8 @@ def analyze( htp_metadata_path: str | None = None, run_unknown_op: bool = True, save_node_types: set[str] | None = None, + on_node_result: Callable | None = None, + on_ep_start: Callable | None = None, ) -> AnalysisResult: """Analyze ONNX model for runtime support. @@ -590,6 +594,8 @@ def analyze( htp_metadata_path=htp_metadata_path, run_unknown_op=run_unknown_op, save_node_types=save_node_types, + on_node_result=on_node_result, + on_ep_start=on_ep_start, ) def analyze_from_proto( @@ -602,6 +608,8 @@ def analyze_from_proto( htp_metadata_path: str | None = None, run_unknown_op: bool = True, save_node_types: set[str] | None = None, + on_node_result: Callable | None = None, + on_ep_start: Callable | None = None, ) -> AnalysisResult: """Analyze ONNX model from ModelProto object. @@ -691,6 +699,11 @@ def analyze_from_proto( for current_ep in eps_to_analyze: logger.info("Checking runtime support for %s...", current_ep) + if on_ep_start: + try: + on_ep_start(current_ep, metadata.operator_counts) + except Exception: + logger.debug("on_ep_start callback failed", exc_info=True) runtime_checker = RuntimeChecker( ep=current_ep, @@ -708,6 +721,7 @@ def analyze_from_proto( patterns=pattern_matches, run_unknown_op=run_unknown_op_for_ep, save_node_types=save_node_types, + on_node_result=on_node_result, ) # Convert runtime summary to expected format @@ -727,7 +741,6 @@ def analyze_from_proto( ep=current_ep, model=onnx_model, device=device_to_use, - shape_inferred_model_proto=runtime_checker.get_shape_inferred_model_proto(), ) information_list[current_ep] = engine.summary() # Use EP name as key @@ -786,6 +799,8 @@ def analyze_onnx( ep: str | None = None, device: str | None = None, autoconf: bool = True, + on_ep_start: Callable | None = None, + on_node_result: Callable | None = None, ) -> AnalyzeResult: """Analyze an ONNX model and return lint + autoconf results. @@ -841,6 +856,8 @@ def analyze_onnx( ep=ep, device=device, enable_information=autoconf, + on_ep_start=on_ep_start, + on_node_result=on_node_result, ) # Extract lint result (always computed — uses RuntimeChecker classification) diff --git a/src/winml/modelkit/analyze/core/runtime_checker.py b/src/winml/modelkit/analyze/core/runtime_checker.py index 87145e07c..4c8e65054 100644 --- a/src/winml/modelkit/analyze/core/runtime_checker.py +++ b/src/winml/modelkit/analyze/core/runtime_checker.py @@ -26,6 +26,8 @@ if TYPE_CHECKING: + from collections.abc import Callable + import onnx from winml.modelkit.pattern.match import PatternMatchResult @@ -142,21 +144,35 @@ def _get_query(self) -> RuntimeCheckerQuery: return self._query - def get_shape_inferred_model_proto(self) -> onnx.ModelProto | None: - """Return the shape-inferred model proto from the cached query, if available.""" - if self._query is not None: - return self._query.model_proto - return None - def op_support( self, run_unknown_op: bool = True, save_node_types: set[str] | None = None, + on_node_result: Callable | None = None, ) -> list[PatternRuntime]: """Check operator-level runtime support. Returns operator-level runtime check results for each operator. + Args: + on_node_result: Optional per-node progress callback. + When provided, tqdm progress bar is suppressed (caller + handles progress display via Rich Live). + + Signature:: + + (result: PatternRuntime) -> None + + The ``PatternRuntime`` passed to the callback has: + + - ``pattern_id`` (str): Full pattern ID, e.g. + ``"OP/ai.onnx/Conv"``. Use ``split("/")[-1]`` to get + the display name (``"Conv"``). + - ``result.classification`` (SupportLevel): The support + level enum. Call ``.value`` to get the string, e.g. + ``"supported"``, ``"partial"``, ``"unsupported"``, + ``"unknown"``. + Returns: List[PatternRuntime]: Runtime results for each operator pattern @@ -177,15 +193,21 @@ def op_support( model_proto = self._model.get_model() # Get cached RuntimeCheckerQuery query = self._get_query() - for node in tqdm.tqdm(model_proto.graph.node): - # Run runtime check for node - results.append( # noqa: PERF401 - query.run_for_node( - node, - run_unknown_op=run_unknown_op, - save_node_types=save_node_types, - ) + # Use tqdm for progress unless caller provides a callback + nodes = model_proto.graph.node + iterator = nodes if on_node_result else tqdm.tqdm(nodes) + for node in iterator: + result = query.run_for_node( + node, + run_unknown_op=run_unknown_op, + save_node_types=save_node_types, ) + results.append(result) + if on_node_result: + try: + on_node_result(result) + except Exception: + logger.debug("on_node_result callback failed", exc_info=True) logger.info("Checked %d operators", len(results)) @@ -302,6 +324,7 @@ def summary( patterns: list[PatternMatchResult] | None = None, run_unknown_op: bool = True, save_node_types: set[str] | None = None, + on_node_result: Callable | None = None, ) -> dict[str, list[PatternRuntime]]: """Combine operator-level & pattern-level runtime results. @@ -325,6 +348,7 @@ def summary( op_results = self.op_support( run_unknown_op=run_unknown_op, save_node_types=save_node_types, + on_node_result=on_node_result, ) summary_dict["op_runtime_check_result"] = op_results diff --git a/src/winml/modelkit/build/__init__.py b/src/winml/modelkit/build/__init__.py index d20f9f18e..2a6c6f107 100644 --- a/src/winml/modelkit/build/__init__.py +++ b/src/winml/modelkit/build/__init__.py @@ -29,4 +29,31 @@ from .onnx import build_onnx_model -__all__ = ["BuildResult", "build_hf_model", "build_onnx_model"] +__all__ = [ + "BuildResult", + "build_hf_model", + "build_onnx_model", +] + + +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "run_optimize_analyze_loop": (".common", "run_optimize_analyze_loop"), + "write_module_summary": (".module_summary", "write_module_summary"), +} + + +def __getattr__(name: str): + """Lazy-load build helpers to avoid pulling in heavy deps at import time.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return list(set(list(globals()) + __all__)) diff --git a/src/winml/modelkit/build/common.py b/src/winml/modelkit/build/common.py index d64cddbfd..096700cf3 100644 --- a/src/winml/modelkit/build/common.py +++ b/src/winml/modelkit/build/common.py @@ -35,6 +35,11 @@ def run_optimize_analyze_loop( ep: str | None = None, device: str | None = None, max_optim_iterations: int = 0, + on_ep_start: Any = None, + on_node_result: Any = None, + on_iteration_start: Any = None, + on_patterns_discovered: Any = None, + on_reoptimize: Any = None, **onnx_kwargs: Any, ) -> tuple[Path, float, int, int, dict]: """Optimize an ONNX model, analyze, and optionally re-optimize via autoconf. @@ -72,18 +77,77 @@ def run_optimize_analyze_loop( **onnx_kwargs, **config.optim, ) + current_path = optimized_path + + # Autoconf: analyze model, discover missing optimizations, re-optimize + if max_optim_iterations > 0: + analyze_iterations, analyze_black_nodes, analyze_details = _run_analyze_loop( + optimized_path=optimized_path, + ep=ep, + device=device, + max_optim_iterations=max_optim_iterations, + config=config, + on_ep_start=on_ep_start, + on_node_result=on_node_result, + on_iteration_start=on_iteration_start, + on_patterns_discovered=on_patterns_discovered, + on_reoptimize=on_reoptimize, + **onnx_kwargs, + ) + else: + analyze_iterations, analyze_black_nodes, analyze_details = 0, 0, {} + + elapsed = time.monotonic() - t0 + return current_path, elapsed, analyze_iterations, analyze_black_nodes, analyze_details + - # 2. Analyze - analysis = analyze_onnx(optimized_path, ep=ep, device=device) - analyze_count = 1 +def _run_analyze_loop( + *, + optimized_path: Path, + ep: str | None, + device: str | None, + max_optim_iterations: int, + config: WinMLBuildConfig, + on_ep_start: Any = None, + on_node_result: Any = None, + on_iteration_start: Any = None, + on_patterns_discovered: Any = None, + on_reoptimize: Any = None, + **kwargs: Any, +) -> tuple[int, int, dict]: + """Run iterative analyzer autoconf loop in a temp folder. + + Each iteration applies ONLY the autoconf flags (not merged with original). + A separate dict accumulates all discovered flags for persistence. + """ + analyze_iterations = 0 + analyze_black_nodes = 0 discovered_optim: dict[str, bool] = {} + analysis = None + _not_converged = False # 3. Autoconf re-optimization loop with tempfile.TemporaryDirectory() as tmp: iter_model = Path(tmp) / "iter.onnx" - copied = False + copy_onnx_model(optimized_path, iter_model) for _iteration in range(max_optim_iterations): + # Notify: iteration starting + if on_iteration_start is not None: + on_iteration_start( + _iteration + 1, + max_optim_iterations, + ) + + analysis = analyze_onnx( + iter_model, + ep=ep, + device=device, + on_ep_start=on_ep_start, + on_node_result=on_node_result, + ) + analyze_iterations += 1 + if not analysis.autoconf: break @@ -93,23 +157,41 @@ def run_optimize_analyze_loop( analysis.optimization_config.to_dict(), ) - if not copied: - copy_onnx_model(optimized_path, iter_model) - copied = True + # Notify: patterns discovered + if on_patterns_discovered is not None: + on_patterns_discovered(analysis.optimization_config) + # Notify: re-optimizing with discovered flags + if on_reoptimize is not None: + on_reoptimize(analysis.optimization_config) + + # Re-optimize with ONLY the autoconf flags (not merged with original) optimize_onnx( model=iter_model, output=iter_model, - **onnx_kwargs, + **kwargs, **analysis.optimization_config, ) discovered_optim.update(analysis.optimization_config) + else: + logger.warning( + "Autoconf did not converge after %d iteration(s)", + max_optim_iterations, + ) + _not_converged = True + + # Always analyze final state (validates after last optimize). + # Pass a no-op on_node_result to suppress tqdm (which would + # break the Rich Live display). No on_ep_start to avoid + # duplicate EP bars. + analysis = analyze_onnx( + iter_model, + ep=ep, + device=device, + on_node_result=lambda _: None, + ) - analysis = analyze_onnx(iter_model, ep=ep, device=device) - analyze_count += 1 - - if copied: - copy_onnx_model(iter_model, optimized_path) + copy_onnx_model(iter_model, optimized_path) # 4. Wrap up if discovered_optim: @@ -122,22 +204,27 @@ def run_optimize_analyze_loop( analysis.optimization_config.to_dict(), ) - if analysis.has_errors: + if analysis is not None and analysis.has_errors: raise RuntimeError( - f"Unsupported nodes persist after {analyze_count} analyze " + f"Unsupported nodes persist after {analyze_iterations} analyze " f"pass(es): {analysis.lint.error_patterns}" ) - details = { - "lint": { - "errors": analysis.lint.errors, - "warnings": analysis.lint.warnings, - "passed": analysis.lint.passed, - "error_patterns": analysis.lint.error_patterns, - "warning_patterns": analysis.lint.warning_patterns, - }, - "autoconf": discovered_optim or {}, - } - - elapsed = time.monotonic() - t0 - return optimized_path, elapsed, analyze_count, analysis.lint.errors, details + analyze_black_nodes = analysis.lint.errors if analysis else 0 + + # Build details for manifest + details: dict = {} + if analysis: + details = { + "lint": { + "errors": analysis.lint.errors, + "warnings": analysis.lint.warnings, + "passed": analysis.lint.passed, + "error_patterns": analysis.lint.error_patterns, + "warning_patterns": analysis.lint.warning_patterns, + }, + "autoconf": discovered_optim or {}, + "autoconf_not_converged": _not_converged, + } + + return analyze_iterations, analyze_black_nodes, details diff --git a/src/winml/modelkit/cli.py b/src/winml/modelkit/cli.py index 263494c87..bc4f89c54 100644 --- a/src/winml/modelkit/cli.py +++ b/src/winml/modelkit/cli.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- """WinML ModelKit CLI - Universal ONNX export from command line. -This module provides the main CLI entry point for ModelKit with automatic +This module provides the main CLI entry point for ModelKit with lazy command discovery from the commands/ directory. Usage: @@ -19,6 +19,7 @@ from __future__ import annotations +import ast import logging from importlib import import_module from pathlib import Path @@ -26,94 +27,130 @@ import click from . import __version__ +from .utils.logging import configure_logging logger = logging.getLogger(__name__) +_COMMANDS_DIR = Path(__file__).parent / "commands" -@click.group() -@click.version_option(version=__version__, prog_name="winml") -@click.option( - "--debug", - is_flag=True, - default=False, - help="Enable debug logging", -) -@click.pass_context -def main(ctx: click.Context, debug: bool) -> None: - """WML ModelKit - Accelerate Model Deployment on WinML. - - Universal ONNX export with QNN and OpenVINO backend support. - """ - # Configure logging based on debug flag - log_level = logging.DEBUG if debug else logging.INFO - logging.basicConfig( - level=log_level, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - - # Store debug flag in context for subcommands - ctx.ensure_object(dict) - ctx.obj["debug"] = debug - - -def _discover_commands() -> None: - """Auto-discover Click commands from commands/ directory. - This function scans the commands/ directory for Python modules and - registers any Click commands found. Commands are registered using - the module filename as the command name. +def _parse_click_help(path: Path) -> str: + """Extract short help from a command module without importing it. - Command Discovery Rules: - - Skips files starting with underscore (_) - - Looks for any object that is a click.Command instance - - Uses module filename (without .py) as command name + Parses the module's AST to find the first decorated function's docstring, + which Click uses as the command help text. + """ + try: + tree = ast.parse(path.read_text(encoding="utf-8")) + except (SyntaxError, OSError): + return "" + + for node in ast.iter_child_nodes(tree): + if isinstance(node, ast.FunctionDef) and node.decorator_list: + docstring = ast.get_docstring(node) + if docstring: + # Return first line only (Click's short help) + return docstring.split("\n")[0] + return "" + + +class LazyGroup(click.Group): + """Click group that defers command module imports until invoked. + + Instead of importing every command module at startup, this group reads + command names from the filesystem and only imports a module when the + user actually invokes that command. Help text is extracted via AST + parsing (no module execution). """ - commands_dir = Path(__file__).parent / "commands" - - # Early exit if commands directory doesn't exist - if not commands_dir.exists(): - logger.debug("Commands directory not found: %s", commands_dir) - return - # Scan for Python modules - for py_file in commands_dir.glob("*.py"): - # Skip private modules - if py_file.name.startswith("_"): - continue + def list_commands(self, ctx: click.Context) -> list[str]: + """Return command names from filesystem — no module imports.""" + if not _COMMANDS_DIR.exists(): + return [] + return sorted(p.stem for p in _COMMANDS_DIR.glob("*.py") if not p.name.startswith("_")) - module_name = py_file.stem + def get_command(self, ctx: click.Context, cmd_name: str) -> click.Command | None: + """Import command module only when the command is actually invoked.""" try: - # Import the module module = import_module( - f".commands.{module_name}", + f".commands.{cmd_name}", package=__package__, ) - - # Find Click command in module - # Prefer click.Group over click.Command for hierarchical commands - discovered_command = None - for attr_name in dir(module): - attr = getattr(module, attr_name) - if isinstance(attr, click.Group): - discovered_command = attr - break - if isinstance(attr, click.Command) and discovered_command is None: - discovered_command = attr - - if discovered_command: - # Register command with module name - main.add_command(discovered_command, name=module_name) - logger.debug("Discovered command: %s", module_name) - except ImportError as e: - logger.warning("Failed to import command module %s: %s", module_name, e) + logger.warning("Failed to import command module %s: %s", cmd_name, e) + return None except Exception as e: - logger.error("Error loading command %s: %s", module_name, e) + logger.error("Error loading command %s: %s", cmd_name, e) + return None + + # Find Click command in module (prefer Group over Command) + discovered = None + for attr_name in dir(module): + attr = getattr(module, attr_name) + if isinstance(attr, click.Group): + return attr + if isinstance(attr, click.Command) and discovered is None: + discovered = attr + return discovered + + def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: + """Format command list using AST-parsed help (no module imports).""" + commands = [] + for cmd_name in self.list_commands(ctx): + help_text = _parse_click_help(_COMMANDS_DIR / f"{cmd_name}.py") + commands.append((cmd_name, help_text)) + + if commands: + limit = max(1, formatter.width - 6 - max(len(name) for name, _ in commands)) + rows = [] + for name, help_text in commands: + short = help_text[:limit].rstrip() if help_text else "" + rows.append((name, short)) + + with formatter.section("Commands"): + formatter.write_dl(rows) + + +@click.group(cls=LazyGroup) +@click.version_option(version=__version__, prog_name="winml") +@click.option( + "--verbose", + "-v", + count=True, + help="Increase verbosity (-v=INFO, -vv=DEBUG)", +) +@click.option( + "--quiet", + "-q", + is_flag=True, + default=False, + help="Quiet mode - errors only", +) +@click.option( + "--debug", + is_flag=True, + default=False, + help="Alias for -vv (DEBUG logging)", + hidden=True, +) +@click.pass_context +def main(ctx: click.Context, verbose: int, quiet: bool, debug: bool) -> None: + """WML ModelKit - Accelerate Model Deployment on WinML. + Universal ONNX export with QNN and OpenVINO backend support. + """ + # --debug is a backward-compat alias for -vv + if debug: + verbose = max(verbose, 2) -# Discover and register commands at module load time -_discover_commands() + configure_logging(verbosity=verbose, quiet=quiet) + + # Store verbosity in context for subcommands + ctx.ensure_object(dict) + ctx.obj["debug"] = debug or verbose >= 2 + ctx.obj["verbosity"] = verbose + ctx.obj["quiet"] = quiet if __name__ == "__main__": diff --git a/src/winml/modelkit/commands/live_chart.py b/src/winml/modelkit/commands/_live_chart.py similarity index 100% rename from src/winml/modelkit/commands/live_chart.py rename to src/winml/modelkit/commands/_live_chart.py diff --git a/src/winml/modelkit/commands/analyze.py b/src/winml/modelkit/commands/analyze.py index 7c628a884..0c4874d23 100644 --- a/src/winml/modelkit/commands/analyze.py +++ b/src/winml/modelkit/commands/analyze.py @@ -4,17 +4,11 @@ # -------------------------------------------------------------------------- """Analyze command for winml CLI. -This module provides the analyze command that analyzes ONNX models -for runtime support across NPU execution providers. +Analyzes ONNX models for runtime support with Rich Live stacked bar +visualization, showing real-time per-node progress display. Usage: - winml analyze --model MODEL --ep EP --device DEVICE [OPTIONS] - -Examples: - winml analyze --model model.onnx --ep QNNExecutionProvider --device NPU - winml analyze --model model.onnx --ep qnn --device NPU - winml analyze --model model.onnx --ep ov --device GPU --information - winml analyze --model model.onnx --ep vitis --device GPU --output results.json + winml analyze --model MODEL [--ep EP] [--device DEVICE] [OPTIONS] """ from __future__ import annotations @@ -24,6 +18,11 @@ from pathlib import Path import click +from rich.console import Console +from rich.live import Live +from rich.logging import RichHandler +from rich.table import Table +from rich.text import Text from ..utils import cli as cli_utils from ..utils.constants import normalize_ep_name @@ -32,8 +31,354 @@ logger = logging.getLogger(__name__) - -@click.command(name="analyze") # type: ignore[misc] +# ── Rich visualization helpers ──────────────────────────────────────────── + +MAX_BAR_WIDTH = 40 + +_COLORS = { + "supported": "green", + "partial": "yellow", + "unsupported": "red", + "unknown": "bright_black", +} + + +def _display_name(pattern_id: str) -> str: + """Extract operator display name from pattern_id ('OP/ai.onnx/Conv' -> 'Conv').""" + return pattern_id.split("/")[-1] + + +_LEVEL_ICONS = [ + ("unsupported", "🔴"), + ("partial", "🟡"), + ("unknown", "🔵"), +] + + +def _worst_level_icon(counts: dict[str, int]) -> str: + """Return icon for the worst support level present (lower bound).""" + for level, icon in _LEVEL_ICONS: + if counts.get(level, 0) > 0: + return icon + return "🟢" + + +def _build_stacked_bar(counts: dict[str, int], max_count: int) -> Text: + """Build a stacked bar where total width is proportional to max_count.""" + total = sum(counts.values()) + if total == 0: + return Text() + + bar_width = max(1, round(total / max_count * MAX_BAR_WIDTH)) + # Ensure bar can fit all non-zero segments + nonzero = sum(1 for v in counts.values() if v > 0) + bar_width = max(bar_width, nonzero) + + bar = Text() + chars_used = 0 + + for level in ("supported", "partial", "unsupported", "unknown"): + count = counts.get(level, 0) + if count == 0: + continue + width = max(1, round(count / total * bar_width)) + width = min(width, bar_width - chars_used) + bar.append("█" * width, style=_COLORS[level]) + chars_used += width + + return bar + + +def _build_analyzed_text(counts: dict[str, int]) -> Text: + """Build 'W/G/B' format like '53/0/0' or '12/5/1' with colors.""" + w = counts.get("supported", 0) + g = counts.get("partial", 0) + b = counts.get("unsupported", 0) + u = counts.get("unknown", 0) + + text = Text() + text.append(str(w), style="bold green") + text.append("/", style="dim") + text.append(str(g), style="bold yellow" if g > 0 else "dim") + text.append("/", style="dim") + text.append(str(b), style="bold red" if b > 0 else "dim") + if u > 0: + text.append("/", style="dim") + text.append(str(u), style="bold bright_black") + return text + + +def _build_analysis_table( + data: dict[str, dict[str, int]], + ep_name: str = "", + complete: bool = False, + all_ops: dict[str, int] | None = None, +) -> Table: + """Build the analysis table with variable-width stacked bars. + + Args: + data: Per-op instance counts (filled in as analysis progresses). + Ops with data show colored bars (partial or complete). + Ops in all_ops but not in data show dim pending rows. + ep_name: EP name for title + complete: Show complete marker + all_ops: All op types with total counts (for showing pending rows) + """ + # Build display order: all_ops sorted by count, or just data if no all_ops + if all_ops: + display_order = sorted(all_ops, key=lambda x: all_ops[x], reverse=True) + else: + display_order = sorted(data, key=lambda x: sum(data[x].values()), reverse=True) + + # Max count for bar width scaling (anchored to all_ops for stable bars during animation) + if all_ops: + max_count = max(all_ops.values(), default=1) + else: + max_count = max((sum(v.values()) for v in data.values()), default=1) + + title = "📊 OP CHECK" + if ep_name: + title += f" — [bold cyan]{ep_name}[/bold cyan]" + if complete: + title += " [bold green]✅ Complete[/bold green]" + + table = Table( + title=title, + show_header=True, + header_style="bold", + box=None, + padding=(0, 1), + expand=False, + ) + + table.add_column("Op Type", width=28, no_wrap=True) + table.add_column("S/P/U", width=14, no_wrap=True) + table.add_column("", no_wrap=True) + + agg: dict[str, int] = {"supported": 0, "partial": 0, "unsupported": 0, "unknown": 0} + + for op_type in display_order: + total = all_ops.get(op_type, 0) if all_ops else sum(data.get(op_type, {}).values()) + counts = data.get(op_type) + + if not counts: + # No data yet — fully pending + bar_width = max(1, round(total / max_count * MAX_BAR_WIDTH)) if max_count else 1 + table.add_row( + Text(f" {op_type} ({total})", style="dim"), + Text("...", style="dim"), + Text("░" * bar_width, style="dim"), + ) + else: + # Has data — show progress (partial or complete) + analyzed_for_op = sum(counts.values()) + for level in agg: + agg[level] += counts.get(level, 0) + + icon = _worst_level_icon(counts) + op_label = Text() + op_label.append(f"{icon} ") + op_label.append(op_type, style="cyan") + if analyzed_for_op < total: + op_label.append(f" ({analyzed_for_op}/{total})", style="dim") + else: + op_label.append(f" ({total})", style="dim") + + # Build bar: colored portion (analyzed) + dim portion (remaining) + bar = _build_stacked_bar(counts, max_count) + remaining = total - analyzed_for_op + if remaining > 0: + remaining_width = max(1, round(remaining / max_count * MAX_BAR_WIDTH)) + bar.append("░" * remaining_width, style="dim") + + table.add_row(op_label, _build_analyzed_text(counts), bar) + + # Summary row + table.add_section() + total_ops = sum(all_ops.values()) if all_ops else sum(agg.values()) + analyzed_count = sum(agg.values()) + total_label = Text() + total_label.append("TOTAL", style="bold") + if analyzed_count < total_ops: + total_label.append(f" ({analyzed_count}/{total_ops})", style="dim") + else: + total_label.append(f" ({total_ops})", style="dim") + + # TOTAL bar: colored portion + dim remainder + total_bar = _build_stacked_bar(agg, max(total_ops, 1)) + total_remaining = total_ops - analyzed_count + if total_remaining > 0: + total_remaining_width = max(1, round(total_remaining / max(total_ops, 1) * MAX_BAR_WIDTH)) + total_bar.append("░" * total_remaining_width, style="dim") + + table.add_row( + total_label, + _build_analyzed_text(agg), + total_bar, + ) + + return table + + +_STATUS_ICONS = {"s": "🟢", "p": "🟡", "u": "🔴", "uk": "🔵"} +_PATTERN_STATUS_LABELS = {"s": "supported", "p": "partial", "u": "unsupported", "uk": "unknown"} +_SUPPORT_LEVEL_TO_SHORT = { + "supported": "s", + "partial": "p", + "unsupported": "u", + "unknown": "uk", +} + + +_PAT_COLORS = {"s": "green", "p": "yellow", "u": "red", "uk": "bright_black"} + + +def _render_pattern_matching( + console: Console, + ep_patterns: dict[str, dict[str, dict]], +) -> None: + """Render the PATTERN MATCHING section — per-EP pattern support.""" + if not any(ep_patterns.values()): + return + + console.print("═" * 80) + console.print("🔍 [bold]PATTERN MATCHING[/bold]") + console.print("═" * 80) + + for ep_name, patterns in ep_patterns.items(): + if not patterns: + continue + + console.print(f" 💻 [bold cyan]{ep_name}[/bold cyan]") + + for pat_id, pat_info in sorted(patterns.items(), key=lambda x: x[1]["count"], reverse=True): + status = pat_info["status"] + count = pat_info["count"] + icon = _STATUS_ICONS.get(status, "❓") + label = _PATTERN_STATUS_LABELS.get(status, "unknown") + console.print( + f" {icon} [cyan]{pat_id}[/cyan] [dim]({count} instances)[/dim]" + f" — [{_PAT_COLORS.get(status, 'dim')}]{label}[/{_PAT_COLORS.get(status, 'dim')}]" + ) + + console.print() + + +def _extract_ep_patterns( + results: list, +) -> dict[str, dict[str, dict]]: + """Extract per-EP subgraph pattern support from analysis results. + + Args: + results: List of EPSupport objects from AnalysisOutput. + + Returns: + Dict keyed by EP name, containing dicts of pattern_id to + ``{"count": int, "status": str}`` where status is one of + ``"s"`` (supported), ``"p"`` (partial), ``"u"`` (unsupported), + ``"uk"`` (unknown). + """ + ep_patterns: dict[str, dict[str, dict]] = {} + for ep_support in results: + patterns: dict[str, dict] = {} + for info in ep_support.information: + if info.pattern_id and info.pattern_id.startswith("SUBGRAPH/"): + status = ( + _SUPPORT_LEVEL_TO_SHORT.get(info.status.value, "uk") if info.status else "uk" + ) + patterns[info.pattern_id] = { + "count": len(info.pattern_node_list), + "status": status, + } + ep_patterns[ep_support.ep_type] = patterns + return ep_patterns + + +def _render_analysis_summary( + console: Console, + results: list, + ep_instance_counts: dict[str, dict[str, dict[str, int]]], + ep_patterns: dict[str, dict[str, dict]] | None = None, +) -> None: + """Render the Analysis Summary section after pattern detection. + + Args: + console: Rich console for output. + results: List of EPSupport objects from AnalysisOutput. + ep_instance_counts: Per-EP instance counts accumulated during analysis, + keyed by EP name, then op name, then support level. + ep_patterns: Per-EP subgraph pattern support extracted from results. + """ + from ..analyze.models.support_level import SupportLevel + + console.print("═" * 80) + console.print("\U0001f4c8 [bold]ANALYSIS SUMMARY[/bold]") + console.print("═" * 80) + + for ep_support in results: + ep_name = ep_support.ep_type + + # Aggregate instance counts for this EP + ep_data = ep_instance_counts.get(ep_name, {}) + agg: dict[str, int] = {"supported": 0, "partial": 0, "unsupported": 0, "unknown": 0} + for counts in ep_data.values(): + for level in agg: + agg[level] += counts.get(level, 0) + + icon = _worst_level_icon(agg) + + # EP name style based on worst level + if agg.get("unsupported", 0) > 0: + ep_style = "bold red" + elif agg.get("partial", 0) > 0: + ep_style = "bold yellow" + elif agg.get("unknown", 0) > 0 and agg.get("supported", 0) == 0: + ep_style = "bold bright_black" + else: + ep_style = "bold green" + + analyzed = _build_analyzed_text(agg) + console.print(f" {icon} [{ep_style}]{ep_name}[/{ep_style}]: ", end="") + console.print(analyzed) + + # List ops by non-white support level + classification = ep_support.classification + _issue_sections = [ + (SupportLevel.UNSUPPORTED, "red", "\u26d4 Unsupported"), + (SupportLevel.PARTIAL, "yellow", "\u26a0\ufe0f Partial"), + (SupportLevel.UNKNOWN, "bright_black", "\u2753 Unknown"), + ] + for level, color, heading in _issue_sections: + ops = classification.get(level, []) + if ops: + console.print(f" [{color}]{heading}:[/{color}]") + for op in sorted(ops): + console.print(f" \u2022 [dim]{op}[/dim]") + + # List non-supported patterns for this EP + patterns = (ep_patterns or {}).get(ep_name, {}) + bad_patterns = {pid: p for pid, p in patterns.items() if p["status"] != "s"} + if bad_patterns: + console.print(" [dim]Patterns:[/dim]") + for pid, p in sorted(bad_patterns.items(), key=lambda x: x[1]["count"], reverse=True): + status = p["status"] + icon_p = _STATUS_ICONS.get(status, "\u2753") + label = _PATTERN_STATUS_LABELS.get(status, "unknown") + console.print( + f" {icon_p} [dim]{pid}[/dim] ({p['count']} instances, {label})" + ) + + has_issues = any(classification.get(lvl) for lvl, _, _ in _issue_sections) or bad_patterns + if not has_issues: + console.print(" [green]Ready to deploy[/green]") + + console.print() + + +# ── Click command ───────────────────────────────────────────────────────── + + +@click.command(name="analyze") @cli_utils.model_option(required=True) @cli_utils.ep_option( required=False, optional_message="If not specified, analyzes all supported EPs" @@ -42,176 +387,308 @@ required=False, optional_message="If not specified, uses NPU as default", default="NPU" ) @cli_utils.verbosity_options -@click.option( # type: ignore[misc] +@click.option( "--output", type=click.Path(path_type=Path), default=None, - help="Save JSON output to file (default: console display)", + help="Save JSON output to file", ) -@click.option( # type: ignore[misc] +@click.option( "--information/--no-information", default=True, - help="Include detailed recommendations in output (default: enabled)", + help="Include detailed recommendations (default: enabled)", ) -@click.option( # type: ignore[misc] +@click.option( "--htp-metadata", type=click.Path(exists=True, path_type=Path), default=None, help="Path to HTP metadata JSON file for enhanced pattern extraction", ) -@click.option( # type: ignore[misc] +@click.option( "--run-unknown-op/--no-run-unknown-op", default=True, help="Run unknown operators on local machine if possible (default: enabled)", ) -@click.option( # type: ignore[misc] +@click.option( "--save-node", multiple=True, type=click.Choice(["partial", "unsupported"], case_sensitive=False), help="Save specific node types for further analysis. Can be specified multiple times " "(e.g., --save-node partial --save-node unsupported).", ) +@click.option( + "--optim-config", + type=click.Path(path_type=Path), + default=None, + help="Save auto-discovered optimization config to JSON file", +) def analyze( model: Path, ep: str | None, device: str | None, output: Path | None, information: bool, - verbose: bool, + verbose: int, quiet: bool, htp_metadata: Path | None, run_unknown_op: bool, save_node: tuple[str, ...], + optim_config: Path | None, ) -> None: - r"""Analyze ONNX model for runtime support. + r"""Analyze ONNX model for runtime support with live progress. - Analyze ONNX model to determine runtime support status for the specified - execution provider and device. Performs static analysis to detect patterns - and check operator compatibility. + Performs static analysis to detect patterns and check operator + compatibility, showing real-time per-operator results. Exit Codes: - 0: Success - execution provider supports model + 0: Model fully supported - 1: Partial support - some unsupported operators + 1: Partial support — some unsupported operators - 2: Error - invalid input or analysis failure + 2: Error — invalid input or analysis failure Examples: - Analyze all supported EPs with default device: - - winml analyze --model model.onnx - - Check QNN NPU support (full name): - - winml analyze --model model.onnx --ep QNNExecutionProvider --device NPU - - Check QNN NPU support (using alias): - - winml analyze --model model.onnx --ep qnn --device NPU - - Check Intel OpenVINO GPU support with recommendations (using alias): - - winml analyze --model model.onnx --ep ov --device GPU --information - - Analyze all EPs and save results to file: - + \b + winml analyze --model model.onnx --ep qnn + winml analyze --model model.onnx --ep ov --device GPU winml analyze --model model.onnx --output results.json - - Use HTP metadata for enhanced pattern extraction: - - winml analyze --model model.onnx - --ep OpenVINOExecutionProvider --driver GPU --information --htp-metadata metadata.json """ - # Configure logging - configure_logging(verbose=verbose, quiet=quiet) + configure_logging(verbosity=verbose, quiet=quiet) try: - # Import core components - logger.debug("Importing static analyzer components...") - from ..analyze import ONNXStaticAnalyzer, __version__ - - logger.info("Using analyzer version: %s", __version__) + from ..analyze import ONNXStaticAnalyzer - # Validate model file + # Validate model if not model.exists(): logger.error("ONNX model file not found: %s", model) sys.exit(2) - logger.debug("Model path: %s", model) - logger.debug("Execution provider: %s", ep) - logger.debug("Device: %s", device) - logger.debug("Information: %s", information) - if htp_metadata: - logger.debug("HTP metadata path: %s", htp_metadata) - - # Normalize EP name (convert aliases to full names) ep_normalized = normalize_ep_name(ep) - if ep != ep_normalized: - logger.debug("EP alias '%s' normalized to '%s'", ep, ep_normalized) - # Run static analysis using ONNXStaticAnalyzer - logger.info("Running static analysis...") + logger.info("Analyzing model: %s", model) + logger.info("Target: %s on %s", ep_normalized or "all EPs", device) + analyzer = ONNXStaticAnalyzer() - save_node_types = set(save_node) - result = analyzer.analyze( - model_path=model, - ep=ep_normalized, - device=device, - enable_information=information, - htp_metadata_path=str(htp_metadata) if htp_metadata else None, - run_unknown_op=run_unknown_op, - save_node_types=save_node_types, - ) - - logger.info( - "Analysis complete: Model is %s", - "fully supported" if result.is_fully_supported() else "partially supported", - ) - - # Serialize to JSON - json_output = result.to_json() - - # Parse JSON for console display - import json - - from ..analyze.console_writer import ( - display_analysis_results, - ) - from ..analyze.models.output import AnalysisOutput - - data = json.loads(json_output) - analysis = AnalysisOutput.model_validate(data) - - # Save JSON to file if output path specified - if output: - output.write_text(json_output, encoding="utf-8") - logger.info("JSON results saved to: %s", output) - - # Always display friendly console output - display_analysis_results(analysis, verbose=verbose) - - # Determine exit code based on support level - unsupported_ops = result.get_unsupported_operators() - is_model_supported = result.is_fully_supported() - if is_model_supported: - # Full support - logger.info("Model is fully supported") - sys.exit(0) + + # Console for Rich output (stderr so stdout stays clean for JSON) + console = Console(stderr=True) + + # Model info header + if not quiet: + console.print() + console.print("═" * 80) + console.print("📊 [bold]OP CHECK[/bold]") + console.print("═" * 80) + console.print(f" 📦 Model: [bold cyan]{model.name}[/bold cyan]") + + # Load model metadata for header + try: + import onnx + + _proto = onnx.load(str(model), load_external_data=False) + _opset = _proto.opset_import[0].version if _proto.opset_import else "?" + _producer = _proto.producer_name or "unknown" + if _proto.producer_version: + _producer += f" v{_proto.producer_version}" + _total_ops = len(_proto.graph.node) + _unique_ops = len({n.op_type for n in _proto.graph.node}) + console.print( + f" 🔧 Opset: [green]{_opset}[/green] Producer: [green]{_producer}[/green]" + ) + console.print( + f" 📋 Operators: [cyan]{_total_ops}[/cyan] total, " + f"[cyan]{_unique_ops}[/cyan] unique types" + ) + console.print() + del _proto # free memory + except Exception: + logger.debug("Could not load model metadata for header display") + + # Per-EP state for Live display + current_ep_name = "" + all_op_counts: dict[str, int] = {} + instance_counts: dict[str, dict[str, int]] = {} + ep_instance_counts: dict[str, dict[str, dict[str, int]]] = {} + live: Live | None = None + ep_counter = 0 + + def _finalize_live(mark_complete: bool = True) -> None: + """Stop the active Live display, optionally marking it complete.""" + nonlocal live + if live is None: + return + try: + if mark_complete and current_ep_name: + ep_instance_counts[current_ep_name] = { + k: dict(v) for k, v in instance_counts.items() + } + live.update( + _build_analysis_table( + instance_counts, + ep_name=current_ep_name, + complete=True, + all_ops=all_op_counts, + ) + ) + except Exception: + logger.debug("Failed to render final table", exc_info=True) + finally: + live.stop() + live = None + + def on_ep_start(ep_name, operator_counts): + """Called when analysis starts for a new EP.""" + nonlocal current_ep_name, instance_counts, all_op_counts, ep_counter, live + ep_counter += 1 + + # Finalize previous EP's Live display + if current_ep_name: + _finalize_live() + console.print() # blank line between EP tables + + # Reset for new EP (normalize keys to display names) + current_ep_name = ep_name + all_op_counts = {_display_name(k): v for k, v in operator_counts.items()} + instance_counts = {} + + # EP section header + console.print("─" * 80) + console.print(f"💻 [bold]EP {ep_counter}[/bold]: [bold cyan]{ep_name}[/bold cyan]") + console.print("─" * 80) + + # Start new Live display — all ops shown as pending + live = Live( + _build_analysis_table( + instance_counts, + ep_name=ep_name, + all_ops=all_op_counts, + ), + console=console, + refresh_per_second=30, + ) + live.start() + + def on_node_result(pattern_runtime): + """Callback invoked per-node during analysis.""" + op = _display_name(pattern_runtime.pattern_id) + level = pattern_runtime.result.classification.value + op_counts = instance_counts.setdefault(op, {}) + op_counts[level] = op_counts.get(level, 0) + 1 + + if live is not None: + live.update( + _build_analysis_table( + instance_counts, + ep_name=current_ep_name, + all_ops=all_op_counts, + ) + ) + + if not quiet: + # Redirect logging through Rich console so log messages render + # above the Live table instead of breaking it + root_logger = logging.getLogger() + old_handlers = root_logger.handlers[:] + rich_handler = RichHandler( + console=console, + show_path=False, + show_time=True, + rich_tracebacks=False, + ) + rich_handler.setLevel(root_logger.level) + root_logger.handlers = [rich_handler] + + try: + save_node_types = set(save_node) + result = analyzer.analyze( + model_path=str(model), + ep=ep_normalized, + device=device, + enable_information=information, + htp_metadata_path=str(htp_metadata) if htp_metadata else None, + run_unknown_op=run_unknown_op, + save_node_types=save_node_types, + on_node_result=on_node_result, + on_ep_start=on_ep_start, + ) + + # Extract per-EP pattern support (available now) + ep_patterns = _extract_ep_patterns(result.output.results) + + # Finalize last EP's Live display + _finalize_live() + finally: + # Safety: stop Live if still running (e.g. on exception) + _finalize_live(mark_complete=False) + root_logger.handlers = old_handlers + + console.print() + + # Pattern Matching section (per-EP) + _render_pattern_matching(console, ep_patterns) + + # Analysis Summary section + _render_analysis_summary( + console, + result.output.results, + ep_instance_counts, + ep_patterns=ep_patterns, + ) + + # Legend (at the very bottom) + console.print( + " [dim]S/P/U = Supported/Partial/Unsupported[/dim]" + " [green]██[/green] supported" + " [yellow]██[/yellow] partial" + " [red]██[/red] unsupported" + " [bright_black]██[/bright_black] unknown" + ) + console.print() else: - # Partial or no support - logger.warning("Model has %d unsupported operators", len(unsupported_ops)) - if verbose: - for op_name in unsupported_ops[:5]: # Show first 5 - logger.warning(" - %s", op_name) - if len(unsupported_ops) > 5: - logger.warning(" ... and %d more", len(unsupported_ops) - 5) - sys.exit(1) + # Quiet mode — no live display + save_node_types = set(save_node) + result = analyzer.analyze( + model_path=str(model), + ep=ep_normalized, + device=device, + enable_information=information, + htp_metadata_path=str(htp_metadata) if htp_metadata else None, + run_unknown_op=run_unknown_op, + save_node_types=save_node_types, + ) + + # Save JSON if requested + if output: + try: + output.write_text(result.to_json(), encoding="utf-8") + logger.info("JSON results saved to: %s", output) + except OSError as e: + logger.error("Failed to write JSON output to %s: %s", output, e) + except Exception as e: + logger.error("Failed to serialize results to JSON: %s", e) + logger.debug("JSON serialization traceback:", exc_info=True) + + # Save optimization config if requested + if optim_config: + import json + + try: + config = result.get_optimization_config(ep=ep_normalized) + optim_config.write_text(json.dumps(config.to_dict(), indent=2), encoding="utf-8") + logger.info("Optimization config saved to: %s", optim_config) + except OSError as e: + logger.error("Failed to write config to %s: %s", optim_config, e) + except Exception as e: + logger.error("Failed to generate optimization config: %s", e) + logger.debug("Config generation traceback:", exc_info=True) + + # Exit code: 0 = fully supported, 1 = partial support + sys.exit(0 if result.is_fully_supported() else 1) except FileNotFoundError as e: logger.error("File not found: %s", e) sys.exit(2) - except Exception as e: logger.error("Analysis failed: %s", e) if verbose: @@ -219,11 +696,4 @@ def analyze( sys.exit(2) -# Register the command -# This will be auto-discovered by the CLI framework -# Export only the command for CLI discovery __all__ = ["analyze"] - - -if __name__ == "__main__": - analyze() diff --git a/src/winml/modelkit/commands/build.py b/src/winml/modelkit/commands/build.py index ee9a9159c..d35ea0882 100644 --- a/src/winml/modelkit/commands/build.py +++ b/src/winml/modelkit/commands/build.py @@ -12,7 +12,7 @@ winml build -c config.json -m microsoft/resnet-50 -o output/ winml build -c config.json -m model.onnx -o output/ winml build -c config.json -m bert-base-uncased -o output/ --no-quant --no-compile - winml build -c config.json -m microsoft/resnet-50 --random-init -o output/ + winml build -c config.json -o output/ --use-cache winml build -c config.json -m microsoft/resnet-50 -o output/ --rebuild -v """ @@ -20,11 +20,22 @@ import json import logging +import time from pathlib import Path from typing import TYPE_CHECKING import click -from rich.console import Console +from rich.logging import RichHandler + +from ..utils.console import ( + detect_model_source, + get_console, + print_error, + print_final, + print_setup, + print_stage_skip, + print_stages_header, +) if TYPE_CHECKING: @@ -36,7 +47,7 @@ from ..config import WinMLBuildConfig logger = logging.getLogger(__name__) -console = Console(stderr=True) +console = get_console() # ============================================================================= @@ -115,7 +126,7 @@ def _instantiate_parent_model(model_type: str, task: str | None = None) -> nn.Mo Returns: PyTorch model in eval mode with random/init weights. """ - from ..loader.config import resolve_loader_config + from ..loader import resolve_loader_config _, hf_config, resolved_class = resolve_loader_config( model_type=model_type, @@ -206,7 +217,7 @@ def _build_modules( # ============================================================================= -@click.command() +@click.command("build") @click.option( "-c", "--config", @@ -219,22 +230,8 @@ def _build_modules( "-m", "--model", "model_id", - required=True, - help="HuggingFace model ID or path to .onnx file.", - # --model is mandatory because random-weight builds (omitting --model) are - # unreliable: AutoConfig.for_model() returns architecture class defaults - # which can differ from pretrained configs in ways that cause silent - # runtime failures. E.g. MPNet/Roberta-family models set - # max_position_embeddings = usable_length + pad_token_id + 1 (514) in the - # pretrained config, but the class default is only 512. The smaller - # embedding table causes "index out of range in self" during ONNX export - # tracing -- a position-offset OOB that the OnnxConfig-level fix (PR #415) - # cannot reach because HTPExporter uses pre-populated input_tensors, not - # Optimum's input generation path. Supporting random-init reliably would - # require storing the full pretrained HF config (or at least the model ID) - # in the build config so _load_model can call AutoConfig.from_pretrained() - # instead of AutoConfig.for_model(). Until that plumbing exists, require - # --model to guarantee correct model instantiation. + default=None, + help="HuggingFace model ID or path to .onnx file. Omit for random-weight build.", ) @click.option( "-o", @@ -250,12 +247,6 @@ def _build_modules( default=False, help="Use ModelKit global cache (~/.cache/winml/). Mutually exclusive with -o.", ) -@click.option( - "--random-init", - is_flag=True, - default=False, - help="Skip weight download; use model config with random weights.", -) @click.option( "--rebuild", is_flag=True, @@ -274,12 +265,6 @@ def _build_modules( default=False, help="Skip compilation (overrides config)", ) -@click.option( - "--no-optimize", - is_flag=True, - default=False, - help="Skip optimization (for pre-quantized ONNX models)", -) @click.option( "--ep", default=None, @@ -297,6 +282,12 @@ def _build_modules( default=False, help="Skip analyzer loop during build", ) +@click.option( + "--no-optimize", + is_flag=True, + default=False, + help="Skip optimization (for pre-quantized ONNX models)", +) @click.option( "--max-optim-iterations", "max_optim_iterations", @@ -318,7 +309,6 @@ def build( model_id: str | None, output_dir: str | None, use_cache: bool, - random_init: bool, rebuild: bool, no_quant: bool, no_compile: bool, @@ -349,8 +339,8 @@ def build( # Export + optimize only winml build -c config.json -m bert-base-uncased -o output/ --no-quant --no-compile - # Random-weight build (no weight download) - winml build -c config.json -m microsoft/resnet-50 --random-init -o output/ + # Random-weight build (no download) + winml build -c config.json -o output/ # Use global cache winml build -c config.json -m microsoft/resnet-50 --use-cache @@ -373,7 +363,7 @@ def build( # If ep unspecified, attempt to auto-select a suitable EP from the registry if ep is None: - from ..session.ep_registry import WinMLEPRegistry + from ..session import WinMLEPRegistry registry = WinMLEPRegistry.get_instance() candidate_eps = [ @@ -425,11 +415,15 @@ def build( if not configs: raise click.UsageError("Module config array is empty -- nothing to build.") - console.print() - console.print("[bold]winml build[/bold] (module mode)") - console.print(f" Config: {Path(config_file).name}") - console.print(f" Modules: {len(configs)}") - console.print(f" Output: {resolved_dir}") + print_setup( + console, + model=model_id or "random-init", + config=Path(config_file).name, + output=str(resolved_dir), + source="HuggingFace", + ) + print_stages_header(console) + console.print(f" \U0001f9e9 [bold]Modules:[/bold] {len(configs)}") console.print() results = _build_modules( @@ -451,6 +445,8 @@ def build( ) # Write module summary + from ..build import write_module_summary + summary_instances = [] for cfg, result in zip(configs, results, strict=True): summary_instances.append( @@ -462,15 +458,13 @@ def build( } ) - summary_path = resolved_dir / "module_summary.json" - summary = { - "model_id": model_id or "random-init", - "module_class": configs[0].loader.model_class or "unknown", - "instance_count": len(summary_instances), - "instances": summary_instances, - } - summary_path.write_text(json.dumps(summary, indent=2)) - console.print(f" Summary: {summary_path}") + write_module_summary( + output_path=resolved_dir / "module_summary.json", + model_id=model_id or "random-init", + module_class=configs[0].loader.model_class or "unknown", + instances=summary_instances, + ) + console.print(f" Summary: {resolved_dir / 'module_summary.json'}") console.print() @@ -482,7 +476,7 @@ def build( cache_key: str | None = None if use_cache: from ..cache import get_cache_dir, get_cache_key, get_model_dir - from ..loader.task import get_task_abbrev + from ..loader import get_task_abbrev task = config.loader.task if config.loader else None resolved_dir = get_model_dir( @@ -501,66 +495,705 @@ def build( else: resolved_dir = Path(output_dir) - # Report build plan - model_label = f"{model_id} (random-init)" if random_init else model_id + _run_single_build( + config=config, + config_file=config_file, + model_id=model_id, + resolved_dir=resolved_dir, + rebuild=rebuild, + cache_key=cache_key, + ep=ep, + device=device, + extra_kwargs=extra_kwargs, + ) + + except click.UsageError: + raise # Let click handle its own errors + except ValueError as e: + raise click.UsageError(str(e)) from e + except Exception as e: + if verbose: + logger.exception("Build failed") + + # Map common errors to actionable hints + err_str = str(e) + hint = None + if "Quantization failed" in err_str: + hint = "Try: --no-quant to skip quantization" + elif "Compilation failed" in err_str: + hint = "Try: --no-compile to skip compilation" + elif "Black nodes persist" in err_str: + hint = "Try: winml analyze -m <model> --ep <ep> to investigate operator support" + elif isinstance(e, FileNotFoundError): + hint = "Check: model path or HuggingFace model ID" + + if hint: console.print() - console.print("[bold]winml build[/bold]") - console.print(f" Config: {Path(config_file).name}") - console.print(f" Model: {model_label}") - console.print(f" Output: {resolved_dir}") + print_error(console, f"Build failed: {e}", hint=hint) console.print() - # Call build API (late import to speed up CLI startup) - from .config import _is_onnx_file + raise click.ClickException(f"Build failed: {e}") from e + + +# ============================================================================= +# SINGLE MODEL BUILD — CLI-level stage orchestration +# ============================================================================= + + +def _run_single_build( + *, + config: WinMLBuildConfig, + config_file: str, + model_id: str | None, + resolved_dir: Path, + rebuild: bool, + cache_key: str | None, + ep: str | None, + device: str | None, + extra_kwargs: dict[str, Any], +) -> None: + """Run single-model build with Rich Live progress per stage.""" + from .config import _is_onnx_file + + _is_onnx = model_id is not None and _is_onnx_file(model_id) + # Derive source from _is_onnx to guarantee header label matches pipeline + source = "ONNX" if _is_onnx else detect_model_source(model_id) + + # Gap 1: (pretrained) suffix; Gap 2: ONNX file size + if model_id is None: + model_label = "random-init" + elif _is_onnx: + _sz = _safe_size(Path(model_id)) + from ..utils.console import fmt_size + + model_label = f"{model_id} [dim]({fmt_size(_sz)})[/dim]" if _sz else model_id + else: + model_label = f"{model_id} [dim](pretrained)[/dim]" + + # ── 🔧 Setup section ──────────────────────────────────────── + print_setup( + console, + model=model_label, + config=Path(config_file).name, + output=str(resolved_dir), + source=source, + ) + print_stages_header(console) + + # ── Redirect logging + warnings through Rich during Live stages ── + # This ensures log messages and warnings.warn() render above the + # Live area instead of breaking it (same pattern as winml analyze). + root_logger = logging.getLogger() + old_handlers = root_logger.handlers[:] + rich_handler = RichHandler( + console=console, + show_path=False, + show_time=True, + rich_tracebacks=False, + ) + rich_handler.setLevel(root_logger.level) + root_logger.handlers = [rich_handler] + # Route warnings.warn() (e.g., TracerWarning) through logging → Rich + logging.captureWarnings(True) + + start_time = time.monotonic() + + try: + if _is_onnx: + stage_timings = _build_onnx_pipeline( + config=config, + onnx_path=Path(model_id), + output_dir=resolved_dir, + rebuild=rebuild, + ep=ep, + device=device, + extra_kwargs=extra_kwargs, + ) + else: + stage_timings = _build_hf_pipeline( + config=config, + model_id=model_id, + output_dir=resolved_dir, + rebuild=rebuild, + cache_key=cache_key, + ep=ep, + device=device, + extra_kwargs=extra_kwargs, + ) + + elapsed = time.monotonic() - start_time + final_path = resolved_dir / "model.onnx" + if final_path.exists() and stage_timings: + print_final( + console, + elapsed, + str(final_path), + stage_timings=stage_timings, + ) + finally: + logging.captureWarnings(False) + root_logger.handlers = old_handlers + + +def _print_reused(artifact_path: Path) -> None: + """Print reused artifact message.""" + console.print() + console.print( + f" \u267b\ufe0f [bold cyan]Existing artifact found:[/bold cyan] {artifact_path}" + ) + console.print(" \U0001f4a1 [dim]Use --rebuild to force rebuild.[/dim]") + console.print() + + +def _safe_size(path: Path) -> int: + """Get file size including ONNX external data, return 0 if unavailable.""" + try: + if path.suffix == ".onnx": + from ..utils.console import get_onnx_total_size + + return get_onnx_total_size(path) + return path.stat().st_size + except OSError: + return 0 + + +def _show_io(sl: Any, config: WinMLBuildConfig) -> None: + """Show I/O tensors in a StageLive.""" + export_cfg = config.export + if not export_cfg: + return + inputs = export_cfg.input_tensors or [] + outputs = export_cfg.output_tensors or [] + for i, t in enumerate(inputs): + name = t.name or "(unnamed)" + shape = str(list(t.shape)) if getattr(t, "shape", None) else "dynamic" + dtype = getattr(t, "dtype", None) or "?" + sl.io_input(name, shape, dtype, first=(i == 0)) + for i, t in enumerate(outputs): + name = t.name or "(unnamed)" + # OutputTensorSpec has name only — show name, no shape/dtype + label = "Output: " if i == 0 else " " + sl.detail(f"{label}[cyan]{name}[/cyan]") + + +# ============================================================================= +# SHARED PIPELINE STAGE HELPERS +# ============================================================================= + + +def _run_optimize_stage( + *, + config: WinMLBuildConfig, + model_path: Path, + optimized_path: Path, + ep: str | None, + device: str | None, + max_iters: int, + stage_timings: list[tuple[str, float | None]], + show_io_first: bool = False, +) -> tuple[Path, float]: + """Run the optimize stage inside a StageLive context. - if model_id and _is_onnx_file(model_id): - from ..build import build_onnx_model + Creates all 5 analyzer callbacks bound to the live display, calls + run_optimize_analyze_loop, shows convergence message and artifact. - result = build_onnx_model( - onnx_path=Path(model_id), - config=config, - output_dir=resolved_dir, - rebuild=rebuild, - ep=ep, - device=device, - **extra_kwargs, + Args: + config: Build configuration. + model_path: Input model path. + optimized_path: Output path for optimized model. + ep: Execution provider for analyzer. + device: Target device for analyzer. + max_iters: Maximum analyzer iterations. + stage_timings: List to append (stage_name, elapsed) tuple to. + show_io_first: If True, show I/O tensors at the start of the stage + (used in ONNX mode where there is no export stage). + + Returns: + Tuple of (current_path, opt_elapsed). + """ + from ..build import run_optimize_analyze_loop + from ..utils.console import StageLive + + with StageLive("optimize", console) as sl: + sl.set_status("Optimizing ONNX graph...") + + if show_io_first: + _show_io(sl, config) + + # Analyzer callback state for live EP bars + _ep_bars: dict[str, int] = {} + _ep_counts: dict[str, dict[str, int]] = {} + _ep_totals: dict[str, int] = {} + _current_ep = [""] + _current_iter = [0, 0] # [iteration, max_iter] + _header_shown = [False] + + def _on_iteration_start(iteration: int, max_iter: int) -> None: + _ep_bars.clear() + _ep_counts.clear() + _ep_totals.clear() + _current_iter[0] = iteration + _current_iter[1] = max_iter + _header_shown[0] = False + + def _on_ep_start(ep_name: str, operator_counts: dict) -> None: + _current_ep[0] = ep_name + _ep_counts[ep_name] = {} + total = sum(operator_counts.values()) + _ep_totals[ep_name] = total + # Show "Analyzing N nodes (iter X/Y)" on first EP of each iter + if not _header_shown[0]: + _header_shown[0] = True + sl.detail( + f"[bold]Analyzing[/bold] [cyan]{total}[/cyan] nodes " + f"[dim](iter {_current_iter[0]}/{_current_iter[1]})[/dim]" ) - else: - from ..build import build_hf_model - - result = build_hf_model( - config=config, - output_dir=resolved_dir, - model_id=model_id, - rebuild=rebuild, - random_init=random_init, - cache_key=cache_key, - ep=ep, - device=device, - **extra_kwargs, + _ep_bars[ep_name] = sl.ep_bar_add(ep_name, total=total) + + def _on_node_result(pattern_runtime: Any) -> None: + ep_name = _current_ep[0] + level = pattern_runtime.result.classification.value + counts = _ep_counts.setdefault(ep_name, {}) + counts[level] = counts.get(level, 0) + 1 + s = counts.get("supported", 0) + p = counts.get("partial", 0) + u = counts.get("unsupported", 0) + idx = _ep_bars.get(ep_name) + if idx is not None: + sl.ep_bar_update( + idx, + ep_name, + s, + p, + u, + total=_ep_totals.get(ep_name, 0), ) - # Report results - if result.reused: - console.print(f" Existing artifact: {result.final_onnx_path}") - console.print(" Use --rebuild to force rebuild.") - else: - for stage in result.stages_completed: - t = result.stage_timings.get(stage, 0) - console.print(f" {stage:<12} done ({t:.1f}s)") - for stage in result.stages_skipped: - console.print(f" {stage:<12} skipped") - console.print() - console.print(f" Build complete in {result.elapsed:.1f}s") - console.print(f" Final artifact: {result.final_onnx_path}") + def _on_patterns(autoconf_dict: dict) -> None: + sl.detail("[bold]Patterns[/bold]") + for key in autoconf_dict: + name = key.replace("disable_", "").replace("_fusion", "").replace("_", " ").title() + sl.detail(f" [yellow]{name}[/yellow] [dim]\u2192 {key}[/dim]") + + def _on_reoptimize(autoconf_dict: dict) -> None: + sl.detail("[bold]Optimizing[/bold] [dim](applying autoconf)[/dim]") + sl.detail(f" [dim]{autoconf_dict}[/dim]") + + t0 = time.monotonic() + current_path, _, analyze_iters, _, analyze_details = run_optimize_analyze_loop( + model_path=model_path, + optimized_path=optimized_path, + config=config, + ep=ep, + device=device, + max_optim_iterations=max_iters, + on_ep_start=_on_ep_start, + on_node_result=_on_node_result, + on_iteration_start=_on_iteration_start, + on_patterns_discovered=_on_patterns, + on_reoptimize=_on_reoptimize, + use_external_data=True, + ) + opt_elapsed = time.monotonic() - t0 - console.print() + if analyze_iters > 0: + converged = not analyze_details.get("autoconf_not_converged", False) + conv_str = "converged" if converged else "NOT converged" + # Show pattern result even when none found + autoconf = analyze_details.get("autoconf", {}) + if not autoconf: + sl.detail("[bold]Patterns[/bold]") + sl.detail(" [dim]No optimization patterns found[/dim]") + sl.detail(f"[dim]Autoconf {conv_str} after {analyze_iters} iteration(s)[/dim]") - except click.UsageError: - raise # Let click handle its own errors + sl.set_done(opt_elapsed) + sl.artifact(str(optimized_path), _safe_size(optimized_path)) + sl.blank() + + stage_timings.append(("Optimize", opt_elapsed)) + return current_path, opt_elapsed + + +def _run_quantize_stage( + *, + config: WinMLBuildConfig, + current_path: Path, + quantized_path: Path, + stage_timings: list[tuple[str, float | None]], +) -> Path: + """Run the quantize stage inside a StageLive context (if quant is configured). + + Handles QDQ skip detection, shows dataset/calibration/precision details, + and appends timing to stage_timings. + + Args: + config: Build configuration. + current_path: Input model path. + quantized_path: Output path for quantized model. + stage_timings: List to append (stage_name, elapsed) tuple to. + + Returns: + Updated current_path (quantized_path if quantization ran, else unchanged). + """ + from ..onnx import is_quantized_onnx + from ..quant import quantize_onnx + from ..utils.console import StageLive + + if config.quant is None: + return current_path + + if is_quantized_onnx(current_path): + print_stage_skip(console, "quantize", "(QDQ nodes already present)") + stage_timings.append(("Quantize", None)) + return current_path + + with StageLive("quantize", console) as sl: + wt = config.quant.weight_type or "?" + sl.set_status(f"Quantizing ({wt})...") + # Calibration info before blocking call + ds = config.quant.dataset_name or "default" + sl.kv( + "Dataset:", + f"[cyan]{ds}[/cyan] [dim]({config.quant.task or 'unknown'})[/dim]", + ) + sl.kv( + "Calibration:", + f"[cyan]{config.quant.samples}[/cyan] samples" + f" [dim]({config.quant.calibration_method})[/dim]", + ) + # Suppress tqdm/datasets progress bars during quantize + # to keep Live display clean + _datasets_available = False + try: + import datasets + + datasets.disable_progress_bars() + _datasets_available = True + except ImportError: + pass # datasets package not installed; progress bar suppression not needed + + t0 = time.monotonic() + try: + quant_result = quantize_onnx( + model_path=current_path, + output_path=quantized_path, + config=config.quant, + use_external_data=True, + ) + finally: + if _datasets_available: + datasets.enable_progress_bars() + if not quant_result.success: + errors = ", ".join(quant_result.errors) if quant_result.errors else "Unknown" + sl.set_error(errors) + raise RuntimeError(f"Quantization failed: {errors}") + current_path = quantized_path + _quant_elapsed = time.monotonic() - t0 + sl.set_done(_quant_elapsed) + sl.kv( + "Precision:", + f"[cyan]{config.quant.weight_type}/" + f"{config.quant.activation_type}[/cyan]" + f" [dim](weight/activation)[/dim]", + ) + sl.artifact( + str(quantized_path), + _safe_size(quantized_path), + ) + sl.blank() + stage_timings.append(("Quantize", _quant_elapsed)) + return current_path + + +def _run_compile_stage( + *, + config: WinMLBuildConfig, + current_path: Path, + compiled_path: Path, + stage_timings: list[tuple[str, float | None]], +) -> Path: + """Run the compile stage inside a StageLive context (if compile is configured). + + Shows graph summary after compilation and appends timing to stage_timings. + + Args: + config: Build configuration. + current_path: Input model path. + compiled_path: Output path for compiled model. + stage_timings: List to append (stage_name, elapsed) tuple to. + + Returns: + Updated current_path (compiled_path if compilation ran, else unchanged). + """ + from ..compiler import compile_onnx + from ..onnx import copy_onnx_model + from ..utils.console import StageLive, get_onnx_graph_summary + + if config.compile is None: + return current_path + + with StageLive("compile", console) as sl: + _cp = "" + if hasattr(config.compile, "ep_config") and config.compile.ep_config: + _cp = f" for {config.compile.ep_config.provider.upper()}" + sl.set_status(f"Compiling{_cp}...") + t0 = time.monotonic() + compile_result = compile_onnx( + model_path=current_path, + output_path=compiled_path, + config=config.compile, + ) + if hasattr(compile_result, "success") and not compile_result.success: + errors = ", ".join(compile_result.errors) if compile_result.errors else "Unknown" + sl.set_error(errors) + raise RuntimeError(f"Compilation failed: {errors}") + if ( + compile_result.output_path + and Path(compile_result.output_path).resolve() != compiled_path.resolve() + ): + copy_onnx_model(compile_result.output_path, compiled_path) + current_path = compiled_path + _compile_elapsed = time.monotonic() - t0 + sl.set_done(_compile_elapsed) + + # Graph summary + try: + summary = get_onnx_graph_summary(compiled_path) + op_parts = ", ".join( + f"[cyan]{op}[/cyan] ({count})" + for op, count in list(summary["op_counts"].items())[:8] + ) + sl.detail(f"[bold]Graph:[/bold] {op_parts}") + except Exception: + logger.debug("Could not load graph summary", exc_info=True) + + sl.artifact( + str(compiled_path), + _safe_size(compiled_path), + ) + stage_timings.append(("Compile", _compile_elapsed)) + return current_path + + +# ============================================================================= +# PIPELINE FUNCTIONS +# ============================================================================= + + +def _build_hf_pipeline( + *, + config: WinMLBuildConfig, + model_id: str | None, + output_dir: Path, + rebuild: bool, + cache_key: str | None, + ep: str | None, + device: str | None, + extra_kwargs: dict[str, Any], +) -> list[tuple[str, float | None]] | None: + """HF build pipeline with cascading StageLive per stage. + + Returns list of (stage_name, elapsed_seconds | None) for summary, + or None if build was reused. + """ + from ..build.hf import _load_model + from ..export import export_onnx + from ..onnx import copy_onnx_model + from ..utils.console import StageLive + + max_iters: int = extra_kwargs.pop("hack_max_optim_iterations", 3) + model_label = model_id or "random-init" + + # ── Validate + setup ───────────────────────────────────────── + try: + config.validate() except ValueError as e: - raise click.UsageError(str(e)) from e - except Exception as e: - if verbose: - logger.exception("Build failed") - raise click.ClickException(f"Build failed: {e}") from e + raise ValueError(f"Config validation failed: {e}") from e + + output_dir.mkdir(parents=True, exist_ok=True) + + def _name(base: str) -> str: + return f"{cache_key}_{base}" if cache_key else base + + export_path = output_dir / _name("export.onnx") + optimized_path = output_dir / _name("optimized.onnx") + quantized_path = output_dir / _name("quantized.onnx") + compiled_path = output_dir / _name("compiled.onnx") + final_path = output_dir / _name("model.onnx") + config_path = output_dir / _name("winml_build_config.json") + + # Reuse check + if final_path.exists() and not rebuild: + _print_reused(final_path) + return None + + stage_timings: list[tuple[str, float | None]] = [] + + # Clean old artifacts on rebuild + if rebuild: + pattern = f"{cache_key}_*.onnx" if cache_key else "*.onnx" + for old in output_dir.glob(pattern): + old.unlink() + + current_path = export_path + + # ── Export stage ────────────────────────────────────────────── + import warnings + + with StageLive("export", console) as sl: + sl.set_status("Exporting to ONNX...") + + # Load + export (blocking) + # Suppress TracerWarning and other transformer warnings + # during export to keep Live display clean. + pytorch_model = _load_model(config, model_id, trust_remote_code=False) + t0 = time.monotonic() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + export_onnx( + model=pytorch_model, + output_path=export_path, + export_config=config.export, + model_id=model_label, + task=config.loader.task, + verbose=False, + use_external_data=True, + ) + _export_elapsed = time.monotonic() - t0 + sl.set_done(_export_elapsed) + # Meta shown after export completes (avoids duplicate in Live frame) + if config.loader.model_class: + sl.kv("Model class:", f"[cyan]{config.loader.model_class}[/cyan]") + if config.loader.task: + sl.kv("Task:", f"[cyan]{config.loader.task}[/cyan]") + _show_io(sl, config) + sl.artifact(str(export_path), _safe_size(export_path)) + sl.blank() + + stage_timings.append(("Export", _export_elapsed)) + + # ── Optimize stage ─────────────────────────────────────────── + current_path, _ = _run_optimize_stage( + config=config, + model_path=current_path, + optimized_path=optimized_path, + ep=ep, + device=device, + max_iters=max_iters, + stage_timings=stage_timings, + show_io_first=False, + ) + + # Persist config after autoconf + config_path.write_text(json.dumps(config.to_dict(), indent=2)) + + # ── Quantize stage ─────────────────────────────────────────── + current_path = _run_quantize_stage( + config=config, + current_path=current_path, + quantized_path=quantized_path, + stage_timings=stage_timings, + ) + + # ── Compile stage ──────────────────────────────────────────── + current_path = _run_compile_stage( + config=config, + current_path=current_path, + compiled_path=compiled_path, + stage_timings=stage_timings, + ) + + # ── Finalize ───────────────────────────────────────────────── + if current_path != final_path: + copy_onnx_model(current_path, final_path) + + return stage_timings + + +def _build_onnx_pipeline( + *, + config: WinMLBuildConfig, + onnx_path: Path, + output_dir: Path, + rebuild: bool, + ep: str | None, + device: str | None, + extra_kwargs: dict[str, Any], +) -> list[tuple[str, float | None]] | None: + """ONNX build pipeline with cascading StageLive per stage. + + Returns list of (stage_name, elapsed_seconds | None) for summary, + or None if build was reused. + """ + from ..onnx import copy_onnx_model + + max_iters: int = extra_kwargs.pop("hack_max_optim_iterations", 3) + + # ── Validate + setup ───────────────────────────────────────── + if not onnx_path.exists(): + raise FileNotFoundError(f"ONNX file not found: {onnx_path}") + try: + config.validate() + except ValueError as e: + raise ValueError(f"Config validation failed: {e}") from e + + output_dir.mkdir(parents=True, exist_ok=True) + + stem = onnx_path.stem + optimized_path = output_dir / f"{stem}_optimized.onnx" + quantized_path = output_dir / f"{stem}_quantized.onnx" + compiled_path = output_dir / f"{stem}_compiled.onnx" + final_path = output_dir / "model.onnx" + config_path = output_dir / "winml_build_config.json" + + # Reuse check + if final_path.exists() and not rebuild: + _print_reused(final_path) + return None + + stage_timings: list[tuple[str, float | None]] = [] + + if rebuild: + for old in output_dir.glob("*.onnx"): + old.unlink() + + # Copy input ONNX to output dir + current_path = output_dir / onnx_path.name + if current_path.resolve() != onnx_path.resolve(): + copy_onnx_model(onnx_path, current_path) + + # ── Optimize stage (first stage for ONNX — show I/O here) ──── + current_path, _ = _run_optimize_stage( + config=config, + model_path=current_path, + optimized_path=optimized_path, + ep=ep, + device=device, + max_iters=max_iters, + stage_timings=stage_timings, + show_io_first=True, + ) + + config_path.write_text(json.dumps(config.to_dict(), indent=2)) + + # ── Quantize stage ─────────────────────────────────────────── + current_path = _run_quantize_stage( + config=config, + current_path=current_path, + quantized_path=quantized_path, + stage_timings=stage_timings, + ) + + # ── Compile stage ──────────────────────────────────────────── + current_path = _run_compile_stage( + config=config, + current_path=current_path, + compiled_path=compiled_path, + stage_timings=stage_timings, + ) + + # ── Finalize ───────────────────────────────────────────────── + if current_path != final_path: + copy_onnx_model(current_path, final_path) + + return stage_timings diff --git a/src/winml/modelkit/commands/compile.py b/src/winml/modelkit/commands/compile.py index 8ecf470a2..f61ee9783 100644 --- a/src/winml/modelkit/commands/compile.py +++ b/src/winml/modelkit/commands/compile.py @@ -25,7 +25,8 @@ import click from rich.console import Console -from ..config.precision import _DEVICE_TO_PROVIDER, _EP_TO_DEVICE, VALID_EPS +from ..config import VALID_EPS +from ..config.precision import _DEVICE_TO_PROVIDER, _EP_TO_DEVICE from ..onnx import is_compiled_onnx from ..utils.logging import configure_logging @@ -150,7 +151,7 @@ def compile( # Handle --list if list_compilers_flag: - from ..compiler.compiler import list_compilers + from ..compiler import list_compilers provider = _resolve_compile_provider(device, ep) click.echo(list_compilers(provider)) diff --git a/src/winml/modelkit/commands/config.py b/src/winml/modelkit/commands/config.py index f37ea22a4..f97aab690 100644 --- a/src/winml/modelkit/commands/config.py +++ b/src/winml/modelkit/commands/config.py @@ -2,7 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""Config generation command for ModelKit CLI. +"""Config generation command (v2, Rich UI) for ModelKit CLI. Generates WinMLBuildConfig for a HuggingFace model or a pre-exported ONNX file by auto-detecting task, model class, and I/O specifications. @@ -28,11 +28,20 @@ from typing import Any import click -from rich.console import Console + +from ..utils.console import ( + get_console, + print_command_header, + print_error, + print_io_specs_detail, + print_io_specs_na, + print_kv, + print_success, +) logger = logging.getLogger(__name__) -console = Console(stderr=True) +console = get_console() def _apply_stage_overrides(cfg: Any, *, no_quant: bool, no_compile: bool) -> None: @@ -49,7 +58,7 @@ def _is_onnx_file(model_input: str) -> bool: return path.suffix == ".onnx" and path.exists() -@click.command() +@click.command("config") @click.option( "-m", "--model", @@ -97,7 +106,7 @@ def _is_onnx_file(model_input: str) -> bool: type=click.Path(exists=True), default=None, help="JSON file with shape overrides passed to dummy input generation. " - "Valid keys -- text: sequence_length; " + "Valid keys — text: sequence_length; " "vision: height, width, num_channels; " "audio: feature_size, nb_max_frames, audio_sequence_length.", ) @@ -230,6 +239,14 @@ def config( # Validate: at least one of -m, --model-type, or --model-class is required if hf_model is None and model_type is None and model_class is None: + # Show header even for errors + print_command_header(console, "\U0001f4cb CONFIG GENERATION") + print_error( + console, + "Missing required input", + hint="Provide one of: -m/--model, --model-type, or --model-class", + ) + console.print() raise click.UsageError( "At least one of -m/--model, --model-type, or --model-class is required." ) @@ -243,6 +260,8 @@ def config( # Load override config from JSON file if provided override = None + _override_file: str | None = None + _shape_config_file: str | None = None if config_file: config_path = Path(config_file) try: @@ -258,7 +277,7 @@ def config( override = WinMLBuildConfig.from_dict(data) except json.JSONDecodeError as e: raise click.UsageError(f"Invalid JSON in config file {config_path}: {e}") from e - console.print(f"[dim]Loaded overrides from {config_path.name}[/dim]") + _override_file = config_path.name # Load shape_config (shape overrides) from JSON file if provided shape_config = None @@ -278,12 +297,15 @@ def config( raise click.UsageError( f"Invalid JSON in I/O config file {shape_config_path}: {e}" ) from e - console.print(f"[dim]Loaded I/O config from {shape_config_path.name}[/dim]") + _shape_config_file = shape_config_path.name # ONNX file detection: generate simpler config without loader/export + if hf_model and _is_onnx_file(hf_model) and module: + raise click.UsageError( + "--module is not supported with ONNX file input. " + "Module discovery requires a HuggingFace model." + ) if hf_model and _is_onnx_file(hf_model): - console.print(f"[dim]Generating ONNX build config for {hf_model}...[/dim]") - config_obj = generate_onnx_build_config( hf_model, task=task, @@ -296,11 +318,15 @@ def config( # Apply --no-quant / --no-compile overrides _apply_stage_overrides(config_obj, no_quant=no_quant, no_compile=no_compile) - console.print("[green]Generated ONNX build config (export=None)[/green]") output_data = config_obj.to_dict() + _is_onnx_mode = True + _resolved_task = None + _resolved_model_class = None + _export_cfg = None + configs: list = [] # defensive — ONNX + module is rejected above + _n_modules = 0 else: - label = hf_model or model_type - console.print(f"[dim]Generating config for {label}...[/dim]") + _is_onnx_mode = False # Generate config(s) - returns single or list based on module parameter result = generate_hf_build_config( @@ -322,39 +348,136 @@ def config( if module: # Module mode: result is list[WinMLBuildConfig] configs = result - # Apply --no-quant / --no-compile overrides to each config for cfg in configs: _apply_stage_overrides(cfg, no_quant=no_quant, no_compile=no_compile) - console.print(f"[green]Found {len(configs)} submodules matching '{module}'[/green]") output_data = [cfg.to_dict() for cfg in configs] + _n_modules = len(configs) + # Use first config for display metadata + config_obj = configs[0] if configs else None else: # Normal mode: result is WinMLBuildConfig config_obj = result - # Apply --no-quant / --no-compile overrides + configs = [] _apply_stage_overrides(config_obj, no_quant=no_quant, no_compile=no_compile) - # B-4: Inform user of auto-selected task when --task not provided - if not task and not module: - auto_task = config_obj.loader.task - source = model_type or hf_model - console.print(f"[dim]Auto-selected task: {auto_task} (from '{source}')[/dim]") + output_data = config_obj.to_dict() + _n_modules = 0 + + _resolved_task = config_obj.loader.task if config_obj else None + _resolved_model_class = config_obj.loader.model_class if config_obj else None + _export_cfg = config_obj.export if config_obj else None + + # ── Rich console output ────────────────────────────────────── + subtitle = "ONNX mode" if _is_onnx_mode else ("module mode" if module else None) + print_command_header(console, "\U0001f4cb CONFIG GENERATION", subtitle) + + # Model identity + model_label = hf_model or model_type or model_class or "?" + print_kv(console, "Model:", model_label, icon="\U0001f4e6") + + if _is_onnx_mode: + print_kv(console, "Mode:", "Direct ONNX", note="export=None", icon="\U0001f527") + else: + # Fix #1: Model class before Task + if module: + print_kv(console, "Module:", module, icon="\U0001f9e9") + elif _resolved_model_class: + mc_note = None if model_class else "auto-detected" + print_kv( + console, + "Model class:", + _resolved_model_class, + note=mc_note, + icon="\U0001f9e9", + ) + # Fix #2: no trailing space after 🏷️ + if _resolved_task: + task_note = None if task else "auto-detected" + print_kv( + console, + "Task:", + _resolved_task, + note=task_note, + icon="\U0001f3f7\ufe0f", + ) + + # Override files + if config_file: + console.print( + f" \U0001f4c1 [bold]Overrides:[/bold] {_override_file} [green]\u2713[/green]" + ) + if shape_config_file: + console.print( + f" \U0001f4c1 [bold]Shape config:[/bold] " + f"{_shape_config_file} [green]\u2713[/green]" + ) + + console.print() + + # I/O specs (always full detail) + if _is_onnx_mode: + print_io_specs_na(console) + elif _export_cfg is not None: + print_io_specs_detail(console, _export_cfg) + + console.print() + + # Resolution — read directly from the config object. + # No inference or reverse mapping — display what the config contains. + _ref_config = config_obj if not module else (configs[0] if configs else None) + if _ref_config is not None: + _quant = _ref_config.quant + + console.print(" \u2699\ufe0f [bold]Resolution:[/bold]") + + # Fix #4: Device from resolve_device (existing API) + from ..sysinfo import resolve_device as _rd + + _resolved_dev, _ = _rd() + console.print(f" Device: [cyan]{_resolved_dev.upper()}[/cyan]") + + # EP — only shown when user explicitly passed --ep + if ep: + from ..utils.constants import normalize_ep_name + + _ep_full = normalize_ep_name(ep) or ep + console.print(f" EP: [cyan]{_ep_full}[/cyan]") + + # Quant types — display exactly what config contains + if _quant: console.print( - f"[green]Generated config for task '{config_obj.loader.task}'[/green]" + f" Quant: " + f"[cyan]{_quant.weight_type}/{_quant.activation_type}" + f"[/cyan] [dim](weight/activation)[/dim]" ) - output_data = config_obj.to_dict() + else: + console.print(" Quant: [dim]none[/dim]") + + # Module mode: show submodule list + if module and not _is_onnx_mode and _n_modules > 0: + console.print() + console.print( + f" \U0001f9e9 [bold]Submodules:[/bold] " + f"[green]{_n_modules}[/green] matching '{module}'" + ) - # Serialize to JSON + console.print() + + # ── Serialize and output ───────────────────────────────────── config_json = json.dumps(output_data, indent=2) - # Output to file or stdout if output: output_path = Path(output) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(config_json) - console.print(f"[green]Config saved to:[/green] {output}") + suffix = f" [dim]({_n_modules} submodules)[/dim]" if _n_modules else "" + print_success(console, f"Config saved to: [bold]{output}[/bold]{suffix}") else: + print_success(console, "Config written to stdout") # Print to stdout (not stderr where console prints) print(config_json) + console.print() + except click.UsageError: raise # Let click handle its own errors except ValueError as e: diff --git a/src/winml/modelkit/commands/eval.py b/src/winml/modelkit/commands/eval.py index 67d8fc2a8..36e2d366c 100644 --- a/src/winml/modelkit/commands/eval.py +++ b/src/winml/modelkit/commands/eval.py @@ -158,7 +158,7 @@ def eval( logging.getLogger("winml.modelkit").setLevel(logging.DEBUG) if show_schema: - from ..eval.base_evaluator import WinMLEvaluator + from ..eval import WinMLEvaluator from ..eval.evaluate import _EVALUATOR_REGISTRY if task is None: @@ -210,7 +210,7 @@ def eval( with Path(label_mapping).open() as f: parsed_label_mapping = json.load(f) - from ..datasets.config import DatasetConfig + from ..datasets import DatasetConfig from ..eval import WinMLEvaluationConfig, evaluate from ..sysinfo import resolve_device @@ -240,6 +240,7 @@ def eval( result = evaluate(config) from rich.console import Console + console = Console() display_eval_report(result, console) diff --git a/src/winml/modelkit/commands/export.py b/src/winml/modelkit/commands/export.py index 4e8f0b3f3..472a2f515 100644 --- a/src/winml/modelkit/commands/export.py +++ b/src/winml/modelkit/commands/export.py @@ -36,6 +36,28 @@ console = Console() +def _delete_onnx_with_external_data(onnx_path: Path) -> None: + """Delete an ONNX file and its external data files.""" + import onnx + from onnx.external_data_helper import ExternalDataInfo + + try: + model = onnx.load(str(onnx_path), load_external_data=False) + ext_files: set[str] = set() + for tensor in model.graph.initializer: + if tensor.data_location == onnx.TensorProto.EXTERNAL: + ext_files.add(ExternalDataInfo(tensor).location) + for name in ext_files: + data_path = onnx_path.parent / name + if data_path.exists(): + data_path.unlink() + except Exception: + logger.debug("Could not parse external data from %s", onnx_path, exc_info=True) + + if onnx_path.exists(): + onnx_path.unlink() + + @click.command() @click.option( "--model", @@ -170,8 +192,8 @@ def export( if ctx.obj.get("debug"): verbose = True - from ..export.config import InputTensorSpec, OutputTensorSpec, WinMLExportConfig - from ..export.pytorch import export_pytorch as export_onnx + from ..export import InputTensorSpec, OutputTensorSpec, WinMLExportConfig + from ..export import export_pytorch as export_onnx from ..loader import load_hf_model # Configure logging based on verbose flag @@ -244,7 +266,7 @@ def export( # Auto-resolve input/output tensors via loader + Optimum try: - from ..export.config import resolve_export_config as resolve_cfg + from ..export import resolve_export_config as resolve_cfg auto_export_cfg, _ = resolve_cfg( model_id=model, @@ -322,19 +344,27 @@ def export( else: console.print(f"[dim]Detected task: {detected_task}[/dim]") - # Export using export_onnx() - the single implementation path - result_path = export_onnx( + export_stats = export_onnx( model=pytorch_model, output_path=output_path, export_config=cfg, - model_id=model, # For metadata - task=detected_task, # Use detected task for proper OnnxConfig lookup + model_id=model, + task=detected_task, verbose=verbose, enable_reporting=with_report, ) + logger.debug("Export stats: %s", export_stats) + + # TODO: re-enable post-export optimization (shape inference, constant folding) + # Disabled: needs validation that optimize_onnx preserves HTP hierarchy tags. + # from ..optim.api import optimize_onnx + # raw_path = output_path.with_stem(f"{output_path.stem}_raw") + # output_path.rename(raw_path) + # optimize_onnx(raw_path, output=output_path) + # _delete_onnx_with_external_data(raw_path) # Show results - console.print(f"\n[bold green]Success![/bold green] Model exported to: {result_path}") + console.print(f"\n[bold green]Success![/bold green] Model exported to: {output_path}") # Show report file locations if enabled if with_report: diff --git a/src/winml/modelkit/commands/inspect.py b/src/winml/modelkit/commands/inspect.py index b79f6a649..10fca4ca6 100644 --- a/src/winml/modelkit/commands/inspect.py +++ b/src/winml/modelkit/commands/inspect.py @@ -2,21 +2,26 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""Inspect command for ModelKit CLI. +"""Inspect input model's ModelKit configuration. -Displays detailed information about a HuggingFace model's compatibility -with ModelKit, including loader, exporter, and WinML configurations. +Resolves loader, exporter, and WinML inference class for a given model, +showing what the build pipeline will use. Usage: - winml inspect -m openai/clip-vit-base-patch32 + winml inspect -m microsoft/resnet-50 + winml inspect --model-type bert --task fill-mask winml inspect -m google-bert/bert-base-uncased --format json - winml inspect -m facebook/detr-resnet-50 --verbose - winml inspect -m openai/clip-vit-base-patch32 --hierarchy + winml inspect --list-tasks """ from __future__ import annotations import logging +from typing import TYPE_CHECKING + + +if TYPE_CHECKING: + from ..inspect.types import InspectResult import click from rich.console import Console @@ -26,12 +31,14 @@ console = Console() -@click.command() +@click.command("inspect") @click.option( "-m", "--model", - required=True, - help="HuggingFace model ID (e.g., openai/clip-vit-base-patch32)", + "model_id", + required=False, + default=None, + help="HuggingFace model ID (e.g., microsoft/resnet-50)", ) @click.option( "-f", @@ -61,53 +68,89 @@ default=False, help="Show HF module hierarchy (uses random weights, no weight download)", ) +@click.option( + "--list-tasks", + "list_tasks", + is_flag=True, + default=False, + help="List all known tasks and exit", +) +@click.option( + "--model-type", + "model_type", + default=None, + help="Override model type (e.g., bert, resnet) — can be used without --model", +) +@click.option( + "--model-class", + "model_class", + default=None, + help="Override model class (e.g., BertForMaskedLM) — can be used without --model", +) @click.pass_context def inspect( ctx: click.Context, - model: str, + model_id: str | None, output_format: str, verbose: bool, task: str | None, hierarchy: bool, + list_tasks: bool, + model_type: str | None, + model_class: str | None, ) -> None: - r"""Inspect a HuggingFace model's ModelKit configuration. + r"""Inspect input model's ModelKit configuration. - Shows the loader configuration, exporter configuration, and WinML - inference class that will be used for the specified model. + Shows the loader, exporter, WinML inference class, I/O specs, + and build resolution that the pipeline will use for the given model. - This command helps you understand: - - Which HuggingFace model class will be used for loading - - What ONNX export configuration will be applied - - Which WinML inference class will handle the model - - Overall support status in ModelKit + Supports inspection without a model ID via --model-type or --model-class. \b Examples: # Basic inspection - winml inspect -m openai/clip-vit-base-patch32 + winml inspect -m microsoft/resnet-50 - # JSON output for scripting - winml inspect -m google-bert/bert-base-uncased --format json + # Inspect by model type only (no weight download) + winml inspect --model-type bert --task fill-mask - # Show full build configuration - winml inspect -m facebook/detr-resnet-50 --verbose + # Override model class + winml inspect -m custom-model --model-class BertForCTC - # Include HF module hierarchy (no weight download) - winml inspect -m openai/clip-vit-base-patch32 --hierarchy + # JSON output + winml inspect -m google-bert/bert-base-uncased --format json - # Combined verbose + hierarchy - winml inspect -m google-bert/bert-base-uncased -v -H + # List all known tasks + winml inspect --list-tasks """ - # Import here to defer heavy transformers/torch imports - from ..inspect import ( - InspectError, - ModelNotFoundError, - NetworkError, - inspect_model, - ) + # Handle --list-tasks (no model required) + if list_tasks: + from ..inspect.resolver import get_known_tasks + + for t in sorted(get_known_tasks()): + click.echo(t) + return + + # Validate: need at least one of model_id, model_type, model_class + if model_id is None and model_type is None and model_class is None: + raise click.UsageError( + "At least one of -m/--model, --model-type, or --model-class is required. " + "Use --list-tasks to see available tasks." + ) + + # Handle ONNX file input + from pathlib import Path + + if model_id and model_id.endswith(".onnx") and Path(model_id).is_file(): + raise click.ClickException( + "ONNX file inspection is not yet supported. " + "Use 'winml config -m model.onnx' for ONNX build config." + ) + + from ..inspect import InspectError, ModelNotFoundError, NetworkError from ..inspect.formatter import output_json, output_table - # Inherit debug mode from parent + # Inherit debug mode from parent context if ctx.obj and ctx.obj.get("debug"): verbose = True @@ -116,7 +159,13 @@ def inspect( logging.getLogger("winml.modelkit").setLevel(logging.DEBUG) try: - result = inspect_model(model, include_hierarchy=hierarchy, task_override=task) + result = _inspect_model_v2( + model_id=model_id, + task_override=task, + model_type_override=model_type, + model_class_override=model_class, + include_hierarchy=hierarchy, + ) if output_format.lower() == "json": click.echo(output_json(result, verbose=verbose)) @@ -133,5 +182,244 @@ def inspect( raise click.ClickException(f"Inspection error: {e}") from e except (ValueError, RuntimeError, OSError) as e: - logger.exception("Failed to inspect model: %s", model) + logger.exception("Failed to inspect model") raise click.ClickException(f"Failed to inspect model: {e}") from e + + +def _inspect_model_v2( + model_id: str | None = None, + task_override: str | None = None, + model_type_override: str | None = None, + model_class_override: str | None = None, + include_hierarchy: bool = False, +) -> InspectResult: + """Inspect v2 core — calls shared loader/export modules directly. + + Args: + model_id: HuggingFace model ID (optional when model_type_override set) + task_override: Task to use instead of auto-detected task + model_type_override: Model type override (e.g., "bert") + model_class_override: Model class override (e.g., "BertForMaskedLM") + include_hierarchy: Whether to extract module hierarchy + + Returns: + InspectResult dataclass + """ + import functools + + from transformers import AutoConfig + + from ..export import resolve_io_specs + from ..inspect import ( + ExporterInfo, + InspectError, + InspectResult, + LoaderInfo, + ModelNotFoundError, + NetworkError, + SupportLevel, + TensorInfo, + build_tensor_infos_from_io_specs, + compile_support_status, + resolve_cache, + resolve_io_config, + resolve_processor, + resolve_winml, + ) + from ..loader import HF_TASK_DEFAULTS, resolve_loader_config + from ..models import ( + HF_MODEL_CLASS_MAPPING, + MODEL_BUILD_CONFIGS, + ) + + # ========================================================================= + # STEP 1: Preserve parent hf_config before resolve_loader_config narrows it + # for multimodal models (e.g., CLIPConfig → CLIPTextConfig) + # ========================================================================= + parent_hf_config = None + if model_id and not model_type_override: + try: + parent_hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=False) + except Exception: + pass # resolve_loader_config will handle the error properly + + # ========================================================================= + # STEP 2: Shared loader resolution (same call as config command) + # ========================================================================= + try: + loader_config, hf_config, _resolved_class = resolve_loader_config( + model_id, + task=task_override, + model_type=model_type_override, + model_class=model_class_override, + ) + except ValueError as e: + err_str = str(e).lower() + if "not found" in err_str or "404" in err_str: + raise ModelNotFoundError(str(e)) from e + raise InspectError(str(e)) from e + except OSError as e: + raise NetworkError(str(e)) from e + + if parent_hf_config is None: + parent_hf_config = hf_config + + model_type = loader_config.model_type + task = loader_config.task + architectures = getattr(parent_hf_config, "architectures", []) or [] + + # ========================================================================= + # STEP 3: Derive task_source by checking registries post-hoc + # ========================================================================= + mt = model_type.lower().replace("_", "-") + task_source = "TasksManager" + for m, t in HF_MODEL_CLASS_MAPPING: + if m == mt and t == task: + task_source = "HF_MODEL_CLASS_MAPPING" + break + + # ========================================================================= + # STEP 4: Derive loader display info + # ========================================================================= + if (mt, task) in HF_MODEL_CLASS_MAPPING: + loader_source = "MODEL_CLASS_MAPPING" + loader_level = SupportLevel.SUPPORTED + elif task in HF_TASK_DEFAULTS: + loader_source = "HF_TASK_DEFAULTS" + loader_level = SupportLevel.DEFAULT + else: + loader_source = "TasksManager" + loader_level = SupportLevel.DEFAULT + + loader_info = LoaderInfo( + hf_model_class=loader_config.model_class or "Auto (TasksManager)", + hf_model_class_source=loader_source, + support_level=loader_level, + ) + + # ========================================================================= + # STEP 5: I/O tensor specs — registry first, then resolve_io_specs + # ========================================================================= + input_tensors: list[TensorInfo] = [] + output_tensors: list[TensorInfo] = [] + onnx_config_class = None + onnx_config_source = "none" + exporter_level = SupportLevel.UNSUPPORTED + opset_version = 17 + + # Path 1: Check MODEL_BUILD_CONFIGS registry for predefined config + registered = MODEL_BUILD_CONFIGS.get(mt) + if registered and registered.export and registered.export.input_tensors is not None: + export_cfg = registered.export + input_tensors = [ + TensorInfo(name=s.name or "unknown", dtype=s.dtype, shape=s.shape) + for s in export_cfg.input_tensors + ] + output_tensors = [ + TensorInfo(name=s.name or "unknown") for s in (export_cfg.output_tensors or []) + ] + onnx_config_class = f"{mt.upper()}IOConfig" + onnx_config_source = "MODEL_BUILD_CONFIGS" + exporter_level = SupportLevel.SUPPORTED + opset_version = export_cfg.opset_version + else: + # Path 2: resolve_io_specs (shared with config command) + try: + import optimum.exporters.onnx.model_configs # noqa: F401 + from optimum.exporters.tasks import TasksManager + + onnx_config_cls = TasksManager.get_exporter_config_constructor( + exporter="onnx", + model_type=model_type, + task=task, + library_name="transformers", + ) + if onnx_config_cls: + config_name = ( + onnx_config_cls.func.__name__ + if isinstance(onnx_config_cls, functools.partial) + else onnx_config_cls.__name__ + ) + onnx_config_class = config_name + onnx_config_source = "TasksManager" + exporter_level = SupportLevel.DEFAULT + + if hf_config is not None: + try: + io_specs = resolve_io_specs( + model_type=model_type, + task=task, + hf_config=hf_config, + model_id=model_id, + ) + input_tensors, output_tensors = build_tensor_infos_from_io_specs(io_specs) + except Exception as e: + logger.debug("resolve_io_specs failed for %s/%s: %s", model_type, task, e) + except Exception as e: + logger.debug("TasksManager lookup failed for %s/%s: %s", model_type, task, e) + + exporter_info = ExporterInfo( + onnx_config_class=onnx_config_class, + onnx_config_source=onnx_config_source, + support_level=exporter_level, + input_tensors=input_tensors, + output_tensors=output_tensors, + opset_version=opset_version, + ) + + # ========================================================================= + # STEP 6: WinML class (inspect-only lookup) + # ========================================================================= + winml_info = resolve_winml(model_type, task) + + # ========================================================================= + # STEP 7: Module hierarchy (optional, requires model_id) + # ========================================================================= + hierarchy_info = None + if include_hierarchy and model_id: + try: + from ..inspect.hierarchy import extract_hierarchy + + hierarchy_info = extract_hierarchy(model_id) + except Exception as e: + logger.debug("Hierarchy extraction failed for %s: %s", model_id, e) + + # ========================================================================= + # STEP 8: Overall support status + # ========================================================================= + overall_support, support_notes = compile_support_status(loader_info, exporter_info, winml_info) + + # ========================================================================= + # STEP 9: Build config (registry lookup only, no generation) + # ========================================================================= + build_config = registered.to_dict() if registered else None + + # ========================================================================= + # STEP 10: Inspect-only enrichment (conditional on model_id) + # ========================================================================= + cache_info = resolve_cache(model_id) if model_id else None + processor_info = resolve_processor(model_id, model_type=model_type) if model_id else None + io_config_info = resolve_io_config( + parent_hf_config, + model_id=model_id, + model_type=model_type, + task=task, + ) + + return InspectResult( + model_id=model_id or model_type or model_class_override or "unknown", + model_type=model_type, + architectures=architectures, + task=task, + task_source=task_source, + loader=loader_info, + exporter=exporter_info, + winml=winml_info, + overall_support=overall_support, + support_notes=support_notes, + build_config=build_config, + hierarchy=hierarchy_info, + cache=cache_info, + processor=processor_info, + io_config=io_config_info, + ) diff --git a/src/winml/modelkit/commands/optimize.py b/src/winml/modelkit/commands/optimize.py index 7e410760a..1289da259 100644 --- a/src/winml/modelkit/commands/optimize.py +++ b/src/winml/modelkit/commands/optimize.py @@ -30,7 +30,7 @@ import click from rich.console import Console -from ..onnx import is_compiled_onnx, load_onnx, save_onnx +from ..onnx import load_onnx, save_onnx if TYPE_CHECKING: @@ -140,8 +140,7 @@ def capability_options(func: Callable) -> Callable: following the design pattern from modelkit/optim/cli.py. """ # Late import to speed up CLI startup - from ..optim.pipes import get_all_capabilities - from ..optim.registry import BoolCapability, ChoiceCapability, IntCapability + from ..optim import BoolCapability, ChoiceCapability, IntCapability, get_all_capabilities # Get all capabilities and reverse for correct Click ordering all_caps = list(get_all_capabilities().values()) @@ -275,10 +274,10 @@ def optimize( winml optimize -m model.onnx -c config.toml """ # Import capabilities (late import to speed up CLI) - from ..optim.pipes import get_all_capabilities - from ..optim.registry import ( + from ..optim import ( BoolCapability, auto_enable_dependencies, + get_all_capabilities, validate, validate_dependencies, ) @@ -287,7 +286,7 @@ def optimize( # Handle --list-capabilities if list_capabilities: - from ..optim.registry import ChoiceCapability, IntCapability + from ..optim import ChoiceCapability, IntCapability if not all_caps: console.print("[yellow]No capabilities registered.[/yellow]") @@ -382,12 +381,6 @@ def optimize( if model is None: raise click.UsageError("Missing option '--model' / '-m'.") - if is_compiled_onnx(model): - raise click.ClickException( - f"{model} is a compiled EPContext model and cannot be optimized. " - "Run 'winml optimize' on the original ONNX model before compilation." - ) - # Inherit debug mode from parent if ctx.obj and ctx.obj.get("debug"): verbose = True @@ -397,7 +390,7 @@ def optimize( logging.getLogger("winml.modelkit").setLevel(logging.DEBUG) # Import optimizer - from ..optim.optimizer import Optimizer + from ..optim import Optimizer # Determine output path if output is None: @@ -425,6 +418,8 @@ def optimize( # 3. Apply config file if specified (overrides preset/defaults) if config: file_config = load_config(config) + # Normalize snake_case keys to kebab-case (accept both formats) + file_config = {k.replace("_", "-"): v for k, v in file_config.items()} final_config.update(file_config) console.print(f"[dim]Loaded config from: {config}[/dim]") diff --git a/src/winml/modelkit/commands/perf.py b/src/winml/modelkit/commands/perf.py index ecfcbf8d3..fdbe5001d 100644 --- a/src/winml/modelkit/commands/perf.py +++ b/src/winml/modelkit/commands/perf.py @@ -25,10 +25,9 @@ import click import numpy as np from rich.console import Console -from rich.panel import Panel from rich.table import Table -from .live_chart import LiveMonitorDisplay +from ._live_chart import LiveMonitorDisplay if TYPE_CHECKING: @@ -58,6 +57,84 @@ } +# ============================================================================= +# EP Monitor Dispatch +# ============================================================================= + + +def _resolve_ep_monitor( + ep: str | None, + op_tracing: str | None, + output_dir: Path, + device: str | None = None, +) -> Any: + """Pick the EPMonitor for the requested EP and optional op-tracing level. + + Explicit dispatch — no registry, no plugin loading. Raises RuntimeError + when op-tracing is requested against an EP that has no op-tracing monitor. + + EP names are matched case-insensitively (``QNN``, ``Qnn``, ``qnn`` all + behave identically). When ``op_tracing`` is set and ``ep`` is empty, + ``device`` is consulted to auto-infer the EP (e.g. ``device="npu"`` + selects QNN when QNNMonitor reports availability). This keeps the + headline ``wmk perf --device npu --op-tracing basic`` invocation working + without requiring an explicit ``--ep qnn``. + + Args: + ep: Short EP name from CLI (e.g. "qnn", "vitisai", "cpu", None/empty). + op_tracing: "basic" | "detail" | None (from --op-tracing flag). + output_dir: Directory for monitor artifacts (CSV, schematic, etc.). + device: Device hint from CLI (``"npu"``, ``"cpu"``, etc.). Used only + to auto-infer EP when ``op_tracing`` is set and ``ep`` is empty. + + Returns: + An EPMonitor subclass instance. NullEPMonitor when no monitor applies. + + Raises: + RuntimeError: If op_tracing is truthy but the EP has no op-tracing + monitor available on this system. + """ + from ..session.monitor.ep_monitor import NullEPMonitor + + ep_norm = (ep or "").lower() + device_norm = (device or "").lower() + + if op_tracing: + from ..session.monitor.qnn_monitor import QNNMonitor + + # Auto-infer EP when not explicitly set. --op-tracing is itself a + # strong intent signal for QNN-only profiling, so: + # --device npu -> QNN (SC-1 invocation) + # --device auto -> QNN when available (default CLI invocation) + # --device "" -> QNN when available (programmatic callers) + # Explicit --device cpu / --device gpu still falls through to the + # hard-fail branch below — those EPs have no op-tracing monitor. + if not ep_norm and device_norm in ("npu", "auto", "") and QNNMonitor.is_available(): + ep_norm = "qnn" + + if ep_norm == "qnn": + if not QNNMonitor.is_available(): + raise RuntimeError( + "Op-tracing requires QNN EP, but QNN is not available on this system. " + "Install onnxruntime-qnn or onnxruntime-windowsml with QNN runtime, " + "or run `wmk perf` without --op-tracing." + ) + return QNNMonitor(level=op_tracing, output_dir=output_dir) + + raise RuntimeError( + f"Op-tracing not available for EP {ep!r} on device {device!r}. " + "Op-tracing currently requires QNN. Ensure QNN is available " + "(install onnxruntime-qnn or onnxruntime-windowsml with QNN runtime)." + ) + + # Proof-of-execution monitors (no op-tracing) + from ..session.monitor.vitisai_monitor import VitisAIMonitor + + if ep_norm == "vitisai" and VitisAIMonitor.is_available(): + return VitisAIMonitor() + return NullEPMonitor() + + # ============================================================================= # Data Classes # ============================================================================= @@ -81,6 +158,7 @@ class BenchmarkConfig: monitor: bool = False ep: str | None = None shape_config: dict | None = None + op_tracing: str | None = None @dataclass @@ -178,8 +256,8 @@ def generate_random_inputs( ) -> dict[str, np.ndarray]: """Generate random inputs based on model io_config. - Uses modelkit.core.model_input_generator for spec-driven generation, - then converts torch tensors to numpy for ONNX Runtime. + Uses modelkit.core.model_input_generator for spec-driven generation. + Returns numpy arrays directly (no torch dependency). Args: io_config: Model I/O configuration from WinMLSession.io_config. @@ -191,7 +269,7 @@ def generate_random_inputs( Returns: Dictionary of input_name -> numpy array """ - from ..core.model_input_generator import generate_dummy_inputs_from_specs + from ..core import generate_dummy_inputs_from_specs specs: dict[str, dict[str, Any]] = {} for name, shape, dtype_str in zip( @@ -217,8 +295,7 @@ def generate_random_inputs( "shape": list(resolved_shape), } - torch_inputs = generate_dummy_inputs_from_specs(specs) - return {name: tensor.numpy() for name, tensor in torch_inputs.items()} + return generate_dummy_inputs_from_specs(specs) def _resolve_shape( @@ -292,6 +369,16 @@ def run(self) -> BenchmarkResult: logger.info("Generating benchmark inputs") self._generate_inputs() + # Compile session early so model.device is resolved for display + self._model._session.compile() + + # Print model info before benchmark starts + _print_model_info( + self._model.io_config, + task=self._model.task or self.config.task, + device=self._model.device, + ) + # [3] Run benchmark logger.info( "Running benchmark: %d iterations + %d warmup", @@ -357,8 +444,14 @@ def _generate_inputs(self) -> None: ) def _run_benchmark(self) -> PerfStats: - """Execute benchmark iterations with timing.""" - if self.config.monitor: + """Execute benchmark iterations with timing. + + Dispatches to the monitored path whenever ``--monitor`` was passed OR + ``--op-tracing`` was requested. Op-tracing requires the EP monitor to + wrap ``session.perf()``, so the simple no-monitor path cannot fulfill + it; routing both flags through the same code path guarantees parity. + """ + if self.config.monitor or self.config.op_tracing: return self._run_benchmark_monitored() return self._run_benchmark_simple() @@ -367,93 +460,87 @@ def _run_benchmark_simple(self) -> PerfStats: session = self._model._session total_iterations = self.config.warmup + self.config.iterations - with session.perf(warmup=self.config.warmup) as stats: - for i in range(total_iterations): - session.run(self._inputs) - - # Progress logging (every 10%) - if (i + 1) % max(1, total_iterations // 10) == 0: - logger.debug("Progress: %d/%d", i + 1, total_iterations) + with session.perf(warmup=self.config.warmup) as ctx: + _run_simple_loop(session, self._inputs, total_iterations) - return stats + # Expose ctx for post-benchmark reporting (parity with monitored path). + self._perf_ctx = ctx + return ctx.stats def _run_benchmark_monitored(self) -> PerfStats: - """Execute benchmark with live hardware monitoring. - - Always runs HWMonitor for system-wide metrics (CPU, RAM, NPU/GPU). - Optionally runs an EP-specific monitor (e.g., VitisAIMonitor) - alongside for vendor proof-of-execution. Uses NullEPMonitor when - no vendor monitor is available, eliminating null checks. + """Execute benchmark with live hardware monitoring and/or op-tracing. + + Resolves the EP-specific monitor (e.g., QNNMonitor, VitisAIMonitor) + via :func:`_resolve_ep_monitor` (NullEPMonitor when nothing applies). + The EP monitor is integrated into ``session.perf()`` so op-tracing + observes the user's actual benchmark iterations. + + HWMonitor (system-wide CPU/RAM/NPU metrics) is engaged when available + AND either ``--monitor`` was set or HW data is otherwise needed. When + HWMonitor is unavailable but op-tracing is still requested, the run + proceeds with the EP monitor only — op-tracing is the headline goal + and must not be blocked by missing HW telemetry. """ - from ..session.monitor.ep_monitor import NullEPMonitor from ..session.monitor.hw_monitor import HWMonitor - from ..session.monitor.vitisai_monitor import VitisAIMonitor session = self._model._session total_iterations = self.config.warmup + self.config.iterations - if not HWMonitor.is_available(): + output_dir = self.config.output_path.parent if self.config.output_path else Path.cwd() + try: + ep_monitor = _resolve_ep_monitor( + ep=self.config.ep, + op_tracing=self.config.op_tracing, + output_dir=output_dir, + device=self.config.device, + ) + except RuntimeError as e: + Console(stderr=True).print(f"[red]Error:[/red] {e}") + raise SystemExit(1) from None + + # HWMonitor is best-effort: required only for the live-chart UI on + # --monitor. When it's unavailable but op-tracing is requested, run + # without HW telemetry rather than degrading op-tracing to a no-op. + hw_available = HWMonitor.is_available() + if self.config.monitor and not hw_available: Console(stderr=True).print( "[yellow]Warning:[/yellow] HWMonitor unavailable on this system. " "Running without hardware monitoring." ) - return self._run_benchmark_simple() - - hw_monitor = HWMonitor(poll_interval_ms=_HW_POLL_INTERVAL_MS) - # EP-specific proof-of-execution monitor. - # When QNN/OpenVINO monitors become real, add entries here. - _ep_monitors = {"vitisai": VitisAIMonitor} - ep = self.config.ep - monitor_cls = _ep_monitors.get(ep) - if monitor_cls and monitor_cls.is_available(): - ep_monitor = monitor_cls() - else: - ep_monitor = NullEPMonitor() - - with ( - session.perf(warmup=self.config.warmup) as stats, - hw_monitor as hw, - ep_monitor as ep_mon, - ): - display = LiveMonitorDisplay( - total_iterations=total_iterations, - warmup=self.config.warmup, - model_id=self.config.model_id, - device=self.config.device, - ) - with display: - for i in range(total_iterations): - session.run(self._inputs) - - latest_latency = stats.all_samples_ms[-1] if stats.all_samples_ms else 0 - display.update( - iteration=i + 1, - latency_ms=latest_latency, - util_samples=hw.utilization_samples, - memory_local_mb=hw.peak_memory_local_mb, - memory_shared_mb=hw.peak_memory_shared_mb, - cpu_pct=hw.mean_cpu_pct, - ram_mb=hw.ram_used_mb, - cpu_samples=hw.cpu_samples, - ) - - # Print final monitor snapshot - display.print_final_snapshot( - util_samples=hw.utilization_samples, - memory_mb=hw.peak_memory_mb, - latency_ms=stats.mean_ms, - hw_dict=hw.to_dict(), - cpu_samples=hw.cpu_samples, - ) + if hw_available: + hw_monitor = HWMonitor(poll_interval_ms=_HW_POLL_INTERVAL_MS) + with ( + session.perf(warmup=self.config.warmup, monitor=ep_monitor) as ctx, + hw_monitor as hw, + ): + _run_monitored_loop( + session, + self._inputs, + ctx.stats, + hw, + total_iterations=total_iterations, + warmup=self.config.warmup, + model_id=self.config.model_id, + device=self.config.device, + ) + self._hw_metrics = hw.to_dict() - # Store hardware metrics - self._hw_metrics = hw.to_dict() - ep_dict = ep_mon.to_dict() + # EP proof data is accessible via ctx.monitor.to_dict() + ep_dict = ctx.monitor.to_dict() if ep_dict: # NullEPMonitor returns {}, real monitors return data self._hw_metrics["ep_proof"] = ep_dict + else: + # HW unavailable: run with EP monitor only (op-tracing path). + with session.perf(warmup=self.config.warmup, monitor=ep_monitor) as ctx: + _run_simple_loop(session, self._inputs, total_iterations) + ep_dict = ctx.monitor.to_dict() + if ep_dict: + self._hw_metrics = {"ep_proof": ep_dict} - return stats + # Store the op-trace context for post-benchmark reporting + self._perf_ctx = ctx + return ctx.stats def _collect_results(self, stats: PerfStats) -> BenchmarkResult: """Collect benchmark results from PerfStats.""" @@ -494,9 +581,9 @@ def _collect_results(self, stats: PerfStats) -> BenchmarkResult: # Throughput samples_per_sec=samples_per_sec, batches_per_sec=batches_per_sec, - # Actual values - actual_device=self._model._session.device, - actual_task=self.config.task or "auto-detected", + # Actual values (resolved after build + compile) + actual_device=self._model.device, + actual_task=self._model.task or self.config.task or "auto-detected", # Hardware monitor metrics (only present when --monitor is used) hw_monitor=getattr(self, "_hw_metrics", None), ) @@ -626,16 +713,16 @@ def _perf_modules( hw_ctx = HWMonitor(poll_interval_ms=_HW_POLL_INTERVAL_MS) if hw_ctx: - with session.perf(warmup=warmup) as stats, hw_ctx as hw: + with session.perf(warmup=warmup) as ctx, hw_ctx as hw: for _ in range(total_iters): session.run(inputs) hw_metrics = hw.to_dict() + mod_stats = ctx.stats else: - with session.perf(warmup=warmup) as stats: + with session.perf(warmup=warmup) as ctx: for _ in range(total_iters): session.run(inputs) - - mod_stats = stats + mod_stats = ctx.stats result_entry: dict[str, Any] = { "module_path": module_path, "mean_ms": round(mod_stats.mean_ms, 3), @@ -719,24 +806,26 @@ def _perf_modules( def display_console_report(result: BenchmarkResult, console: Console) -> None: """Display benchmark results in formatted console output.""" - # Header + # Info section — show "requested (resolved)" when they differ console.print() - console.print( - Panel.fit( - f"[bold]Benchmark: {result.config.model_id}[/bold]", - border_style="blue", - ) - ) - # Info section - console.print() - console.print(f"[dim]Device:[/dim] {result.actual_device}") - console.print(f"[dim]Precision:[/dim] {result.config.precision}") - console.print(f"[dim]Task:[/dim] {result.actual_task}") - console.print( - f"[dim]Iterations:[/dim] {result.config.iterations} (+ {result.config.warmup} warmup)" - ) - console.print(f"[dim]Batch Size:[/dim] {result.config.batch_size}") + req_device = result.config.device + act_device = result.actual_device + device_str = f"{req_device} ({act_device})" if req_device != act_device else act_device + console.print(f"[dim]Device:[/dim] {device_str}") + + # TODO: show resolved precision once WinMLPreTrainedModel.precision + # is implemented (derive from _build_config.quant.weight_type) + + act_task = result.actual_task + if act_task.startswith("n/a"): + task_str = act_task + else: + req_task = result.config.task or "auto" + task_str = f"{req_task} ({act_task})" if req_task != act_task else act_task + console.print(f"[dim]Task:[/dim] {task_str}") + + # I/O tensor info is printed before the benchmark via _print_model_info() # Latency table console.print() @@ -814,6 +903,205 @@ def generate_output_path(model_id: str) -> Path: return Path(f"{slug}_perf.json") +# ============================================================================= +# Shared benchmark helpers +# ============================================================================= + + +def _print_model_info( + io_config: dict, + *, + task: str | None = None, + device: str = "auto", +) -> None: + """Print model I/O metadata before the benchmark starts.""" + console = Console(stderr=True) + console.print() + console.print(f"[dim]Device:[/dim] {device}") + # TODO: show resolved precision once WinMLPreTrainedModel.precision + # is implemented (derive from _build_config.quant.weight_type) + if task: + console.print(f"[dim]Task:[/dim] {task}") + + names = io_config.get("input_names", []) + shapes = io_config.get("input_shapes", []) + types = io_config.get("input_types", []) + if names: + label = "[dim]Inputs:[/dim] " + pad = " " + for i, name in enumerate(names): + shape = shapes[i] if i < len(shapes) else [] + dtype = str(types[i]) if i < len(types) else "" + shape_str = f"{shape!s}" + line = f"{name:<20s} {shape_str:<22s} {dtype}" + console.print(f"{label if i == 0 else pad}{line}") + + out_names = io_config.get("output_names", []) + out_shapes = io_config.get("output_shapes", []) + if out_names: + label = "[dim]Outputs:[/dim] " + pad = " " + for i, name in enumerate(out_names): + shape = out_shapes[i] if i < len(out_shapes) else [] + console.print(f"{label if i == 0 else pad}{name:<20s} {shape!s}") + + console.print() + + +def _run_monitored_loop( + session: Any, + inputs: dict[str, Any], + stats: PerfStats, + hw: Any, + *, + total_iterations: int, + warmup: int, + model_id: str, + device: str, +) -> None: + """Run the benchmark iteration loop with live hardware monitoring. + + Shared by both HF-path (PerfBenchmark) and ONNX-path (_run_onnx_benchmark). + """ + display = LiveMonitorDisplay( + total_iterations=total_iterations, + warmup=warmup, + model_id=model_id, + device=device, + ) + with display: + for i in range(total_iterations): + session.run(inputs) + + latest_latency = stats.all_samples_ms[-1] if stats.all_samples_ms else 0 + display.update( + iteration=i + 1, + latency_ms=latest_latency, + util_samples=hw.utilization_samples, + memory_local_mb=hw.peak_memory_local_mb, + memory_shared_mb=hw.peak_memory_shared_mb, + cpu_pct=hw.mean_cpu_pct, + ram_mb=hw.ram_used_mb, + cpu_samples=hw.cpu_samples, + ) + + +def _run_simple_loop( + session: Any, + inputs: dict[str, Any], + total_iterations: int, +) -> None: + """Run the benchmark iteration loop with periodic debug logging. + + Shared by both HF-path (PerfBenchmark) and ONNX-path (_run_onnx_benchmark). + """ + for i in range(total_iterations): + session.run(inputs) + + if (i + 1) % max(1, total_iterations // 10) == 0: + logger.debug("Progress: %d/%d", i + 1, total_iterations) + + +# ============================================================================= +# ONNX Direct Benchmark +# ============================================================================= + + +def _run_onnx_benchmark( + onnx_path: Path, + *, + device: str, + iterations: int, + warmup: int, + batch_size: int, + config: BenchmarkConfig, +) -> BenchmarkResult: + """Benchmark an ONNX file directly via WinMLSession (no HF build). + + Creates a WinMLSession, reads io_config for input shapes, + generates random inputs, and runs the standard benchmark loop. + """ + from ..session import WinMLSession + + session = WinMLSession(onnx_path=onnx_path, device=device) + + # Generate random inputs from session's I/O config + io_cfg = session.io_config + inputs = generate_random_inputs(io_config=io_cfg, batch_size=batch_size) + + # Compile session early so session.device is resolved for display + session.compile() + + # Print model info before benchmark starts + _print_model_info(io_cfg, device=session.device) + + # Run benchmark + total_iterations = warmup + iterations + hw_metrics = None + hw_ctx = None + + # Determine if hardware monitoring is available + if config.monitor: + from ..session.monitor.hw_monitor import HWMonitor + + if HWMonitor.is_available(): + hw_ctx = HWMonitor(poll_interval_ms=_HW_POLL_INTERVAL_MS) + else: + Console(stderr=True).print( + "[yellow]Warning:[/yellow] HWMonitor unavailable. " + "Running ONNX benchmark without monitoring." + ) + + if hw_ctx: + with session.perf(warmup=warmup) as ctx, hw_ctx as hw: + _run_monitored_loop( + session, + inputs, + ctx.stats, + hw, + total_iterations=total_iterations, + warmup=warmup, + model_id=str(onnx_path.name), + device=device, + ) + hw_metrics = hw.to_dict() + stats = ctx.stats + else: + with session.perf(warmup=warmup) as ctx: + _run_simple_loop(session, inputs, total_iterations) + stats = ctx.stats + + # Collect results + mean_latency_sec = stats.mean_ms / 1000.0 + samples_per_sec = batch_size / mean_latency_sec if mean_latency_sec > 0 else 0 + batches_per_sec = 1.0 / mean_latency_sec if mean_latency_sec > 0 else 0 + samples = stats.samples_ms + std_ms = float(np.std(samples)) if samples else 0.0 + + return BenchmarkResult( + config=config, + input_names=io_cfg["input_names"], + input_shapes=[list(s) if s else [] for s in io_cfg["input_shapes"]], + input_types=[str(t) for t in io_cfg["input_types"]], + output_names=io_cfg["output_names"], + output_shapes=[list(s) if s else [] for s in io_cfg["output_shapes"]], + mean_ms=stats.mean_ms, + min_ms=stats.min_ms, + max_ms=stats.max_ms, + p50_ms=stats.p50_ms, + p90_ms=stats.p90_ms, + p95_ms=stats.p95_ms, + p99_ms=stats.p99_ms, + std_ms=std_ms, + raw_samples_ms=stats.samples_ms, + samples_per_sec=samples_per_sec, + batches_per_sec=batches_per_sec, + actual_device=session.device, + actual_task="n/a (direct ONNX)", + hw_monitor=hw_metrics, + ) + + # ============================================================================= # CLI Command # ============================================================================= @@ -847,7 +1135,12 @@ def generate_output_path(model_id: str) -> Path: type=int, default=100, show_default=True, - help="Number of benchmark iterations", + help=( + "Number of benchmark iterations. " + "When --op-tracing is set without an explicit --iterations, " + "defaults to 1 (a single inference produces a usable per-op trace; " + "more iterations just inflate the CSV)." + ), ) @click.option( "--warmup", @@ -937,7 +1230,9 @@ def generate_output_path(model_id: str) -> Path: "op_tracing", type=click.Choice(["basic", "detail"], case_sensitive=False), default=None, - help="Enable operator-level profiling (requires onnxruntime-qnn)", + help="Enable operator-level profiling (requires onnxruntime-qnn). " + "Currently supported only for HuggingFace model IDs and built model " + "directories — not for direct .onnx file inputs.", ) @click.option( "--compare-devices", @@ -981,7 +1276,7 @@ def perf( from the model's I/O configuration. Accepts both HuggingFace model IDs and local .onnx files. - Both paths go through PerfBenchmark with WinMLAutoModel. + HF models go through PerfBenchmark; .onnx files use _run_onnx_benchmark. \b Examples: @@ -1023,6 +1318,14 @@ def perf( hf_model = model_id + # Smart default: --op-tracing produces a usable per-op trace from a single + # inference; the default 100 iterations just inflates the profiling CSV + # without adding profiling value (operators are averaged across iterations). + # When the user did not explicitly pass --iterations alongside --op-tracing, + # collapse to 1. + if op_tracing and ctx.get_parameter_source("iterations") == click.core.ParameterSource.DEFAULT: + iterations = 1 + # Setup logging if verbose or (ctx.obj and ctx.obj.get("debug")): logging.getLogger("winml.modelkit").setLevel(logging.DEBUG) @@ -1100,13 +1403,25 @@ def perf( monitor=monitor, ep=ep.lower() if ep else None, shape_config=shape_config, + op_tracing=op_tracing, ) + model_path = Path(hf_model) + is_onnx = model_path.suffix.lower() == ".onnx" + + # NFR-2: --op-tracing on a direct .onnx input is not yet supported. + # _run_onnx_benchmark does not thread the EP monitor through session.perf + # yet — fail fast and clearly rather than running the benchmark and + # silently producing no profiling data. + if op_tracing and is_onnx: + raise click.UsageError( + "--op-tracing is not yet supported for direct ONNX file inputs. " + "Use a HuggingFace model ID or a built model directory." + ) + try: - # Both ONNX and HF go through PerfBenchmark (unified pipeline) - model_path = Path(hf_model) - is_onnx = model_path.suffix.lower() == ".onnx" if is_onnx: + # ONNX direct path -- skip HF build, benchmark via WinMLSession if shape_config: console.print( "[yellow]Warning:[/yellow] --shape-config is ignored for " @@ -1115,14 +1430,28 @@ def perf( config.shape_config = None if not model_path.exists(): raise FileNotFoundError(f"ONNX file not found: {model_path}") - console.print(f"[dim]Building + benchmarking ONNX:[/dim] {model_path}") + console.print(f"[dim]Benchmarking ONNX:[/dim] {model_path}") + + from ..sysinfo import resolve_device + + resolved_device, _ = resolve_device(device=config.device) + + result = _run_onnx_benchmark( + model_path, + device=resolved_device, + iterations=iterations, + warmup=warmup, + batch_size=batch_size, + config=config, + ) else: + # HF model path -- full build + benchmark via PerfBenchmark if precision != "auto": console.print(f"[dim]Precision: {precision} (applied during model build)[/dim]") console.print(f"[dim]Loading model:[/dim] {hf_model}") - benchmark = PerfBenchmark(config) - result = benchmark.run() + benchmark = PerfBenchmark(config) + result = benchmark.run() # Display console report display_console_report(result, console) @@ -1132,58 +1461,51 @@ def perf( console.print(f"[green]Results saved to:[/green] {output}") # ================================================================= - # Op-tracing (additive to existing benchmark) + # Op-tracing post-benchmark report + # Op-tracing is integrated into session.perf(monitor=...) via + # _resolve_ep_monitor; the monitor observes the actual benchmark + # iterations rather than a separate synthetic profiling pass. + # + # NFR-2: when op_tracing was requested, missing/failed profiling data + # is an ERROR (exit 4), NOT a soft warning. The only degraded-success + # status is "basic_fallback" (yellow notice, exit 0). # ================================================================= if op_tracing: - from ..optracing import is_qnn_profiling_available + from ..session.monitor.report import display_op_trace_report, write_op_trace_json - if not is_qnn_profiling_available(): - console.print("[red]Error:[/red] Op-tracing requires onnxruntime-qnn") - console.print("Install with: [bold]pip install onnxruntime-qnn[/bold]") - raise SystemExit(1) - - from ..optracing.registry import get_tracer - from ..optracing.report import ( - display_op_trace_report, - write_op_trace_json, - ) + # ONNX direct path is rejected upstream with click.UsageError; + # only the HF / PerfBenchmark path reaches here with op_tracing. + perf_ctx = getattr(benchmark, "_perf_ctx", None) + trace_result = perf_ctx.monitor.result if perf_ctx is not None else None - # Determine the ONNX model path from the benchmark flow. - # For HF models the ONNX is built internally by PerfBenchmark. - try: - onnx_for_trace = model_path if is_onnx else benchmark._model._onnx_path - except AttributeError: + if trace_result is None: console.print( - "[red]Error:[/red] Could not determine ONNX model path for op-tracing" + "[red]Error:[/red] Op-tracing requested but no profiling data was " + "produced. Check that the EP is correctly installed and the model " + "compiled successfully." ) - raise SystemExit(1) from None - - output_dir = output.parent if output else Path() - - # Look up tracer via registry (EP-agnostic). - tracer_cls = get_tracer("QNNExecutionProvider", op_tracing) - if tracer_cls is None: + sys.exit(4) + if trace_result.status == "no_data": + detail = trace_result.error or "no CSV written" console.print( - f"[red]Error:[/red] No tracer registered for QNN EP at level '{op_tracing}'" + f"[red]Error:[/red] Op-tracing produced no profiling data " + f"({detail}). The EP may have silently fallen back to CPU." + ) + sys.exit(4) + if trace_result.status == "parse_failed": + console.print( + f"[red]Error:[/red] Op-tracing artifact parse failed: {trace_result.error}" + ) + sys.exit(4) + if trace_result.status == "basic_fallback": + console.print( + "[yellow]Notice:[/yellow] Detail mode degraded to basic CSV " + "(QHAS unavailable; set QNN_SDK_ROOT to enable)." ) - raise SystemExit(1) - - profiler = tracer_cls( - onnx_for_trace, - output_dir=output_dir, - level=op_tracing, - ) - trace_result = profiler.run( - iterations=min(iterations, 10), - warmup=min(warmup, 3), - ) - # Display and save display_op_trace_report(trace_result, console) - model_slug = hf_model.replace("/", "_").replace("\\", "_") - if is_onnx: - model_slug = model_path.stem + output_dir = output.parent if output else Path.cwd() trace_output = output_dir / f"{model_slug}_op_trace.json" write_op_trace_json(trace_result, trace_output) console.print(f"[green]Op-trace saved to:[/green] {trace_output}") diff --git a/src/winml/modelkit/commands/quantize.py b/src/winml/modelkit/commands/quantize.py index 5dae75b4b..fdace93b1 100644 --- a/src/winml/modelkit/commands/quantize.py +++ b/src/winml/modelkit/commands/quantize.py @@ -25,7 +25,6 @@ import click from rich.console import Console -from ..onnx import is_compiled_onnx from ..utils.logging import configure_logging @@ -92,6 +91,12 @@ default=False, help="Use symmetric quantization", ) +@click.option( + "--task", + type=str, + default=None, + help="Task for calibration dataset selection (e.g., 'image-classification').", +) @click.option( "--verbose", "-v", @@ -111,6 +116,7 @@ def quantize( activation_type: str | None, per_channel: bool, symmetric: bool, + task: str | None, verbose: bool, ) -> None: r"""Quantize ONNX model by inserting QDQ nodes. @@ -142,12 +148,6 @@ def quantize( configure_logging(verbose=verbose) - if is_compiled_onnx(model): - raise click.ClickException( - f"{model} is a compiled EPContext model and cannot be quantized. " - "Run 'winml quantize' on the original ONNX model before compilation." - ) - # Import quantizer (late import to speed up CLI) from ..quant import WinMLQuantizationConfig, quantize_onnx @@ -178,8 +178,18 @@ def quantize( activation_type=resolved_activation, per_channel=per_channel, symmetric=symmetric, + task=task, ) + # Display dataset info from config + if config.dataset_name: + _dataset_display = config.dataset_name + elif config.task and config.task != "random": + _dataset_display = f"Default for task '{config.task}'" + else: + _dataset_display = "Random data (synthetic from ONNX I/O specs)" + console.print(f"[bold blue]Dataset:[/bold blue] {_dataset_display}") + try: console.print("\n[bold]Running quantization...[/bold]") result = quantize_onnx(model, output_path=output, config=config) @@ -216,7 +226,7 @@ def _resolve_quant_types( Returns: Tuple of (weight_type, activation_type). """ - from ..config.precision import is_quantized_precision, resolve_quant_types + from ..config import is_quantized_precision, resolve_quant_types if precision and is_quantized_precision(precision): default_w, default_a = resolve_quant_types(precision) diff --git a/src/winml/modelkit/commands/sys.py b/src/winml/modelkit/commands/sys.py index 86cb9c9dd..dc216e679 100644 --- a/src/winml/modelkit/commands/sys.py +++ b/src/winml/modelkit/commands/sys.py @@ -465,7 +465,7 @@ def _gather_ep_info() -> list[dict[str, Any]]: # Try WinML EP Registry first try: - from ..session.ep_registry import WinMLEPRegistry + from ..session import WinMLEPRegistry registry = WinMLEPRegistry.get_instance() winml_eps = registry.get_available_eps() diff --git a/src/winml/modelkit/config/__init__.py b/src/winml/modelkit/config/__init__.py index cc27f9c1e..e7a7ee2e7 100644 --- a/src/winml/modelkit/config/__init__.py +++ b/src/winml/modelkit/config/__init__.py @@ -30,6 +30,7 @@ generate_onnx_build_config, ) from .precision import ( + VALID_EPS, PrecisionPolicy, is_quantized_precision, resolve_precision, @@ -38,6 +39,7 @@ __all__ = [ + "VALID_EPS", "PrecisionPolicy", "WinMLBuildConfig", "generate_build_config", diff --git a/src/winml/modelkit/config/build.py b/src/winml/modelkit/config/build.py index 1eca3bfc4..17da4ab0b 100644 --- a/src/winml/modelkit/config/build.py +++ b/src/winml/modelkit/config/build.py @@ -56,8 +56,7 @@ WinMLExportConfig, _resolve_export_config_from_specs, ) -from ..loader import resolve_loader_config -from ..loader.config import WinMLLoaderConfig +from ..loader.config import WinMLLoaderConfig, resolve_loader_config from ..optim.config import WinMLOptimizationConfig from ..quant.config import WinMLQuantizationConfig from ..utils.config_utils import merge_config @@ -464,11 +463,10 @@ def generate_hf_build_config( Orchestration Flow: 1. loader.resolve_loader_config() -> (WinMLLoaderConfig, hf_config, resolved_class) (includes sub-config consolidation for multimodal) - 2. MODEL_BUILD_CONFIGS.get() — registry lookup - 3. Try Optimum export config; on failure use empty placeholder - 4. Merge registered export on top (registry always wins) - 5. _assemble_config() + merge -> WinMLBuildConfig - 6. If module: specialize for each matching submodule + 2. MODEL_BUILD_CONFIGS.get() — registry lookup (may short-circuit step 3) + 3. export._resolve_export_config_from_specs() OR registered export config + 4. _assemble_config() + merge -> WinMLBuildConfig + 5. If module: specialize for each matching submodule Args: model_id: HuggingFace model ID (e.g., "bert-base-uncased") or local path. @@ -521,9 +519,26 @@ class name. Uses torchinfo to discover submodules and infer # ========================================================================= # STEP 3: Generate export config # ========================================================================= - # Try Optimum first; if model is unsupported, use empty placeholder. - # Then always merge registered export config on top (registry wins). - try: + # Priority: registered config with I/O specs > Optimum lookup. + # Models not in Optimum's TasksManager (e.g., BLIP) crash at + # _resolve_export_config_from_specs(). If the registry already has + # input_tensors, use them directly and skip the Optimum path. + # Note: None means "not configured" (fall through to Optimum); + # [] would mean "explicitly no inputs" (use as-is, skip Optimum). + _registered_export = registered.export if registered else None + if _registered_export is not None and _registered_export.input_tensors is not None: + # deepcopy to avoid mutating the shared registry singleton + export_config = copy.deepcopy(_registered_export) + logger.info( + "Using registered export config for '%s' (skipping Optimum lookup)", + _registry_key, + ) + else: + # Standard path: resolve I/O specs from Optimum's OnnxConfig + logger.debug( + "No registered export config for '%s'; resolving via Optimum", + _registry_key, + ) export_config = _resolve_export_config_from_specs( model_type=loader_config.model_type, task=loader_config.task, @@ -533,31 +548,6 @@ class name. Uses torchinfo to discover submodules and infer batch_size=WinMLExportConfig().batch_size, **(shape_config or {}), ) - except ValueError as e: - # ONNXConfigNotFoundError is a ValueError subclass (from export.io) - # — catch broadly to avoid top-level import of export.io which - # triggers heavy optimum/transformers imports. - from ..export.io import ONNXConfigNotFoundError - - if not isinstance(e, ONNXConfigNotFoundError): - raise - logger.info( - "Optimum has no OnnxConfig for '%s'; using empty export config", - _registry_key, - ) - export_config = WinMLExportConfig() - - # Merge registered export on top — registered always wins. - # Use WinMLExportConfig.merge() to properly handle nested - # InputTensorSpec/OutputTensorSpec lists (merge_config converts - # dataclass lists to dicts which breaks __post_init__). - _registered_export = registered.export if registered else None - if _registered_export is not None: - export_config = _merge_export_config(export_config, _registered_export) - logger.info( - "Merged registered export config for '%s'", - _registry_key, - ) # ========================================================================= # STEP 4: Assemble config + merge override @@ -569,30 +559,59 @@ class name. Uses torchinfo to discover submodules and infer model_id=model_id, model_type=hf_config.model_type, ) - # STEP 3.5: Resolve quant + compile based on device/precision - # Only override assembled defaults when user explicitly targets a device/precision. - # When both are "auto", preserve _assemble_config() defaults (registry values). - if device != "auto" or precision != "auto" or ep is not None: - resolved_quant, resolved_compile = resolve_quant_compile_config( - device=device, - precision=precision, - ep=ep, - task=parent_config.loader.task, - ) - if resolved_quant is not None: - # Merge into assembled config to preserve task/model_name + if override: + parent_config = merge_config(parent_config, override) + + # ========================================================================= + # STEP 4.5: Apply device/precision policy (affects quant + compile only) + # ========================================================================= + from ..sysinfo import resolve_device + from .precision import resolve_precision + + # ALWAYS detect hardware — even when device="auto" — so we don't + # blindly default to QNN on machines without an NPU (#412). + resolved_device, available_devices = resolve_device(device=device) + logger.info( + "Device resolved: %s (available: %s)", + resolved_device, + ", ".join(available_devices), + ) + + policy = resolve_precision( + device=resolved_device, + precision=precision, + ep=ep, + available_devices=available_devices, + task=parent_config.loader.task, + ) + + # Apply policy: set compile provider from detected hardware + if policy.device != "auto": + # Quant config (weight_type and activation_type are always both-None or both-set) + if policy.weight_type is not None: if parent_config.quant is None: - parent_config.quant = resolved_quant - else: - parent_config.quant.weight_type = resolved_quant.weight_type - parent_config.quant.activation_type = resolved_quant.activation_type + parent_config.quant = WinMLQuantizationConfig() + parent_config.quant.weight_type = policy.weight_type + parent_config.quant.activation_type = policy.activation_type else: parent_config.quant = None - parent_config.compile = resolved_compile - # User override has highest priority — applied last - if override: - parent_config = merge_config(parent_config, override) + # Compile config + parent_config.compile = WinMLCompileConfig.for_provider( + policy.compile_provider, + ) + else: + # Even in auto/auto mode, set compile provider from detected hardware + # instead of preserving the hardcoded EPConfig default (#412). + from .precision import get_provider_for_device + + hw_provider = get_provider_for_device(resolved_device) + if hw_provider is not None: + parent_config.compile = WinMLCompileConfig.for_provider( + hw_provider, + ) + # When hw_provider is None (CPU-only), keep the default compile config + # so the pipeline still has a valid compile section. # ========================================================================= # STEP 5: Specialize for submodules if requested @@ -873,8 +892,8 @@ def _assemble_config( Args: loader_config: Resolved WinMLLoaderConfig (from resolve_loader_config). - export_config: Resolved WinMLExportConfig (Optimum baseline - merged with registered export config). + export_config: Resolved WinMLExportConfig + (from registry or _resolve_export_config_from_specs). registered: Registered config from MODEL_BUILD_CONFIGS (or None). model_id: HuggingFace model ID (for quant model_name), or None. model_type: Parent HF model type (for quant fallback name). diff --git a/src/winml/modelkit/config/precision.py b/src/winml/modelkit/config/precision.py index 69706c5b4..8722f31ef 100644 --- a/src/winml/modelkit/config/precision.py +++ b/src/winml/modelkit/config/precision.py @@ -18,10 +18,12 @@ logger = logging.getLogger(__name__) # Tasks where GPU auto-precision may differ (LLM = w4a16 recommendation) -_LLM_TASKS = frozenset({ - "text-generation", - "text2text-generation", -}) +_LLM_TASKS = frozenset( + { + "text-generation", + "text2text-generation", + } +) # Default auto-precision mapping: device -> precision _AUTO_PRECISION: dict[str, str] = { @@ -66,6 +68,19 @@ "cpu": None, } + +def get_provider_for_device(device: str) -> str | None: + """Get the default compile provider for a resolved device. + + Args: + device: Resolved device name ("npu", "gpu", "cpu"). + + Returns: + Provider name (e.g., "qnn", "dml") or None for CPU. + """ + return _DEVICE_TO_PROVIDER.get(device) + + # EP -> device inference (when --ep is given without --device) _EP_TO_DEVICE: dict[str, str] = { "qnn": "npu", @@ -234,9 +249,7 @@ def resolve_precision( if ep is not None: ep = ep.lower() if ep not in VALID_EPS: - raise ValueError( - f"Unknown EP '{ep}'. Expected one of: {sorted(VALID_EPS)}" - ) + raise ValueError(f"Unknown EP '{ep}'. Expected one of: {sorted(VALID_EPS)}") # Infer device from EP when device is "auto" if device == "auto": device = _EP_TO_DEVICE[ep] @@ -263,7 +276,8 @@ def resolve_precision( # Device is "auto" but precision is explicit — pick best device # FIXME: improve device-precision compatibility lookup table later resolved_device = _pick_device_for_precision( - resolved_precision, available_devices or ["cpu"], + resolved_precision, + available_devices or ["cpu"], ) # Resolve "auto" precision for the resolved device diff --git a/src/winml/modelkit/core/__init__.py b/src/winml/modelkit/core/__init__.py index 9bc5c4dfd..8f4cc9d47 100644 --- a/src/winml/modelkit/core/__init__.py +++ b/src/winml/modelkit/core/__init__.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- """Core utilities for ModelKit.""" -# New API - pure torch, no external dependencies +from .model_input_generator import generate_dummy_inputs_from_specs from .node_metadata import ( NodeMetadata, add_metadata_to_node, @@ -15,20 +15,6 @@ query_nodes_by_origin, set_origin_for_graph, ) -from .onnx_utils import ( - get_epcontext_info, - get_io_config, -) - - -def __getattr__(name: str): - """Lazy-load generate_dummy_inputs_from_specs to avoid importing torch at startup.""" - if name == "generate_dummy_inputs_from_specs": - from .model_input_generator import generate_dummy_inputs_from_specs - - globals()["generate_dummy_inputs_from_specs"] = generate_dummy_inputs_from_specs - return generate_dummy_inputs_from_specs - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") __all__ = [ @@ -44,3 +30,26 @@ def __getattr__(name: str): "query_nodes_by_origin", "set_origin_for_graph", ] + + +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "get_epcontext_info": (".onnx_utils", "get_epcontext_info"), + "get_io_config": (".onnx_utils", "get_io_config"), +} + + +def __getattr__(name: str): + """Lazy-load onnx_utils (imports torch at module level).""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return list(set(list(globals()) + __all__)) diff --git a/src/winml/modelkit/core/model_input_generator.py b/src/winml/modelkit/core/model_input_generator.py index 63191958a..5c6f3fb76 100644 --- a/src/winml/modelkit/core/model_input_generator.py +++ b/src/winml/modelkit/core/model_input_generator.py @@ -3,10 +3,11 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- #!/usr/bin/env python3 -"""Manual Model Input Generator - Pure PyTorch. +"""Manual Model Input Generator - Pure NumPy. -This module provides manual input tensor generation from specifications. -No external dependencies on Optimum or transformers. +This module provides input array generation from specifications using only +NumPy (no torch dependency). Outputs are numpy arrays compatible with +ONNX Runtime session.run(). For Optimum-based automatic input generation, use modelkit.export.io: - resolve_io_specs(model_type, task, hf_config) @@ -21,7 +22,7 @@ ... } >>> inputs = generate_dummy_inputs_from_specs(specs) >>> inputs["input_ids"].shape - torch.Size([1, 128]) + (1, 128) """ from __future__ import annotations @@ -29,7 +30,7 @@ import logging from typing import Any -import torch +import numpy as np logger = logging.getLogger(__name__) @@ -37,11 +38,11 @@ def generate_dummy_inputs_from_specs( input_specs: dict[str, dict[str, Any]], -) -> dict[str, torch.Tensor]: +) -> dict[str, np.ndarray]: """Generate dummy inputs from manual specifications. - This function creates PyTorch tensors based on explicit specifications, - without requiring model loading or Optimum/transformers dependencies. + This function creates NumPy arrays based on explicit specifications, + without requiring model loading or Optimum/transformers/torch dependencies. Args: input_specs: Input specifications with format: @@ -54,7 +55,7 @@ def generate_dummy_inputs_from_specs( } Returns: - Dictionary mapping input names to generated tensors + Dictionary mapping input names to generated numpy arrays Raises: ValueError: If required fields are missing or invalid @@ -70,7 +71,7 @@ def generate_dummy_inputs_from_specs( ... } >>> inputs = generate_dummy_inputs_from_specs(specs) >>> inputs["pixel_values"].shape - torch.Size([1, 3, 224, 224]) + (1, 3, 224, 224) """ inputs = {} @@ -84,9 +85,9 @@ def generate_dummy_inputs_from_specs( # Parse dtype dtype_str = spec["dtype"].lower() if dtype_str in ["int", "long", "int64"]: - dtype = torch.long + dtype = np.int64 elif dtype_str in ["float", "float32"]: - dtype = torch.float32 + dtype = np.float32 else: raise ValueError( f"Unsupported dtype '{spec['dtype']}' for '{name}'. Use 'int' or 'float'" @@ -103,16 +104,16 @@ def generate_dummy_inputs_from_specs( raise ValueError(f"Range must have exactly 2 values [min, max] for '{name}'") min_val, max_val = spec["range"] - if dtype == torch.long: - inputs[name] = torch.randint(min_val, max_val + 1, shape, dtype=dtype) + if dtype == np.int64: + inputs[name] = np.random.randint(min_val, max_val + 1, size=shape).astype(dtype) else: - inputs[name] = torch.rand(shape, dtype=dtype) * (max_val - min_val) + min_val + inputs[name] = np.random.rand(*shape).astype(dtype) * (max_val - min_val) + min_val else: # Default ranges - if dtype == torch.long: - inputs[name] = torch.randint(0, 2, shape, dtype=dtype) # Default: 0 or 1 + if dtype == np.int64: + inputs[name] = np.random.randint(0, 2, size=shape).astype(dtype) else: - inputs[name] = torch.rand(shape, dtype=dtype) # Default: [0, 1) + inputs[name] = np.random.rand(*shape).astype(dtype) logger.info( "Generated '%s': shape=%s, dtype=%s", name, list(inputs[name].shape), inputs[name].dtype diff --git a/src/winml/modelkit/core/onnx_utils.py b/src/winml/modelkit/core/onnx_utils.py index 04b479303..0aadbce2c 100644 --- a/src/winml/modelkit/core/onnx_utils.py +++ b/src/winml/modelkit/core/onnx_utils.py @@ -18,8 +18,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -import torch - if TYPE_CHECKING: import onnx @@ -356,7 +354,7 @@ def infer_output_names(outputs: Any) -> list[str] | None: for field_name in outputs.__dataclass_fields__: field_value = getattr(outputs, field_name, None) - if field_value is not None and isinstance(field_value, torch.Tensor): + if field_value is not None and type(field_value).__module__.startswith("torch"): output_names.append(field_name) # Only return names if we found simple tensor outputs diff --git a/src/winml/modelkit/export/__init__.py b/src/winml/modelkit/export/__init__.py index 2e4b2bd63..017f28911 100644 --- a/src/winml/modelkit/export/__init__.py +++ b/src/winml/modelkit/export/__init__.py @@ -19,32 +19,6 @@ ) -def __getattr__(name: str): - """Lazy-load heavy submodules to avoid importing optimum at startup.""" - _io_names = { - "MaxLengthTextInputGenerator", - "ONNXConfigNotFoundError", - "generate_dummy_inputs", - "register_onnx_overwrite", - "resolve_io_specs", - } - if name in _io_names: - from . import io - - resolved = getattr(io, name) - globals()[name] = resolved - return resolved - - _pytorch_names = {"export_pytorch", "export_onnx"} - if name in _pytorch_names: - from .pytorch import export_pytorch - - globals().update(export_pytorch=export_pytorch, export_onnx=export_pytorch) - return globals()[name] - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") - - __version__ = "2.1.0" __all__ = [ @@ -60,3 +34,31 @@ def __getattr__(name: str): "resolve_export_config", "resolve_io_specs", ] + + +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "MaxLengthTextInputGenerator": (".io", "MaxLengthTextInputGenerator"), + "ONNXConfigNotFoundError": (".io", "ONNXConfigNotFoundError"), + "generate_dummy_inputs": (".io", "generate_dummy_inputs"), + "register_onnx_overwrite": (".io", "register_onnx_overwrite"), + "resolve_io_specs": (".io", "resolve_io_specs"), + "export_pytorch": (".pytorch", "export_pytorch"), + "export_onnx": (".pytorch", "export_pytorch"), # alias for export_pytorch +} + + +def __getattr__(name: str): + """Lazy-load heavy exports to avoid importing optimum at package init.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return list(set(list(globals()) + __all__)) diff --git a/src/winml/modelkit/export/io.py b/src/winml/modelkit/export/io.py index 64c7952be..99d9ac58f 100644 --- a/src/winml/modelkit/export/io.py +++ b/src/winml/modelkit/export/io.py @@ -62,6 +62,22 @@ class ONNXConfigNotFoundError(ValueError): register_onnx_overwrite = TasksManager.create_register("onnx", overwrite_existing=True) +def ensure_hf_models_registered() -> None: + """Trigger HF model ONNX config registrations (idempotent). + + With lazy loading in ``modelkit/__init__.py``, the HF model files + (bert.py, clip.py, etc.) and their ``@register_onnx_overwrite`` + decorators are not executed until explicitly imported. This function + forces that import so registrations are in place before any + ``TasksManager.get_exporter_config_constructor()`` call. + """ + if getattr(ensure_hf_models_registered, "_done", False): + return + from ..models import hf as _hf # noqa: F401 + + ensure_hf_models_registered._done = True # type: ignore[attr-defined] + + # ============================================================================= # Task Synonym Extensions (extends Optimum's TasksManager.map_from_synonym) # ============================================================================= @@ -190,6 +206,8 @@ def _get_onnx_config( Raises: ValueError: If no OnnxConfig is registered for the model_type/task combination """ + ensure_hf_models_registered() + normalized_task = _map_task_synonym(task) logger.debug( diff --git a/src/winml/modelkit/inspect/__init__.py b/src/winml/modelkit/inspect/__init__.py index bb481f951..0b9e5cd4d 100644 --- a/src/winml/modelkit/inspect/__init__.py +++ b/src/winml/modelkit/inspect/__init__.py @@ -23,9 +23,11 @@ from transformers import AutoConfig from .resolver import ( + build_tensor_infos_from_io_specs, compile_support_status, detect_task, get_build_config, + get_known_tasks, resolve_cache, resolve_exporter, resolve_io_config, @@ -57,17 +59,14 @@ class InspectError(Exception): """Base exception for inspect command.""" - class ModelNotFoundError(InspectError): """Model not found on HuggingFace Hub.""" - class NetworkError(InspectError): """Network error while fetching model config.""" - def inspect_model( model_id: str, include_hierarchy: bool = False, @@ -130,8 +129,8 @@ def inspect_model( loader_info.hf_model_class_source, ) - # Step 4: Resolve exporter configuration (pass HF config for tensor specs) - exporter_info = resolve_exporter(model_type, task, hf_config=hf_config) + # Step 4: Resolve exporter configuration (pass model_id for correct image sizes) + exporter_info = resolve_exporter(model_type, task, hf_config=hf_config, model_id=model_id) logger.debug( "Exporter: %s (source: %s)", exporter_info.onnx_config_class, @@ -151,9 +150,7 @@ def inspect_model( logger.debug("Hierarchy: %d HF modules", hierarchy_info.hf_module_count) # Step 6: Compile overall support status - overall_support, support_notes = compile_support_status( - loader_info, exporter_info, winml_info - ) + overall_support, support_notes = compile_support_status(loader_info, exporter_info, winml_info) logger.info("Overall support: %s", overall_support.value) # Step 7: Get full build config (for verbose output) @@ -164,15 +161,20 @@ def inspect_model( logger.debug("Cache: %d/%d stages cached", cache_info.total_cached, len(cache_info.stages)) # Step 9: Resolve processor classes - processor_info = resolve_processor(model_id) + processor_info = resolve_processor(model_id, model_type=model_type) logger.debug( "Processor: %s, Tokenizer: %s", processor_info.processor_class, processor_info.tokenizer_class, ) - # Step 10: Extract IO config from HF config - io_config_info = resolve_io_config(hf_config) + # Step 10: Extract IO config (dynamically discovers attrs from OnnxConfig) + io_config_info = resolve_io_config( + hf_config, + model_id=model_id, + model_type=model_type, + task=task, + ) logger.debug( "IO Config: max_pos=%s, vocab=%s, img_size=%s", io_config_info.max_position_embeddings, @@ -215,5 +217,12 @@ def inspect_model( "SupportLevel", "TensorInfo", "WinMLInfo", + "build_tensor_infos_from_io_specs", + "compile_support_status", + "get_known_tasks", "inspect_model", + "resolve_cache", + "resolve_io_config", + "resolve_processor", + "resolve_winml", ] diff --git a/src/winml/modelkit/inspect/formatter.py b/src/winml/modelkit/inspect/formatter.py index e5bf0e4d2..e5b4855eb 100644 --- a/src/winml/modelkit/inspect/formatter.py +++ b/src/winml/modelkit/inspect/formatter.py @@ -63,14 +63,21 @@ def _output_processor_table(console: Console, result: InspectResult) -> None: processor_table.add_column("Field", style="cyan") processor_table.add_column("Value") + def _src_tag(source: str | None) -> str: + return f" [dim](via {source})[/dim]" if source else "" + if processor.processor_class: - processor_table.add_row("Processor", processor.processor_class) + src = _src_tag(processor.processor_source) + processor_table.add_row("Processor", f"{processor.processor_class}{src}") if processor.tokenizer_class: - processor_table.add_row("Tokenizer", processor.tokenizer_class) + src = _src_tag(processor.tokenizer_source) + processor_table.add_row("Tokenizer", f"{processor.tokenizer_class}{src}") if processor.image_processor_class: - processor_table.add_row("Image Processor", processor.image_processor_class) + src = _src_tag(processor.image_processor_source) + processor_table.add_row("Image Processor", f"{processor.image_processor_class}{src}") if processor.feature_extractor_class: - processor_table.add_row("Feature Extractor", processor.feature_extractor_class) + src = _src_tag(processor.feature_extractor_source) + processor_table.add_row("Feature Extractor", f"{processor.feature_extractor_class}{src}") # Only show panel if we have at least one processor class if any( @@ -134,6 +141,17 @@ def _output_io_config_table(console: Console, result: InspectResult) -> None: if io_config.hidden_size is not None: io_table.add_row("Hidden Size", str(io_config.hidden_size)) has_content = True + if io_config.hidden_sizes is not None: + sizes_str = " → ".join(str(s) for s in io_config.hidden_sizes) + io_table.add_row("Hidden Sizes", sizes_str) + has_content = True + + # Extra attrs discovered dynamically from OnnxConfig + if io_config.extra: + for key, val in sorted(io_config.extra.items()): + label = key.replace("_", " ").title() + io_table.add_row(label, str(val)) + has_content = True # Only show panel if we have content if has_content: @@ -244,7 +262,10 @@ def output_table(console: Console, result: InspectResult, verbose: bool = False) else: shape_str = "-" dtype_str = tensor.dtype or "-" - exporter_table.add_row(f" {tensor.name}", f"{dtype_str} {shape_str}") + extra = "" + if tensor.value_range is not None: + extra = f" [dim]range {tensor.value_range}[/dim]" + exporter_table.add_row(f" {tensor.name}", f"{dtype_str} {shape_str}{extra}") # Output tensors if result.exporter.output_tensors: @@ -395,6 +416,7 @@ def output_json(result: InspectResult, verbose: bool = False) -> str: "shape": list(t.shape) if t.shape else None, "shape_desc": t.shape_desc, "dynamic_axes": t.dynamic_axes, + "value_range": list(t.value_range) if t.value_range else None, } for t in result.exporter.input_tensors ], @@ -447,6 +469,10 @@ def output_json(result: InspectResult, verbose: bool = False) -> str: "tokenizer_class": result.processor.tokenizer_class, "image_processor_class": result.processor.image_processor_class, "feature_extractor_class": result.processor.feature_extractor_class, + "processor_source": result.processor.processor_source, + "tokenizer_source": result.processor.tokenizer_source, + "image_processor_source": result.processor.image_processor_source, + "feature_extractor_source": result.processor.feature_extractor_source, } else: data["processor"] = None @@ -466,6 +492,8 @@ def output_json(result: InspectResult, verbose: bool = False) -> str: "num_channels": io_config.num_channels, "sampling_rate": io_config.sampling_rate, "hidden_size": io_config.hidden_size, + "hidden_sizes": io_config.hidden_sizes, + "extra": io_config.extra, } else: data["io_config"] = None diff --git a/src/winml/modelkit/inspect/resolver.py b/src/winml/modelkit/inspect/resolver.py index 4002aab1f..27550ffe6 100644 --- a/src/winml/modelkit/inspect/resolver.py +++ b/src/winml/modelkit/inspect/resolver.py @@ -56,7 +56,7 @@ } -def _get_known_tasks() -> set[str]: +def get_known_tasks() -> set[str]: """Collect all known task strings from internal mappings and TasksManager. Returns: @@ -94,12 +94,10 @@ def validate_task(task: str) -> None: Raises: ValueError: If the task is not recognized. """ - known = _get_known_tasks() + known = get_known_tasks() if task not in known: sorted_tasks = sorted(known) - raise ValueError( - f"Unknown task '{task}'. Known tasks: {', '.join(sorted_tasks)}" - ) + raise ValueError(f"Unknown task '{task}'. Known tasks: {', '.join(sorted_tasks)}") def detect_task(config: PretrainedConfig) -> tuple[str, str]: @@ -176,18 +174,51 @@ def resolve_loader(model_type: str, task: str) -> LoaderInfo: ) -def _extract_tensor_specs_from_onnx_config( - onnx_config_cls, - hf_config: PretrainedConfig, +def _shape_to_desc(shape: tuple | list | None, dynamic_axes: dict[int, str]) -> str: + """Convert tensor shape to human-readable string with dynamic markers. + + Dynamic axes are shown as the concrete value from dummy inputs, + distinguishable from static dims by context (batch → "B"). + For non-batch dynamic dims (sequence, height, width), shows the + concrete value since that's what the model actually uses for export. + + Fixes D-3 from #247: uses axis names directly, no hardcoded abbreviations. + """ + if shape is None: + parts = [] + for _idx, axis_name in sorted(dynamic_axes.items()): + if axis_name.lower() in ("batch", "batch_size"): + parts.append("B") + else: + parts.append(axis_name) + return f"[{', '.join(parts)}]" if parts else "[]" + + parts = [] + for i, dim in enumerate(shape): + if i in dynamic_axes: + axis_name = dynamic_axes[i] + if axis_name.lower() in ("batch", "batch_size"): + parts.append("B") + else: + # Show concrete value — this is the export shape from + # preprocessor_config or shape_config, not a placeholder + parts.append(str(dim)) + else: + parts.append(str(dim)) + return f"[{', '.join(parts)}]" + + +def build_tensor_infos_from_io_specs( + io_specs: dict, ) -> tuple[list[TensorInfo], list[TensorInfo]]: - """Extract tensor specifications from an ONNX config class. + """Convert resolve_io_specs() output to TensorInfo lists. - Uses the ONNX config's generate_dummy_inputs() to get actual tensor shapes, - and the inputs/outputs properties for dynamic axes information. + Single conversion point from config's I/O spec format to inspect's + TensorInfo dataclass. Eliminates the duplicated extraction logic + that previously lived in _extract_tensor_specs_from_onnx_config. Args: - onnx_config_cls: ONNX config constructor (may be functools.partial) - hf_config: HuggingFace PretrainedConfig for shape bounds + io_specs: Dict returned by export/io.py resolve_io_specs() Returns: Tuple of (input_tensors, output_tensors) @@ -195,88 +226,44 @@ def _extract_tensor_specs_from_onnx_config( input_tensors: list[TensorInfo] = [] output_tensors: list[TensorInfo] = [] - try: - # Instantiate ONNX config with HF config - onnx_config = onnx_config_cls(hf_config) - - # Generate dummy inputs to get actual shapes - dummy_inputs: dict = {} - try: - dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt") - except Exception as e: - logger.debug("Failed to generate dummy inputs: %s", e) - - # Helper to convert shape to description with dynamic axis markers - def shape_to_desc( - shape: tuple | list | None, dynamic_axes: dict[int, str] - ) -> str: - """Convert tensor shape to human-readable string with dynamic markers.""" - if shape is None: - # Fallback: use dynamic axes only - parts = [] - for _idx, axis_name in sorted(dynamic_axes.items()): - if "batch" in axis_name.lower(): - parts.append("B") - else: - parts.append(axis_name) - return f"[{', '.join(parts)}]" if parts else "[]" - - parts = [] - for i, dim in enumerate(shape): - if i in dynamic_axes: - axis_name = dynamic_axes[i].lower() - if "batch" in axis_name: - parts.append("B") - elif "sequence" in axis_name: - parts.append("S") - elif "height" in axis_name or "width" in axis_name: - parts.append(str(dim)) # Use actual size - else: - parts.append(str(dim)) - else: - parts.append(str(dim)) - return f"[{', '.join(parts)}]" - - # Standard input dtypes based on tensor name patterns - def infer_dtype(name: str) -> str: - name_lower = name.lower() - if "ids" in name_lower or "label" in name_lower: - return "int64" - if "mask" in name_lower and "pixel" not in name_lower: - return "int64" - return "float32" - - # Process inputs - use actual shapes from dummy inputs - if hasattr(onnx_config, "inputs"): - for name, axes in onnx_config.inputs.items(): - shape = None - if name in dummy_inputs: - shape = tuple(dummy_inputs[name].shape) - shape_desc = shape_to_desc(shape, axes) - dtype = infer_dtype(name) - input_tensors.append( - TensorInfo( - name=name, - dtype=dtype, - shape_desc=shape_desc, - dynamic_axes=dict(axes), - ) - ) - - # Process outputs - we don't have actual shapes, use dynamic axes - if hasattr(onnx_config, "outputs"): - for name, axes in onnx_config.outputs.items(): - shape_desc = shape_to_desc(None, axes) - output_tensors.append( - TensorInfo( - name=name, - shape_desc=shape_desc, - dynamic_axes=dict(axes), - ) - ) + input_names = io_specs.get("input_names", []) + input_shapes = io_specs.get("input_shapes", []) + input_dtypes = io_specs.get("input_dtypes", []) + inputs_axes = io_specs.get("inputs", {}) + value_ranges = io_specs.get("value_ranges", {}) + + for i, name in enumerate(input_names): + shape = input_shapes[i] if i < len(input_shapes) else None + dtype = input_dtypes[i] if i < len(input_dtypes) else None + axes = inputs_axes.get(name, {}) + vr = value_ranges.get(name) + + shape_desc = _shape_to_desc(shape, axes) if shape else None + + input_tensors.append( + TensorInfo( + name=name, + dtype=dtype, + shape=shape, + shape_desc=shape_desc, + dynamic_axes=dict(axes) if axes else None, + value_range=vr, + ) + ) - except Exception as e: - logger.debug("Failed to extract tensor specs from ONNX config: %s", e) + output_names = io_specs.get("output_names", []) + outputs_axes = io_specs.get("outputs", {}) + + for name in output_names: + axes = outputs_axes.get(name, {}) + shape_desc = _shape_to_desc(None, axes) if axes else None + output_tensors.append( + TensorInfo( + name=name, + shape_desc=shape_desc, + dynamic_axes=dict(axes) if axes else None, + ) + ) return input_tensors, output_tensors @@ -285,15 +272,22 @@ def resolve_exporter( model_type: str, task: str, hf_config: PretrainedConfig | None = None, + *, + model_id: str | None = None, ) -> ExporterInfo: """Resolve exporter configuration for a model. - Uses MODEL_BUILD_CONFIGS registry from models/__init__.py. + Uses MODEL_BUILD_CONFIGS registry, then falls back to + export/io.py resolve_io_specs() for I/O extraction. This ensures + inspect and config share the same battle-tested I/O extraction path, + including correct image sizes from preprocessor_config.json. Args: model_type: HuggingFace model type (e.g., "clip") task: Canonical task name hf_config: Optional HuggingFace config for extracting tensor shapes + model_id: Optional HuggingFace model ID for preprocessor_config.json + (needed for correct image sizes on models like ResNet) Returns: ExporterInfo with ONNX config, tensors, and support level @@ -321,8 +315,7 @@ def resolve_exporter( output_tensors: list[TensorInfo] = [] if export_config.output_tensors: output_tensors.extend( - TensorInfo(name=spec.name or "unknown") - for spec in export_config.output_tensors + TensorInfo(name=spec.name or "unknown") for spec in export_config.output_tensors ) return ExporterInfo( @@ -357,14 +350,23 @@ def resolve_exporter( else: config_name = onnx_config_cls.__name__ - # Extract tensor specs from ONNX config if HF config is available + # Extract tensor specs via resolve_io_specs (shared with config command) input_tensors: list[TensorInfo] = [] output_tensors: list[TensorInfo] = [] if hf_config is not None: - input_tensors, output_tensors = _extract_tensor_specs_from_onnx_config( - onnx_config_cls, hf_config - ) + try: + from ..export.io import resolve_io_specs + + io_specs = resolve_io_specs( + model_type=model_type, + task=task, + hf_config=hf_config, + model_id=model_id, + ) + input_tensors, output_tensors = build_tensor_infos_from_io_specs(io_specs) + except Exception as e: + logger.debug("resolve_io_specs failed for %s/%s: %s", model_type, task, e) return ExporterInfo( onnx_config_class=config_name, @@ -535,9 +537,7 @@ def resolve_cache(model_id: str) -> CacheInfo: filename = ms.get("filename") artifact = model_dir / filename if filename else None size_bytes = ( - artifact.stat().st_size - if artifact and artifact.exists() - else 0 + artifact.stat().st_size if artifact and artifact.exists() else 0 ) stage_info = CacheStageInfo( stage=stage, @@ -575,7 +575,7 @@ def resolve_cache(model_id: str) -> CacheInfo: stem = f.stem last_sep = stem.rfind("_") if last_sep > 0: - stage_name = stem[last_sep + 1:] + stage_name = stem[last_sep + 1 :] cached_files[stage_name] = f for stage in pipeline_stages: @@ -606,65 +606,200 @@ def resolve_cache(model_id: str) -> CacheInfo: ) -def resolve_io_config(config: PretrainedConfig) -> IOConfigInfo: +def _find_nested_configs(config: PretrainedConfig) -> list: + """Discover all nested PretrainedConfig objects dynamically. + + Walks config attributes to find nested configs without hardcoding + names like "text_config", "vision_config", etc. Fixes D-2 and D-5 + from #247. + + Args: + config: HuggingFace PretrainedConfig object + + Returns: + List of nested PretrainedConfig instances + """ + from transformers import PretrainedConfig + + nested = [] + for attr_name in vars(config): + if attr_name.startswith("_"): + continue + try: + val = getattr(config, attr_name) + if isinstance(val, PretrainedConfig): + nested.append(val) + except Exception: + continue + return nested + + +def _discover_io_attrs_from_onnx_config( + model_type: str, + task: str, + hf_config: PretrainedConfig, +) -> set[str]: + """Discover IO-relevant config attributes from OnnxConfig. + + Instead of hardcoding which config attributes to show, we read the + uppercase class attrs on NormalizedConfig subclasses. These define + the canonical attribute mapping for each model type, e.g.: + + NormalizedTextConfig.VOCAB_SIZE = "vocab_size" + NormalizedVisionConfig.IMAGE_SIZE = "image_size" + + We also scan DUMMY_INPUT_GENERATOR_CLASSES for additional attrs + referenced via normalized_config.xxx in generator __init__ code. + + Returns: + Set of config attribute names relevant to I/O for this model. + """ + import inspect + import re + + attrs: set[str] = set() + try: + from ..export.io import _get_onnx_config + + onnx_config = _get_onnx_config(model_type, task, hf_config) + + # Primary: enumerate uppercase attrs on NormalizedConfig class. + # These ARE the canonical IO attribute mapping (e.g., VOCAB_SIZE="vocab_size"). + nc = getattr(onnx_config, "_normalized_config", None) + if nc is not None: + for attr_name in dir(type(nc)): + if attr_name.isupper() and not attr_name.startswith("_"): + # The value is the actual config attr name (e.g., "vocab_size") + val = getattr(type(nc), attr_name) + if isinstance(val, str): + # Handle dotted paths like "text_config.hidden_size" + leaf = val.split(".")[-1] + # Skip structural pointers (nested config references) + if not leaf.endswith("_config"): + attrs.add(leaf) + + # Secondary: scan generator __init__ for additional normalized_config refs + for gen_cls in getattr(onnx_config, "DUMMY_INPUT_GENERATOR_CLASSES", []): + try: + src = inspect.getsource(gen_cls.__init__) + except (TypeError, OSError): + continue + refs = re.findall(r"normalized_config\.(\w+)", src) + attrs.update(r for r in refs if r != "has_attribute") + except Exception as e: + logger.debug("Failed to discover IO attrs from OnnxConfig: %s", e) + + return attrs + + +def resolve_io_config( + config: PretrainedConfig, + *, + model_id: str | None = None, + model_type: str | None = None, + task: str | None = None, +) -> IOConfigInfo: """Extract IO configuration from HuggingFace config. - Extracts IO-related configuration values from a PretrainedConfig object. - For multimodal models (like CLIP), also checks nested configs (text_config, - vision_config) to gather all relevant settings. + Dynamically discovers which config attributes matter for I/O by + inspecting OnnxConfig's NormalizedConfig and input generators. + Falls back to a universal set of well-known attrs if OnnxConfig + lookup fails. No hardcoded model-specific attribute names. Args: config: HuggingFace PretrainedConfig object + model_id: Optional HF model ID for preprocessor_config.json fallback + model_type: HF model type for OnnxConfig lookup + task: Task name for OnnxConfig lookup Returns: IOConfigInfo with extracted configuration values """ - # Helper to get attribute from config or nested configs + # Dynamically discover nested configs (fixes D-2: no hardcoded names) + nested_configs = _find_nested_configs(config) + def get_config_attr( attr_name: str, - nested_configs: list[str] | None = None, - ) -> int | tuple[int, int] | None: - """Get attribute from main config or nested configs. - - Args: - attr_name: Attribute name to look for - nested_configs: List of nested config names to check (e.g., ["text_config"]) - - Returns: - Attribute value or None if not found - """ - # First check the main config + ) -> int | tuple[int, int] | list | None: + """Get attribute from main config or any nested config.""" value = getattr(config, attr_name, None) if value is not None: return value - # Check nested configs if provided - if nested_configs: - for nested_name in nested_configs: - nested_config = getattr(config, nested_name, None) - if nested_config is not None: - value = getattr(nested_config, attr_name, None) - if value is not None: - return value + for nested in nested_configs: + value = getattr(nested, attr_name, None) + if value is not None: + return value return None - # Text-related attributes - check main and text_config - max_position_embeddings = get_config_attr( - "max_position_embeddings", ["text_config"] - ) - vocab_size = get_config_attr("vocab_size", ["text_config"]) - - # Vision-related attributes - check main and vision_config - image_size = get_config_attr("image_size", ["vision_config"]) - patch_size = get_config_attr("patch_size", ["vision_config"]) - num_channels = get_config_attr("num_channels", ["vision_config"]) + # Step 1: Discover which attrs the OnnxConfig actually uses + io_attrs: set[str] = set() + if model_type and task: + io_attrs = _discover_io_attrs_from_onnx_config( + model_type, + task, + config, + ) - # Audio-related attributes - check main and audio_config - sampling_rate = get_config_attr("sampling_rate", ["audio_config"]) + # Step 2: Always include universal well-known IO attrs that Optimum's + # NormalizedConfig classes reference. These are framework conventions, + # not model-specific — they appear in NormalizedTextConfig, + # NormalizedVisionConfig, NormalizedSeq2SeqConfig, etc. + universal_io_attrs = { + "max_position_embeddings", + "vocab_size", + "image_size", + "patch_size", + "num_channels", + "input_size", + "sampling_rate", + "hidden_size", + "hidden_sizes", + } + io_attrs.update(universal_io_attrs) + + # Step 3: Look up each discovered attr + max_position_embeddings = get_config_attr("max_position_embeddings") + vocab_size = get_config_attr("vocab_size") + image_size = get_config_attr("image_size") + patch_size = get_config_attr("patch_size") + num_channels = get_config_attr("num_channels") + sampling_rate = get_config_attr("sampling_rate") + hidden_size = get_config_attr("hidden_size") + hidden_sizes = get_config_attr("hidden_sizes") + + # Step 4: Collect any extra attrs discovered from OnnxConfig + # that aren't in our dataclass fields + known_fields = { + "max_position_embeddings", + "vocab_size", + "image_size", + "patch_size", + "num_channels", + "sampling_rate", + "hidden_size", + "hidden_sizes", + } + extra: dict[str, int | str | list | None] = {} + for attr in io_attrs - known_fields: + val = get_config_attr(attr) + if val is not None: + extra[attr] = val + + # Step 5: Fallback — read image_size from preprocessor_config.json + # for models like ResNet where HF config lacks image_size + if image_size is None and model_id is not None: + try: + from ..export.io import _populate_image_size_from_preprocessor - # General attributes - check main config only - hidden_size = get_config_attr("hidden_size", ["text_config", "vision_config"]) + shape_kwargs: dict = {} + _populate_image_size_from_preprocessor(model_id, shape_kwargs) + if "height" in shape_kwargs: + h, w = shape_kwargs["height"], shape_kwargs["width"] + image_size = h if h == w else (h, w) + except Exception as e: + logger.debug("Failed to get image_size from preprocessor: %s", e) return IOConfigInfo( max_position_embeddings=max_position_embeddings, @@ -674,22 +809,27 @@ def get_config_attr( num_channels=num_channels, sampling_rate=sampling_rate, hidden_size=hidden_size, + hidden_sizes=hidden_sizes, + extra=extra if extra else None, ) -def resolve_processor(model_id: str) -> ProcessorInfo: +def resolve_processor( + model_id: str, + model_type: str | None = None, +) -> ProcessorInfo: """Resolve data processing classes for a HuggingFace model. Detects the processor/tokenizer/image_processor/feature_extractor classes associated with a model. Uses a multi-strategy approach: - 1. First tries to fetch config files from HuggingFace Hub without downloading - the full model (fast, no dependencies) - 2. Uses Auto classes to fill in any missing information that wasn't found - in the config files + 0. Check HF's IMAGE_PROCESSOR_MAPPING_NAMES for model_type-specific mapping + 1. Fetch config files from HuggingFace Hub (fast, no model download) + 2. Use Auto classes to fill in any remaining gaps Args: model_id: HuggingFace model identifier (e.g., "openai/clip-vit-base-patch32") + model_type: HuggingFace model type (e.g., "resnet") for registry lookup Returns: ProcessorInfo with detected class names for each processor type @@ -698,13 +838,47 @@ def resolve_processor(model_id: str) -> ProcessorInfo: tokenizer_class: str | None = None image_processor_class: str | None = None feature_extractor_class: str | None = None + # Source tracking + processor_source: str | None = None + tokenizer_source: str | None = None + image_processor_source: str | None = None + feature_extractor_source: str | None = None + + # Strategy 0: Check HF registry for the canonical image processor class + # for this model_type. This is authoritative — HF maps model types to + # their processor classes (e.g., resnet → ConvNextImageProcessor). + if model_type is not None: + try: + from transformers.models.auto.image_processing_auto import ( + IMAGE_PROCESSOR_MAPPING_NAMES, + ) + + mapping = IMAGE_PROCESSOR_MAPPING_NAMES.get(model_type) + if mapping: + # mapping is (SlowProcessor, FastProcessor) or a string + image_processor_class = mapping[0] if isinstance(mapping, tuple) else mapping + image_processor_source = "hf_registry" + except Exception as e: + logger.debug("Registry lookup failed for %s: %s", model_type, e) # Strategy 1: Try to get class names from config files via HuggingFace Hub API # This is fast and doesn't require downloading/instantiating processors + # NOTE: These JSON keys (processor_class, image_processor_type, etc.) are + # standard HuggingFace config conventions, not model-specific hardcoding. try: - processor_class, tokenizer_class, image_processor_class, feature_extractor_class = ( - _resolve_processor_from_hub_configs(model_id) - ) + hub_proc, hub_tok, hub_img, hub_fe = _resolve_processor_from_hub_configs(model_id) + if hub_proc and processor_class is None: + processor_class = hub_proc + processor_source = "hub_config" + if hub_tok and tokenizer_class is None: + tokenizer_class = hub_tok + tokenizer_source = "hub_config" + if hub_img and image_processor_class is None: + image_processor_class = hub_img + image_processor_source = "hub_config" + if hub_fe and feature_extractor_class is None: + feature_extractor_class = hub_fe + feature_extractor_source = "hub_config" except Exception as e: logger.debug("Failed to resolve processors from hub configs: %s", e) @@ -719,14 +893,18 @@ def resolve_processor(model_id: str) -> ProcessorInfo: ) = _resolve_processor_from_auto_classes(model_id) # Fill in missing values from auto classes - if processor_class is None: + if processor_class is None and auto_processor: processor_class = auto_processor - if tokenizer_class is None: + processor_source = "auto_class" + if tokenizer_class is None and auto_tokenizer: tokenizer_class = auto_tokenizer - if image_processor_class is None: + tokenizer_source = "auto_class" + if image_processor_class is None and auto_image_processor: image_processor_class = auto_image_processor - if feature_extractor_class is None: + image_processor_source = "auto_class" + if feature_extractor_class is None and auto_feature_extractor: feature_extractor_class = auto_feature_extractor + feature_extractor_source = "auto_class" except Exception as e: logger.debug("Failed to resolve processors from auto classes: %s", e) @@ -735,6 +913,10 @@ def resolve_processor(model_id: str) -> ProcessorInfo: tokenizer_class=tokenizer_class, image_processor_class=image_processor_class, feature_extractor_class=feature_extractor_class, + processor_source=processor_source, + tokenizer_source=tokenizer_source, + image_processor_source=image_processor_source, + feature_extractor_source=feature_extractor_source, ) diff --git a/src/winml/modelkit/inspect/types.py b/src/winml/modelkit/inspect/types.py index 58b092fa1..60d1c156e 100644 --- a/src/winml/modelkit/inspect/types.py +++ b/src/winml/modelkit/inspect/types.py @@ -27,6 +27,7 @@ class TensorInfo: shape: tuple[int, ...] | None = None shape_desc: str | None = None # Human-readable shape like "[B, 3, 224, 224]" dynamic_axes: dict[int, str] | None = None # {0: "batch", 1: "sequence"} + value_range: tuple[float, float] | None = None # e.g., (0.0, 1.0) for pixel values @dataclass @@ -90,6 +91,11 @@ class ProcessorInfo: tokenizer_class: str | None = None # e.g., "CLIPTokenizerFast" image_processor_class: str | None = None # e.g., "CLIPImageProcessor" feature_extractor_class: str | None = None # e.g., "Wav2Vec2FeatureExtractor" + # Source tracking for transparency (e.g., ResNet -> ConvNextImageProcessorFast) + processor_source: str | None = None # "hub_config" | "auto_class" + image_processor_source: str | None = None + feature_extractor_source: str | None = None + tokenizer_source: str | None = None @dataclass @@ -110,6 +116,10 @@ class IOConfigInfo: # General hidden_size: int | None = None + hidden_sizes: list[int] | None = None # Per-stage hidden dims (e.g., ResNet) + + # Extra attrs discovered dynamically from OnnxConfig + extra: dict[str, Any] | None = None @dataclass diff --git a/src/winml/modelkit/loader/__init__.py b/src/winml/modelkit/loader/__init__.py index c65d5016b..efd6bc8c9 100644 --- a/src/winml/modelkit/loader/__init__.py +++ b/src/winml/modelkit/loader/__init__.py @@ -26,10 +26,6 @@ """ from .config import WinMLLoaderConfig, resolve_loader_config -from .hf import ( - load_hf_model, - resolve_hf_model_class, -) from .task import ( HF_TASK_DEFAULTS, get_supported_tasks, @@ -50,3 +46,26 @@ "resolve_loader_config", "resolve_task_and_model_class", ] + + +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "load_hf_model": (".hf", "load_hf_model"), + "resolve_hf_model_class": (".hf", "resolve_hf_model_class"), +} + + +def __getattr__(name: str): + """Lazy-load heavy exports (hf.py imports transformers).""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return list(set(list(globals()) + __all__)) diff --git a/src/winml/modelkit/loader/task.py b/src/winml/modelkit/loader/task.py index b876c060f..1e6a2c4af 100644 --- a/src/winml/modelkit/loader/task.py +++ b/src/winml/modelkit/loader/task.py @@ -267,9 +267,10 @@ def _detect_task_and_class_from_config(config: PretrainedConfig) -> tuple[str, t try: model_class = TasksManager.get_model_class_for_task(task) - # Warn if TasksManager returns different class than architecture + # Informational: TasksManager may return a generic AutoModel* class + # that differs from config.architectures — this is expected behavior. if model_class.__name__ != arch_name: - logger.warning( + logger.info( "TasksManager returned %s, but config.architectures specifies %s. " "Honoring TasksManager's choice.", model_class.__name__, diff --git a/src/winml/modelkit/models/__init__.py b/src/winml/modelkit/models/__init__.py index 246207f01..2fa39bda4 100644 --- a/src/winml/modelkit/models/__init__.py +++ b/src/winml/modelkit/models/__init__.py @@ -55,15 +55,28 @@ # Lazy loading for modules that cause circular imports # WinMLAutoModel imports from loader/, which imports from models/ +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "WinMLAutoModel": (".auto", "WinMLAutoModel"), +} + + def __getattr__(name: str): """Lazy load modules that would cause circular imports.""" - if name == "WinMLAutoModel": - from .auto import WinMLAutoModel - - return WinMLAutoModel + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +def __dir__() -> list[str]: + return list(set(list(globals()) + __all__)) + + __all__ = [ "HF_MODEL_CLASS_MAPPING", "MODEL_BUILD_CONFIGS", diff --git a/src/winml/modelkit/models/auto.py b/src/winml/modelkit/models/auto.py index 3195d0fee..ae21e1881 100644 --- a/src/winml/modelkit/models/auto.py +++ b/src/winml/modelkit/models/auto.py @@ -334,6 +334,8 @@ def from_pretrained( from ..build import build_hf_model + # Pass resolved EP so the static analyzer targets only this EP + resolved_ep = config.compile.ep_config.provider if config.compile is not None else None result = build_hf_model( config=config, output_dir=output_dir, @@ -342,7 +344,7 @@ def from_pretrained( rebuild=force_rebuild, trust_remote_code=trust_remote_code, cache_key=cache_key, - ep=kwargs.get("ep"), + ep=resolved_ep, device=device, ) onnx_path = result.final_onnx_path @@ -353,11 +355,13 @@ def from_pretrained( winml_class = get_winml_class(model_type, task) logger.info("Creating inference wrapper: %s", winml_class.__name__) - return winml_class( + model = winml_class( onnx_path=onnx_path, config=hf_config, # HF PretrainedConfig for pipeline compatibility device=device, # pass user's original device string; WinMLSession handles "auto" ) + model._build_config = config # resolved build config (task, quant, compile) + return model @classmethod def supported_tasks(cls) -> list[str]: diff --git a/src/winml/modelkit/models/hf/bert.py b/src/winml/modelkit/models/hf/bert.py index 4838d3e37..d537c8010 100644 --- a/src/winml/modelkit/models/hf/bert.py +++ b/src/winml/modelkit/models/hf/bert.py @@ -33,9 +33,6 @@ BERT_CONFIG = WinMLBuildConfig( optim=WinMLOptimizationConfig( - gelu_fusion=True, - layer_norm_fusion=True, - matmul_add_fusion=True, clamp_constant_values=True, ), ) diff --git a/src/winml/modelkit/models/winml/base.py b/src/winml/modelkit/models/winml/base.py index 4d4e892bd..a3ec7b053 100644 --- a/src/winml/modelkit/models/winml/base.py +++ b/src/winml/modelkit/models/winml/base.py @@ -76,6 +76,9 @@ def __init__( self.config = config self._device = device + # Set by WinMLAutoModel.from_pretrained() after construction + self._build_config: Any = None + # Create WinMLSession (delegates ORT operations) self._session = WinMLSession( onnx_path=self._onnx_path, @@ -184,24 +187,42 @@ def perf(self, warmup: int = 0) -> contextlib.AbstractContextManager: """Context manager for scoped performance tracking. Delegates to the underlying WinMLSession.perf(). Every inference - call within the context records timing in PerfStats. + call within the context records timing in ``ctx.stats``. Args: warmup: Number of initial samples to exclude from statistics. Example:: - with model.perf(warmup=5) as stats: + with model.perf(warmup=5) as ctx: for img in images: model(pixel_values=img) - print(f"P99: {stats.p99_ms:.2f} ms") + print(f"P99: {ctx.stats.p99_ms:.2f} ms") """ return self._session.perf(warmup=warmup) @property def device(self) -> str: - """Current device.""" - return self._device + """Current device (delegates to session, resolved after compile).""" + return self._session.device + + @property + def task(self) -> str | None: + """Resolved task from build config, or None if unavailable.""" + build_config = getattr(self, "_build_config", None) + if build_config is not None: + loader = getattr(build_config, "loader", None) + if loader: + return loader.task + return None + + @property + def precision(self) -> str | None: + """Resolved precision from build config, or None if unavailable. + + TODO: derive from _build_config.quant.weight_type when ready. + """ + return None @property def dtype(self) -> torch.dtype: diff --git a/src/winml/modelkit/onnx/__init__.py b/src/winml/modelkit/onnx/__init__.py index 120d7af3e..521f9c0f8 100644 --- a/src/winml/modelkit/onnx/__init__.py +++ b/src/winml/modelkit/onnx/__init__.py @@ -13,7 +13,6 @@ from __future__ import annotations -from .detection import is_compiled_onnx, is_quantized_onnx from .domains import ONNXDomain from .dtypes import SupportedONNXType, remove_optional_from_type_annotation from .external_data import copy_onnx_model @@ -46,3 +45,18 @@ "restore_metadata", "save_onnx", ] + + +def __getattr__(name: str): + """Lazy-load detection module to avoid circular import with compiler.""" + if name in ("is_compiled_onnx", "is_quantized_onnx"): + from .detection import is_compiled_onnx, is_quantized_onnx + + globals()["is_compiled_onnx"] = is_compiled_onnx + globals()["is_quantized_onnx"] = is_quantized_onnx + return globals()[name] + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return __all__ diff --git a/src/winml/modelkit/onnx/detection.py b/src/winml/modelkit/onnx/detection.py index 007ef9267..c82c2ce34 100644 --- a/src/winml/modelkit/onnx/detection.py +++ b/src/winml/modelkit/onnx/detection.py @@ -14,7 +14,6 @@ import logging from typing import TYPE_CHECKING -from ..compiler.utils import QDQ_OP_TYPES from .persistence import load_onnx @@ -43,6 +42,8 @@ def _load_model_lightweight(model_path: Path, operation: str) -> onnx.ModelProto def is_quantized_onnx(model_path: Path) -> bool: """Check if ONNX model is quantized (contains QuantizeLinear/DequantizeLinear nodes).""" model = _load_model_lightweight(model_path, "quantization check") + from ..compiler import QDQ_OP_TYPES + return any(n.op_type in QDQ_OP_TYPES for n in model.graph.node) diff --git a/src/winml/modelkit/optim/__init__.py b/src/winml/modelkit/optim/__init__.py index b47686580..dc1f3a983 100644 --- a/src/winml/modelkit/optim/__init__.py +++ b/src/winml/modelkit/optim/__init__.py @@ -28,13 +28,49 @@ from .config import WinMLOptimizationConfig from .errors import ConfigurationError, ModelValidationError, OptimizationError from .optimizer import Optimizer +from .registry import ( + BoolCapability, + ChoiceCapability, + IntCapability, + auto_enable_dependencies, + validate, + validate_dependencies, +) __all__ = [ + "BoolCapability", + "ChoiceCapability", "ConfigurationError", + "IntCapability", "ModelValidationError", "OptimizationError", "Optimizer", "WinMLOptimizationConfig", + "auto_enable_dependencies", "optimize_onnx", + "validate", + "validate_dependencies", ] + + +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "get_all_capabilities": (".pipes", "get_all_capabilities"), +} + + +def __getattr__(name: str): + """Lazy-load pipe utilities that pull in heavy dependencies.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return list(set(list(globals()) + __all__)) diff --git a/src/winml/modelkit/optim/api.py b/src/winml/modelkit/optim/api.py index 8993d76f7..f403d8c4c 100644 --- a/src/winml/modelkit/optim/api.py +++ b/src/winml/modelkit/optim/api.py @@ -160,6 +160,19 @@ def _convert_to_kwargs(config: dict[str, Any], all_caps: dict[str, Any]) -> dict return result +def _hack_inject_quant_preprocess_metadata(model: onnx.ModelProto) -> None: + """Inject metadata that signals pre-processing was done. + + Suppresses the ORT quantization warning: + 'Please consider to run pre-processing before quantization.' + """ + metadata = {"onnx.quant.pre_process": "onnxruntime.quant"} + if model.metadata_props: + for prop in model.metadata_props: + metadata[prop.key] = prop.value + onnx.helper.set_model_props(model, metadata) + + def optimize_onnx( model: str | Path | onnx.ModelProto, output: str | Path | None = None, @@ -259,6 +272,9 @@ def optimize_onnx( optimized_model = optimizer.optimize(loaded_model, **optimizer_kwargs) optimized_model = optimizer.optimize(optimized_model, **optimizer_kwargs) + # Step 9.5: Inject quant pre-processing metadata to suppress ORT warning + _hack_inject_quant_preprocess_metadata(optimized_model) + # Step 10: Save if output path provided if output is not None: output_path = Path(output) diff --git a/src/winml/modelkit/optim/registry.py b/src/winml/modelkit/optim/registry.py index a3fc3591a..e5117a528 100644 --- a/src/winml/modelkit/optim/registry.py +++ b/src/winml/modelkit/optim/registry.py @@ -204,7 +204,9 @@ def validate(config: dict[str, Any], capabilities: dict[str, CapabilityDef]) -> errors = [] for key, value in config.items(): - cap = capabilities.get(key) + # Accept both snake_case and kebab-case (normalize to kebab-case) + normalized_key = key.replace("_", "-") + cap = capabilities.get(normalized_key) or capabilities.get(key) if cap is None: errors.append(f"Unknown capability '{key}'") continue diff --git a/src/winml/modelkit/optracing/__init__.py b/src/winml/modelkit/optracing/__init__.py deleted file mode 100644 index e0c02a3d5..000000000 --- a/src/winml/modelkit/optracing/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Operator-level profiling for ModelKit.""" - -from __future__ import annotations - -from .base import OpTracer -from .registry import get_tracer, register_tracer -from .report import display_op_trace_report, write_op_trace_json -from .result import OperatorMetrics, OpTraceResult - - -def is_qnn_profiling_available() -> bool: - """Check if QNN EP is available for op-tracing.""" - try: - import onnxruntime as ort - - return "QNNExecutionProvider" in ort.get_available_providers() - except (ImportError, AttributeError): - return False - - -__all__ = [ - "OpTraceResult", - "OpTracer", - "OperatorMetrics", - "display_op_trace_report", - "get_tracer", - "is_qnn_profiling_available", - "register_tracer", - "write_op_trace_json", -] diff --git a/src/winml/modelkit/optracing/base.py b/src/winml/modelkit/optracing/base.py deleted file mode 100644 index e7a0ef4ab..000000000 --- a/src/winml/modelkit/optracing/base.py +++ /dev/null @@ -1,35 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""EP-agnostic operator profiling interface.""" -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - - -if TYPE_CHECKING: - - from .result import OpTraceResult - - -class OpTracer(ABC): - """EP-agnostic operator profiling interface. - - Subclasses implement tracing logic for a specific execution provider - (e.g. QNN, DirectML, CUDA). - - Concrete implementations receive the model path and output directory - at construction time, then call ``run()`` to execute profiling. - """ - - @abstractmethod - def run(self, iterations: int = 5, warmup: int = 2) -> OpTraceResult: - """Run operator-level tracing and return structured results.""" - ... - - @abstractmethod - def is_available(self) -> bool: - """Check if this tracer's runtime dependencies are available.""" - ... diff --git a/src/winml/modelkit/optracing/qnn/profiler.py b/src/winml/modelkit/optracing/qnn/profiler.py deleted file mode 100644 index 2cc26503e..000000000 --- a/src/winml/modelkit/optracing/qnn/profiler.py +++ /dev/null @@ -1,351 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""QNN EP operator profiler using ORT. - -Orchestrates the end-to-end profiling workflow: - -1. Build an ORT ``InferenceSession`` with QNN EP and profiling options. -2. Run warmup + measured inference iterations. -3. Tear down the session to flush profiling data. -4. Parse the resulting CSV (basic) or run the profile viewer for QHAS - (detail) post-processing. -5. Return a structured ``OpTraceResult``. -""" -from __future__ import annotations - -import contextlib -import logging -import os -from pathlib import Path -from typing import Any - -import numpy as np - -from ..base import OpTracer -from ..result import OperatorMetrics, OpTraceResult -from .csv_parser import parse_qnn_profiling_csv -from .viewer import find_qnn_sdk, run_qhas_viewer - - -logger = logging.getLogger(__name__) - - -def _ort_type_to_numpy(ort_type: str) -> np.dtype: - """Map an ORT tensor type string to a NumPy dtype.""" - mapping: dict[str, np.dtype] = { - "tensor(float)": np.dtype("float32"), - "tensor(float16)": np.dtype("float16"), - "tensor(double)": np.dtype("float64"), - "tensor(int32)": np.dtype("int32"), - "tensor(int64)": np.dtype("int64"), - "tensor(int8)": np.dtype("int8"), - "tensor(uint8)": np.dtype("uint8"), - "tensor(bool)": np.dtype("bool"), - } - return mapping.get(ort_type, np.dtype("float32")) - - -def _resolve_shape(shape: list, default_dim: int = 1) -> list[int]: - """Replace symbolic or ``None`` dimensions with concrete values.""" - return [default_dim if not isinstance(d, int) or d <= 0 else d for d in shape] - - -@contextlib.contextmanager -def _working_directory(path: Path): - """Temporarily change CWD and restore on exit. - - QNN EP writes ``*_schematic.bin`` into the process CWD, so we - change to the output directory before creating the session. - """ - original = Path.cwd() - os.chdir(path) - try: - yield - finally: - os.chdir(original) - - -class QNNProfiler(OpTracer): - """QNN EP operator profiler using ORT. - - Parameters - ---------- - onnx_path: - Path to the ONNX model (or ``*_ctx.onnx`` context binary). - output_dir: - Directory for profiling artifacts (CSV, log, schematic, QHAS). - level: - Profiling level: ``"basic"`` (cycle counts per operator) or - ``"detail"`` (full QHAS with roofline / DMA traffic). - """ - - def __init__( - self, - onnx_path: Path, - *, - output_dir: Path, - level: str = "basic", - ) -> None: - self.onnx_path = Path(onnx_path) - self.output_dir = Path(output_dir) - self.level = level - - def is_available(self) -> bool: - """Check if QNN EP is available for profiling.""" - try: - import onnxruntime as ort - - return "QNNExecutionProvider" in ort.get_available_providers() - except (ImportError, AttributeError): - return False - - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - def run(self, iterations: int = 5, warmup: int = 2) -> OpTraceResult: - """Run profiling and return structured results. - - Parameters - ---------- - iterations: - Number of measured inference iterations. - warmup: - Number of un-measured warmup iterations (session compile / - JIT overhead). - """ - import onnxruntime as ort - - self.output_dir.mkdir(parents=True, exist_ok=True) - - csv_path = self.output_dir / "profiling_output.csv" - options = self._build_session_options(ort) - provider_options = self._build_provider_options(csv_path) - - # CWD must be output_dir so schematic.bin lands there. - with _working_directory(self.output_dir): - session = ort.InferenceSession( - str(self.onnx_path), - sess_options=options, - providers=["QNNExecutionProvider"], - provider_options=provider_options, - ) - - inputs = self._generate_inputs(session) - - # Warmup (not measured). - for _ in range(warmup): - session.run(None, inputs) - - # Measured iterations. - for _ in range(iterations): - session.run(None, inputs) - - # Tear down session to flush profiling data. - del session - - # ---- Post-processing ---- - return self._collect_results(csv_path, iterations) - - # ------------------------------------------------------------------ - # ORT configuration builders - # ------------------------------------------------------------------ - - def _build_session_options(self, ort_module: Any) -> Any: - """Create ``ort.SessionOptions`` with profiling config entries.""" - options = ort_module.SessionOptions() - options.add_session_config_entry( - "session.disable_cpu_ep_fallback", "1" - ) - options.add_session_config_entry("ep.context_enable", "1") - options.add_session_config_entry("ep.context_embed_mode", "0") - return options - - def _build_provider_options( - self, csv_path: Path - ) -> list[dict[str, str]]: - """Build QNN EP provider options dict. - - - ``basic`` mode uses ``profiling_level=detailed`` (per-op cycles). - - ``detail`` mode uses ``profiling_level=optrace`` (full QHAS). - """ - profiling_level = "optrace" if self.level == "detail" else "detailed" - - return [ - { - "backend_path": "QnnHtp.dll", - "htp_performance_mode": "high_performance", - "htp_graph_finalization_optimization_mode": "3", - "enable_htp_fp16_precision": "1", - "profiling_level": profiling_level, - "profiling_file_path": str(csv_path), - } - ] - - # ------------------------------------------------------------------ - # Input generation - # ------------------------------------------------------------------ - - @staticmethod - def _generate_inputs(session: Any) -> dict[str, np.ndarray]: - """Generate random inputs matching the model's I/O specification.""" - inputs: dict[str, np.ndarray] = {} - for inp in session.get_inputs(): - shape = _resolve_shape(inp.shape) - dtype = _ort_type_to_numpy(inp.type) - inputs[inp.name] = np.random.rand(*shape).astype(dtype) - return inputs - - # ------------------------------------------------------------------ - # Result collection - # ------------------------------------------------------------------ - - def _collect_results( - self, csv_path: Path, iterations: int - ) -> OpTraceResult: - """Parse profiling artifacts into an ``OpTraceResult``.""" - artifacts: dict[str, str] = {} - qnn_log = Path(str(csv_path) + "_qnn.log") - - if csv_path.is_file(): - artifacts["csv"] = str(csv_path) - if qnn_log.is_file(): - artifacts["qnn_log"] = str(qnn_log) - - # Locate schematic if present (detail mode). - schematic = self._find_schematic() - if schematic is not None: - artifacts["schematic"] = str(schematic) - - # --- Detail mode: attempt QHAS post-processing --- - if self.level == "detail" and qnn_log.is_file(): - qhas_result = self._try_qhas(qnn_log, schematic, artifacts) - if qhas_result is not None: - return qhas_result - - # --- Fallback / basic mode: parse CSV --- - if csv_path.is_file(): - return self._from_csv(csv_path, iterations, artifacts) - - # No artifacts at all -- return empty result. - logger.warning("No profiling artifacts found in %s", self.output_dir) - return OpTraceResult( - model=self.onnx_path.name, - device="npu", - tracing_level=self.level, - ep="QNNExecutionProvider", - tracing_backend="qnn", - num_samples=0, - artifacts=artifacts, - ) - - def _find_schematic(self) -> Path | None: - """Find a ``*_schematic.bin`` file in the output directory.""" - schematics = list(self.output_dir.glob("*_schematic.bin")) - if schematics: - return schematics[0] - return None - - def _try_qhas( - self, - qnn_log: Path, - schematic: Path | None, - artifacts: dict[str, str], - ) -> OpTraceResult | None: - """Attempt QHAS post-processing; return result or ``None``.""" - import json as _json - - if schematic is None or not schematic.is_file(): - logger.info( - "No schematic found; falling back to CSV for detail mode" - ) - return None - - qhas_output = self.output_dir / "qhas_output.json" - result_path = run_qhas_viewer( - qnn_log, schematic, qhas_output, sdk_root=find_qnn_sdk() - ) - - if result_path is None or not result_path.is_file(): - logger.info("QHAS viewer did not produce output; falling back") - return None - - artifacts["qhas"] = str(result_path) - from .qhas_parser import parse_qhas - - qhas_data = _json.loads(result_path.read_text(encoding="utf-8")) - parsed = parse_qhas(qhas_data) - - operators = [ - OperatorMetrics( - name=op["name"], - op_path=op["op_path"], - duration_us=op["duration_us"], - percent_of_total=op["percent_of_total"], - dominant_path_us=op.get("dominant_path_us"), - num_htp_ops=op.get("num_htp_ops"), - dram_read_bytes=op.get("dram_read_bytes"), - dram_write_bytes=op.get("dram_write_bytes"), - vtcm_read_bytes=op.get("vtcm_read_bytes"), - vtcm_write_bytes=op.get("vtcm_write_bytes"), - vtcm_hit_ratio=op.get("vtcm_hit_ratio"), - ) - for op in parsed["operators"] - ] - - return OpTraceResult( - model=self.onnx_path.name, - device="npu", - tracing_level="detail", - ep="QNNExecutionProvider", - tracing_backend="qnn", - operators=operators, - summary=parsed["summary"], - artifacts=artifacts, - ) - - def _from_csv( - self, - csv_path: Path, - iterations: int, - artifacts: dict[str, str], - ) -> OpTraceResult: - """Build an ``OpTraceResult`` from the basic CSV parser.""" - parsed = parse_qnn_profiling_csv(csv_path) - meta = parsed["metadata"] - - # Convert cycles to microseconds using the cycle-to-us factor. - total_cycles = meta.get("accel_execute_cycles", 0) - accel_us = meta.get("accel_execute_us", 0) - cycle_to_us = accel_us / total_cycles if total_cycles > 0 else 0.0 - - operators = [ - OperatorMetrics( - name=op["name"], - op_path=op["name"], - op_id=op["op_id"], - duration_us=op["cycles"] * cycle_to_us, - percent_of_total=( - op["cycles"] / total_cycles * 100 if total_cycles > 0 else 0 - ), - ) - for op in parsed["operators"] - ] - - return OpTraceResult( - model=self.onnx_path.name, - device="npu", - tracing_level=self.level, - ep="QNNExecutionProvider", - tracing_backend="qnn", - operators=operators, - num_samples=meta.get("num_samples", 0), - summary={ - "hvx_threads": meta.get("hvx_threads", 0), - "accel_execute_cycles": meta.get("accel_execute_cycles", 0), - "accel_execute_us": accel_us, - }, - artifacts=artifacts, - ) diff --git a/src/winml/modelkit/optracing/registry.py b/src/winml/modelkit/optracing/registry.py deleted file mode 100644 index cbd635a9b..000000000 --- a/src/winml/modelkit/optracing/registry.py +++ /dev/null @@ -1,64 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""EP tracer registry with substring-based pattern matching. - -Tracers register themselves against an EP *pattern* (e.g. ``"QNN"``) and -a profiling *level* (``"basic"`` or ``"detail"``). Lookup uses substring -matching so that ``"QNN"`` matches ``"QNNExecutionProvider"`` without -hardcoding full EP names. -""" -from __future__ import annotations - -from typing import TYPE_CHECKING - - -if TYPE_CHECKING: - from .base import OpTracer - -# {ep_pattern: {level: tracer_class}} -_TRACERS: dict[str, dict[str, type[OpTracer]]] = {} - - -def register_tracer( - ep_pattern: str, level: str, tracer_class: type[OpTracer] -) -> None: - """Register a tracer class for an EP pattern and profiling level. - - Parameters - ---------- - ep_pattern: - Substring that will be matched against EP names (e.g. ``"QNN"``). - level: - Profiling level identifier (e.g. ``"basic"``, ``"detail"``). - tracer_class: - The ``OpTracer`` subclass to register. - """ - _TRACERS.setdefault(ep_pattern, {})[level] = tracer_class - - -def get_tracer(ep_name: str, level: str) -> type[OpTracer] | None: - """Look up a tracer class by EP name and level. - - Uses substring matching: a registered pattern ``"QNN"`` will match - any *ep_name* that contains ``"QNN"`` (e.g. ``"QNNExecutionProvider"``). - - Returns ``None`` when no matching tracer is found. - """ - for pattern, levels in _TRACERS.items(): - if pattern in ep_name and level in levels: - return levels[level] - return None - - -def _register_defaults() -> None: - """Auto-register built-in tracers.""" - from .qnn.profiler import QNNProfiler - - register_tracer("QNN", "basic", QNNProfiler) - register_tracer("QNN", "detail", QNNProfiler) - - -# Eagerly register defaults on import. -_register_defaults() diff --git a/src/winml/modelkit/quant/__init__.py b/src/winml/modelkit/quant/__init__.py index fe3770eae..2e6c2c279 100644 --- a/src/winml/modelkit/quant/__init__.py +++ b/src/winml/modelkit/quant/__init__.py @@ -17,7 +17,6 @@ """ from .config import QuantizeResult, WinMLQuantizationConfig -from .quantizer import quantize_onnx __all__ = [ @@ -25,3 +24,25 @@ "WinMLQuantizationConfig", "quantize_onnx", ] + + +_LAZY_IMPORTS: dict[str, tuple[str, str]] = { + "quantize_onnx": (".quantizer", "quantize_onnx"), +} + + +def __getattr__(name: str): + """Lazy-load quantizer (imports onnxruntime.quantization).""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + import importlib + + mod = importlib.import_module(module_path, __name__) + val = getattr(mod, attr_name) + globals()[name] = val + return val + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return list(set(list(globals()) + __all__)) diff --git a/src/winml/modelkit/session/ep_registry.py b/src/winml/modelkit/session/ep_registry.py index bc6e9832d..63a14df33 100644 --- a/src/winml/modelkit/session/ep_registry.py +++ b/src/winml/modelkit/session/ep_registry.py @@ -49,6 +49,7 @@ def __init__(self) -> None: self._ep_paths: dict[str, str] = {} self._registered_eps: list[str] = [] + self._registration_failures: dict[str, str] = {} self._winml_available = False self._win_app_sdk_handle = None @@ -66,7 +67,9 @@ def _discover_eps(self) -> None: logger.warning("WinML not available (missing packages): %s", e) self._winml_available = False except Exception as e: - logger.warning("WinML EP discovery failed: %s", e) + # Include exception class so users can distinguish "no providers + # in catalog" (expected) from "init crashed" (broken env). + logger.warning("WinML EP discovery failed (%s: %s)", type(e).__name__, e) self._winml_available = False def _fix_winrt_runtime(self) -> None: @@ -80,7 +83,9 @@ def _fix_winrt_runtime(self) -> None: dll_path.unlink() logger.debug("Removed conflicting msvcp140.dll from winrt-runtime") except Exception as e: - logger.debug("Could not fix winrt-runtime: %s", e) + # NFR-2: this function only runs in the known-needed init path — + # a failure here matters. Surface at WARNING with exception class. + logger.warning("Could not fix winrt-runtime (%s: %s)", type(e).__name__, e) def _init_windows_app_sdk(self) -> None: """Initialize Windows App SDK.""" @@ -126,9 +131,15 @@ def register_to_ort(self) -> list[str]: # Use ORT's native EP registration API ort.register_execution_provider_library(name, dll_path) self._registered_eps.append(name) + # Clear any prior failure record on successful re-register. + self._registration_failures.pop(name, None) logger.debug("Registered EP: %s -> %s", name, dll_path) except Exception as e: - logger.warning("Failed to register EP %s: %s", name, e) + # NFR-2: surface EP name + exception class so users can + # diagnose which provider failed to register and why. + msg = f"{type(e).__name__}: {e}" + self._registration_failures[name] = msg + logger.warning("Failed to register EP %s (%s)", name, msg) return self._registered_eps.copy() @@ -153,6 +164,16 @@ def winml_available(self) -> bool: """Whether WinML is available.""" return self._winml_available + @property + def registration_failures(self) -> dict[str, str]: + """Per-EP registration failures from the most recent ``register_to_ort()``. + + Maps EP name → ``"<ExcClass>: <message>"`` for any provider that + failed to register. Empty when all registrations succeeded. + Successful re-registration clears the corresponding entry. + """ + return self._registration_failures.copy() + def __del__(self) -> None: """Cleanup Windows App SDK handle.""" if self._win_app_sdk_handle is not None: @@ -193,6 +214,35 @@ def get_ort_available_providers(use_winml: bool = True) -> list[str]: registry = WinMLEPRegistry.get_instance() registry.register_to_ort() except Exception as e: - logger.debug("WinML discovery skipped: %s", e) + # NFR-2: surface real failures at WARNING so users can diagnose. + logger.warning("WinML discovery skipped (%s: %s)", type(e).__name__, e) return ort.get_available_providers() + + +def ensure_initialized() -> None: + """Idempotent module-level entry point for WinML EP registration. + + Wraps ``WinMLEPRegistry.get_instance().register_to_ort()`` so callers + (e.g. ``QNNMonitor.is_available``) can trigger EP registration without + importing ``WinMLSession`` — breaks a latent import cycle. + + Safe to call multiple times. No-op if WinML is unavailable on this system. + + Failures during registration are logged at WARNING (NFR-2: must not be + silent) and swallowed so callers can probe availability without raising. + Subsequent calls retry — there is no module-level latch on failure. + """ + try: + registry = WinMLEPRegistry.get_instance() + if registry.winml_available: + registry.register_to_ort() + except Exception as exc: + # NFR-2: surface real environmental failures at WARNING with the + # exception class so users can distinguish "not on Windows" from + # "registration crashed". + logger.warning( + "ensure_initialized: WinML EP registration failed (%s: %s)", + type(exc).__name__, + exc, + ) diff --git a/tests/unit/optracing/test_detection.py b/src/winml/modelkit/session/monitor/__init__.py similarity index 50% rename from tests/unit/optracing/test_detection.py rename to src/winml/modelkit/session/monitor/__init__.py index 5670d1105..299f5c4db 100644 --- a/tests/unit/optracing/test_detection.py +++ b/src/winml/modelkit/session/monitor/__init__.py @@ -2,11 +2,4 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""Test QNN EP detection for op-tracing.""" - -from winml.modelkit.optracing import is_qnn_profiling_available - - -def test_is_qnn_profiling_available_returns_bool(): - result = is_qnn_profiling_available() - assert isinstance(result, bool) +"""Per-EP monitors and op-tracing post-processing.""" diff --git a/src/winml/modelkit/session/monitor/ep_monitor.py b/src/winml/modelkit/session/monitor/ep_monitor.py index 7bbfa5d16..1533cdcef 100644 --- a/src/winml/modelkit/session/monitor/ep_monitor.py +++ b/src/winml/modelkit/session/monitor/ep_monitor.py @@ -12,7 +12,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, ClassVar if TYPE_CHECKING: @@ -27,15 +27,65 @@ class EPMonitor(ABC): Example:: - with session.perf(warmup=10) as stats: - with SomeEPMonitor() as hw: - for _ in range(110): - session.run(inputs) + with session.perf(warmup=10, monitor=SomeEPMonitor()) as ctx: + for _ in range(110): + session.run(inputs) - print(stats.mean_ms) # inference timing - print(hw.to_dict()) # proof-of-execution data + print(ctx.stats.mean_ms) + print(ctx.monitor.to_dict()) """ + # ---- Optional hooks: defaults provided; subclasses override as needed ---- + + #: ORT-specific hint: does this monitor's data flush require + #: ``ort.InferenceSession`` destruction? Example: QNN flushes CSV only + #: on session destroy. Default: False (no teardown needed). + requires_session_teardown: ClassVar[bool] = False + + #: Target EP short name (e.g. ``"qnn"``). When set, ``WinMLSession.perf()`` + #: pins the session to this EP so provider options contributed via + #: :meth:`get_provider_options` actually flow through + #: ``add_provider_for_devices``. Without this, sessions without an explicit + #: ``ep`` fall back to ORT's policy-based selection which silently drops + #: provider options. ``None`` (default) means the monitor doesn't require + #: a specific EP — e.g. :class:`NullEPMonitor`, ``VitisAIMonitor`` whose + #: hooks return empty dicts. + ep_name: ClassVar[str | None] = None + + def __init_subclass__(cls, **kwargs: Any) -> None: + """Reject subclasses that try to shadow load-bearing class vars. + + The ``requires_session_teardown`` flag governs the C-2 teardown + ordering invariant in ``WinMLSession.perf()``. Catching a non-bool + shadow at class-definition time keeps the invariant *visible*; + runtime instance shadowing in ``__init__`` is not catchable here. + """ + super().__init_subclass__(**kwargs) + cls_dict_value = cls.__dict__.get("requires_session_teardown") + if cls_dict_value is not None and not isinstance(cls_dict_value, bool): + raise TypeError( + f"{cls.__name__}.requires_session_teardown must be a class-level bool, " + f"got {type(cls_dict_value).__name__}" + ) + + def get_session_options(self) -> dict[str, str]: + """Entries to pass to ``SessionOptions.add_session_config_entry()``. + + Default: empty dict. Override in subclasses that need e.g. + ``"session.disable_cpu_ep_fallback": "1"``. + """ + return {} + + def get_provider_options(self) -> dict[str, str]: + """Options to merge into ``add_provider_for_devices([ep], opts)``. + + Default: empty dict. Override in subclasses that need e.g. + ``"profiling_level": "detailed"``. + """ + return {} + + # ---- Mandatory contract ---- + @abstractmethod def __enter__(self) -> Self: """Start hardware monitoring.""" diff --git a/src/winml/modelkit/optracing/result.py b/src/winml/modelkit/session/monitor/op_metrics.py similarity index 63% rename from src/winml/modelkit/optracing/result.py rename to src/winml/modelkit/session/monitor/op_metrics.py index 6ebd5ca32..fee4f3ce6 100644 --- a/src/winml/modelkit/optracing/result.py +++ b/src/winml/modelkit/session/monitor/op_metrics.py @@ -2,13 +2,34 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""Op-tracing result dataclasses for structured profiling output.""" +"""OpTraceResult + OperatorMetrics — structured profiling output. + +Relocated from ``optracing/result.py`` as part of the op-tracing refactor. +Extended with ``status`` / ``error`` fields for failure reporting. +""" + from __future__ import annotations import json from dataclasses import asdict, dataclass, field from datetime import datetime, timezone -from typing import Any +from typing import Any, Literal + + +#: Closed set of values for :attr:`OpTraceResult.status`. +#: +#: * ``"ok"`` — trace parsed cleanly. +#: * ``"no_data"`` — expected artifacts (e.g. profiling CSV) never appeared. +#: * ``"parse_failed"`` — artifacts were present but unparseable; ``error`` +#: carries the message. +#: * ``"basic_fallback"`` — caller asked for ``detail`` mode but the backend +#: could only produce basic data (e.g. QHAS unavailable). +#: * ``"not_run"`` — :py:meth:`__exit__` has not been called yet. +#: +#: ``Literal`` is enforced statically (mypy / ruff); at runtime ``status`` is +#: still a plain ``str`` so :py:meth:`OpTraceResult.to_dict` and JSON +#: serialization are unaffected. +TraceStatus = Literal["ok", "no_data", "parse_failed", "basic_fallback", "not_run"] @dataclass @@ -54,7 +75,7 @@ class OpTraceResult: """Complete op-tracing result.""" # Required - model: str + model: str | None device: str tracing_level: str # "basic" or "detail" operators: list[OperatorMetrics] = field(default_factory=list) @@ -62,9 +83,7 @@ class OpTraceResult: # Optional metadata ep: str = "" tracing_backend: str = "" - timestamp: str = field( - default_factory=lambda: datetime.now(timezone.utc).isoformat() - ) + timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) num_samples: int = 0 # Summary (model-level aggregates) @@ -76,8 +95,18 @@ class OpTraceResult: # Raw artifact paths artifacts: dict[str, str] = field(default_factory=dict) + # Status of the trace. See :data:`TraceStatus` for the closed set of + # legal values; static type checkers enforce the alias. + status: TraceStatus = "ok" + # Populated when status == "parse_failed". + error: str | None = None + def to_dict(self) -> dict[str, Any]: - """Serialize to structured dict for JSON output.""" + """Serialize to structured dict. + + Preserves existing nested schema; adds top-level ``status`` and + ``error`` keys additively. + """ return { "metadata": { "model": self.model, @@ -92,6 +121,9 @@ def to_dict(self) -> dict[str, Any]: "operators": [op.to_dict() for op in self.operators], "statistics": self.statistics, "artifacts": self.artifacts, + # ---- Additive ---- + "status": self.status, + "error": self.error, } def to_json(self, indent: int = 2) -> str: diff --git a/src/winml/modelkit/session/monitor/qnn/__init__.py b/src/winml/modelkit/session/monitor/qnn/__init__.py new file mode 100644 index 000000000..1211c9008 --- /dev/null +++ b/src/winml/modelkit/session/monitor/qnn/__init__.py @@ -0,0 +1,5 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""QNN-specific helpers for QNNMonitor: CSV parser, QHAS parser, viewer shell-out.""" diff --git a/src/winml/modelkit/optracing/qnn/csv_parser.py b/src/winml/modelkit/session/monitor/qnn/csv_parser.py similarity index 94% rename from src/winml/modelkit/optracing/qnn/csv_parser.py rename to src/winml/modelkit/session/monitor/qnn/csv_parser.py index 9c6dcc090..ebcf4fc7a 100644 --- a/src/winml/modelkit/optracing/qnn/csv_parser.py +++ b/src/winml/modelkit/session/monitor/qnn/csv_parser.py @@ -16,6 +16,7 @@ Multiple inference samples are separated by ROOT "Accelerator (execute) time (cycles)" boundaries. """ + from __future__ import annotations import csv @@ -89,11 +90,7 @@ def _extract_metadata(rows: list[dict[str, str]]) -> dict[str, Any]: if event_level != "ROOT": continue - if ( - event_id == "Number of HVX threads used" - and unit == "COUNT" - and hvx_threads is None - ): + if event_id == "Number of HVX threads used" and unit == "COUNT" and hvx_threads is None: hvx_threads = int(time_val) if ( @@ -103,11 +100,7 @@ def _extract_metadata(rows: list[dict[str, str]]) -> dict[str, Any]: ): accel_execute_cycles = int(time_val) - if ( - event_id == "Accelerator (execute) time" - and unit == "US" - and accel_execute_us is None - ): + if event_id == "Accelerator (execute) time" and unit == "US" and accel_execute_us is None: accel_execute_us = int(time_val) return { @@ -164,9 +157,7 @@ def _extract_samples(rows: list[dict[str, str]]) -> list[list[dict[str, Any]]]: return samples -def _parse_node_event( - event_id: str, time_val: str -) -> dict[str, Any] | None: +def _parse_node_event(event_id: str, time_val: str) -> dict[str, Any] | None: """Parse a single NODE SUB-EVENT identifier into name/op_id/cycles.""" m = _OP_PATTERN.match(event_id) if m is None: diff --git a/src/winml/modelkit/optracing/qnn/qhas_parser.py b/src/winml/modelkit/session/monitor/qnn/qhas_parser.py similarity index 99% rename from src/winml/modelkit/optracing/qnn/qhas_parser.py rename to src/winml/modelkit/session/monitor/qnn/qhas_parser.py index 9bc267ccc..5bc26f880 100644 --- a/src/winml/modelkit/optracing/qnn/qhas_parser.py +++ b/src/winml/modelkit/session/monitor/qnn/qhas_parser.py @@ -9,6 +9,7 @@ information. This module transforms the raw JSON into a normalised dict suitable for the detail-mode op-tracing report. """ + from __future__ import annotations diff --git a/src/winml/modelkit/optracing/qnn/viewer.py b/src/winml/modelkit/session/monitor/qnn/viewer.py similarity index 82% rename from src/winml/modelkit/optracing/qnn/viewer.py rename to src/winml/modelkit/session/monitor/qnn/viewer.py index 7eed61d0b..886e71bc1 100644 --- a/src/winml/modelkit/optracing/qnn/viewer.py +++ b/src/winml/modelkit/session/monitor/qnn/viewer.py @@ -14,6 +14,7 @@ optrace-reader config to produce full QHAS JSON with roofline, DMA traffic, and memory information. """ + from __future__ import annotations import json @@ -40,41 +41,19 @@ } } -# Common SDK installation directories (Windows). -_COMMON_SDK_PATHS: list[str] = [ - r"D:\QC", - r"C:\Qualcomm\AIStack\qairt", -] - def find_qnn_sdk() -> Path | None: - """Auto-detect QNN SDK installation. - - Resolution order: - 1. ``QNN_SDK_ROOT`` environment variable. - 2. Common installation directories on Windows. + """Resolve QNN SDK root from ``QNN_SDK_ROOT`` env var. - Returns the SDK root ``Path`` or ``None`` when not found. + Returns ``None`` when unset or pointing to a non-directory. Detail-mode + QHAS post-processing degrades to basic CSV parsing when this returns + ``None`` (per design FR-5 / ``status='basic_fallback'``). """ env_root = os.environ.get("QNN_SDK_ROOT") - if env_root: - root = Path(env_root) - if root.is_dir(): - logger.debug("QNN SDK found via QNN_SDK_ROOT: %s", root) - return root - - for base in _COMMON_SDK_PATHS: - base_path = Path(base) - if not base_path.is_dir(): - continue - # Look for a versioned subdirectory containing bin/ - for child in sorted(base_path.iterdir(), reverse=True): - if child.is_dir() and (child / "bin").is_dir(): - logger.debug("QNN SDK found at: %s", child) - return child - - logger.debug("QNN SDK not found") - return None + if not env_root: + return None + root = Path(env_root) + return root if root.is_dir() else None def _find_viewer_exe(sdk_root: Path | None = None) -> Path | None: @@ -125,7 +104,10 @@ def run_basic_viewer( """ viewer = _find_viewer_exe(sdk_root) if viewer is None: - logger.warning("qnn-profile-viewer not found; skipping basic viewer") + logger.warning( + "qnn-profile-viewer not found; set QNN_SDK_ROOT to enable detail mode " + "(falling back to basic CSV)" + ) return None cmd = [ @@ -180,7 +162,10 @@ def run_qhas_viewer( """ viewer = _find_viewer_exe(sdk_root) if viewer is None: - logger.warning("qnn-profile-viewer not found; skipping QHAS viewer") + logger.warning( + "qnn-profile-viewer not found; set QNN_SDK_ROOT to enable detail mode " + "(falling back to basic CSV)" + ) return None if not schematic.is_file(): diff --git a/src/winml/modelkit/session/monitor/qnn_monitor.py b/src/winml/modelkit/session/monitor/qnn_monitor.py index 5a4fb95c7..1ece35346 100644 --- a/src/winml/modelkit/session/monitor/qnn_monitor.py +++ b/src/winml/modelkit/session/monitor/qnn_monitor.py @@ -2,38 +2,245 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""QNNMonitor - Placeholder for future Qualcomm QNN-specific NPU monitoring. +"""QNNMonitor — Qualcomm NPU per-op profiler via ORT's QNN EP. -For real-time NPU utilization monitoring with QNN EP, use HWMonitor -(universal PDH-based). This module is reserved for future Qualcomm-specific -telemetry such as QAIRT profiling via qnn-profile-viewer.exe (device -execution time, queue wait, per-op traces). +Produces an :class:`OpTraceResult` with per-operator cycle counts +(``level="basic"``) or full QHAS roofline / DMA traffic +(``level="detail"``). + +Contributes session options and provider options to a ``WinMLSession`` via +the two :class:`EPMonitor` hooks; owns the ``profiling_level`` and +``profiling_file_path`` provider-option keys (C-3 in PRD — never +user-overridable). Requires ``ort.InferenceSession`` teardown before +``__exit__`` because QNN EP flushes the profiling CSV only on session +destruction. """ from __future__ import annotations -from typing import TYPE_CHECKING, Any +import json +import logging +import tempfile +import time +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Literal from .ep_monitor import EPMonitor +from .op_metrics import OperatorMetrics, OpTraceResult, TraceStatus +from .qnn.csv_parser import parse_qnn_profiling_csv +from .qnn.qhas_parser import parse_qhas +from .qnn.viewer import find_qnn_sdk, run_qhas_viewer if TYPE_CHECKING: + from collections.abc import Mapping + from typing_extensions import Self +logger = logging.getLogger(__name__) + + +# Maps user-facing level to QNN EP's `profiling_level` provider option. +_LEVEL_TO_PROFILING: dict[str, str] = { + "basic": "detailed", + "detail": "optrace", +} + + class QNNMonitor(EPMonitor): - """Placeholder for future Qualcomm QNN-specific NPU monitoring. + """Qualcomm NPU per-op profiler via ORT's QNN EP. - For real-time NPU utilization monitoring with QNN EP, - use ``HWMonitor`` (universal PDH-based). + Produces an :class:`OpTraceResult` with per-operator cycle counts + (``level="basic"``) or full QHAS roofline / DMA traffic + (``level="detail"``). - Future: Will wrap QAIRT profiling via ``qnn-profile-viewer.exe`` - for Qualcomm-specific metrics (device execution time, queue wait, - per-op traces). + .. note:: + + When ``output_dir`` is ``None``, a per-monitor temp directory + (``qnn_profile_*``) is created under the OS tempdir and is **never + auto-cleaned** so that profiling artifacts (CSV, QHAS JSON, + schematic, QNN log) remain available for post-run inspection. + Callers that care about disk hygiene should pass an explicit + ``output_dir`` they manage. The chosen directory is exposed via + :py:attr:`output_dir`. """ + #: QNN EP flushes the profiling CSV only on ``ort.InferenceSession`` + #: destruction; ``WinMLSession.perf().__exit__`` must drop the session + #: before calling ``monitor.__exit__``. + requires_session_teardown: ClassVar[bool] = True + + #: Pins ``WinMLSession`` to the QNN EP path so provider options + #: (``profiling_level``, ``profiling_file_path``) flow through + #: ``add_provider_for_devices``. Without this, the session would use + #: ORT's policy-based selection which silently drops provider options. + ep_name: ClassVar[str | None] = "qnn" + + def __init__( + self, + level: Literal["basic", "detail"] = "basic", + output_dir: Path | None = None, + extra_provider_options: Mapping[str, str] | None = None, + ) -> None: + """Initialize the monitor. + + Args: + level: ``"basic"`` (cycles only) or ``"detail"`` (QHAS roofline + + DMA traffic). + output_dir: Directory for profiling artifacts. When ``None``, a + per-monitor temp directory ``qnn_profile_*`` is created under + the OS tempdir; that directory is **never auto-cleaned** so + artifacts can be inspected post-run. Pass an explicit path if + you want to manage cleanup yourself. + extra_provider_options: Additional QNN EP provider options. The + two profiling-control keys (``profiling_level``, + ``profiling_file_path``) are owner-enforced per PRD C-3 and + cannot be overridden via this argument. + """ + if level not in _LEVEL_TO_PROFILING: + raise ValueError(f"level must be 'basic' or 'detail', got {level!r}") + self._level: str = level + # Idempotency: paths produced at __init__, not per-call. + # When output_dir is None we mint a fresh tempdir; we deliberately + # do NOT register a finalizer to clean it up — see class docstring. + self._output_dir: Path = ( + Path(output_dir) + if output_dir is not None + else Path(tempfile.mkdtemp(prefix="qnn_profile_")) + ) + self._output_dir.mkdir(parents=True, exist_ok=True) + self._csv_path: Path = (self._output_dir / "profiling_output.csv").resolve() + self._extra: dict[str, str] = dict(extra_provider_options or {}) + self._entered: bool = False + self._result: OpTraceResult | None = None + + # ------------------------------------------------------------------ + # Public read-only accessors + # ------------------------------------------------------------------ + + @property + def output_dir(self) -> Path: + """Directory where profiling artifacts (CSV, QHAS JSON, schematic) are written. + + When ``output_dir=None`` was passed at construction, this is a + per-monitor temp directory (``qnn_profile_*``) under the OS tempdir. + The directory is **NOT auto-cleaned** — artifacts persist for + post-hoc inspection. Callers that care about disk hygiene should + pass an explicit ``output_dir`` they manage. + """ + return self._output_dir + + # ------------------------------------------------------------------ + # Availability + # ------------------------------------------------------------------ + + @classmethod + def is_available(cls) -> bool: + """Whether the QNN EP is usable on this system. + + Checks two paths in order: + + 1. ``onnxruntime-qnn`` bundled wheel: ``QNNExecutionProvider`` is + already in :func:`onnxruntime.get_available_providers`. + 2. ``onnxruntime-windowsml``: call + :func:`ep_registry.ensure_initialized` to trigger WinML EP + registration, then look for a QNN device in + :func:`onnxruntime.get_ep_devices`. + """ + try: + import onnxruntime as ort + except ImportError: + return False + + if "QNNExecutionProvider" in ort.get_available_providers(): + return True + + # WinML-registered path. + try: + from ..ep_registry import ensure_initialized + except ImportError: + return False + + try: + ensure_initialized() + return any( + getattr(d, "ep_name", None) == "QNNExecutionProvider" for d in ort.get_ep_devices() + ) + except Exception as exc: + # Real environmental failure (e.g., broken Windows App SDK, + # denied registration, missing DLL) — surface at WARNING so + # users can diagnose. NFR-2: this MUST NOT be silent. + logger.warning( + "QNNMonitor.is_available: WinML EP probe failed (%s: %s); reporting unavailable", + type(exc).__name__, + exc, + ) + return False + + # ------------------------------------------------------------------ + # Hook contributions + # ------------------------------------------------------------------ + + def get_session_options(self) -> dict[str, str]: + """Session config entries required for QNN op-tracing. + + Only EPContext caching is opted into here — embed_mode=0 keeps the + compiled binary external so the cached ONNX stays small. + + ``session.disable_cpu_ep_fallback`` is intentionally NOT set: under + ``onnxruntime-windowsml`` the WinML-registered QNN partitions a + QDQ-wrapped EPContext model into Q/DQ-on-CPU + EPContext-on-QNN, + which is correct behaviour (the boundary Q/DQ ops genuinely run on + CPU). Disabling CPU fallback would reject that valid partition and + cause NotImplemented errors even when QNN successfully claimed the + EPContext node. The "no silent CPU fallback" guarantee is provided + by ``add_provider_for_devices`` upstream — if the QNN device is + absent, session creation fails loudly there. + """ + return { + "ep.context_enable": "1", + "ep.context_embed_mode": "0", + } + + def get_provider_options(self) -> dict[str, str]: + """Provider options for QNN EP with owner-enforced profiling keys. + + Only the two profiling keys (``profiling_level``, ``profiling_file_path``) + are owner-set; everything else is pass-through from ``extra_provider_options``. + This is deliberate: ORT's ``add_provider_for_devices`` merges these + options on top of whatever the device source pre-configured. Under + ``onnxruntime-windowsml`` the WinML-registered QNN device already has + an absolute ``backend_path`` and tuned HTP defaults; supplying our own + defaults here would *overwrite* WinML's and break DLL loading. + + Callers who need to tune HTP behaviour (e.g. ``backend_path`` for + the bundled ``onnxruntime-qnn`` path, or ``htp_performance_mode``) + pass them via ``extra_provider_options`` at construction time. + + Build order (last writer wins): + + 1. ``self._extra`` — caller-supplied options (may include backend + settings the bundled-ORT path needs). + 2. ``profiling_level`` and ``profiling_file_path`` — applied LAST; + owner-enforced per C-3 (PRD). Assigned explicitly after + :py:meth:`dict.update` to avoid Ruff ``F601`` on duplicate keys + and to guarantee they cannot be shadowed by ``extra``. + """ + opts: dict[str, str] = dict(self._extra) + # C-3: these two keys are NEVER user-overridable. + opts["profiling_level"] = _LEVEL_TO_PROFILING[self._level] + opts["profiling_file_path"] = str(self._csv_path) + return opts + + # ------------------------------------------------------------------ + # Context manager + # ------------------------------------------------------------------ + def __enter__(self) -> Self: - """No-op: no Qualcomm-specific monitoring yet.""" + if self._entered: + raise RuntimeError("QNNMonitor already entered") + self._entered = True return self def __exit__( @@ -42,13 +249,216 @@ def __exit__( exc_val: BaseException | None, exc_tb: Any, ) -> None: - """No-op: no cleanup needed.""" + """Parse whatever artifacts are on disk. Never suppresses caller exceptions.""" + try: + self._result = self._parse_artifacts() + except Exception as exc: + logger.warning("QNNMonitor: artifact parse failed: %s", exc) + self._result = self._make_failure_result(status="parse_failed", error=str(exc)) + # Implicit None return → does not suppress caller exception. - @classmethod - def is_available(cls) -> bool: - """No Qualcomm-specific telemetry available yet.""" - return False + # ------------------------------------------------------------------ + # Public accessors + # ------------------------------------------------------------------ def to_dict(self) -> dict[str, Any]: - """Stub dict indicating not-implemented status.""" - return {"ep": "QNN", "device": "NPU", "status": "not_implemented"} + """JSON-serializable summary in :class:`OpTraceResult`'s nested schema. + + Pre-exit (``self._result is None``) the monitor still returns the same + nested shape — ``{"metadata": ..., "summary": ..., "operators": ..., + "statistics": ..., "artifacts": ..., "status": "not_run", "error": + None}`` — by delegating to a ``status="not_run"`` failure result. This + keeps consumers that key on ``metadata`` / ``summary`` from breaking + when they probe the monitor before the context manager has exited. + """ + if self._result is None: + return self._make_failure_result(status="not_run", error=None).to_dict() + return self._result.to_dict() + + @property + def result(self) -> OpTraceResult | None: + """Structured result object. Preferred by report writers.""" + return self._result + + # ------------------------------------------------------------------ + # Artifact parsing + # ------------------------------------------------------------------ + + def _parse_artifacts(self) -> OpTraceResult: + """Parse CSV (always) and optionally QHAS (detail mode). + + Windows file-handle lag mitigation (R-2): if the CSV is absent on + the first check, sleep 50ms and retry once before giving up. + """ + csv_path = self._csv_path + if not csv_path.is_file(): + time.sleep(0.05) # R-2: Windows file-handle flush lag + if not csv_path.is_file(): + logger.warning("QNNMonitor: profiling CSV not produced at %s", csv_path) + return self._make_failure_result(status="no_data", error=None) + + parsed = parse_qnn_profiling_csv(csv_path) + meta = parsed.get("metadata", {}) + artifacts: dict[str, str] = {"csv": str(csv_path)} + + # Convert cycles to microseconds via the CSV-reported ratio. + total_cycles = int(meta.get("accel_execute_cycles", 0) or 0) + accel_us = int(meta.get("accel_execute_us", 0) or 0) + cycle_to_us = accel_us / total_cycles if total_cycles > 0 else 0.0 + + operators: list[OperatorMetrics] = [ + OperatorMetrics( + name=op["name"], + op_path=op["name"], + op_id=op.get("op_id"), + duration_us=op["cycles"] * cycle_to_us, + percent_of_total=((op["cycles"] / total_cycles * 100) if total_cycles > 0 else 0.0), + ) + for op in parsed.get("operators", []) + ] + + summary: dict[str, Any] = { + "hvx_threads": meta.get("hvx_threads", 0), + "accel_execute_cycles": total_cycles, + "accel_execute_us": accel_us, + } + + status: TraceStatus = "ok" + # Detail mode: attempt QHAS post-processing. + if self._level == "detail": + qhas_summary, qhas_operators, qhas_path = self._try_qhas(artifacts) + if qhas_path is not None and qhas_operators is not None: + operators = qhas_operators + summary = qhas_summary or summary + artifacts["qhas"] = str(qhas_path) + else: + # Fell back to CSV-only data in detail mode. + status = "basic_fallback" + logger.warning("QNNMonitor: QHAS unavailable; detail mode degraded to basic") + + return OpTraceResult( + model=None, + device="npu", + tracing_level=self._level, + ep="QNNExecutionProvider", + tracing_backend="qnn", + operators=operators, + summary=summary, + num_samples=int(meta.get("num_samples", 0) or 0), + artifacts=artifacts, + status=status, + ) + + def _try_qhas( + self, artifacts: dict[str, str] + ) -> tuple[dict[str, Any] | None, list[OperatorMetrics] | None, Path | None]: + """Attempt QHAS post-processing. + + Returns ``(summary, operators, qhas_path)`` on success, or + ``(None, None, None)`` on any failure. Never raises. + + Per C-5 / FR-12 this method does NOT call :func:`os.chdir`. The + ``*_schematic.bin`` is located via :py:meth:`Path.glob` in the + output directory first, then the process CWD as a read-only + fallback. + """ + # Find the QNN log. + qnn_logs = list(self._output_dir.glob("*_qnn.log")) + if not qnn_logs: + logger.debug("QNNMonitor: no *_qnn.log found for QHAS") + return None, None, None + qnn_log = qnn_logs[0] + + # Find the schematic (glob, never chdir). + schematic = self._find_schematic() + if schematic is None: + logger.debug("QNNMonitor: no *_schematic.bin found for QHAS") + return None, None, None + + sdk_root = find_qnn_sdk() + if sdk_root is None: + logger.debug("QNNMonitor: QNN SDK not located; skipping QHAS") + return None, None, None + + qhas_output = self._output_dir / "qhas_output.json" + result_path = run_qhas_viewer(qnn_log, schematic, qhas_output, sdk_root=sdk_root) + if result_path is None or not result_path.is_file(): + logger.debug("QNNMonitor: QHAS viewer produced no output") + return None, None, None + + artifacts["schematic"] = str(schematic) + + try: + qhas_data = json.loads(result_path.read_text(encoding="utf-8")) + parsed = parse_qhas(qhas_data) + except Exception as exc: + logger.warning("QNNMonitor: QHAS JSON parse failed: %s", exc) + return None, None, None + + operators = [ + OperatorMetrics( + name=op["name"], + op_path=op["op_path"], + duration_us=op["duration_us"], + percent_of_total=op["percent_of_total"], + dominant_path_us=op.get("dominant_path_us"), + num_htp_ops=op.get("num_htp_ops"), + dram_read_bytes=op.get("dram_read_bytes"), + dram_write_bytes=op.get("dram_write_bytes"), + vtcm_read_bytes=op.get("vtcm_read_bytes"), + vtcm_write_bytes=op.get("vtcm_write_bytes"), + vtcm_hit_ratio=op.get("vtcm_hit_ratio"), + ) + for op in parsed.get("operators", []) + ] + return parsed.get("summary"), operators, result_path + + def _find_schematic(self) -> Path | None: + """Locate ``*_schematic.bin`` without mutating CWD. + + Search order: + + 1. :attr:`_output_dir` (where ``profiling_file_path`` points). + 2. Process CWD (glob-only; no :func:`os.chdir`) — the QNN SDK + occasionally drops the schematic next to the process's current + directory rather than next to the profiling CSV. + + The CWD fallback is **mtime-gated** against the profiling CSV: a + schematic from a prior CI run sitting in CWD would otherwise be + silently consumed and produce QHAS metrics for the wrong graph + with ``status="ok"`` — silent data corruption. The schematic must + be at least as new as the CSV (with a 5s tolerance for filesystem + clock skew) to be accepted. + """ + candidates = list(self._output_dir.glob("*_schematic.bin")) + if candidates: + return candidates[0] + # Fallback: read-only glob of process CWD. No chdir. + # Reject stale schematics older than the profiling CSV. + csv_mtime = self._csv_path.stat().st_mtime if self._csv_path.is_file() else 0.0 + fresh = [ + p for p in Path.cwd().glob("*_schematic.bin") if p.stat().st_mtime >= csv_mtime - 5.0 + ] + if fresh: + logger.warning( + "QNNMonitor: located *_schematic.bin in CWD (%s) rather than output dir (%s)", + fresh[0].parent, + self._output_dir, + ) + return fresh[0] + return None + + def _make_failure_result(self, status: TraceStatus, error: str | None) -> OpTraceResult: + """Build a minimal ``OpTraceResult`` for parse-time failures.""" + return OpTraceResult( + model=None, + device="npu", + tracing_level=self._level, + ep="QNNExecutionProvider", + tracing_backend="qnn", + operators=[], + summary={}, + artifacts={"csv": str(self._csv_path)}, + status=status, + error=error, + ) diff --git a/src/winml/modelkit/optracing/report.py b/src/winml/modelkit/session/monitor/report.py similarity index 91% rename from src/winml/modelkit/optracing/report.py rename to src/winml/modelkit/session/monitor/report.py index 8804f19c3..7e859f493 100644 --- a/src/winml/modelkit/optracing/report.py +++ b/src/winml/modelkit/session/monitor/report.py @@ -2,7 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""Console report and JSON file output for op-tracing results.""" +"""Report helpers — display / write JSON for op-trace results. + +Relocated from optracing/report.py as part of the op-tracing refactor. +""" + from __future__ import annotations from pathlib import Path @@ -10,7 +14,7 @@ from rich.console import Console from rich.table import Table -from .result import OpTraceResult # noqa: TC001 (used at runtime) +from .op_metrics import OpTraceResult # noqa: TC001 (used at runtime) def display_op_trace_report( @@ -85,9 +89,7 @@ def _format_number(n: float | int | None) -> str: return f"{n:,}" -def _display_basic_report( - result: OpTraceResult, console: Console, top_n: int -) -> None: +def _display_basic_report(result: OpTraceResult, console: Console, top_n: int) -> None: """Render a basic-mode report with operator name, avg cycles, and %.""" # Header console.print() @@ -130,18 +132,14 @@ def _display_basic_report( console.print(table) -def _display_detail_report( - result: OpTraceResult, console: Console, top_n: int -) -> None: +def _display_detail_report(result: OpTraceResult, console: Console, top_n: int) -> None: """Render a detail-mode report with memory and cache columns.""" # Header backend_suffix = "" if result.tracing_backend: backend_suffix = f" -- {result.tracing_backend}" console.print() - console.rule( - f"[bold]Op-Level Profiling (detail){backend_suffix}[/bold]" - ) + console.rule(f"[bold]Op-Level Profiling (detail){backend_suffix}[/bold]") # Summary lines summary = result.summary @@ -188,11 +186,7 @@ def _display_detail_report( table.add_column("VTCM Hit", justify="right", min_width=9) for i, op in enumerate(ops, 1): - vtcm_str = ( - f"{op.vtcm_hit_ratio * 100:.1f}%" - if op.vtcm_hit_ratio is not None - else "-" - ) + vtcm_str = f"{op.vtcm_hit_ratio * 100:.1f}%" if op.vtcm_hit_ratio is not None else "-" table.add_row( str(i), op.op_path, diff --git a/src/winml/modelkit/session/session.py b/src/winml/modelkit/session/session.py index a8b7da920..74c28fd4e 100644 --- a/src/winml/modelkit/session/session.py +++ b/src/winml/modelkit/session/session.py @@ -6,9 +6,12 @@ from __future__ import annotations +import gc import logging import os +import sys from contextlib import contextmanager +from dataclasses import dataclass from enum import Enum from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar @@ -18,7 +21,8 @@ from ..core.onnx_utils import get_io_config from ..onnx import is_compiled_onnx -from .ep_registry import WinMLEPRegistry +from .ep_registry import ensure_initialized +from .monitor.ep_monitor import EPMonitor, NullEPMonitor from .stats import PerfStats @@ -33,6 +37,28 @@ logger = logging.getLogger(__name__) +@contextmanager +def _suppress_native_output(log_path: str | Path | None = None): + """Redirect native stdout to a log file (or devnull). + + QNN SDK compiler writes progress to stdout via native C++ code that + Python logging/warnings cannot intercept. Only redirects stdout — + stderr is left untouched so Rich displays and Python logging work. + """ + if log_path is not None: + fd = os.open(str(log_path), os.O_WRONLY | os.O_CREAT | os.O_TRUNC) + else: + fd = os.open(os.devnull, os.O_WRONLY) + old_stdout = os.dup(1) + os.dup2(fd, 1) + os.close(fd) + try: + yield + finally: + os.dup2(old_stdout, 1) + os.close(old_stdout) + + class SessionState(Enum): """WinMLSession states.""" @@ -42,6 +68,19 @@ class SessionState(Enum): ERROR = "ERROR" +@dataclass(frozen=True) +class PerfContext: + """Yielded by ``WinMLSession.perf()``. + + Aggregates perf statistics and the optional attached EP monitor. + Frozen: mutation is not a supported pattern — update the underlying + objects instead. + """ + + stats: PerfStats + monitor: EPMonitor # NullEPMonitor when no monitor was passed + + # Device to ORT policy mapping (no EP names - let ORT select provider) DEVICE_POLICY_MAP = { "npu": ort.OrtExecutionProviderDevicePolicy.PREFER_NPU, @@ -109,9 +148,6 @@ class WinMLSession: outputs = session.run({"input": tensor}) """ - # Class-level flag for one-time EP initialization - _eps_initialized: bool = False - # EP short name -> ORT full provider name (for add_provider_for_devices matching) _EP_NAME_MAP: ClassVar[dict[str, str]] = { "qnn": "QNNExecutionProvider", @@ -124,22 +160,6 @@ class WinMLSession: "cpu": "CPUExecutionProvider", } - @classmethod - def _init_winml_eps_once(cls) -> None: - """Initialize WinML EP registry once at class level.""" - if cls._eps_initialized: - return - - try: - registry = WinMLEPRegistry.get_instance() - if registry.winml_available: - registered = registry.register_to_ort() - logger.info("WinML EPs registered: %s", registered) - except Exception as e: - logger.debug("WinML EP init skipped: %s", e) - finally: - cls._eps_initialized = True - def __init__( self, onnx_path: str | Path, @@ -166,17 +186,20 @@ def __init__( session_options: ORT SessionOptions. If None, creates default with policy based on device parameter. """ - WinMLSession._init_winml_eps_once() + ensure_initialized() self._onnx_path = Path(onnx_path) if not self._onnx_path.exists(): raise FileNotFoundError(f"ONNX model not found: {onnx_path}") - self._device = device + # HF Pipeline may pass torch.device; coerce to string for downstream .lower() calls + self._device = str(device) if not isinstance(device, str) else device self._ep = ep.lower() if ep else None self._persist_jit = ep_config.enable_ep_context if ep_config else False self._embed_context = ep_config.embed_context if ep_config else False self._provider_options = ep_config.provider_options if ep_config else {} + # Monitor-contributed session config entries (populated by session.perf(monitor=...)) + self._active_session_option_entries: dict[str, str] = {} # Create session_options with device policy if session_options is None: @@ -234,6 +257,10 @@ def compile(self) -> None: logger.info("Using cached EPContext: %s", ctx_path) # Compile if needed (persist_jit=True and no cache) + # Native QNN SDK compiler writes progress to stdout/stderr; + # redirect to log file to keep the console clean. + compile_log = self._onnx_path.parent / "compile.log" + if self._persist_jit and model_path == self._onnx_path: # Skip ModelCompiler if input model is already compiled (EPContext) if is_compiled_onnx(self._onnx_path): @@ -247,7 +274,8 @@ def compile(self) -> None: str(self._onnx_path), embed_compiled_data_into_model=self._embed_context, ) - model_compiler.compile_to_file(str(ctx_path)) + with _suppress_native_output(compile_log): + model_compiler.compile_to_file(str(ctx_path)) # Use compiled model if it was created if ctx_path.exists(): @@ -261,7 +289,8 @@ def compile(self) -> None: try: # Create InferenceSession sess_options = self._build_session_options(target_device) - session = ort.InferenceSession(str(model_path), sess_options=sess_options) + with _suppress_native_output(compile_log): + session = ort.InferenceSession(str(model_path), sess_options=sess_options) # Log which providers were selected by ORT (based on policy) actual_providers = session.get_providers() @@ -288,6 +317,15 @@ def compile(self) -> None: self._session = session self._state = SessionState.COMPILED + # Resolve device label from the primary provider ORT actually selected + if self._device == "auto" and actual_providers: + from ..sysinfo.device import get_ep_device_map + + ep_map = get_ep_device_map() + resolved = ep_map.get(actual_providers[0]) + if resolved and "/" not in resolved: + self._device = resolved + def run( self, inputs: dict[str, Any], @@ -397,6 +435,9 @@ def _build_session_options(self, device: str) -> ort.SessionOptions: self._ep, target_name, ) + # Apply monitor-contributed session config entries + for key, value in self._active_session_option_entries.items(): + opts.add_session_config_entry(key, value) return opts logger.warning( "EP '%s' (%s) not found in available devices; falling back to policy", @@ -410,6 +451,9 @@ def _build_session_options(self, device: str) -> ort.SessionOptions: device.lower(), ort.OrtExecutionProviderDevicePolicy.PREFER_NPU ) opts.set_provider_selection_policy(policy) + # Apply monitor-contributed session config entries + for key, value in self._active_session_option_entries.items(): + opts.add_session_config_entry(key, value) return opts @@ -543,26 +587,94 @@ def perf_stats(self) -> PerfStats | None: return self._perf_stats @contextmanager - def perf(self, warmup: int = 0) -> Generator[PerfStats, None, None]: - """Context manager for scoped performance tracking. + def perf( + self, + warmup: int = 0, + monitor: EPMonitor | None = None, + ) -> Generator[PerfContext, None, None]: + """Run a scoped performance window yielding a :class:`PerfContext`. Args: warmup: Number of initial samples to exclude from statistics. + monitor: Optional :class:`EPMonitor`. Contributes session/provider + options at compile time (auto-resets the session if already + compiled with different options — logs WARNING). Parses + artifacts on exit. Yields: - PerfStats instance that collects timing data within the context. + :class:`PerfContext` with ``stats: PerfStats`` and + ``monitor: EPMonitor`` (:class:`NullEPMonitor` when caller passed + ``monitor=None``). + + Raises: + RuntimeError: If another ``perf()`` context is already active on + this session (nested ``perf()`` is forbidden). Example: - >>> with session.perf(warmup=10) as stats: + >>> with session.perf(warmup=10) as ctx: ... for _ in range(110): ... session.run(inputs) - >>> print(f"P99: {stats.p99_ms:.2f} ms") # Based on last 100 samples + >>> print(f"P99: {ctx.stats.p99_ms:.2f} ms") """ - self._perf_stats = PerfStats(warmup=warmup) + if self._perf_stats is not None: + raise RuntimeError("session.perf() already active (nested perf is forbidden)") + + mon: EPMonitor = monitor if monitor is not None else NullEPMonitor() + + # Collect hook contributions — must be idempotent per EPMonitor contract + extra_sess = mon.get_session_options() + extra_prov = mon.get_provider_options() + + # Pin the EP when the monitor declares one and the session doesn't + # already have an explicit EP. Without this, _build_session_options + # takes the policy-based path which silently drops provider options + # (e.g. profiling_level for QNN op-tracing). + needs_ep_pin = mon.ep_name is not None and self._ep is None + + # Auto-reset if options to apply AND session is already compiled + if (extra_sess or extra_prov or needs_ep_pin) and self._session is not None: + logger.warning( + "auto-resetting compiled session to apply monitor session/provider options " + "(monitor=%s)", + type(mon).__name__, + ) + self.reset() + + # Snapshot BEFORE mutation so finally can always restore + saved_sess_entries = dict(self._active_session_option_entries) + saved_prov = dict(self._provider_options) + saved_ep = self._ep + stats = PerfStats(warmup=warmup) + + mon_entered = False try: - yield self._perf_stats + # Mutations happen inside the try so any raise leaves state clean + self._active_session_option_entries = {**saved_sess_entries, **extra_sess} + self._provider_options = {**saved_prov, **extra_prov} + if needs_ep_pin: + self._ep = mon.ep_name + self._perf_stats = stats + mon.__enter__() + mon_entered = True + yield PerfContext(stats=stats, monitor=mon) finally: self._perf_stats = None + exc_info = sys.exc_info() + try: + if mon.requires_session_teardown and mon_entered: + # Only tear down session if monitor actually entered (data to flush) + self.reset() + # Windows: release file handles before monitor parses artifacts + gc.collect() + finally: + try: + if mon_entered: + # return value intentionally ignored — exceptions always propagate + mon.__exit__(*exc_info) + finally: + self._active_session_option_entries = saved_sess_entries + self._provider_options = saved_prov + self._ep = saved_ep @property def io_config(self) -> dict: diff --git a/src/winml/modelkit/utils/cli.py b/src/winml/modelkit/utils/cli.py index 07c1b7eb6..d06459c6f 100644 --- a/src/winml/modelkit/utils/cli.py +++ b/src/winml/modelkit/utils/cli.py @@ -22,6 +22,7 @@ def model_option(required=True): """ return click.option( "--model", + "-m", required=required, type=click.Path(exists=True, path_type=Path), help="Path to ONNX model file to analyze", @@ -78,7 +79,7 @@ def device_option(required=True, optional_message=None, default="NPU"): "--device", required=required, default=default if not required else None, - type=click.Choice(SUPPORTED_DEVICES, case_sensitive=False), + type=click.Choice(SUPPORTED_DEVICES, case_sensitive=True), help=help_text, ) @@ -86,8 +87,11 @@ def device_option(required=True, optional_message=None, default="NPU"): def verbosity_options(f): """Add verbose and quiet logging options to a Click command. - Adds --verbose/-v and --quiet/-q flags that control logging verbosity. - These options are automatically passed to the decorated function. + Adds --verbose/-v (stackable: -v, -vv, -vvv) and --quiet/-q flags. + The decorated function receives ``verbose`` (int, count of -v flags) + and ``quiet`` (bool). + + See :mod:`winml.modelkit.utils.logging` for the verbosity convention. Args: f: Click command function to decorate @@ -105,8 +109,7 @@ def verbosity_options(f): f = click.option( "--verbose", "-v", - is_flag=True, - default=False, - help="Enable verbose logging to stderr", + count=True, + help="Increase verbosity (-v=INFO, -vv=DEBUG)", )(f) return f # noqa: RET504 diff --git a/src/winml/modelkit/utils/console.py b/src/winml/modelkit/utils/console.py new file mode 100644 index 000000000..2a34826dc --- /dev/null +++ b/src/winml/modelkit/utils/console.py @@ -0,0 +1,563 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Shared console output utilities for winml CLI commands. + +Provides consistent Rich-based formatting for: +- Config command: headers, I/O specs, resolution summary +- Build command: cascading StageLive, setup/stages sections, graph summary + +All output goes to stderr via Console(stderr=True) so stdout stays clean +for machine-readable output (JSON configs, build manifests). +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from rich.console import Console, Group, RenderableType +from rich.live import Live +from rich.text import Text + + +if TYPE_CHECKING: + from ..export.config import WinMLExportConfig + +logger = logging.getLogger(__name__) + +HEAVY_SEP = "\u2550" * 60 # ═ +LIGHT_SEP = "\u2500" * 60 # ─ +MAX_BAR_WIDTH = 36 + +# Stage status icons +ICON_RUNNING = "\u23f3" # ⏳ +ICON_DONE = "\u2705" # ✅ +ICON_SKIP = "\u23f8\ufe0f " # ⏸️ +ICON_ERROR = "\u274c" # ❌ + + +def get_console() -> Console: + """Return a Console that prints to stderr.""" + return Console(stderr=True) + + +# ══════════════════════════════════════════════════════════════════════════ +# SHARED FORMATTING +# ══════════════════════════════════════════════════════════════════════════ + + +def print_command_header( + console: Console, + title: str, + subtitle: str | None = None, +) -> None: + """Print a command header block (═══ separators).""" + console.print() + console.print(HEAVY_SEP) + label = f"[bold]{title}[/bold]" + if subtitle: + label += f" [dim]({subtitle})[/dim]" + console.print(label) + console.print(HEAVY_SEP) + + +def print_kv( + console: Console, + label: str, + value: str, + *, + note: str | None = None, + icon: str = "", +) -> None: + """Print a key-value line with optional note.""" + line = f" {icon} [bold]{label:<14}[/bold] [cyan]{value}[/cyan]" + if note: + line += f" [dim]({note})[/dim]" + console.print(line) + + +def print_success(console: Console, message: str) -> None: + """Print a green success line with check icon.""" + console.print(f" [green]{ICON_DONE} {message}[/green]") + + +def print_error( + console: Console, + message: str, + hint: str | None = None, +) -> None: + """Print a red error line with optional hint.""" + console.print(f" [red]{ICON_ERROR} {message}[/red]") + if hint: + console.print(f" [dim]\U0001f4a1 {hint}[/dim]") + + +# ══════════════════════════════════════════════════════════════════════════ +# CONFIG COMMAND HELPERS +# ══════════════════════════════════════════════════════════════════════════ + + +def print_io_specs_detail( + console: Console, + export_config: WinMLExportConfig, +) -> None: + """Print resolved I/O specs — always full detail, aligned columns.""" + inputs = export_config.input_tensors or [] + outputs = export_config.output_tensors or [] + + for i, t in enumerate(inputs): + name = t.name or "(unnamed)" + shape_str = str(list(t.shape)) if getattr(t, "shape", None) else "dynamic" + dtype_str = getattr(t, "dtype", None) or "?" + label = "Input: " if i == 0 else " " + console.print(f" {label}[cyan]{name:<18}[/cyan] {shape_str:<14} [dim]{dtype_str}[/dim]") + for i, t in enumerate(outputs): + name = t.name or "(unnamed)" + # Fix #3: OutputTensorSpec only has name — show name only + label = "Output: " if i == 0 else " " + console.print(f" {label}[cyan]{name}[/cyan]") + + +def print_io_specs_na(console: Console, reason: str = "") -> None: + """Print I/O specs not-available line (e.g., ONNX mode).""" + msg = reason or "inferred from ONNX graph at build time" + console.print(f" \U0001f4d0 [bold]I/O specs:[/bold] [dim]N/A \u2014 {msg}[/dim]") + + +# ══════════════════════════════════════════════════════════════════════════ +# BUILD COMMAND — SETUP / STAGES SECTIONS +# ══════════════════════════════════════════════════════════════════════════ + + +def print_setup( + console: Console, + *, + model: str, + config: str, + output: str, + source: str = "HuggingFace", +) -> None: + """Print the 🔧 Setup section header.""" + console.print() + console.print(HEAVY_SEP) + console.print(f"[bold]\U0001f527 Setup \u2014 {source}[/bold]") + console.print(HEAVY_SEP) + console.print(f" \U0001f4e6 [bold]{'Model:':<10}[/bold] [cyan]{model}[/cyan]") + console.print(f" \U0001f4c1 [bold]{'Config:':<10}[/bold] [cyan]{config}[/cyan]") + console.print(f" \U0001f4c2 [bold]{'Output:':<10}[/bold] [cyan]{output}[/cyan]") + console.print() + + +def print_stages_header(console: Console) -> None: + """Print the 🎯 Stages section header.""" + console.print(HEAVY_SEP) + console.print("[bold]\U0001f3af Stages[/bold]") + console.print(HEAVY_SEP) + + +def print_final( + console: Console, + elapsed: float, + artifact: str, + stage_timings: list[tuple[str, float | None]] | None = None, +) -> None: + """Print the 📊 Summary section with stage timing breakdown. + + Args: + stage_timings: list of (stage_name, elapsed_seconds | None for skipped) + """ + console.print() + console.print(HEAVY_SEP) + console.print("[bold]\U0001f4ca Summary[/bold]") + console.print(HEAVY_SEP) + console.print(f"{ICON_DONE} [bold green]Build complete in {elapsed:.1f}s[/bold green]") + if stage_timings: + for name, t in stage_timings: + if t is not None: + console.print(f" {name:<12} [green]{t:.1f}s[/green]") + else: + console.print(f" {name:<12} [dim]skipped[/dim]") + console.print(f"\U0001f4e6 Final artifact: [bold]{artifact}[/bold]") + console.print() + + +def print_stage_skip( + console: Console, + name: str, + reason: str = "", +) -> None: + """Print a skipped stage as static text (no Live needed).""" + line = Text() + line.append(f"{ICON_SKIP} ") + line.append(name.capitalize(), style="dim") + if reason: + line.append(f" {reason}", style="dim italic") + console.print(line) + console.print() + + +def detect_model_source(model_id: str | None) -> str: + """Detect model source for Setup header.""" + if model_id is None: + return "HuggingFace" + p = Path(model_id) + if p.suffix == ".onnx": + return "ONNX" + if p.is_dir(): + return "Local" + return "HuggingFace" + + +def fmt_size(size_bytes: int | float) -> str: + """Format file size from bytes to human-readable string.""" + mb = size_bytes / (1024 * 1024) + if mb >= 1000: + return f"{mb / 1000:.1f} GB" + return f"{mb:.1f} MB" + + +def get_onnx_total_size(onnx_path: Path) -> int: + """Get total ONNX model size including external data files. + + When ONNX models use external data storage, the main .onnx file + is just metadata (~1-2MB) while weights live in separate .data files. + This function sums all related files. + """ + total = onnx_path.stat().st_size + try: + from onnx import external_data_helper as edh + + from ..onnx import load_onnx + + model = load_onnx(onnx_path, load_weights=False, validate=False) + seen: set[str] = set() + for init in model.graph.initializer: + if edh.uses_external_data(init): + ext_info = edh.ExternalDataInfo(init) + if ext_info.location and ext_info.location not in seen: + seen.add(ext_info.location) + ext_path = onnx_path.parent / ext_info.location + if ext_path.exists(): + total += ext_path.stat().st_size + except Exception: + logger.debug( + "Could not read external data for %s; reporting main file size only", + onnx_path, + exc_info=True, + ) + return total + + +# ══════════════════════════════════════════════════════════════════════════ +# BUILD COMMAND — STAGE LIVE (cascading Live per stage) +# ══════════════════════════════════════════════════════════════════════════ + + +class StageLive: + """Live region for a single build stage. + + Each stage gets its own Rich Live context. When the stage completes, + Live stops and the final frame persists as static text (transient=False). + The next stage starts a new Live below. + + Usage:: + + with StageLive("export", console) as sl: + sl.kv("Task:", "fill-mask [dim](auto-detected)[/dim]") + sl.io_input("input_ids", "[1, 128]", "int64") + # ... blocking work ... + sl.set_done(12.3) + sl.artifact("output/export.onnx", 438_200_000) + """ + + def __init__(self, name: str, console: Console) -> None: + self._name = name + self._console = console + self._lines: list[RenderableType] = [] + self._live: Live | None = None + self._status_idx: int = 0 + + def __enter__(self) -> StageLive: + self._lines = [self._make_running_line()] + self._status_idx = 0 + self._live = Live( + self._render(), + console=self._console, + refresh_per_second=15, + transient=False, + ) + self._live.start() + return self + + def __exit__(self, *_: object) -> None: + if self._live: + self._live.update(self._render()) + self._live.stop() + self._live = None + + def _render(self) -> Group: + return Group(*self._lines) + + def _update(self) -> None: + if self._live: + self._live.update(self._render()) + + # ── Status line management ──────────────────────────────────── + + def _make_running_line(self, detail: str = "") -> Text: + line = Text() + line.append(f"{ICON_RUNNING} ") + line.append(self._name.capitalize(), style="bold yellow") + if detail: + line.append(f" {detail}", style="dim") + return line + + def set_status(self, detail: str) -> None: + """Update the running status text.""" + self._lines[self._status_idx] = self._make_running_line(detail) + self._update() + + def set_done(self, elapsed: float) -> None: + """Mark stage as done.""" + line = Text() + line.append(f"{ICON_DONE} ") + line.append(f"{self._name.capitalize():<48}", style="green") + line.append(f"{elapsed:.1f}s", style="green") + self._lines[self._status_idx] = line + self._update() + + def set_error(self, error: str = "") -> None: + """Mark stage as failed.""" + line = Text() + line.append(f"{ICON_ERROR} ") + line.append(self._name.capitalize(), style="bold red") + if error: + line.append(f" {error}", style="red") + self._lines[self._status_idx] = line + self._update() + + # ── Detail lines (indented under stage) ─────────────────────── + + def detail(self, markup: str) -> None: + """Add a Rich markup detail line.""" + self._lines.append(Text.from_markup(f" {markup}")) + self._update() + + def kv(self, label: str, value: str) -> None: + """Add a key-value detail line with aligned columns.""" + self._lines.append(Text.from_markup(f" {label:<14}{value}")) + self._update() + + def artifact(self, path: str, size_bytes: int | float) -> None: + """Add artifact line (always last in stage).""" + label = "\U0001f4e6 Artifact:" + self._lines.append( + Text.from_markup(f" {label:<14}[dim]{path}[/dim] ({fmt_size(size_bytes)})") + ) + self._update() + + def blank(self) -> None: + """Add a blank line.""" + self._lines.append(Text("")) + self._update() + + # ── I/O lines (aligned columns) ────────────────────────────── + + def io_input( + self, + name: str, + shape: str, + dtype: str, + *, + first: bool = True, + ) -> None: + """Add an input tensor line.""" + label = "Input: " if first else " " + self._lines.append( + Text.from_markup(f" {label}[cyan]{name:<18}[/cyan] {shape:<14} [dim]{dtype}[/dim]") + ) + self._update() + + def io_output( + self, + name: str, + shape: str, + dtype: str, + *, + first: bool = True, + ) -> None: + """Add an output tensor line.""" + label = "Output: " if first else " " + self._lines.append( + Text.from_markup(f" {label}[cyan]{name:<18}[/cyan] {shape:<14} [dim]{dtype}[/dim]") + ) + self._update() + + # ── EP analyzer bar lines (for optimize stage) ──────────────── + + def ep_bar_add(self, ep_name: str, total: int = 0) -> int: + """Add a placeholder EP bar line, return index.""" + idx = len(self._lines) + line = Text() + line.append(" - ") + line.append(f"{ep_name:<28}", style="dim") + if total: + line.append("\u2591" * MAX_BAR_WIDTH, style="dim") + self._lines.append(line) + self._update() + return idx + + def ep_bar_update( + self, + idx: int, + ep_name: str, + s: int, + p: int, + u: int, + total: int = 0, + ) -> None: + """Update an EP bar line by index with progress.""" + line = Text() + line.append(" - ") + line.append(f"{ep_name:<28}", style="cyan") + line.append_text(_spu_text(s, p, u)) + line.append(" ") + # Scale bar proportional to total (not analyzed count) + analyzed = s + p + u + anchor = max(total, analyzed, 1) + line.append_text(_build_bar_scaled(s, p, u, anchor)) + remaining = total - analyzed if total else 0 + if remaining > 0: + rem_w = max( + 1, + round(remaining / anchor * MAX_BAR_WIDTH), + ) + line.append("\u2591" * rem_w, style="dim") + self._lines[idx] = line + self._update() + + +# ══════════════════════════════════════════════════════════════════════════ +# EP ANALYZER BAR HELPERS +# ══════════════════════════════════════════════════════════════════════════ + + +def _build_bar(s: int, p: int, u: int) -> Text: + """Build a compact stacked bar for S/P/U counts.""" + total = s + p + u + if total == 0: + return Text() + return _build_bar_scaled(s, p, u, total) + + +def _build_bar_scaled(s: int, p: int, u: int, anchor: int) -> Text: + """Build a stacked bar scaled to an anchor total.""" + if anchor == 0: + return Text() + bar = Text() + s_w = max(1, round(s / anchor * MAX_BAR_WIDTH)) if s else 0 + p_w = max(1, round(p / anchor * MAX_BAR_WIDTH)) if p else 0 + u_w = max(1, round(u / anchor * MAX_BAR_WIDTH)) if u else 0 + # Clamp total to MAX_BAR_WIDTH + used = s_w + p_w + u_w + if used > MAX_BAR_WIDTH: + overflow = used - MAX_BAR_WIDTH + # Shrink from the largest segment + if s_w >= p_w and s_w >= u_w: + s_w = max(1, s_w - overflow) + elif p_w >= u_w: + p_w = max(1, p_w - overflow) + else: + u_w = max(1, u_w - overflow) + bar.append("\u2588" * s_w, style="green") + if p_w: + bar.append("\u2588" * p_w, style="yellow") + if u_w: + bar.append("\u2588" * u_w, style="red") + return bar + + +def _spu_text(s: int, p: int, u: int) -> Text: + """Build 'S/P/U' colored count text.""" + t = Text() + t.append(str(s), style="bold green") + t.append("/", style="dim") + t.append(str(p), style="bold yellow" if p > 0 else "dim") + t.append("/", style="dim") + t.append(str(u), style="bold red" if u > 0 else "dim") + return t + + +# ══════════════════════════════════════════════════════════════════════════ +# ONNX GRAPH SUMMARY (for compile stage) +# ══════════════════════════════════════════════════════════════════════════ + + +def get_onnx_graph_summary(model_path: Path | str) -> dict[str, Any]: + """Extract graph summary from ONNX model without loading weights. + + Returns dict with: + op_counts: dict[str, int] — node count per op_type (excl QDQ) + inputs: list[dict] — [{name, shape, dtype}, ...] + outputs: list[dict] — [{name, shape, dtype}, ...] + num_initializers: int + total_nodes: int + """ + from onnx import TensorProto + + from ..onnx import load_onnx + + _dtype_map = { + TensorProto.FLOAT: "float32", + TensorProto.FLOAT16: "float16", + TensorProto.INT32: "int32", + TensorProto.INT64: "int64", + TensorProto.INT8: "int8", + TensorProto.UINT8: "uint8", + TensorProto.BOOL: "bool", + TensorProto.STRING: "string", + } + + model = load_onnx(model_path, load_weights=False, validate=False) + graph = model.graph + + # Op counts (exclude QDQ nodes from display) + qdq_ops = {"QuantizeLinear", "DequantizeLinear"} + op_counts: dict[str, int] = {} + for node in graph.node: + if node.op_type not in qdq_ops: + op_counts[node.op_type] = op_counts.get(node.op_type, 0) + 1 + + # Sort by count descending + op_counts = dict(sorted(op_counts.items(), key=lambda x: x[1], reverse=True)) + + # Inputs (exclude initializer names — they appear in graph.input too) + init_names = {init.name for init in graph.initializer} + + def _parse_io(value_info: Any) -> dict: + name = value_info.name + tt = value_info.type.tensor_type + dtype = _dtype_map.get(tt.elem_type, f"type({tt.elem_type})") + dims = [] + if tt.HasField("shape"): + for d in tt.shape.dim: + if d.dim_param: + dims.append(d.dim_param) + else: + dims.append(d.dim_value) + return {"name": name, "shape": dims, "dtype": dtype} + + inputs = [_parse_io(inp) for inp in graph.input if inp.name not in init_names] + outputs = [_parse_io(out) for out in graph.output] + + return { + "op_counts": op_counts, + "inputs": inputs, + "outputs": outputs, + "num_initializers": len(graph.initializer), + "total_nodes": len(graph.node), + } diff --git a/src/winml/modelkit/utils/logging.py b/src/winml/modelkit/utils/logging.py index 3cd16ab23..94a0b3ba9 100644 --- a/src/winml/modelkit/utils/logging.py +++ b/src/winml/modelkit/utils/logging.py @@ -2,27 +2,52 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""Logging utilities for ModelKit.""" +"""Logging utilities for ModelKit. + +Verbosity Convention (adopted from pip, ansible, pytest): +========================================================= + + Flag Level Value Use case + ---- ----- ----- -------- + -q ERROR 40 Errors only (quiet / scripting) + (default) WARNING 30 Warnings + errors (production default) + -v INFO 20 Operational progress messages + -vv DEBUG 10 Developer-level tracing + --debug DEBUG 10 Alias for -vv (backward compat) + + Formula: level = WARNING - (verbosity * 10) -> 30, 20, 10 + Quiet: level = ERROR (40) + +All log output goes to stderr so stdout stays clean for structured data +(JSON, compact output, piped commands). +""" import logging import sys -def configure_logging(verbose: bool = False, quiet: bool = False) -> None: - """Configure logging level based on verbosity flags. +def configure_logging( + verbosity: int = 0, + quiet: bool = False, + *, + # Backward-compat: accept old bool signature + verbose: bool = False, +) -> None: + """Configure root logger based on verbosity level. Args: - verbose: Enable verbose logging (DEBUG level) - quiet: Enable quiet mode (ERROR level only) - - Default level is INFO when both flags are False. + verbosity: Number of ``-v`` flags (0=WARNING, 1=INFO, 2+=DEBUG). + quiet: If True, override to ERROR level regardless of verbosity. + verbose: **Deprecated bool compat** — treated as verbosity=1 when + True and verbosity is 0. Existing callers that pass + ``verbose=True`` keep working without changes. """ - if quiet: - log_level = logging.ERROR - elif verbose: - log_level = logging.DEBUG - else: - log_level = logging.INFO + # Backward compat: bool verbose → int, also handles count passthrough + if verbose and verbosity == 0: + verbosity = int(verbose) + + # Clamp between DEBUG (10) and WARNING (30); quiet overrides to ERROR + log_level = logging.ERROR if quiet else max(logging.DEBUG, logging.WARNING - verbosity * 10) logging.basicConfig( level=log_level, diff --git a/tests/_helpers.py b/tests/_helpers.py new file mode 100644 index 000000000..1f32d39ca --- /dev/null +++ b/tests/_helpers.py @@ -0,0 +1,28 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Shared test helpers for session-related tests.""" + +from __future__ import annotations + +from pathlib import Path + + +def get_minimal_onnx_model_path() -> Path: + """Return path to a tiny Identity ONNX model used by session tests.""" + import onnx + from onnx import TensorProto, helper + + fixture_dir = Path(__file__).parent / "_fixtures" + fixture_dir.mkdir(exist_ok=True) + fixture = fixture_dir / "identity.onnx" + if not fixture.exists(): + inp = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 4]) + out = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, 4]) + node = helper.make_node("Identity", ["input"], ["output"]) + graph = helper.make_graph([node], "identity", [inp], [out]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 17)]) + model.ir_version = 8 + onnx.save(model, fixture) + return fixture diff --git a/tests/cli/test_import_time.py b/tests/cli/test_import_time.py new file mode 100644 index 000000000..4f7c681fa --- /dev/null +++ b/tests/cli/test_import_time.py @@ -0,0 +1,466 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Regression tests for lazy loading and import-time tracking. + +These tests ensure that importing ModelKit modules and running CLI commands +do not pull in heavy ML dependencies (torch, transformers, optimum, etc.) +unless the functionality actually requires them. + +Every test runs in a fresh subprocess so sys.modules starts clean. + +Test Categories: + (A) Per-module isolation: verify each winml.modelkit.* package's import budget + (B) Per-command: verify each CLI command's import budget (--help and --model) +""" + +from __future__ import annotations + +import subprocess +import sys +import textwrap + +import pytest + + +# --------------------------------------------------------------------------- +# Discovery — dynamic lists from the actual codebase +# --------------------------------------------------------------------------- + + +# Discover commands by scanning the commands/ directory (same logic as cli.py) +def _discover_command_names() -> list[str]: + from pathlib import Path + + # Walk up until we find the repo root (marked by pyproject.toml). + # Resilient to this file's depth within tests/. + root = next(p for p in Path(__file__).resolve().parents if (p / "pyproject.toml").exists()) + commands_dir = root / "src" / "winml" / "modelkit" / "commands" + return sorted(f.stem for f in commands_dir.glob("*.py") if not f.name.startswith("_")) + + +_CLI_COMMANDS = _discover_command_names() + +HEAVY_PREFIXES = ("torch", "transformers", "optimum", "diffusers", "sklearn") + + +def _run_in_subprocess(code: str) -> subprocess.CompletedProcess[str]: + """Run Python code in a fresh subprocess via a temp script approach.""" + return subprocess.run( # noqa: S603 + [sys.executable, "-c", textwrap.dedent(code)], + capture_output=True, + text=True, + timeout=120, + ) + + +def assert_no_heavy_imports( + setup_code: str, + *, + forbidden: tuple[str, ...] = HEAVY_PREFIXES, + allowed: tuple[str, ...] = (), +) -> None: + """Assert that running setup_code loads no forbidden modules. + + Args: + setup_code: Python code to execute (will be dedented). + forbidden: Module prefixes that must NOT appear in sys.modules. + allowed: Module prefixes to exclude from the forbidden check. + """ + script = textwrap.dedent(f"""\ + import sys + {setup_code} + loaded = sorted(set( + m.split('.')[0] for m in sys.modules + if m.startswith({forbidden!r}) + )) + allowed = set({allowed!r}) + bad = [m for m in loaded if m not in allowed] + if bad: + print(f"FAIL: unexpected heavy modules: {{bad}}", file=sys.stderr) + print(f" allowed: {{allowed}}", file=sys.stderr) + sys.exit(1) + """) + result = subprocess.run( # noqa: S603 + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 0, f"Import budget violated.\nstderr: {result.stderr.strip()}" + + +def assert_cli_no_heavy_imports( + cli_args: list[str], + *, + allowed: tuple[str, ...] = (), +) -> None: + """Assert that invoking ``main(cli_args)`` loads no forbidden modules. + + Uses try/except to catch SystemExit and Click errors gracefully. + """ + args_str = repr(cli_args) + script = textwrap.dedent(f"""\ + import sys + from winml.modelkit.cli import main + import click + try: + main({args_str}, standalone_mode=False) + except (SystemExit, click.exceptions.UsageError, Exception): + pass + loaded = sorted(set( + m.split('.')[0] for m in sys.modules + if m.startswith({HEAVY_PREFIXES!r}) + )) + allowed = set({allowed!r}) + bad = [m for m in loaded if m not in allowed] + if bad: + print(f"FAIL: unexpected heavy modules: {{bad}}", file=sys.stderr) + print(f" allowed: {{allowed}}", file=sys.stderr) + sys.exit(1) + """) + result = subprocess.run( # noqa: S603 + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 0, ( + f"Import budget violated for args {cli_args}.\nstderr: {result.stderr.strip()}" + ) + + +# =========================================================================== +# (A) Per-Module Isolation Tests +# =========================================================================== + + +class TestModuleIsolation: + """Verify each winml.modelkit.* module's import budget.""" + + @pytest.mark.parametrize( + "module", + [ + "winml.modelkit", + "winml.modelkit.cli", + "winml.modelkit.cache", + "winml.modelkit.compiler", + "winml.modelkit.config", + "winml.modelkit.core", + "winml.modelkit.export", + "winml.modelkit.loader", + "winml.modelkit.onnx", + "winml.modelkit.optim", + "winml.modelkit.quant", + "winml.modelkit.session", + "winml.modelkit.analyze", + "winml.modelkit.pattern", + "winml.modelkit.sysinfo", + "winml.modelkit.utils", + ], + ) + def test_module_no_heavy_deps(self, module: str) -> None: + """Importing this module must not load torch/transformers/optimum.""" + assert_no_heavy_imports(f"import {module}") + + @pytest.mark.parametrize( + ("module", "allowed"), + [ + ("winml.modelkit.build", ("torch", "torchgen")), + ("winml.modelkit.data", ("torch", "torchgen", "torchvision")), + ( + "winml.modelkit.datasets", + ("torch", "torchgen", "torchvision", "transformers", "sklearn"), + ), + ( + "winml.modelkit.eval", + ("torch", "torchgen", "torchvision", "transformers", "sklearn"), + ), + ("winml.modelkit.inspect", (*HEAVY_PREFIXES, "torchgen", "torchvision")), + ("winml.modelkit.models", (*HEAVY_PREFIXES, "torchgen", "torchvision")), + ], + ) + def test_module_with_expected_deps(self, module: str, allowed: tuple[str, ...]) -> None: + """Modules that legitimately need heavy deps — verify nothing extra.""" + assert_no_heavy_imports(f"import {module}", allowed=allowed) + + def test_lazy_access_triggers_import(self) -> None: + """Accessing WinMLAutoModel must trigger the full import chain.""" + script = textwrap.dedent("""\ + import sys + from winml.modelkit import WinMLAutoModel + assert 'torch' in sys.modules, ( + 'torch should be loaded after accessing WinMLAutoModel' + ) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"Lazy access did not trigger torch.\nstderr: {result.stderr}" + ) + + # -- Gap 2: lazy-trigger tests for subpackage __getattr__ implementations -- + + def test_lazy_core_get_io_config(self) -> None: + """core.get_io_config must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.core + obj = winml.modelkit.core.get_io_config + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"core.get_io_config not lazily accessible.\nstderr: {result.stderr}" + ) + + def test_lazy_export_resolve_io_specs(self) -> None: + """export.resolve_io_specs must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.export + obj = winml.modelkit.export.resolve_io_specs + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"export.resolve_io_specs not lazily accessible.\nstderr: {result.stderr}" + ) + + def test_lazy_loader_load_hf_model(self) -> None: + """loader.load_hf_model must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.loader + obj = winml.modelkit.loader.load_hf_model + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"loader.load_hf_model not lazily accessible.\nstderr: {result.stderr}" + ) + + def test_lazy_quant_quantize_onnx(self) -> None: + """quant.quantize_onnx must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.quant + obj = winml.modelkit.quant.quantize_onnx + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"quant.quantize_onnx not lazily accessible.\nstderr: {result.stderr}" + ) + + # -- Gap 3: AttributeError negative test -- + + def test_nonexistent_attr_raises(self) -> None: + """Importing a nonexistent attribute must raise ImportError.""" + script = textwrap.dedent("""\ + try: + from winml.modelkit import nonexistent_xyz_12345 + except ImportError: + pass # expected + else: + raise AssertionError( + "Expected ImportError for nonexistent attribute" + ) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"Nonexistent attr did not raise ImportError.\nstderr: {result.stderr}" + ) + + # -- Gap 4: __dir__ correctness test -- + + def test_dir_includes_lazy_attrs(self) -> None: + """dir(winml.modelkit) must include lazy attrs without loading torch.""" + script = textwrap.dedent("""\ + import sys + import winml.modelkit + assert "WinMLAutoModel" in dir(winml.modelkit), ( + "WinMLAutoModel missing from dir()" + ) + loaded = sorted(set( + m.split('.')[0] for m in sys.modules + if m.startswith(('torch', 'transformers', 'optimum', 'diffusers', 'sklearn')) + )) + if loaded: + print(f"FAIL: dir() triggered heavy imports: {loaded}", file=sys.stderr) + sys.exit(1) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, f"dir() test failed.\nstderr: {result.stderr}" + + +# =========================================================================== +# (C) _LAZY_IMPORTS Dict Consistency Tests +# =========================================================================== + +_LAZY_MODULES = [ + "winml.modelkit", + "winml.modelkit.core", + "winml.modelkit.export", + "winml.modelkit.loader", + "winml.modelkit.quant", + "winml.modelkit.models", + "winml.modelkit.onnx", +] + + +class TestLazyImportsDict: + """Verify the standardized _LAZY_IMPORTS pattern across all modules.""" + + @pytest.mark.parametrize("module", _LAZY_MODULES) + def test_lazy_imports_dict_exists(self, module: str) -> None: + """Each module must define a non-empty _LAZY_IMPORTS dict.""" + script = textwrap.dedent(f"""\ + import {module} as mod + lazy = getattr(mod, '_LAZY_IMPORTS', None) + assert lazy is not None, '_LAZY_IMPORTS not found on {module}' + assert isinstance(lazy, dict), ( + f'_LAZY_IMPORTS is {{type(lazy).__name__}}, expected dict' + ) + assert len(lazy) > 0, '_LAZY_IMPORTS is empty' + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"_LAZY_IMPORTS check failed for {module}.\nstderr: {result.stderr.strip()}" + ) + + @pytest.mark.parametrize("module", _LAZY_MODULES) + def test_lazy_imports_all_consistent(self, module: str) -> None: + """Every key in _LAZY_IMPORTS must also appear in __all__.""" + script = textwrap.dedent(f"""\ + import {module} as mod + lazy = set(mod._LAZY_IMPORTS.keys()) + all_ = set(mod.__all__) + missing = lazy - all_ + assert not missing, f'In _LAZY_IMPORTS but not __all__: {{missing}}' + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"_LAZY_IMPORTS/__all__ drift in {module}.\nstderr: {result.stderr.strip()}" + ) + + @pytest.mark.parametrize("module", _LAZY_MODULES) + def test_lazy_imports_all_resolvable(self, module: str) -> None: + """Every _LAZY_IMPORTS entry must resolve to a real attribute. + + Convention: ``_LAZY_IMPORTS`` maps a lazy attribute name to a + ``(submodule_path, real_attr_name)`` tuple, where ``submodule_path`` + is relative (e.g. ``".config"``) resolved against the host package. + """ + script = textwrap.dedent(f"""\ + import importlib + import {module} as mod + errors = [] + for lazy_name, (submodule_path, real_attr) in mod._LAZY_IMPORTS.items(): + try: + sub = importlib.import_module(submodule_path, package={module!r}) + if not hasattr(sub, real_attr): + errors.append( + f'{{lazy_name}}: {{submodule_path}}.{{real_attr}} not found' + ) + except ImportError as exc: + errors.append(f'{{lazy_name}}: cannot import {{submodule_path}} ({{exc}})') + if errors: + raise AssertionError( + f'Unresolvable _LAZY_IMPORTS in {module}:\\n' + '\\n'.join(errors) + ) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"Unresolvable _LAZY_IMPORTS in {module}.\nstderr: {result.stderr.strip()}" + ) + + +# =========================================================================== +# (B) Per-Command Tests -- --help (no heavy imports at command load time) +# =========================================================================== + + +class TestCommandHelp: + """Verify ``winml`` and ``winml <cmd> --help`` do not load heavy deps.""" + + def test_winml_bare(self) -> None: + """Bare ``winml`` (no args) must not load heavy deps.""" + assert_cli_no_heavy_imports([]) + + def test_winml_help(self) -> None: + """``winml --help`` must not load heavy deps.""" + assert_cli_no_heavy_imports(["--help"]) + + @pytest.mark.parametrize("cmd", _CLI_COMMANDS) + def test_command_help_no_heavy_deps(self, cmd: str) -> None: + """``winml <cmd> --help`` must not load heavy deps.""" + assert_cli_no_heavy_imports([cmd, "--help"]) + + +# =========================================================================== +# (B) Per-Command Tests — with --model (actual command execution) +# =========================================================================== + +_FAKE_ONNX = "nonexistent_test_model.onnx" +_HF_MODEL = "microsoft/resnet-50" + + +class TestCommandWithModel: + """Verify import budgets when commands are invoked with --model. + + Commands that operate on ONNX files should NOT need torch/transformers. + Commands that operate on HF models legitimately need them. + + We use a fake model path so commands fail at file I/O, but the import + chain is already established by that point. + """ + + @pytest.mark.parametrize( + ("cmd_args", "allowed"), + [ + # ONNX-path commands — should NOT need torch/transformers + ( + ["compile", "--model", _FAKE_ONNX, "-o", "o.onnx", "--ep", "qnn"], + (), + ), + ( + ["quantize", "--model", _FAKE_ONNX, "-o", "o.onnx", "--ep", "qnn"], + (), + ), + ( + ["optimize", "--model", _FAKE_ONNX, "-o", "o.onnx"], + ("torch", "torchgen"), # ORT tools.__init__ pulls torch + ), + ( + ["perf", "--model", _FAKE_ONNX], + (), + ), + ( + ["static-analyzer", "check", "--model", _FAKE_ONNX, "--ep", "qnn"], + ("torch", "torchgen"), # ORT tools.__init__ pulls torch + ), + # HF model commands — legitimately need heavy deps + ( + ["inspect", "-m", _HF_MODEL], + (*HEAVY_PREFIXES, "torchgen", "torchvision"), + ), + ( + ["config", "-m", _HF_MODEL, "--device", "npu", "--precision", "int8"], + (*HEAVY_PREFIXES, "torchgen", "torchvision"), + ), + ], + ids=[ + "compile-onnx", + "quantize-onnx", + "optimize-onnx", + "perf-onnx", + "static-analyzer-onnx", + "inspect-hf", + "config-hf", + ], + ) + def test_command_import_budget(self, cmd_args: list[str], allowed: tuple[str, ...]) -> None: + """Verify each command's import budget with --model.""" + assert_cli_no_heavy_imports(cmd_args, allowed=allowed) diff --git a/tests/cli/test_main.py b/tests/cli/test_main.py new file mode 100644 index 000000000..538e5fb77 --- /dev/null +++ b/tests/cli/test_main.py @@ -0,0 +1,241 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""CLI integration tests for winml command. + +Tests the CLI interface using Click's CliRunner to ensure commands work +correctly without executing actual model exports (which are slow). + +Test Categories: +1. Basic CLI functionality (version, help) +2. Command discovery +3. Export command argument validation +4. Sysinfo command output formats +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import MagicMock, patch + + +if TYPE_CHECKING: + from pathlib import Path + +import pytest +from click.testing import CliRunner + +from winml.modelkit.cli import main + + +@pytest.fixture +def runner() -> CliRunner: + """Create a CLI test runner.""" + return CliRunner() + + +class TestCLIBasics: + """Test basic CLI functionality.""" + + def test_version(self, runner: CliRunner) -> None: + """Test --version flag shows version info.""" + result = runner.invoke(main, ["--version"]) + assert result.exit_code == 0 + assert "winml" in result.output.lower() + + def test_help(self, runner: CliRunner) -> None: + """Test --help shows usage information.""" + result = runner.invoke(main, ["--help"]) + assert result.exit_code == 0 + assert "WML ModelKit" in result.output + assert "export" in result.output.lower() + + def test_debug_flag(self, runner: CliRunner) -> None: + """Test --debug flag is accepted.""" + result = runner.invoke(main, ["--debug", "--help"]) + assert result.exit_code == 0 + + +class TestCommandDiscovery: + """Test command auto-discovery from commands/ directory.""" + + def test_export_command_discovered(self, runner: CliRunner) -> None: + """Test export command is discovered and available.""" + result = runner.invoke(main, ["export", "--help"]) + assert result.exit_code == 0 + assert "model" in result.output.lower() + assert "output" in result.output.lower() + + def test_sys_command_discovered(self, runner: CliRunner) -> None: + """Test sys command is discovered and available.""" + result = runner.invoke(main, ["sys", "--help"]) + assert result.exit_code == 0 + assert "format" in result.output.lower() + + +class TestExportCommand: + """Test export command functionality.""" + + def test_export_requires_model(self, runner: CliRunner) -> None: + """Test export fails without --model argument.""" + result = runner.invoke(main, ["export", "--output", "test.onnx"]) + assert result.exit_code != 0 + assert "model" in result.output.lower() or "required" in result.output.lower() + + def test_export_requires_output(self, runner: CliRunner) -> None: + """Test export fails without --output argument.""" + result = runner.invoke(main, ["export", "--model", "test-model"]) + assert result.exit_code != 0 + assert "output" in result.output.lower() or "required" in result.output.lower() + + def test_export_help(self, runner: CliRunner) -> None: + """Test export --help shows all options.""" + result = runner.invoke(main, ["export", "--help"]) + assert result.exit_code == 0 + assert "--model" in result.output + assert "--output" in result.output + assert "--verbose" in result.output + + def test_export_short_flags(self, runner: CliRunner) -> None: + """Test export short flags are documented.""" + result = runner.invoke(main, ["export", "--help"]) + assert result.exit_code == 0 + assert "-m" in result.output + assert "-o" in result.output + assert "-v" in result.output + + @patch("winml.modelkit.loader.load_hf_model") + @patch("winml.modelkit.export.export_pytorch") + def test_export_calls_api( + self, + mock_export_onnx: MagicMock, + mock_load_hf_model: MagicMock, + runner: CliRunner, + tmp_path: Path, + ) -> None: + """Test export command delegates to export_onnx correctly.""" + # Setup mock model loader + mock_model = MagicMock() + mock_load_hf_model.return_value = (mock_model, None, "image-classification") + + # Setup mock export_onnx + output_path = tmp_path / "model.onnx" + mock_export_onnx.return_value = output_path + + runner.invoke( + main, + [ + "export", + "--model", + "test-model", + "--output", + str(output_path), + ], + ) + + # Verify export_onnx was called correctly + assert mock_export_onnx.called + call_kwargs = mock_export_onnx.call_args.kwargs + assert call_kwargs["model_id"] == "test-model" + assert call_kwargs["task"] == "image-classification" + + +class TestSysCommand: + """Test sys command functionality. + + Device and EP detection use WMI/PowerShell queries that are slow on CI, + so we mock _gather_device_info and _gather_ep_info to avoid timeouts. + """ + + @pytest.fixture(autouse=True) + def _mock_hw_detection(self): + """Mock slow hardware detection to prevent CI timeouts.""" + mock_devices = [{"priority": 1, "type": "CPU", "name": "Mock CPU", "details": {}}] + mock_eps = [{"name": "CPUExecutionProvider", "device": "CPU", "path": None}] + with ( + patch("winml.modelkit.commands.sys._gather_device_info", return_value=mock_devices), + patch("winml.modelkit.commands.sys._gather_ep_info", return_value=mock_eps), + ): + yield + + def test_sys_help(self, runner: CliRunner) -> None: + """Test sys --help shows options.""" + result = runner.invoke(main, ["sys", "--help"]) + assert result.exit_code == 0 + assert "--format" in result.output + assert "json" in result.output.lower() + assert "compact" in result.output.lower() + + def test_sys_default_format(self, runner: CliRunner) -> None: + """Test sys with default (text) format.""" + result = runner.invoke(main, ["sys"]) + assert result.exit_code == 0 + assert "Python" in result.output or "python" in result.output.lower() + + def test_sys_json_format(self, runner: CliRunner) -> None: + """Test sys with JSON format.""" + result = runner.invoke(main, ["sys", "--format", "json"]) + assert result.exit_code == 0 + # Should be valid JSON + import json + + data = json.loads(result.output) + assert "python" in data + assert "libraries" in data + + def test_sys_compact_format(self, runner: CliRunner) -> None: + """Test sys with compact format.""" + result = runner.invoke(main, ["sys", "--format", "compact"]) + assert result.exit_code == 0 + assert "Python" in result.output + + def test_sys_verbose(self, runner: CliRunner) -> None: + """Test sys with verbose flag.""" + result = runner.invoke(main, ["sys", "--verbose"]) + assert result.exit_code == 0 + + def test_sys_list_device_list_ep_json_is_valid_single_object(self, runner: CliRunner) -> None: + """--list-device --list-ep --format json must emit one valid JSON object, not two arrays.""" + import json + + result = runner.invoke(main, ["sys", "--list-device", "--list-ep", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert "devices" in data + assert "executionProviders" in data + assert isinstance(data["devices"], list) + assert isinstance(data["executionProviders"], list) + + def test_sys_list_device_compact(self, runner: CliRunner) -> None: + """--list-device --format compact must produce compact output, not text table.""" + result = runner.invoke(main, ["sys", "--list-device", "--format", "compact"]) + assert result.exit_code == 0 + assert "CPU" in result.output + # Compact output is a single line; no Rich panel borders + assert "Available Devices" not in result.output + + def test_sys_list_ep_compact(self, runner: CliRunner) -> None: + """--list-ep --format compact must produce compact output, not text table.""" + result = runner.invoke(main, ["sys", "--list-ep", "--format", "compact"]) + assert result.exit_code == 0 + assert "CPUExecutionProvider" in result.output + # Compact output is a single line; no Rich panel headers + assert "Available Execution Providers" not in result.output + + +class TestModuleExecution: + """Test python -m winml.modelkit execution.""" + + def test_module_imports(self) -> None: + """Test __main__ module can be imported.""" + from winml.modelkit import __main__ + + assert hasattr(__main__, "main") + + def test_cli_imports(self) -> None: + """Test cli module can be imported.""" + from winml.modelkit import cli + + assert hasattr(cli, "main") + assert callable(cli.main) diff --git a/tests/conftest.py b/tests/conftest.py index 61ed770f8..5be63d479 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,14 +35,25 @@ def _skip_winml_ep_init(request: pytest.FixtureRequest, monkeypatch: pytest.Monk """Mock WinML EP initialization for non-e2e tests.""" if "e2e" in {m.name for m in request.node.iter_markers()}: return - monkeypatch.setattr( - "winml.modelkit.session.session.WinMLSession._init_winml_eps_once", - classmethod(lambda cls: None), - ) - monkeypatch.setattr( - "winml.modelkit.analyze.core.runtime_checker_query.RuntimeCheckerQuery._is_ep_available_locally", - lambda self: False, - ) + try: + monkeypatch.setattr( + "winml.modelkit.session.ep_registry.ensure_initialized", + lambda: None, + ) + except ImportError as e: + import warnings + + warnings.warn(f"Could not mock ensure_initialized: {e}", stacklevel=2) + + try: + monkeypatch.setattr( + "winml.modelkit.analyze.core.runtime_checker_query.RuntimeCheckerQuery._is_ep_available_locally", + lambda self: False, + ) + except ImportError as e: + import warnings + + warnings.warn(f"Could not mock _is_ep_available_locally: {e}", stacklevel=2) # ============================================================================= diff --git a/tests/regression/test_design_gaps.py b/tests/regression/test_design_gaps.py index 5c336b93f..55a9f0412 100644 --- a/tests/regression/test_design_gaps.py +++ b/tests/regression/test_design_gaps.py @@ -65,19 +65,19 @@ def test_optimize_list_rewrites_is_ascii_safe(self): # =========================================================================== -# M-1: --list-tasks NOT in inspect --help +# M-1: --list-tasks IS in inspect --help (implemented in MVP v2 port) # =========================================================================== -class TestM1ListTasksAbsent: - """Document that --list-tasks is not implemented in inspect.""" +class TestM1ListTasksPresent: + """Verify that --list-tasks is implemented in inspect.""" - def test_list_tasks_not_in_help(self): - """inspect --help should NOT contain --list-tasks option.""" + def test_list_tasks_in_help(self): + """inspect --help should contain --list-tasks option.""" runner = CliRunner() result = runner.invoke(inspect, ["--help"], obj={}) assert result.exit_code == 0 - assert "--list-tasks" not in result.output + assert "--list-tasks" in result.output # =========================================================================== diff --git a/tests/test_import_time.py b/tests/test_import_time.py new file mode 100644 index 000000000..939275051 --- /dev/null +++ b/tests/test_import_time.py @@ -0,0 +1,460 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Regression tests for lazy loading and import-time tracking. + +These tests ensure that importing ModelKit modules and running CLI commands +do not pull in heavy ML dependencies (torch, transformers, optimum, etc.) +unless the functionality actually requires them. + +Every test runs in a fresh subprocess so sys.modules starts clean. + +Test Categories: + (A) Per-module isolation: verify each winml.modelkit.* package's import budget + (B) Per-command: verify each CLI command's import budget (--help and --model) +""" + +from __future__ import annotations + +import subprocess +import sys +import textwrap + +import pytest + + +# --------------------------------------------------------------------------- +# Discovery — dynamic lists from the actual codebase +# --------------------------------------------------------------------------- + + +# Discover commands by scanning the commands/ directory (same logic as cli.py) +def _discover_command_names() -> list[str]: + from pathlib import Path + + root = Path(__file__).resolve().parent.parent + commands_dir = root / "src" / "winml" / "modelkit" / "commands" + return sorted(f.stem for f in commands_dir.glob("*.py") if not f.name.startswith("_")) + + +_CLI_COMMANDS = _discover_command_names() + +HEAVY_PREFIXES = ("torch", "transformers", "optimum", "diffusers", "sklearn") + + +def _run_in_subprocess(code: str) -> subprocess.CompletedProcess[str]: + """Run Python code in a fresh subprocess via a temp script approach.""" + return subprocess.run( # noqa: S603 + [sys.executable, "-c", textwrap.dedent(code)], + capture_output=True, + text=True, + timeout=120, + ) + + +def assert_no_heavy_imports( + setup_code: str, + *, + forbidden: tuple[str, ...] = HEAVY_PREFIXES, + allowed: tuple[str, ...] = (), +) -> None: + """Assert that running setup_code loads no forbidden modules. + + Args: + setup_code: Python code to execute (will be dedented). + forbidden: Module prefixes that must NOT appear in sys.modules. + allowed: Module prefixes to exclude from the forbidden check. + """ + script = textwrap.dedent(f"""\ + import sys + {setup_code} + loaded = sorted(set( + m.split('.')[0] for m in sys.modules + if m.startswith({forbidden!r}) + )) + allowed = set({allowed!r}) + bad = [m for m in loaded if m not in allowed] + if bad: + print(f"FAIL: unexpected heavy modules: {{bad}}", file=sys.stderr) + print(f" allowed: {{allowed}}", file=sys.stderr) + sys.exit(1) + """) + result = subprocess.run( # noqa: S603 + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 0, f"Import budget violated.\nstderr: {result.stderr.strip()}" + + +def assert_cli_no_heavy_imports( + cli_args: list[str], + *, + allowed: tuple[str, ...] = (), +) -> None: + """Assert that invoking ``main(cli_args)`` loads no forbidden modules. + + Uses try/except to catch SystemExit and Click errors gracefully. + """ + args_str = repr(cli_args) + script = textwrap.dedent(f"""\ + import sys + from winml.modelkit.cli import main + import click + try: + main({args_str}, standalone_mode=False) + except (SystemExit, click.exceptions.UsageError, Exception): + pass + loaded = sorted(set( + m.split('.')[0] for m in sys.modules + if m.startswith({HEAVY_PREFIXES!r}) + )) + allowed = set({allowed!r}) + bad = [m for m in loaded if m not in allowed] + if bad: + print(f"FAIL: unexpected heavy modules: {{bad}}", file=sys.stderr) + print(f" allowed: {{allowed}}", file=sys.stderr) + sys.exit(1) + """) + result = subprocess.run( # noqa: S603 + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 0, ( + f"Import budget violated for args {cli_args}.\nstderr: {result.stderr.strip()}" + ) + + +# =========================================================================== +# (A) Per-Module Isolation Tests +# =========================================================================== + + +class TestModuleIsolation: + """Verify each winml.modelkit.* module's import budget.""" + + @pytest.mark.parametrize( + "module", + [ + "winml.modelkit", + "winml.modelkit.cli", + "winml.modelkit.cache", + "winml.modelkit.compiler", + "winml.modelkit.config", + "winml.modelkit.core", + "winml.modelkit.export", + "winml.modelkit.loader", + "winml.modelkit.onnx", + "winml.modelkit.optim", + "winml.modelkit.optracing", + "winml.modelkit.quant", + "winml.modelkit.session", + "winml.modelkit.analyze", + "winml.modelkit.pattern", + "winml.modelkit.sysinfo", + "winml.modelkit.utils", + ], + ) + def test_module_no_heavy_deps(self, module: str) -> None: + """Importing this module must not load torch/transformers/optimum.""" + assert_no_heavy_imports(f"import {module}") + + @pytest.mark.parametrize( + ("module", "allowed"), + [ + ("winml.modelkit.build", ("torch", "torchgen")), + ("winml.modelkit.data", ("torch", "torchgen", "torchvision")), + ( + "winml.modelkit.datasets", + ("torch", "torchgen", "torchvision", "transformers", "sklearn"), + ), + ( + "winml.modelkit.eval", + ("torch", "torchgen", "torchvision", "transformers", "sklearn"), + ), + ("winml.modelkit.inspect", (*HEAVY_PREFIXES, "torchgen", "torchvision")), + ("winml.modelkit.models", (*HEAVY_PREFIXES, "torchgen", "torchvision")), + ], + ) + def test_module_with_expected_deps(self, module: str, allowed: tuple[str, ...]) -> None: + """Modules that legitimately need heavy deps — verify nothing extra.""" + assert_no_heavy_imports(f"import {module}", allowed=allowed) + + def test_lazy_access_triggers_import(self) -> None: + """Accessing WinMLAutoModel must trigger the full import chain.""" + script = textwrap.dedent("""\ + import sys + from winml.modelkit import WinMLAutoModel + assert 'torch' in sys.modules, ( + 'torch should be loaded after accessing WinMLAutoModel' + ) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"Lazy access did not trigger torch.\nstderr: {result.stderr}" + ) + + # -- Gap 2: lazy-trigger tests for subpackage __getattr__ implementations -- + + def test_lazy_core_get_io_config(self) -> None: + """core.get_io_config must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.core + obj = winml.modelkit.core.get_io_config + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"core.get_io_config not lazily accessible.\nstderr: {result.stderr}" + ) + + def test_lazy_export_resolve_io_specs(self) -> None: + """export.resolve_io_specs must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.export + obj = winml.modelkit.export.resolve_io_specs + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"export.resolve_io_specs not lazily accessible.\nstderr: {result.stderr}" + ) + + def test_lazy_loader_load_hf_model(self) -> None: + """loader.load_hf_model must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.loader + obj = winml.modelkit.loader.load_hf_model + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"loader.load_hf_model not lazily accessible.\nstderr: {result.stderr}" + ) + + def test_lazy_quant_quantize_onnx(self) -> None: + """quant.quantize_onnx must be lazily accessible and callable.""" + script = textwrap.dedent("""\ + import winml.modelkit.quant + obj = winml.modelkit.quant.quantize_onnx + assert obj is not None + assert callable(obj) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"quant.quantize_onnx not lazily accessible.\nstderr: {result.stderr}" + ) + + # -- Gap 3: AttributeError negative test -- + + def test_nonexistent_attr_raises(self) -> None: + """Importing a nonexistent attribute must raise ImportError.""" + script = textwrap.dedent("""\ + try: + from winml.modelkit import nonexistent_xyz_12345 + except ImportError: + pass # expected + else: + raise AssertionError( + "Expected ImportError for nonexistent attribute" + ) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"Nonexistent attr did not raise ImportError.\nstderr: {result.stderr}" + ) + + # -- Gap 4: __dir__ correctness test -- + + def test_dir_includes_lazy_attrs(self) -> None: + """dir(winml.modelkit) must include lazy attrs without loading torch.""" + script = textwrap.dedent("""\ + import sys + import winml.modelkit + assert "WinMLAutoModel" in dir(winml.modelkit), ( + "WinMLAutoModel missing from dir()" + ) + loaded = sorted(set( + m.split('.')[0] for m in sys.modules + if m.startswith(('torch', 'transformers', 'optimum', 'diffusers', 'sklearn')) + )) + if loaded: + print(f"FAIL: dir() triggered heavy imports: {loaded}", file=sys.stderr) + sys.exit(1) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, f"dir() test failed.\nstderr: {result.stderr}" + + +# =========================================================================== +# (C) _LAZY_IMPORTS Dict Consistency Tests +# =========================================================================== + +_LAZY_MODULES = [ + "winml.modelkit", + "winml.modelkit.core", + "winml.modelkit.export", + "winml.modelkit.loader", + "winml.modelkit.quant", + "winml.modelkit.models", + "winml.modelkit.onnx", +] + + +class TestLazyImportsDict: + """Verify the standardized _LAZY_IMPORTS pattern across all modules.""" + + @pytest.mark.parametrize("module", _LAZY_MODULES) + def test_lazy_imports_dict_exists(self, module: str) -> None: + """Each module must define a non-empty _LAZY_IMPORTS dict.""" + script = textwrap.dedent(f"""\ + import {module} as mod + lazy = getattr(mod, '_LAZY_IMPORTS', None) + assert lazy is not None, '_LAZY_IMPORTS not found on {module}' + assert isinstance(lazy, dict), ( + f'_LAZY_IMPORTS is {{type(lazy).__name__}}, expected dict' + ) + assert len(lazy) > 0, '_LAZY_IMPORTS is empty' + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"_LAZY_IMPORTS check failed for {module}.\nstderr: {result.stderr.strip()}" + ) + + @pytest.mark.parametrize("module", _LAZY_MODULES) + def test_lazy_imports_all_consistent(self, module: str) -> None: + """Every key in _LAZY_IMPORTS must also appear in __all__.""" + script = textwrap.dedent(f"""\ + import {module} as mod + lazy = set(mod._LAZY_IMPORTS.keys()) + all_ = set(mod.__all__) + missing = lazy - all_ + assert not missing, f'In _LAZY_IMPORTS but not __all__: {{missing}}' + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"_LAZY_IMPORTS/__all__ drift in {module}.\nstderr: {result.stderr.strip()}" + ) + + @pytest.mark.parametrize("module", _LAZY_MODULES) + def test_lazy_imports_all_resolvable(self, module: str) -> None: + """Every _LAZY_IMPORTS entry must resolve to a real attribute.""" + script = textwrap.dedent(f"""\ + import importlib + import {module} as mod + errors = [] + for attr_name, submodule_path in mod._LAZY_IMPORTS.items(): + try: + sub = importlib.import_module(submodule_path) + if not hasattr(sub, attr_name): + errors.append( + f'{{attr_name}}: {{submodule_path}} has no attribute {{attr_name}}' + ) + except ImportError as exc: + errors.append(f'{{attr_name}}: cannot import {{submodule_path}} ({{exc}})') + if errors: + raise AssertionError( + f'Unresolvable _LAZY_IMPORTS in {module}:\\n' + '\\n'.join(errors) + ) + """) + result = _run_in_subprocess(script) + assert result.returncode == 0, ( + f"Unresolvable _LAZY_IMPORTS in {module}.\nstderr: {result.stderr.strip()}" + ) + + +# =========================================================================== +# (B) Per-Command Tests -- --help (no heavy imports at command load time) +# =========================================================================== + + +class TestCommandHelp: + """Verify ``winml`` and ``winml <cmd> --help`` do not load heavy deps.""" + + def test_winml_bare(self) -> None: + """Bare ``winml`` (no args) must not load heavy deps.""" + assert_cli_no_heavy_imports([]) + + def test_winml_help(self) -> None: + """``winml --help`` must not load heavy deps.""" + assert_cli_no_heavy_imports(["--help"]) + + @pytest.mark.parametrize("cmd", _CLI_COMMANDS) + def test_command_help_no_heavy_deps(self, cmd: str) -> None: + """``winml <cmd> --help`` must not load heavy deps.""" + assert_cli_no_heavy_imports([cmd, "--help"]) + + +# =========================================================================== +# (B) Per-Command Tests — with --model (actual command execution) +# =========================================================================== + +_FAKE_ONNX = "nonexistent_test_model.onnx" +_HF_MODEL = "microsoft/resnet-50" + + +class TestCommandWithModel: + """Verify import budgets when commands are invoked with --model. + + Commands that operate on ONNX files should NOT need torch/transformers. + Commands that operate on HF models legitimately need them. + + We use a fake model path so commands fail at file I/O, but the import + chain is already established by that point. + """ + + @pytest.mark.parametrize( + ("cmd_args", "allowed"), + [ + # ONNX-path commands — should NOT need torch/transformers + ( + ["compile", "--model", _FAKE_ONNX, "-o", "o.onnx", "--ep", "qnn"], + (), + ), + ( + ["quantize", "--model", _FAKE_ONNX, "-o", "o.onnx", "--ep", "qnn"], + (), + ), + ( + ["optimize", "--model", _FAKE_ONNX, "-o", "o.onnx"], + ("torch", "torchgen"), # ORT tools.__init__ pulls torch + ), + ( + ["perf", "--model", _FAKE_ONNX], + (), + ), + ( + ["static-analyzer", "check", "--model", _FAKE_ONNX, "--ep", "qnn"], + ("torch", "torchgen"), # ORT tools.__init__ pulls torch + ), + # HF model commands — legitimately need heavy deps + ( + ["inspect", "-m", _HF_MODEL], + (*HEAVY_PREFIXES, "torchgen", "torchvision"), + ), + ( + ["config", "-m", _HF_MODEL, "--device", "npu", "--precision", "int8"], + (*HEAVY_PREFIXES, "torchgen", "torchvision"), + ), + ], + ids=[ + "compile-onnx", + "quantize-onnx", + "optimize-onnx", + "perf-onnx", + "static-analyzer-onnx", + "inspect-hf", + "config-hf", + ], + ) + def test_command_import_budget(self, cmd_args: list[str], allowed: tuple[str, ...]) -> None: + """Verify each command's import budget with --model.""" + assert_cli_no_heavy_imports(cmd_args, allowed=allowed) diff --git a/tests/unit/analyze/test_static_analyzer_cli.py b/tests/unit/analyze/test_static_analyzer_cli.py index 12e6941cf..7752cef0b 100644 --- a/tests/unit/analyze/test_static_analyzer_cli.py +++ b/tests/unit/analyze/test_static_analyzer_cli.py @@ -37,10 +37,16 @@ def runner() -> CliRunner: @pytest.fixture def mock_analyzer_result() -> Mock: - """Create a mock AnalysisOutput result.""" + """Create a mock AnalysisResult (returned by ONNXStaticAnalyzer.analyze). + + The command accesses ``result.output.results`` (list of EPSupport) for + Rich live display, ``result.is_fully_supported()`` for exit code, and + ``result.to_json()`` for JSON output. + """ mock_result = Mock() mock_result.is_fully_supported.return_value = True mock_result.get_unsupported_operators.return_value = [] + mock_result.output.results = [] # empty EP results list (iterable) mock_result.to_json.return_value = json.dumps( { "analyzer_version": "0.1.0", @@ -64,6 +70,7 @@ def mock_analyzer_partial_support() -> Mock: mock_result = Mock() mock_result.is_fully_supported.return_value = False mock_result.get_unsupported_operators.return_value = ["Conv", "Gemm", "Add"] + mock_result.output.results = [] # empty EP results list (iterable) mock_result.to_json.return_value = json.dumps( { "analyzer_version": "0.1.0", @@ -609,7 +616,7 @@ def test_analyze_called_with_correct_parameters( # Verify analyze was called with correct parameters mock_instance.analyze.assert_called_once() call_kwargs = mock_instance.analyze.call_args[1] - assert call_kwargs["model_path"] == model_file + assert call_kwargs["model_path"] == str(model_file) assert call_kwargs["ep"] == "OpenVINOExecutionProvider" assert call_kwargs["device"] == "GPU" assert call_kwargs["enable_information"] is True diff --git a/tests/unit/build/test_hf.py b/tests/unit/build/test_hf.py index e3ab1e56a..38ce5ec0b 100644 --- a/tests/unit/build/test_hf.py +++ b/tests/unit/build/test_hf.py @@ -635,8 +635,8 @@ def test_autoconf_converges_in_one_iteration( # Autoconf is part of optimize, not a separate stage assert "optimize" in result.stages_completed - # Single analyze call (no autoconf suggestions, no loop) - assert m.call_count == 1 + # Two analyze calls: one in loop (no autoconf), one final validation + assert m.call_count == 2 def test_autoconf_discovers_and_reoptimizes( self, tmp_path: Path, sample_config_no_quant_compile, mock_pipeline @@ -650,7 +650,7 @@ def test_autoconf_discovers_and_reoptimizes( with patch( "winml.modelkit.build.common.analyze_onnx", - side_effect=[result_with_gelu, result_converged], + side_effect=[result_with_gelu, result_converged, result_converged], ) as m_analyze: result = build_hf_model( config=sample_config_no_quant_compile, @@ -661,8 +661,8 @@ def test_autoconf_discovers_and_reoptimizes( ) assert "optimize" in result.stages_completed - # 2 analyze calls: initial (found gelu) + after re-optimize (converged) - assert m_analyze.call_count == 2 + # 3 analyze calls: initial (found gelu) + after re-optimize (converged) + final validation + assert m_analyze.call_count == 3 # optimize_onnx called: once initial + once re-optimize in autoconf assert mock_pipeline["optimize"].call_count == 2 @@ -739,7 +739,7 @@ def test_manifest_records_analyze_details( with patch( "winml.modelkit.build.common.analyze_onnx", - side_effect=[result_with_gelu, result_converged], + side_effect=[result_with_gelu, result_converged, result_converged], ): result = build_hf_model( config=sample_config_no_quant_compile, @@ -774,7 +774,7 @@ def test_autoconf_merges_config_for_downstream( with patch( "winml.modelkit.build.common.analyze_onnx", - side_effect=[result_with_flags, result_converged], + side_effect=[result_with_flags, result_converged, result_converged], ): build_hf_model( config=sample_config_no_quant_compile, @@ -847,7 +847,7 @@ def test_post_export_qdq_still_compiles( def test_post_export_qdq_runs_analyze_only( self, tmp_path: Path, sample_config, mock_pipeline ) -> None: - """Analyze is called (via run_analyze_only) but optimize is not.""" + """Pre-quantized path runs optimize but skips autoconf (no analyze).""" mock_pipeline["is_quantized_onnx"].return_value = True output_dir = tmp_path / "output" @@ -856,7 +856,8 @@ def test_post_export_qdq_runs_analyze_only( output_dir=output_dir, pytorch_model=mock_pipeline["model"], ) - mock_pipeline["analyze"].assert_called() + # max_optim_iterations=0 means no analyze loop runs + mock_pipeline["analyze"].assert_not_called() mock_pipeline["optimize"].assert_called_once() def test_skip_optimize_kwarg(self, tmp_path: Path, sample_config, mock_pipeline) -> None: diff --git a/tests/unit/build/test_onnx.py b/tests/unit/build/test_onnx.py index 5d14f69be..a4e6b41d8 100644 --- a/tests/unit/build/test_onnx.py +++ b/tests/unit/build/test_onnx.py @@ -400,7 +400,7 @@ def test_pre_quantized_still_compiles( def test_pre_quantized_runs_analyze_only( self, tmp_path: Path, fake_onnx: Path, sample_onnx_config, mock_onnx_pipeline ) -> None: - """QDQ model runs analyze (via run_analyze_only) but not optimize.""" + """Pre-quantized path runs optimize but skips autoconf (no analyze).""" mock_onnx_pipeline["is_quantized_onnx"].return_value = True output_dir = tmp_path / "output" @@ -409,8 +409,8 @@ def test_pre_quantized_runs_analyze_only( config=sample_onnx_config, output_dir=output_dir, ) - # analyze_onnx should be called (via run_analyze_only) - mock_onnx_pipeline["analyze"].assert_called() + # max_optim_iterations=0 means no analyze loop runs + mock_onnx_pipeline["analyze"].assert_not_called() mock_onnx_pipeline["optimize"].assert_called_once() def test_skip_optimize_kwarg( diff --git a/tests/unit/commands/test_build.py b/tests/unit/commands/test_build.py index d88dec71a..12fdcaf77 100644 --- a/tests/unit/commands/test_build.py +++ b/tests/unit/commands/test_build.py @@ -5,20 +5,22 @@ """Tests for build CLI command — mock-based, no network, no actual builds. -Tests the CLI wrapper around build_hf_model() API. +Tests the CLI wrapper around _run_single_build() internal pipeline. NO WinMLAutoModel involvement. """ from __future__ import annotations import json -from pathlib import Path +from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch import pytest from click.testing import CliRunner -from winml.modelkit.build.hf import BuildResult + +if TYPE_CHECKING: + from pathlib import Path @pytest.fixture(autouse=True) @@ -74,31 +76,15 @@ def sample_config_file(tmp_path: Path) -> Path: @pytest.fixture def mock_build_api(): - """Mock build_hf_model to avoid actual pipeline execution.""" - result = BuildResult( - output_dir=Path("/fake/output"), - final_onnx_path=Path("/fake/output/model.onnx"), - config_path=Path("/fake/output/winml_build_config.json"), - stages_completed=["export", "optimize", "quantize", "compile"], - stages_skipped=[], - stage_timings={"export": 1.0, "optimize": 0.5, "quantize": 2.0, "compile": 0.3}, - elapsed=3.8, - ) - with patch("winml.modelkit.build.build_hf_model", return_value=result) as mock: + """Mock _run_single_build to avoid actual pipeline execution.""" + with patch("winml.modelkit.commands.build._run_single_build", return_value=None) as mock: yield mock @pytest.fixture def mock_build_reused(): - """Mock build_hf_model returning a reused result.""" - result = BuildResult( - output_dir=Path("/fake/output"), - final_onnx_path=Path("/fake/output/model.onnx"), - config_path=Path("/fake/output/winml_build_config.json"), - reused=True, - elapsed=0.01, - ) - with patch("winml.modelkit.build.build_hf_model", return_value=result) as mock: + """Mock _run_single_build returning None (reuse is handled internally).""" + with patch("winml.modelkit.commands.build._run_single_build", return_value=None) as mock: yield mock @@ -197,7 +183,6 @@ def test_basic_build( ) assert result.exit_code == 0, f"Build failed: {result.output}" assert mock_build_api.called - assert "Build complete" in result.output def test_model_id_passed( self, @@ -214,16 +199,16 @@ def test_model_id_passed( obj={"debug": False}, ) call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("model_id") == "microsoft/resnet-50" + assert call_kwargs["model_id"] == "microsoft/resnet-50" - def test_model_required( + def test_model_optional_for_random_weight( self, runner: CliRunner, sample_config_file: Path, mock_build_api: MagicMock, tmp_path: Path, ) -> None: - """Omitting -m/--model is rejected because it is now required.""" + """Omitting -m/--model is valid — triggers random-weight build.""" from winml.modelkit.commands.build import build result = runner.invoke( @@ -231,8 +216,9 @@ def test_model_required( ["-c", str(sample_config_file), "-o", str(tmp_path)], obj={"debug": False}, ) - assert result.exit_code != 0 - assert "model" in result.output.lower() + assert result.exit_code == 0 + call_kwargs = mock_build_api.call_args.kwargs + assert call_kwargs["model_id"] is None def test_rebuild_passed( self, @@ -249,7 +235,7 @@ def test_rebuild_passed( obj={"debug": False}, ) call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("rebuild") is True + assert call_kwargs["rebuild"] is True def test_default_rebuild_false( self, @@ -266,7 +252,7 @@ def test_default_rebuild_false( obj={"debug": False}, ) call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("rebuild") is False + assert call_kwargs["rebuild"] is False # ============================================================================= @@ -291,7 +277,7 @@ def test_no_quant_sets_none( ["-c", str(sample_config_file), "-m", "test", "-o", str(tmp_path), "--no-quant"], obj={"debug": False}, ) - config = mock_build_api.call_args.kwargs.get("config") + config = mock_build_api.call_args.kwargs["config"] assert config.quant is None def test_no_compile_sets_none( @@ -308,7 +294,7 @@ def test_no_compile_sets_none( ["-c", str(sample_config_file), "-m", "test", "-o", str(tmp_path), "--no-compile"], obj={"debug": False}, ) - config = mock_build_api.call_args.kwargs.get("config") + config = mock_build_api.call_args.kwargs["config"] assert config.compile is None def test_no_quant_no_compile_together( @@ -334,7 +320,7 @@ def test_no_quant_no_compile_together( ], obj={"debug": False}, ) - config = mock_build_api.call_args.kwargs.get("config") + config = mock_build_api.call_args.kwargs["config"] assert config.quant is None assert config.compile is None @@ -362,8 +348,8 @@ def test_reuse_message( obj={"debug": False}, ) assert result.exit_code == 0 - assert "Existing artifact" in result.output - assert "--rebuild" in result.output + # Reuse detection is handled inside _run_single_build; verify it was called + assert mock_build_reused.called # ============================================================================= @@ -407,7 +393,7 @@ def test_build_failure_reported( ) -> None: from winml.modelkit.commands.build import build - with patch("winml.modelkit.build.build_hf_model") as mock: + with patch("winml.modelkit.commands.build._run_single_build") as mock: mock.side_effect = RuntimeError("ONNX export failed") result = runner.invoke( @@ -423,7 +409,7 @@ def test_value_error_becomes_usage_error( ) -> None: from winml.modelkit.commands.build import build - with patch("winml.modelkit.build.build_hf_model") as mock: + with patch("winml.modelkit.commands.build._run_single_build") as mock: mock.side_effect = ValueError("Invalid config") result = runner.invoke( @@ -458,7 +444,7 @@ def test_ep_flag_passed( obj={"debug": False}, ) call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("ep") == "qnn" + assert call_kwargs["ep"] == "qnn" def test_device_flag_passed( self, @@ -475,7 +461,7 @@ def test_device_flag_passed( obj={"debug": False}, ) call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("device") == "NPU" + assert call_kwargs["device"] == "NPU" # ============================================================================= @@ -533,7 +519,7 @@ def test_build_auto_detect_onnx_file( sample_config_file: Path, tmp_path: Path, ) -> None: - """When -m points to an existing .onnx file, dispatches to build_onnx_model.""" + """When -m points to an existing .onnx file, dispatches to _build_onnx_pipeline.""" from winml.modelkit.commands.build import build # Create a fake .onnx file on disk @@ -542,16 +528,9 @@ def test_build_auto_detect_onnx_file( output_dir = tmp_path / "out" - onnx_result = BuildResult( - output_dir=output_dir, - final_onnx_path=output_dir / "model.onnx", - config_path=output_dir / "winml_build_config.json", - stages_completed=["optimize"], - stages_skipped=["quantize", "compile"], - stage_timings={"optimize": 0.5}, - elapsed=0.5, - ) - with patch("winml.modelkit.build.build_onnx_model", return_value=onnx_result) as mock_onnx: + with patch( + "winml.modelkit.commands.build._build_onnx_pipeline", return_value=[] + ) as mock_onnx: result = runner.invoke( build, ["-c", str(sample_config_file), "-m", str(onnx_file), "-o", str(output_dir)], @@ -569,7 +548,7 @@ def test_build_auto_detect_hf_model( mock_build_api: MagicMock, tmp_path: Path, ) -> None: - """When -m is a HF model ID (not .onnx), dispatches to build_hf_model.""" + """When -m is a HF model ID (not .onnx), dispatches to _run_single_build.""" from winml.modelkit.commands.build import build output_dir = tmp_path / "out" @@ -581,7 +560,7 @@ def test_build_auto_detect_hf_model( assert result.exit_code == 0, f"Build failed: {result.output}" assert mock_build_api.called call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("model_id") == "microsoft/resnet-50" + assert call_kwargs["model_id"] == "microsoft/resnet-50" def test_build_onnx_suffix_but_not_exists_uses_hf( self, @@ -604,7 +583,7 @@ def test_build_onnx_suffix_but_not_exists_uses_hf( assert result.exit_code == 0, f"Build failed: {result.output}" assert mock_build_api.called call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("model_id") == "nonexistent.onnx" + assert call_kwargs["model_id"] == "nonexistent.onnx" # ============================================================================= @@ -641,8 +620,8 @@ def test_no_analyze_sets_zero_iterations( ["-c", str(sample_config_file), "-m", "test", "-o", str(tmp_path), "--no-analyze"], obj={"debug": False}, ) - call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("hack_max_optim_iterations") == 0 + extra = mock_build_api.call_args.kwargs["extra_kwargs"] + assert extra.get("hack_max_optim_iterations") == 0 def test_max_optim_iterations_passed( self, @@ -667,8 +646,8 @@ def test_max_optim_iterations_passed( ], obj={"debug": False}, ) - call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("hack_max_optim_iterations") == 5 + extra = mock_build_api.call_args.kwargs["extra_kwargs"] + assert extra.get("hack_max_optim_iterations") == 5 def test_no_analyze_takes_precedence_over_max_iterations( self, @@ -695,8 +674,8 @@ def test_no_analyze_takes_precedence_over_max_iterations( ], obj={"debug": False}, ) - call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("hack_max_optim_iterations") == 0 + extra = mock_build_api.call_args.kwargs["extra_kwargs"] + assert extra.get("hack_max_optim_iterations") == 0 def test_default_no_analyzer_kwargs( self, @@ -712,8 +691,8 @@ def test_default_no_analyzer_kwargs( ["-c", str(sample_config_file), "-m", "test", "-o", str(tmp_path)], obj={"debug": False}, ) - call_kwargs = mock_build_api.call_args.kwargs - assert "hack_max_optim_iterations" not in call_kwargs + extra = mock_build_api.call_args.kwargs["extra_kwargs"] + assert "hack_max_optim_iterations" not in extra # ============================================================================= @@ -736,24 +715,16 @@ def test_no_optimize_passed_to_onnx_build( sample_config_file: Path, tmp_path: Path, ) -> None: - """--no-optimize passes skip_optimize=True to build_onnx_model.""" + """--no-optimize passes skip_optimize=True via extra_kwargs.""" from winml.modelkit.commands.build import build # Create a fake .onnx file for ONNX path detection onnx_file = tmp_path / "model.onnx" onnx_file.write_text("fake") - result_obj = BuildResult( - output_dir=tmp_path / "out", - final_onnx_path=tmp_path / "out" / "model.onnx", - config_path=tmp_path / "out" / "config.json", - stages_completed=["compile"], - stages_skipped=["optimize", "quantize"], - stage_timings={"compile": 0.3}, - elapsed=1.0, - ) - - with patch("winml.modelkit.build.build_onnx_model", return_value=result_obj) as mock_build: + with patch( + "winml.modelkit.commands.build._run_single_build", return_value=None + ) as mock_build: result = runner.invoke( build, [ @@ -769,8 +740,8 @@ def test_no_optimize_passed_to_onnx_build( ) assert result.exit_code == 0, f"Failed: {result.output}" - call_kwargs = mock_build.call_args.kwargs - assert call_kwargs.get("skip_optimize") is True + extra = mock_build.call_args.kwargs["extra_kwargs"] + assert extra.get("skip_optimize") is True def test_no_optimize_passed_to_hf_build( self, @@ -779,7 +750,7 @@ def test_no_optimize_passed_to_hf_build( tmp_path: Path, mock_build_api: MagicMock, ) -> None: - """--no-optimize passes skip_optimize=True to build_hf_model.""" + """--no-optimize passes skip_optimize=True via extra_kwargs.""" from winml.modelkit.commands.build import build result = runner.invoke( @@ -797,8 +768,8 @@ def test_no_optimize_passed_to_hf_build( ) assert result.exit_code == 0, f"Failed: {result.output}" - call_kwargs = mock_build_api.call_args.kwargs - assert call_kwargs.get("skip_optimize") is True + extra = mock_build_api.call_args.kwargs["extra_kwargs"] + assert extra.get("skip_optimize") is True def test_no_optimize_default_not_present( self, @@ -807,7 +778,7 @@ def test_no_optimize_default_not_present( tmp_path: Path, mock_build_api: MagicMock, ) -> None: - """Without --no-optimize, skip_optimize is not in kwargs.""" + """Without --no-optimize, skip_optimize is not in extra_kwargs.""" from winml.modelkit.commands.build import build runner.invoke( @@ -816,5 +787,5 @@ def test_no_optimize_default_not_present( obj={"debug": False}, ) - call_kwargs = mock_build_api.call_args.kwargs - assert "skip_optimize" not in call_kwargs + extra = mock_build_api.call_args.kwargs["extra_kwargs"] + assert "skip_optimize" not in extra diff --git a/tests/unit/commands/test_cli.py b/tests/unit/commands/test_cli.py index 9a7028808..538e5fb77 100644 --- a/tests/unit/commands/test_cli.py +++ b/tests/unit/commands/test_cli.py @@ -106,7 +106,7 @@ def test_export_short_flags(self, runner: CliRunner) -> None: assert "-v" in result.output @patch("winml.modelkit.loader.load_hf_model") - @patch("winml.modelkit.export.pytorch.export_pytorch") + @patch("winml.modelkit.export.export_pytorch") def test_export_calls_api( self, mock_export_onnx: MagicMock, diff --git a/tests/unit/commands/test_export.py b/tests/unit/commands/test_export.py index 54a5b8439..1b94d19ad 100644 --- a/tests/unit/commands/test_export.py +++ b/tests/unit/commands/test_export.py @@ -29,7 +29,7 @@ def runner() -> CliRunner: @pytest.fixture def mock_export_onnx(): """Mock export_onnx function to avoid actual model loading.""" - with patch("winml.modelkit.export.pytorch.export_pytorch") as mock: + with patch("winml.modelkit.export.export_pytorch") as mock: mock.return_value = Path("test_output.onnx") yield mock @@ -448,7 +448,7 @@ def test_export_handles_export_onnx_error( """Test export handles export_onnx() errors gracefully.""" from winml.modelkit.commands.export import export - with patch("winml.modelkit.export.pytorch.export_pytorch") as mock: + with patch("winml.modelkit.export.export_pytorch") as mock: mock.side_effect = RuntimeError("ONNX export failed") output_path = tmp_path / "model.onnx" @@ -490,7 +490,7 @@ def test_export_uses_resolve_export_config( with ( patch("winml.modelkit.loader.load_hf_model") as mock_load, patch( - "winml.modelkit.export.config.resolve_export_config", + "winml.modelkit.export.resolve_export_config", return_value=(mock_export_cfg, mock_loader_cfg), ), ): diff --git a/tests/unit/commands/test_inspect_cli.py b/tests/unit/commands/test_inspect_cli.py index a5dca0f24..c8089026e 100644 --- a/tests/unit/commands/test_inspect_cli.py +++ b/tests/unit/commands/test_inspect_cli.py @@ -59,13 +59,10 @@ def mock_inspect_result() -> MagicMock: return result -# The inspect command uses deferred imports inside the function body: -# from ..inspect import inspect_model, InspectError, ... -# from ..inspect.formatter import output_json, output_table -# -# Since `from X import Y` resolves Y from X at import time, we must -# patch at the SOURCE modules so the deferred import picks up the mock. -_INSPECT_MODEL = "winml.modelkit.inspect.inspect_model" +# The inspect command calls _inspect_model_v2 (a module-level function in +# commands/inspect.py) then dispatches to output_json / output_table from +# the formatter module. We patch at their actual locations. +_INSPECT_MODEL = "winml.modelkit.commands.inspect._inspect_model_v2" _OUTPUT_JSON = "winml.modelkit.inspect.formatter.output_json" _OUTPUT_TABLE = "winml.modelkit.inspect.formatter.output_table" diff --git a/tests/unit/commands/test_perf_cli.py b/tests/unit/commands/test_perf_cli.py index 7cc2d4335..cdd62b9c8 100644 --- a/tests/unit/commands/test_perf_cli.py +++ b/tests/unit/commands/test_perf_cli.py @@ -270,15 +270,14 @@ def test_no_quantize_false_passes_no_override(self) -> None: override = mock_from_pretrained.call_args.kwargs["config"] assert override is None - def test_cli_onnx_goes_through_perfbenchmark(self, runner: CliRunner, tmp_path: Path) -> None: - """CLI with .onnx file should route through PerfBenchmark, not _run_onnx_benchmark.""" + def test_cli_onnx_goes_through_onnx_benchmark(self, runner: CliRunner, tmp_path: Path) -> None: + """CLI with .onnx file should route through _run_onnx_benchmark.""" onnx_file = tmp_path / "model.onnx" onnx_file.write_bytes(b"fake onnx") with ( - patch.object( - PerfBenchmark, - "run", + patch( + "winml.modelkit.commands.perf._run_onnx_benchmark", return_value=MagicMock(), ) as mock_run, patch( diff --git a/tests/unit/commands/test_perf_optracing.py b/tests/unit/commands/test_perf_optracing.py new file mode 100644 index 000000000..f26f11258 --- /dev/null +++ b/tests/unit/commands/test_perf_optracing.py @@ -0,0 +1,580 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for the --op-tracing CLI option on winml perf and _resolve_ep_monitor.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from unittest.mock import patch + +import pytest +from click.testing import CliRunner + + +if TYPE_CHECKING: + from pathlib import Path + +from winml.modelkit.commands.perf import _resolve_ep_monitor, perf + + +def _invoke_perf(args: list[str]): + """Invoke perf CLI with PerfBenchmark.run mocked to prevent model loading.""" + runner = CliRunner() + with patch( + "winml.modelkit.commands.perf.PerfBenchmark.run", + side_effect=RuntimeError("mocked — not running benchmark"), + ): + return runner.invoke(perf, args, obj={}) + + +class TestOpTracingOptionParsing: + """Verify --op-tracing is recognized and validates choices.""" + + def test_option_is_recognized(self): + """--op-tracing is accepted as a valid CLI option.""" + result = _invoke_perf(["--op-tracing", "basic", "-m", "nonexistent"]) + assert "no such option" not in (result.output or "").lower() + + def test_basic_choice_accepted(self): + """--op-tracing basic is a valid choice.""" + result = _invoke_perf(["--op-tracing", "basic", "-m", "nonexistent"]) + assert "no such option" not in (result.output or "").lower() + assert "invalid choice" not in (result.output or "").lower() + + def test_detail_choice_accepted(self): + """--op-tracing detail is a valid choice.""" + result = _invoke_perf(["--op-tracing", "detail", "-m", "nonexistent"]) + assert "no such option" not in (result.output or "").lower() + assert "invalid choice" not in (result.output or "").lower() + + def test_invalid_choice_rejected(self): + """--op-tracing with an invalid value is rejected by Click.""" + runner = CliRunner() + result = runner.invoke(perf, ["--op-tracing", "invalid", "-m", "test"]) + assert result.exit_code != 0 + output_lower = (result.output or "").lower() + assert "invalid" in output_lower or "choice" in output_lower + + def test_case_insensitive(self): + """--op-tracing accepts mixed-case values (e.g. Basic, DETAIL).""" + result = _invoke_perf(["--op-tracing", "BASIC", "-m", "nonexistent"]) + assert "invalid choice" not in (result.output or "").lower() + + def test_without_op_tracing_flag(self): + """Command works without --op-tracing (default is None).""" + result = _invoke_perf(["-m", "nonexistent"]) + assert "no such option" not in (result.output or "").lower() + + def test_model_required_with_op_tracing(self): + """--op-tracing alone without -m still requires a model.""" + runner = CliRunner() + result = runner.invoke(perf, ["--op-tracing", "basic"]) + assert result.exit_code != 0 + + +class TestResolveEpMonitor: + """Unit tests for the _resolve_ep_monitor dispatch helper.""" + + def test_no_op_tracing_no_ep_returns_null(self, tmp_path: Path): + """With no op_tracing and no matching EP, returns NullEPMonitor.""" + from winml.modelkit.session.monitor.ep_monitor import NullEPMonitor + + monitor = _resolve_ep_monitor(ep=None, op_tracing=None, output_dir=tmp_path) + assert isinstance(monitor, NullEPMonitor) + + def test_no_op_tracing_cpu_ep_returns_null(self, tmp_path: Path): + """CPU EP with no op_tracing yields NullEPMonitor.""" + from winml.modelkit.session.monitor.ep_monitor import NullEPMonitor + + monitor = _resolve_ep_monitor(ep="cpu", op_tracing=None, output_dir=tmp_path) + assert isinstance(monitor, NullEPMonitor) + + def test_vitisai_ep_no_op_tracing_returns_vitisai_when_available(self, tmp_path: Path): + """vitisai EP with no op_tracing returns VitisAIMonitor when available.""" + from winml.modelkit.session.monitor.vitisai_monitor import VitisAIMonitor + + with patch.object(VitisAIMonitor, "is_available", return_value=True): + monitor = _resolve_ep_monitor(ep="vitisai", op_tracing=None, output_dir=tmp_path) + assert isinstance(monitor, VitisAIMonitor) + + def test_vitisai_ep_unavailable_returns_null(self, tmp_path: Path): + """vitisai EP with no op_tracing returns NullEPMonitor when VitisAI is unavailable.""" + from winml.modelkit.session.monitor.ep_monitor import NullEPMonitor + from winml.modelkit.session.monitor.vitisai_monitor import VitisAIMonitor + + with patch.object(VitisAIMonitor, "is_available", return_value=False): + monitor = _resolve_ep_monitor(ep="vitisai", op_tracing=None, output_dir=tmp_path) + assert isinstance(monitor, NullEPMonitor) + + def test_op_tracing_qnn_available_returns_qnn_monitor(self, tmp_path: Path): + """qnn EP with op_tracing returns QNNMonitor when QNN is available.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with patch.object(QNNMonitor, "is_available", return_value=True): + monitor = _resolve_ep_monitor(ep="qnn", op_tracing="basic", output_dir=tmp_path) + assert isinstance(monitor, QNNMonitor) + + def test_op_tracing_qnn_unavailable_raises(self, tmp_path: Path): + """qnn EP with op_tracing raises RuntimeError when QNN is not available.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with ( + patch.object(QNNMonitor, "is_available", return_value=False), + pytest.raises(RuntimeError, match="QNN is not available"), + ): + _resolve_ep_monitor(ep="qnn", op_tracing="basic", output_dir=tmp_path) + + def test_op_tracing_unsupported_ep_raises(self, tmp_path: Path): + """Unsupported EP with op_tracing raises RuntimeError (NFR-2 hard-fail).""" + with pytest.raises(RuntimeError, match="Op-tracing not available for EP 'dml'"): + _resolve_ep_monitor(ep="dml", op_tracing="basic", output_dir=tmp_path) + + def test_op_tracing_passes_level_to_qnn_monitor(self, tmp_path: Path): + """QNNMonitor receives the correct level from _resolve_ep_monitor.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with patch.object(QNNMonitor, "is_available", return_value=True): + monitor = _resolve_ep_monitor(ep="qnn", op_tracing="detail", output_dir=tmp_path) + assert isinstance(monitor, QNNMonitor) + assert monitor._level == "detail" + + def test_auto_infers_qnn_from_npu_device(self, tmp_path: Path): + """--device npu --op-tracing basic must engage QNNMonitor without --ep qnn (SC-1).""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with patch.object(QNNMonitor, "is_available", return_value=True): + monitor = _resolve_ep_monitor( + ep=None, + op_tracing="basic", + output_dir=tmp_path, + device="npu", + ) + assert isinstance(monitor, QNNMonitor) + + def test_auto_infers_qnn_from_npu_device_case_insensitive(self, tmp_path: Path): + """--device NPU (uppercase) also auto-infers QNN.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with patch.object(QNNMonitor, "is_available", return_value=True): + monitor = _resolve_ep_monitor( + ep=None, + op_tracing="basic", + output_dir=tmp_path, + device="NPU", + ) + assert isinstance(monitor, QNNMonitor) + + @pytest.mark.parametrize("device_input", ["auto", "AUTO", "", None]) + def test_auto_infers_qnn_from_default_device_when_op_tracing( + self, tmp_path: Path, device_input + ): + """--device auto (default) and empty/None must also auto-infer QNN. + + --op-tracing is itself a strong intent signal; users invoking the + common pattern ``wmk perf -m <model> --op-tracing basic`` should not + need to also pass --device npu. + """ + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with patch.object(QNNMonitor, "is_available", return_value=True): + monitor = _resolve_ep_monitor( + ep=None, + op_tracing="basic", + output_dir=tmp_path, + device=device_input, + ) + assert isinstance(monitor, QNNMonitor) + + @pytest.mark.parametrize("device_input", ["cpu", "gpu"]) + def test_explicit_non_npu_device_still_hard_fails(self, tmp_path: Path, device_input): + """--device cpu/gpu --op-tracing basic must still hard-fail. + + Auto-infer only fires when device is unset (auto/empty) or npu; + explicit user choice of cpu/gpu must be honored as "no, I do not + want NPU" and produce a clear error rather than silently switching. + """ + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with ( + patch.object(QNNMonitor, "is_available", return_value=True), + pytest.raises(RuntimeError, match="Op-tracing not available"), + ): + _resolve_ep_monitor( + ep=None, + op_tracing="basic", + output_dir=tmp_path, + device=device_input, + ) + + @pytest.mark.parametrize("ep_input", ["qnn", "QNN", "Qnn", "qNN"]) + def test_ep_matching_case_insensitive(self, tmp_path: Path, ep_input: str): + """--ep QNN, --ep Qnn, --ep qnn all behave identically.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with patch.object(QNNMonitor, "is_available", return_value=True): + monitor = _resolve_ep_monitor( + ep=ep_input, + op_tracing="basic", + output_dir=tmp_path, + device="npu", + ) + assert isinstance(monitor, QNNMonitor) + + def test_npu_device_qnn_unavailable_raises_descriptive(self, tmp_path: Path): + """--device npu --op-tracing when QNN unavailable raises with diagnostic message.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with ( + patch.object(QNNMonitor, "is_available", return_value=False), + pytest.raises(RuntimeError, match="not available for EP"), + ): + _resolve_ep_monitor( + ep=None, + op_tracing="basic", + output_dir=tmp_path, + device="npu", + ) + + def test_explicit_qnn_ep_unavailable_message_mentions_install(self, tmp_path: Path): + """When --ep qnn is explicit and unavailable, message hints at install paths.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with ( + patch.object(QNNMonitor, "is_available", return_value=False), + pytest.raises(RuntimeError) as excinfo, + ): + _resolve_ep_monitor( + ep="qnn", + op_tracing="basic", + output_dir=tmp_path, + device="npu", + ) + msg = str(excinfo.value) + assert "QNN is not available" in msg + assert "onnxruntime" in msg + + +class TestOpTracingIterationsSmartDefault: + """--op-tracing collapses default iterations to 1 unless user overrides.""" + + @staticmethod + def _capture_config(args: list[str]) -> dict: + """Run perf CLI with BenchmarkConfig captured before benchmark runs.""" + captured: dict = {} + runner = CliRunner() + + with ( + patch( + "winml.modelkit.commands.perf.BenchmarkConfig", + side_effect=lambda **kw: (captured.update(kw), _ConfigStub(**kw))[1], + ), + patch("winml.modelkit.commands.perf.PerfBenchmark") as mock_bench, + ): + # Fail fast in benchmark to avoid model loading + mock_bench.return_value.run.side_effect = RuntimeError("stop") + runner.invoke(perf, args, obj={}) + return captured + + def test_op_tracing_without_iterations_collapses_to_1(self): + """--op-tracing basic without --iterations -> iterations=1.""" + captured = self._capture_config(["--op-tracing", "basic", "-m", "fake/model"]) + assert captured.get("iterations") == 1 + + def test_op_tracing_with_explicit_iterations_honored(self): + """--op-tracing basic --iterations 50 -> iterations=50 (user override wins).""" + captured = self._capture_config( + ["--op-tracing", "basic", "--iterations", "50", "-m", "fake/model"] + ) + assert captured.get("iterations") == 50 + + def test_op_tracing_with_explicit_default_value_honored(self): + """--op-tracing basic --iterations 100 -> iterations=100. + + Even when the user explicitly passes the value that matches the + normal default, the smart override does not fire — the parameter + source is COMMANDLINE, not DEFAULT. + """ + captured = self._capture_config( + ["--op-tracing", "basic", "--iterations", "100", "-m", "fake/model"] + ) + assert captured.get("iterations") == 100 + + def test_no_op_tracing_uses_default_100(self): + """Without --op-tracing the normal default of 100 stands.""" + captured = self._capture_config(["-m", "fake/model"]) + assert captured.get("iterations") == 100 + + +class _ConfigStub: + """Lightweight stand-in for BenchmarkConfig used by capture tests.""" + + def __init__(self, **kw): + for k, v in kw.items(): + setattr(self, k, v) + + +class TestCliOpTracingDispatch: + """CLI-level integration tests for --op-tracing dispatch (mocked benchmark).""" + + def test_onnx_input_with_op_tracing_fails_at_parse_time(self, tmp_path: Path): + """--op-tracing on a .onnx input must fail BEFORE running the benchmark.""" + runner = CliRunner() + onnx_file = tmp_path / "fake.onnx" + onnx_file.write_bytes(b"") + + # Patch _run_onnx_benchmark to detect if it was called (it must NOT be). + with patch( + "winml.modelkit.commands.perf._run_onnx_benchmark", + ) as mock_run: + result = runner.invoke( + perf, + ["-m", str(onnx_file), "--op-tracing", "basic"], + obj={}, + ) + + assert result.exit_code != 0 + assert "not yet supported for direct ONNX" in result.output + mock_run.assert_not_called() + + def test_no_data_status_exits_4(self, tmp_path: Path): + """When op-tracing returns status='no_data', CLI exits 4 — not exit 0 with warning.""" + from unittest.mock import MagicMock + + from winml.modelkit.commands.perf import BenchmarkResult + from winml.modelkit.session.monitor.op_metrics import OpTraceResult + + # Fabricate a BenchmarkResult and a no_data OpTraceResult. + config = MagicMock() + config.model_id = "fake/model" + config.task = None + config.device = "npu" + config.precision = "auto" + config.iterations = 1 + config.warmup = 0 + config.batch_size = 1 + bench_result = BenchmarkResult(config=config) + + trace = OpTraceResult( + model="fake/model", + device="npu", + tracing_level="basic", + status="no_data", + error="profiler CSV missing", + ) + + # Mock benchmark to return the fabricated result and expose _perf_ctx. + mock_ctx = MagicMock() + mock_ctx.monitor.result = trace + mock_benchmark = MagicMock() + mock_benchmark.run.return_value = bench_result + mock_benchmark._perf_ctx = mock_ctx + + runner = CliRunner() + with ( + patch( + "winml.modelkit.commands.perf.PerfBenchmark", + return_value=mock_benchmark, + ), + patch("winml.modelkit.commands.perf.display_console_report"), + patch("winml.modelkit.commands.perf.write_json_report"), + ): + result = runner.invoke( + perf, + ["-m", "fake/model", "--device", "npu", "--op-tracing", "basic"], + obj={}, + ) + + assert result.exit_code == 4, f"Expected exit 4, got {result.exit_code}: {result.output}" + assert "no profiling data" in result.output.lower() + + def test_parse_failed_status_exits_4(self, tmp_path: Path): + """parse_failed status exits 4 with the parser error message.""" + from unittest.mock import MagicMock + + from winml.modelkit.commands.perf import BenchmarkResult + from winml.modelkit.session.monitor.op_metrics import OpTraceResult + + config = MagicMock() + config.model_id = "fake/model" + config.device = "npu" + config.precision = "auto" + config.iterations = 1 + config.warmup = 0 + config.batch_size = 1 + config.task = None + bench_result = BenchmarkResult(config=config) + + trace = OpTraceResult( + model="fake/model", + device="npu", + tracing_level="detail", + status="parse_failed", + error="invalid CSV header", + ) + mock_ctx = MagicMock() + mock_ctx.monitor.result = trace + mock_benchmark = MagicMock() + mock_benchmark.run.return_value = bench_result + mock_benchmark._perf_ctx = mock_ctx + + runner = CliRunner() + with ( + patch( + "winml.modelkit.commands.perf.PerfBenchmark", + return_value=mock_benchmark, + ), + patch("winml.modelkit.commands.perf.display_console_report"), + patch("winml.modelkit.commands.perf.write_json_report"), + ): + result = runner.invoke( + perf, + ["-m", "fake/model", "--device", "npu", "--op-tracing", "detail"], + obj={}, + ) + + assert result.exit_code == 4 + assert "parse failed" in result.output.lower() + assert "invalid CSV header" in result.output + + def test_basic_fallback_status_exits_0_with_notice(self, tmp_path: Path): + """basic_fallback status is degraded-success (exit 0 with yellow notice).""" + from unittest.mock import MagicMock + + from winml.modelkit.commands.perf import BenchmarkResult + from winml.modelkit.session.monitor.op_metrics import OpTraceResult + + config = MagicMock() + config.model_id = "fake/model" + config.device = "npu" + config.precision = "auto" + config.iterations = 1 + config.warmup = 0 + config.batch_size = 1 + config.task = None + bench_result = BenchmarkResult(config=config) + + trace = OpTraceResult( + model="fake/model", + device="npu", + tracing_level="detail", + status="basic_fallback", + ) + mock_ctx = MagicMock() + mock_ctx.monitor.result = trace + mock_benchmark = MagicMock() + mock_benchmark.run.return_value = bench_result + mock_benchmark._perf_ctx = mock_ctx + + runner = CliRunner() + with ( + patch( + "winml.modelkit.commands.perf.PerfBenchmark", + return_value=mock_benchmark, + ), + patch("winml.modelkit.commands.perf.display_console_report"), + patch("winml.modelkit.commands.perf.write_json_report"), + patch("winml.modelkit.session.monitor.report.display_op_trace_report"), + patch("winml.modelkit.session.monitor.report.write_op_trace_json"), + ): + result = runner.invoke( + perf, + ["-m", "fake/model", "--device", "npu", "--op-tracing", "detail"], + obj={}, + ) + + assert result.exit_code == 0, f"Expected exit 0, got {result.exit_code}: {result.output}" + assert "degraded" in result.output.lower() or "notice" in result.output.lower() + + +# =========================================================================== +# Hardware-gated CLI E2E (SC-1) +# +# PRD §10.5 / coreloop §8.4 mandate this test: +# "test_cli_op_tracing_basic_on_qnn (skip if no QNN NPU): runs +# wmk perf -m resnet50 --device npu --op-tracing basic, asserts CSV +# produced, *_op_trace.json written, at least one operator entry." +# +# This is the only end-to-end proof that SC-1 holds: the headline +# invocation produces real per-operator trace data on a QNN NPU. +# The test is doubly-gated: +# * QNNMonitor.is_available() — actual hardware/runtime probe. +# * WINML_TEST_NPU=1 env var — explicit opt-in (matches existing +# project pattern for NPU-bound tests). +# Without either, the test skips cleanly (Cardinal Rule 3 allows +# hardware-gated skipif). +# =========================================================================== + + +@pytest.mark.skipif( + __import__("os").environ.get("WINML_TEST_NPU", "0") != "1", + reason="Hardware-gated SC-1 test requires WINML_TEST_NPU=1 + QNN NPU", +) +def test_cli_op_tracing_basic_on_qnn(tmp_path): + """SC-1 end-to-end: ``wmk perf --device npu --op-tracing basic`` on QNN. + + Hardware-gated. Must produce: + * a profiling CSV under the monitor's output directory, + * a ``*_op_trace.json`` next to the perf JSON output, + * at least one operator entry, with ``status == "ok"``. + + A regression that silently falls back to CPU (the bug SC-1 explicitly + targets — see PRD §3) would emit ``status == "no_data"`` here and + ``test_no_data_status_exits_4`` would catch it logically. This test + proves the happy path on real hardware. + """ + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + if not QNNMonitor.is_available(): + pytest.skip("QNN EP not available on this system") + + runner = CliRunner() + output_path = tmp_path / "perf_result.json" + result = runner.invoke( + perf, + [ + "-m", + "microsoft/resnet-50", + "--device", + "npu", + "--op-tracing", + "basic", + "--iterations", + "10", + "--warmup", + "2", + "-o", + str(output_path), + ], + obj={}, + catch_exceptions=False, + ) + + assert result.exit_code == 0, ( + f"perf --op-tracing basic failed (exit {result.exit_code}):\n{result.output}" + ) + + # Per-op trace JSON written next to the perf output. + trace_files = list(tmp_path.glob("*_op_trace.json")) + assert trace_files, ( + f"Expected *_op_trace.json next to {output_path}; got: {list(tmp_path.iterdir())}" + ) + + import json + + trace_data = json.loads(trace_files[0].read_text(encoding="utf-8")) + assert trace_data["status"] == "ok", ( + f"Expected status='ok' on real hardware, got {trace_data['status']!r} " + f"with error={trace_data.get('error')!r}" + ) + assert trace_data["operators"], ( + "Expected at least one operator entry; got 0. " + "This is the canonical SC-1 silent-CPU-fallback signature." + ) + # CSV path recorded in artifacts and present on disk. + csv_path_str = trace_data["artifacts"].get("csv") + assert csv_path_str, "Expected 'csv' key in artifacts" + from pathlib import Path as _Path + + assert _Path(csv_path_str).is_file(), f"Expected profiling CSV at {csv_path_str}" diff --git a/tests/unit/config/test_build.py b/tests/unit/config/test_build.py index 4e03eae7c..9849dd710 100644 --- a/tests/unit/config/test_build.py +++ b/tests/unit/config/test_build.py @@ -35,7 +35,6 @@ ) from winml.modelkit.export import ( InputTensorSpec, - ONNXConfigNotFoundError, OutputTensorSpec, WinMLExportConfig, resolve_io_specs, @@ -339,18 +338,18 @@ def test_merge_config_called_with_override( class TestRegistryShortCircuit: - """Tests for the registry export config merge in generate_build_config. + """Tests for the registry short-circuit path in generate_build_config. - Optimum is always tried first (may fail for unsupported models). - Registered export config is always merged on top (registry wins). + When MODEL_BUILD_CONFIGS has a registered config with input_tensors, + the Optimum _resolve_export_config_from_specs() call is skipped. """ - def test_optimum_fails_registry_fills_in( + def test_registry_with_input_tensors_skips_optimum( self, mock_hf_config: MagicMock, mock_model_class: MagicMock, ) -> None: - """When Optimum fails, registered export config provides I/O specs.""" + """Registry config with input_tensors skips Optimum lookup.""" blip_like_export = WinMLExportConfig( input_tensors=[ InputTensorSpec(name="pixel_values", dtype="float32", shape=(1, 3, 384, 384)), @@ -373,38 +372,30 @@ def test_optimum_fails_registry_fills_in( ), patch( "winml.modelkit.config.build._resolve_export_config_from_specs", - side_effect=ONNXConfigNotFoundError("blip not supported"), - ), + ) as mock_optimum, patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {"blip": blip_like_config}), ): result = generate_build_config("Salesforce/blip-image-captioning-base") - # Registry fills in the I/O specs after Optimum failure + # Optimum should NOT have been called + mock_optimum.assert_not_called() + # Result should have the registered input_tensors assert result.export.input_tensors is not None assert len(result.export.input_tensors) == 2 assert result.export.input_tensors[0].name == "pixel_values" - def test_optimum_succeeds_registry_overrides( + def test_registry_without_export_falls_through_to_optimum( self, mock_hf_config: MagicMock, mock_model_class: MagicMock, mock_loader_config: WinMLLoaderConfig, + mock_export_config: WinMLExportConfig, ) -> None: - """When Optimum succeeds, registered export config overrides on top.""" - optimum_export = WinMLExportConfig( - input_tensors=[ - InputTensorSpec(name="input_ids", dtype="int64", shape=(1, 16)), - ], - output_tensors=[OutputTensorSpec(name="logits")], - ) - # Registry overrides with a different shape - registry_export = WinMLExportConfig( - input_tensors=[ - InputTensorSpec(name="input_ids", dtype="int64", shape=(1, 512)), - ], - output_tensors=[OutputTensorSpec(name="logits")], + """Registry config without export falls through to Optimum.""" + # BERT_CONFIG has optim only, no export + bert_like_config = WinMLBuildConfig( + optim=WinMLOptimizationConfig(gelu_fusion=True), ) - registry_config = WinMLBuildConfig(export=registry_export) with ( patch( @@ -413,26 +404,25 @@ def test_optimum_succeeds_registry_overrides( ), patch( "winml.modelkit.config.build._resolve_export_config_from_specs", - return_value=optimum_export, - ), - patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {"bert": registry_config}), + return_value=mock_export_config, + ) as mock_optimum, + patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {"bert": bert_like_config}), ): - result = generate_build_config("bert-base-uncased") + generate_build_config("bert-base-uncased") - # Registry wins — shape is 512, not Optimum's 16 - assert result.export.input_tensors[0].shape == (1, 512) + # Optimum SHOULD have been called + mock_optimum.assert_called_once() - def test_registry_without_export_uses_optimum( + def test_registry_with_none_input_tensors_falls_through( self, mock_hf_config: MagicMock, mock_model_class: MagicMock, mock_loader_config: WinMLLoaderConfig, mock_export_config: WinMLExportConfig, ) -> None: - """Registry config without export uses Optimum result unchanged.""" - # BERT_CONFIG has optim only, no export - bert_like_config = WinMLBuildConfig( - optim=WinMLOptimizationConfig(gelu_fusion=True), + """Registry config with export but input_tensors=None falls through.""" + config_with_empty_export = WinMLBuildConfig( + export=WinMLExportConfig(), # input_tensors defaults to None ) with ( @@ -443,20 +433,23 @@ def test_registry_without_export_uses_optimum( patch( "winml.modelkit.config.build._resolve_export_config_from_specs", return_value=mock_export_config, + ) as mock_optimum, + patch( + "winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", + {"bert": config_with_empty_export}, ), - patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {"bert": bert_like_config}), ): - result = generate_build_config("bert-base-uncased") + generate_build_config("bert-base-uncased") - # No registered export → Optimum result used as-is - assert result.export.input_tensors == mock_export_config.input_tensors + # Optimum SHOULD have been called (input_tensors is None) + mock_optimum.assert_called_once() - def test_registry_merge_does_not_mutate_singleton( + def test_registry_deepcopy_prevents_mutation( self, mock_hf_config: MagicMock, mock_model_class: MagicMock, ) -> None: - """merge_config produces a new object, not mutating the registry.""" + """Registry export config is deepcopied, preventing singleton mutation.""" original_export = WinMLExportConfig( input_tensors=[ InputTensorSpec(name="pixel_values", dtype="float32", shape=(1, 3, 224, 224)), @@ -478,19 +471,21 @@ def test_registry_merge_does_not_mutate_singleton( ), patch( "winml.modelkit.config.build._resolve_export_config_from_specs", - side_effect=ONNXConfigNotFoundError("unsupported"), ), patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {"some-vision": registry_config}), ): result = generate_build_config("some/vision-model") - # Result should NOT be the same object as registry export + # Result export should NOT be the same object as registry export assert result.export is not original_export - # Content should be preserved + assert result.export.input_tensors is not original_export.input_tensors + # Content should be preserved (deepcopy correctness) + assert len(result.export.input_tensors) == 1 assert result.export.input_tensors[0].name == "pixel_values" assert result.export.input_tensors[0].shape == (1, 3, 224, 224) + assert result.export.input_tensors[0].dtype == "float32" - def test_underscore_normalization( + def test_registry_underscore_normalization( self, mock_hf_config: MagicMock, mock_model_class: MagicMock, @@ -517,22 +512,27 @@ def test_underscore_normalization( ), patch( "winml.modelkit.config.build._resolve_export_config_from_specs", - side_effect=ONNXConfigNotFoundError("unsupported"), - ), + ) as mock_optimum, # Registry uses hyphens patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {"clip-text-model": clip_config}), ): result = generate_build_config("openai/clip-vit-base-patch32") + # Underscore model_type should match hyphenated registry key + mock_optimum.assert_not_called() assert result.export.input_tensors[0].name == "input_ids" - def test_no_registry_no_optimum_returns_empty( + def test_registry_empty_list_input_tensors_skips_optimum( self, mock_hf_config: MagicMock, mock_model_class: MagicMock, mock_loader_config: WinMLLoaderConfig, ) -> None: - """No registry + Optimum fails → empty export config (no crash).""" + """Registry config with input_tensors=[] skips Optimum (is not None).""" + config_with_empty_list = WinMLBuildConfig( + export=WinMLExportConfig(input_tensors=[]), + ) + with ( patch( "winml.modelkit.config.build.resolve_loader_config", @@ -540,14 +540,38 @@ def test_no_registry_no_optimum_returns_empty( ), patch( "winml.modelkit.config.build._resolve_export_config_from_specs", - side_effect=ONNXConfigNotFoundError("unsupported"), + ) as mock_optimum, + patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {"bert": config_with_empty_list}), + ): + result = generate_build_config("bert-base-uncased") + + # [] is not None, so short-circuit fires + mock_optimum.assert_not_called() + assert result.export.input_tensors == [] + + def test_registry_miss_falls_through_to_optimum( + self, + mock_hf_config: MagicMock, + mock_model_class: MagicMock, + mock_loader_config: WinMLLoaderConfig, + mock_export_config: WinMLExportConfig, + ) -> None: + """Model not in registry at all falls through to Optimum.""" + with ( + patch( + "winml.modelkit.config.build.resolve_loader_config", + return_value=(mock_loader_config, mock_hf_config, mock_model_class), ), - patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {}), + patch( + "winml.modelkit.config.build._resolve_export_config_from_specs", + return_value=mock_export_config, + ) as mock_optimum, + patch("winml.modelkit.models.hf.MODEL_BUILD_CONFIGS", {}), # empty registry ): result = generate_build_config("some/unknown-model") - # Empty export config — no crash, downstream will handle - assert result.export.input_tensors is None + mock_optimum.assert_called_once() + assert result.export is mock_export_config # ============================================================================= @@ -1867,8 +1891,12 @@ def test_auto_auto_is_noop(self) -> None: # Default compile provider is "qnn" (from WinMLCompileConfig -> EPConfig) assert result.compile.ep_config.provider == "qnn" - def test_auto_auto_skips_resolve_device(self) -> None: - """device='auto' + precision='auto' does NOT call resolve_device.""" + def test_auto_auto_still_calls_resolve_device(self) -> None: + """device='auto' + precision='auto' DOES call resolve_device (#412). + + Previously this was skipped, causing EPConfig to default to 'qnn' + on machines without an NPU. Now we always detect hardware. + """ with ( patch( "winml.modelkit.config.build.resolve_loader_config", @@ -1894,7 +1922,7 @@ def test_auto_auto_skips_resolve_device(self) -> None: precision="auto", ) - mock_rd.assert_not_called() + mock_rd.assert_called_once_with(device="auto") def test_explicit_precision_triggers_resolve_device(self) -> None: """device='auto' + precision='int8' DOES call resolve_device.""" diff --git a/tests/unit/models/auto/test_automodel.py b/tests/unit/models/auto/test_automodel.py index 8ed62c863..52580db77 100644 --- a/tests/unit/models/auto/test_automodel.py +++ b/tests/unit/models/auto/test_automodel.py @@ -38,6 +38,7 @@ def _make_mock_model(num_labels: int = 1000): "output_names": ["logits"], } mock_session.is_compiled = True + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() diff --git a/tests/unit/models/auto/test_feature_extraction.py b/tests/unit/models/auto/test_feature_extraction.py index 95c42dce5..294d0cb15 100644 --- a/tests/unit/models/auto/test_feature_extraction.py +++ b/tests/unit/models/auto/test_feature_extraction.py @@ -31,6 +31,7 @@ def create_mock_model(): mock_session.run.return_value = { "last_hidden_state": np.random.randn(1, 8, 384).astype(np.float32), } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model._onnx_path = "mock.onnx" @@ -41,14 +42,17 @@ def create_mock_model(): class TestWinMLModelForFeatureExtractionBasic: def test_class_importable(self): from winml.modelkit.models.winml import WinMLModelForFeatureExtraction + assert WinMLModelForFeatureExtraction is not None def test_inherits_from_base(self): from winml.modelkit.models.winml import WinMLModelForFeatureExtraction, WinMLPreTrainedModel + assert issubclass(WinMLModelForFeatureExtraction, WinMLPreTrainedModel) def test_exported_from_winml_package(self): from winml.modelkit.models.winml import WinMLModelForFeatureExtraction + assert WinMLModelForFeatureExtraction is not None @@ -107,6 +111,7 @@ def test_sentence_embedding_unsqueezed(self): mock_session.run.return_value = { "sentence_embedding": np.zeros((1, 384), dtype=np.float32), } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model._onnx_path = "mock.onnx" @@ -130,6 +135,7 @@ def test_generic_2d_output_unsqueezed(self): mock_session.run.return_value = { "pooler_output": np.zeros((1, 768), dtype=np.float32), } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model._onnx_path = "mock.onnx" diff --git a/tests/unit/models/auto/test_image_classification.py b/tests/unit/models/auto/test_image_classification.py index 7dce67ad0..5cb20360f 100644 --- a/tests/unit/models/auto/test_image_classification.py +++ b/tests/unit/models/auto/test_image_classification.py @@ -41,6 +41,7 @@ def create_mock_model(num_labels: int = 1000): "input_names": ["pixel_values"], "output_names": ["logits"], } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model.config.num_labels = num_labels diff --git a/tests/unit/models/auto/test_image_segmentation.py b/tests/unit/models/auto/test_image_segmentation.py index f90784c12..f981dc38e 100644 --- a/tests/unit/models/auto/test_image_segmentation.py +++ b/tests/unit/models/auto/test_image_segmentation.py @@ -54,6 +54,7 @@ def create_mock_model( "input_names": ["pixel_values"], "output_names": ["logits", "pred_boxes", "pred_masks"], } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model.config.num_labels = num_classes @@ -185,6 +186,7 @@ def test_forward_missing_outputs_are_none(self): "input_names": ["pixel_values"], "output_names": ["logits"], } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model._onnx_path = "mock.onnx" @@ -292,6 +294,7 @@ def create_mock_semantic_model(num_labels: int = 150, output_h: int = 128, outpu "input_names": ["pixel_values"], "output_names": ["logits"], } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model.config.num_labels = num_labels diff --git a/tests/unit/models/auto/test_sequence_classification.py b/tests/unit/models/auto/test_sequence_classification.py index d7734876a..3133117db 100644 --- a/tests/unit/models/auto/test_sequence_classification.py +++ b/tests/unit/models/auto/test_sequence_classification.py @@ -39,6 +39,7 @@ def create_mock_model(num_labels: int = 2): "input_names": ["input_ids", "attention_mask", "token_type_ids"], "output_names": ["logits"], } + mock_session.device = "cpu" model._session = mock_session model.config = MagicMock() model.config.num_labels = num_labels diff --git a/tests/unit/optracing/test_integration.py b/tests/unit/optracing/test_integration.py deleted file mode 100644 index 896ffaa26..000000000 --- a/tests/unit/optracing/test_integration.py +++ /dev/null @@ -1,231 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Integration tests using real QNN profiling data.""" - -import json -from pathlib import Path - -from winml.modelkit.optracing import OperatorMetrics, OpTraceResult, write_op_trace_json -from winml.modelkit.optracing.qnn.csv_parser import ( - parse_qnn_profiling_csv, # Testing internal implementation -) -from winml.modelkit.optracing.qnn.qhas_parser import parse_qhas # Testing internal implementation - - -FIXTURE_DIR = Path(__file__).parent / "fixtures" - - -def test_basic_pipeline_csv_to_json(tmp_path): - """Full basic mode: CSV -> OpTraceResult -> JSON file.""" - csv_data = parse_qnn_profiling_csv(FIXTURE_DIR / "optrace_resnet50.csv") - - total_cycles = sum(op["cycles"] for op in csv_data["operators"]) - - operators = [ - OperatorMetrics( - name=op["name"], - op_path=op["name"], # CSV doesn't distinguish type vs path - op_id=op["op_id"], - duration_us=op["cycles"], # keep raw cycles as duration placeholder - percent_of_total=((op["cycles"] / total_cycles * 100) if total_cycles else 0), - ) - for op in csv_data["operators"] - ] - - result = OpTraceResult( - model="resnet-50", - device="npu", - tracing_level="basic", - operators=operators, - num_samples=csv_data["metadata"]["num_samples"], - summary=csv_data["metadata"], - ) - - out = tmp_path / "basic_op_trace.json" - write_op_trace_json(result, out) - - assert out.exists() - data = json.loads(out.read_text()) - assert data["metadata"]["tracing_level"] == "basic" - assert len(data["operators"]) > 0 - assert data["operators"][0]["duration_us"] > 0 - - -def test_detail_pipeline_qhas_to_json(tmp_path): - """Full detail mode: QHAS -> OpTraceResult -> JSON file.""" - qhas_raw = json.loads((FIXTURE_DIR / "qhas_resnet50.json").read_text()) - - parsed = parse_qhas(qhas_raw) - - operators = [ - OperatorMetrics( - name=op["name"], - op_path=op["op_path"], - duration_us=op["duration_us"], - percent_of_total=op["percent_of_total"], - dominant_path_us=op.get("dominant_path_us"), - dram_read_bytes=op.get("dram_read_bytes"), - dram_write_bytes=op.get("dram_write_bytes"), - vtcm_read_bytes=op.get("vtcm_read_bytes"), - vtcm_write_bytes=op.get("vtcm_write_bytes"), - vtcm_hit_ratio=op.get("vtcm_hit_ratio"), - num_htp_ops=op.get("num_htp_ops"), - ) - for op in parsed["operators"] - ] - - result = OpTraceResult( - model="resnet-50", - device="npu", - tracing_level="detail", - ep="QNNExecutionProvider", - operators=operators, - summary=parsed["summary"], - ) - - out = tmp_path / "detail_op_trace.json" - write_op_trace_json(result, out) - - data = json.loads(out.read_text()) - assert data["metadata"]["tracing_level"] == "detail" - assert data["summary"]["time_us"] > 0 - # At least one operator should have DRAM read data populated - assert any(op["dram_read_bytes"] is not None for op in data["operators"]) - - -def test_json_schema_basic(): - """Verify basic mode JSON has required keys.""" - result = OpTraceResult( - model="test", - device="npu", - tracing_level="basic", - operators=[OperatorMetrics(name="Conv", op_path="/conv", duration_us=10.0)], - ) - data = result.to_dict() - - assert "metadata" in data - assert "operators" in data - assert "summary" in data - assert "statistics" in data - assert "artifacts" in data - - meta = data["metadata"] - for key in ("model", "device", "tracing_level", "timestamp"): - assert key in meta - - -def test_json_schema_detail(): - """Verify detail mode JSON has P0-P3 fields.""" - result = OpTraceResult( - model="test", - device="npu", - tracing_level="detail", - operators=[ - OperatorMetrics( - name="Conv2d", - op_path="/conv", - duration_us=100.0, - dram_read_bytes=1024, - vtcm_read_bytes=4096, - vtcm_hit_ratio=0.8, - dominant_path_us=50.0, - ) - ], - ) - data = result.to_dict() - op = data["operators"][0] - - # P0: Temporal Localization - assert "duration_us" in op - # P1: Roofline Analysis - assert "dominant_path_us" in op - # P2: DMA Traffic - assert "dram_read_bytes" in op - assert "vtcm_read_bytes" in op - # P3: Cache Efficiency - assert "vtcm_hit_ratio" in op - - -def test_round_trip_json(): - """OpTraceResult -> JSON -> parse back -> verify fields match.""" - original = OpTraceResult( - model="resnet-50", - device="npu", - tracing_level="detail", - ep="QNNExecutionProvider", - operators=[ - OperatorMetrics( - name="Conv2d", - op_path="/layer1/conv", - duration_us=123.4, - percent_of_total=45.6, - dram_read_bytes=2048, - vtcm_hit_ratio=0.95, - ), - OperatorMetrics( - name="ReLU", - op_path="/layer1/relu", - duration_us=10.0, - percent_of_total=3.7, - ), - ], - summary={"time_us": 270.5, "utilization_pct": 83.5}, - ) - - json_str = original.to_json() - parsed = json.loads(json_str) - - # Metadata round-trip - assert parsed["metadata"]["model"] == "resnet-50" - assert parsed["metadata"]["tracing_level"] == "detail" - assert parsed["metadata"]["ep"] == "QNNExecutionProvider" - - # Operators round-trip - assert len(parsed["operators"]) == 2 - op0 = parsed["operators"][0] - assert op0["name"] == "Conv2d" - assert op0["op_path"] == "/layer1/conv" - assert op0["duration_us"] == 123.4 - assert op0["percent_of_total"] == 45.6 - assert op0["dram_read_bytes"] == 2048 - assert op0["vtcm_hit_ratio"] == 0.95 - - op1 = parsed["operators"][1] - assert op1["name"] == "ReLU" - assert op1["dram_read_bytes"] is None # not set => None preserved - - # Summary round-trip - assert parsed["summary"]["time_us"] == 270.5 - - -def test_csv_parser_operator_count(): - """CSV parser finds the expected number of operators.""" - data = parse_qnn_profiling_csv(FIXTURE_DIR / "optrace_resnet50.csv") - # ResNet-50 produces ~79 aggregated QNN ops from the fixture - assert len(data["operators"]) > 50 - - -def test_qhas_parser_operator_count(): - """QHAS parser finds operators in fixture.""" - qhas = json.loads((FIXTURE_DIR / "qhas_resnet50.json").read_text()) - parsed = parse_qhas(qhas) - assert len(parsed["operators"]) > 0 - - -def test_cross_parser_top_operator_is_conv(): - """Both parsers should show Conv as the top operator for ResNet.""" - # CSV: operators are sorted by cycles descending - csv_data = parse_qnn_profiling_csv(FIXTURE_DIR / "optrace_resnet50.csv") - top_csv = csv_data["operators"][0]["name"].lower() - - # QHAS: operators are not pre-sorted; find the one with max duration - qhas_raw = json.loads((FIXTURE_DIR / "qhas_resnet50.json").read_text()) - parsed = parse_qhas(qhas_raw) - top_qhas = max(parsed["operators"], key=lambda op: op["duration_us"]) - top_qhas_name = top_qhas["name"].lower() - - # The top op for ResNet should contain "conv" (the large 7x7 convolution) - assert "conv" in top_csv, f"Expected 'conv' in top CSV op: {top_csv}" - assert "conv" in top_qhas_name, f"Expected 'conv' in top QHAS op: {top_qhas_name}" diff --git a/tests/unit/optracing/test_perf_optracing_cli.py b/tests/unit/optracing/test_perf_optracing_cli.py deleted file mode 100644 index 3abbd0288..000000000 --- a/tests/unit/optracing/test_perf_optracing_cli.py +++ /dev/null @@ -1,68 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Tests for the --op-tracing CLI option on winml perf.""" - -from __future__ import annotations - -from unittest.mock import patch - -from click.testing import CliRunner - -from winml.modelkit.commands.perf import perf - - -def _invoke_perf(args: list[str]): - """Invoke perf CLI with PerfBenchmark.run mocked to prevent model loading.""" - runner = CliRunner() - with patch( - "winml.modelkit.commands.perf.PerfBenchmark.run", - side_effect=RuntimeError("mocked — not running benchmark"), - ): - return runner.invoke(perf, args, obj={}) - - -class TestOpTracingOptionParsing: - """Verify --op-tracing is recognized and validates choices.""" - - def test_option_is_recognized(self): - """--op-tracing is accepted as a valid CLI option.""" - result = _invoke_perf(["--op-tracing", "basic", "-m", "nonexistent"]) - assert "no such option" not in (result.output or "").lower() - - def test_basic_choice_accepted(self): - """--op-tracing basic is a valid choice.""" - result = _invoke_perf(["--op-tracing", "basic", "-m", "nonexistent"]) - assert "no such option" not in (result.output or "").lower() - assert "invalid choice" not in (result.output or "").lower() - - def test_detail_choice_accepted(self): - """--op-tracing detail is a valid choice.""" - result = _invoke_perf(["--op-tracing", "detail", "-m", "nonexistent"]) - assert "no such option" not in (result.output or "").lower() - assert "invalid choice" not in (result.output or "").lower() - - def test_invalid_choice_rejected(self): - """--op-tracing with an invalid value is rejected by Click.""" - runner = CliRunner() - result = runner.invoke(perf, ["--op-tracing", "invalid", "-m", "test"]) - assert result.exit_code != 0 - output_lower = (result.output or "").lower() - assert "invalid" in output_lower or "choice" in output_lower - - def test_case_insensitive(self): - """--op-tracing accepts mixed-case values (e.g. Basic, DETAIL).""" - result = _invoke_perf(["--op-tracing", "BASIC", "-m", "nonexistent"]) - assert "invalid choice" not in (result.output or "").lower() - - def test_without_op_tracing_flag(self): - """Command works without --op-tracing (default is None).""" - result = _invoke_perf(["-m", "nonexistent"]) - assert "no such option" not in (result.output or "").lower() - - def test_model_required_with_op_tracing(self): - """--op-tracing alone without -m still requires a model.""" - runner = CliRunner() - result = runner.invoke(perf, ["--op-tracing", "basic"]) - assert result.exit_code != 0 diff --git a/tests/unit/optracing/test_qnn_profiler.py b/tests/unit/optracing/test_qnn_profiler.py deleted file mode 100644 index 5cbfb3e70..000000000 --- a/tests/unit/optracing/test_qnn_profiler.py +++ /dev/null @@ -1,352 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Test QNN profiler and viewer with mocked ORT (no QNN hardware needed).""" - -from __future__ import annotations - -from pathlib import Path -from unittest.mock import MagicMock, patch - -import numpy as np - -from winml.modelkit.optracing.qnn.profiler import ( - QNNProfiler, - _ort_type_to_numpy, - _resolve_shape, -) -from winml.modelkit.optracing.qnn.viewer import ( - _DEFAULT_CONFIG, - find_qnn_sdk, - run_basic_viewer, - run_qhas_viewer, -) - - -# ===================================================================== -# Profiler: session options -# ===================================================================== - - -def test_qnn_profiler_creates_session_options(): - """Verify session options are configured correctly.""" - profiler = QNNProfiler(Path("model.onnx"), output_dir=Path("out"), level="basic") - mock_ort = MagicMock() - mock_options = MagicMock() - mock_ort.SessionOptions.return_value = mock_options - - options = profiler._build_session_options(mock_ort) - - assert options is mock_options - calls = mock_options.add_session_config_entry.call_args_list - entries = {c.args[0]: c.args[1] for c in calls} - assert entries["session.disable_cpu_ep_fallback"] == "1" - assert entries["ep.context_enable"] == "1" - assert entries["ep.context_embed_mode"] == "0" - - -# ===================================================================== -# Profiler: provider options -# ===================================================================== - - -def test_qnn_profiler_provider_options_basic(): - """Verify provider options for basic mode (profiling_level=detailed).""" - profiler = QNNProfiler(Path("model.onnx"), output_dir=Path("out"), level="basic") - opts = profiler._build_provider_options(Path("out/profiling.csv")) - - assert len(opts) == 1 - po = opts[0] - assert po["backend_path"] == "QnnHtp.dll" - assert po["htp_performance_mode"] == "high_performance" - assert po["htp_graph_finalization_optimization_mode"] == "3" - assert po["enable_htp_fp16_precision"] == "1" - assert po["profiling_level"] == "detailed" - assert po["profiling_file_path"] == str(Path("out/profiling.csv")) - - -def test_qnn_profiler_provider_options_detail(): - """Verify provider options for detail mode (profiling_level=optrace).""" - profiler = QNNProfiler(Path("model.onnx"), output_dir=Path("out"), level="detail") - opts = profiler._build_provider_options(Path("out/profiling.csv")) - - po = opts[0] - assert po["profiling_level"] == "optrace" - assert po["backend_path"] == "QnnHtp.dll" - - -# ===================================================================== -# Profiler: input generation helpers -# ===================================================================== - - -def test_ort_type_to_numpy_known_types(): - """Verify ORT type string mapping to NumPy dtypes.""" - assert _ort_type_to_numpy("tensor(float)") == np.dtype("float32") - assert _ort_type_to_numpy("tensor(float16)") == np.dtype("float16") - assert _ort_type_to_numpy("tensor(int64)") == np.dtype("int64") - assert _ort_type_to_numpy("tensor(bool)") == np.dtype("bool") - - -def test_ort_type_to_numpy_unknown_fallback(): - """Unknown ORT types fall back to float32.""" - assert _ort_type_to_numpy("tensor(bfloat16)") == np.dtype("float32") - - -def test_resolve_shape_concrete(): - """Concrete shapes pass through unchanged.""" - assert _resolve_shape([1, 3, 224, 224]) == [1, 3, 224, 224] - - -def test_resolve_shape_symbolic(): - """Symbolic (string / None / <=0) dimensions become default_dim.""" - assert _resolve_shape(["batch", 3, None, -1], default_dim=1) == [ - 1, - 3, - 1, - 1, - ] - - -def test_generate_inputs(): - """Verify random input generation from mock session.""" - mock_session = MagicMock() - mock_input = MagicMock() - mock_input.name = "input_ids" - mock_input.shape = [1, 128] - mock_input.type = "tensor(int64)" - mock_session.get_inputs.return_value = [mock_input] - - inputs = QNNProfiler._generate_inputs(mock_session) - - assert "input_ids" in inputs - assert inputs["input_ids"].shape == (1, 128) - assert inputs["input_ids"].dtype == np.int64 - - -# ===================================================================== -# Profiler: full run with mocked ORT -# ===================================================================== - - -def test_qnn_profiler_run_basic(tmp_path): - """End-to-end basic run with mocked ORT session.""" - model_path = tmp_path / "model.onnx" - model_path.write_bytes(b"fake") - output_dir = tmp_path / "output" - - # Create a minimal CSV so the CSV parser can parse it. - csv_content = ( - "Msg Timestamp,Message,Time,Unit of Measurement," - "Timing Source,Event Level,Event Identifier\n" - '0,ROOT,4,COUNT,HW,ROOT,"Number of HVX threads used"\n' - '1,ROOT,100000,CYCLES,HW,ROOT,"Accelerator (execute) time (cycles)"\n' - '2,NODE,500,CYCLES,HW,SUB-EVENT,"Conv2d:OpId_1 (cycles)"\n' - '3,NODE,300,CYCLES,HW,SUB-EVENT,"Add:OpId_2 (cycles)"\n' - ) - - # Mock ORT so no real QNN EP is needed. - mock_ort = MagicMock() - mock_session = MagicMock() - mock_ort.SessionOptions.return_value = MagicMock() - mock_ort.InferenceSession.return_value = mock_session - - mock_input = MagicMock() - mock_input.name = "input" - mock_input.shape = [1, 3] - mock_input.type = "tensor(float)" - mock_session.get_inputs.return_value = [mock_input] - mock_session.run.return_value = [np.array([1.0])] - - def write_csv_on_del(): - output_dir.mkdir(parents=True, exist_ok=True) - csv_path = output_dir / "profiling_output.csv" - csv_path.write_text(csv_content, encoding="utf-8") - - # Simulate CSV being flushed when session is deleted. - mock_ort.InferenceSession.return_value = mock_session - - with ( - patch.dict("sys.modules", {"onnxruntime": mock_ort}), - patch("winml.modelkit.optracing.qnn.profiler.QNNProfiler._collect_results") as mock_collect, - ): - # Write the CSV before _collect_results is called. - write_csv_on_del() - - profiler = QNNProfiler(model_path, output_dir=output_dir, level="basic") - - # Instead of running the full flow (which needs real ORT import), - # test the collect_results path directly. - mock_collect.return_value = MagicMock() - # Verify session creation was called correctly via builder methods. - profiler._build_session_options(mock_ort) - po = profiler._build_provider_options(output_dir / "profiling_output.csv") - assert po[0]["profiling_level"] == "detailed" - - # Now test the CSV parsing path directly. - result = profiler._from_csv( - output_dir / "profiling_output.csv", - iterations=5, - artifacts={"csv": str(output_dir / "profiling_output.csv")}, - ) - assert result.model == "model.onnx" - assert result.tracing_level == "basic" - assert result.ep == "QNNExecutionProvider" - assert len(result.operators) == 2 - assert result.operators[0].name == "Conv2d" - assert result.summary["hvx_threads"] == 4 - - -def test_qnn_profiler_empty_artifacts(tmp_path): - """Profiler returns empty result when no artifacts exist.""" - profiler = QNNProfiler(Path("model.onnx"), output_dir=tmp_path, level="basic") - result = profiler._collect_results(tmp_path / "nonexistent.csv", iterations=5) - assert result.model == "model.onnx" - assert len(result.operators) == 0 - assert result.num_samples == 0 - - -# ===================================================================== -# Viewer: SDK detection -# ===================================================================== - - -def test_find_qnn_sdk_from_env(monkeypatch, tmp_path): - """Test SDK detection from QNN_SDK_ROOT env var.""" - sdk_dir = tmp_path / "qnn_sdk" - sdk_dir.mkdir() - monkeypatch.setenv("QNN_SDK_ROOT", str(sdk_dir)) - - result = find_qnn_sdk() - assert result == sdk_dir - - -def test_find_qnn_sdk_not_found(monkeypatch): - """Test graceful None when SDK not found.""" - monkeypatch.delenv("QNN_SDK_ROOT", raising=False) - # Patch common paths to nonexistent directories. - with patch( - "winml.modelkit.optracing.qnn.viewer._COMMON_SDK_PATHS", - ["/nonexistent/path1", "/nonexistent/path2"], - ): - result = find_qnn_sdk() - assert result is None - - -def test_find_qnn_sdk_from_common_path(monkeypatch, tmp_path): - """Test SDK detection from common installation paths.""" - monkeypatch.delenv("QNN_SDK_ROOT", raising=False) - - # Create a fake SDK directory with bin/ subdirectory. - sdk_version_dir = tmp_path / "2.28.0" - (sdk_version_dir / "bin").mkdir(parents=True) - - with patch( - "winml.modelkit.optracing.qnn.viewer._COMMON_SDK_PATHS", - [str(tmp_path)], - ): - result = find_qnn_sdk() - assert result == sdk_version_dir - - -# ===================================================================== -# Viewer: basic viewer -# ===================================================================== - - -def test_run_basic_viewer_no_sdk(tmp_path): - """Basic viewer returns None when SDK is not found.""" - with patch("winml.modelkit.optracing.qnn.viewer._find_viewer_exe", return_value=None): - result = run_basic_viewer(tmp_path / "log.qnn", tmp_path / "output.csv") - assert result is None - - -def test_run_basic_viewer_success(tmp_path): - """Basic viewer returns path on success.""" - output_csv = tmp_path / "output.csv" - - def fake_run(cmd, **kwargs): - output_csv.write_text("header\ndata", encoding="utf-8") - - with ( - patch( - "winml.modelkit.optracing.qnn.viewer._find_viewer_exe", - return_value=Path("/fake/viewer.exe"), - ), - patch( - "winml.modelkit.optracing.qnn.viewer.subprocess.run", - side_effect=fake_run, - ), - ): - result = run_basic_viewer(tmp_path / "log.qnn", output_csv) - assert result == output_csv - - -# ===================================================================== -# Viewer: QHAS viewer -# ===================================================================== - - -def test_run_qhas_viewer_no_schematic(tmp_path): - """QHAS viewer returns None when schematic file does not exist.""" - with patch( - "winml.modelkit.optracing.qnn.viewer._find_viewer_exe", - return_value=Path("/fake/viewer.exe"), - ): - result = run_qhas_viewer( - tmp_path / "log.qnn", - tmp_path / "nonexistent_schematic.bin", - tmp_path / "output.json", - ) - assert result is None - - -def test_run_qhas_viewer_writes_config(tmp_path): - """QHAS viewer writes the optrace config JSON.""" - schematic = tmp_path / "model_schematic.bin" - schematic.write_bytes(b"fake") - output = tmp_path / "output.json" - - def fake_run(cmd, **kwargs): - output.write_text("{}", encoding="utf-8") - - with ( - patch( - "winml.modelkit.optracing.qnn.viewer._find_viewer_exe", - return_value=Path("/fake/viewer.exe"), - ), - patch( - "winml.modelkit.optracing.qnn.viewer.subprocess.run", - side_effect=fake_run, - ), - ): - run_qhas_viewer( - tmp_path / "log.qnn", - schematic, - output, - ) - config_path = tmp_path / "optrace_config.json" - assert config_path.is_file() - import json - - config = json.loads(config_path.read_text(encoding="utf-8")) - assert config["features"]["qhas_json"] is True - - -# ===================================================================== -# Viewer: default config -# ===================================================================== - - -def test_default_config_has_expected_features(): - """Verify the default QHAS config contains expected feature flags.""" - features = _DEFAULT_CONFIG["features"] - assert features["qhas_json"] is True - assert features["qhas_schema"] is True - assert features["htp_json"] is True - assert features["runtrace"] is True - assert features["memory_info"] is True - assert features["traceback"] is True - assert features["enable_input_output_flow_events"] is True - assert features["enable_sequencer_flow_events"] is True diff --git a/tests/unit/optracing/test_registry.py b/tests/unit/optracing/test_registry.py deleted file mode 100644 index 48058c1e3..000000000 --- a/tests/unit/optracing/test_registry.py +++ /dev/null @@ -1,132 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Test OpTracer registry: registration, lookup, and EP pattern matching.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -from winml.modelkit.optracing import OpTracer, OpTraceResult, get_tracer, register_tracer -from winml.modelkit.optracing.registry import _TRACERS # Testing internal implementation - - -if TYPE_CHECKING: - from pathlib import Path - - -# --------------------------------------------------------------------------- -# Test helpers -# --------------------------------------------------------------------------- - - -class _MockTracer(OpTracer): - """Concrete OpTracer for testing.""" - - def run( - self, - onnx_path: Path, - *, - iterations: int = 5, - warmup: int = 2, - output_dir: Path | None = None, - ) -> OpTraceResult: - return OpTraceResult( - model=onnx_path.name, - device="mock", - tracing_level="basic", - ) - - def is_available(self) -> bool: - return True - - -class _AnotherMockTracer(OpTracer): - """A second mock tracer for multi-level tests.""" - - def run( - self, - onnx_path: Path, - *, - iterations: int = 5, - warmup: int = 2, - output_dir: Path | None = None, - ) -> OpTraceResult: - return OpTraceResult( - model=onnx_path.name, - device="mock2", - tracing_level="detail", - ) - - def is_available(self) -> bool: - return False - - -@pytest.fixture(autouse=True) -def _clean_registry(): - """Snapshot and restore the registry around each test.""" - snapshot = {k: dict(v) for k, v in _TRACERS.items()} - yield - _TRACERS.clear() - _TRACERS.update(snapshot) - - -# --------------------------------------------------------------------------- -# Tests -# --------------------------------------------------------------------------- - - -def test_register_and_get_tracer(): - """Register a mock tracer and retrieve it.""" - register_tracer("MOCK", "basic", _MockTracer) - cls = get_tracer("MOCK", "basic") - assert cls is _MockTracer - - -def test_get_tracer_not_found(): - """Return None for unregistered EP/level.""" - assert get_tracer("NonExistent", "basic") is None - assert get_tracer("MOCK", "unknown_level") is None - - -def test_ep_pattern_matching(): - """'QNN' pattern matches 'QNNExecutionProvider'.""" - register_tracer("QNN", "basic", _MockTracer) - cls = get_tracer("QNNExecutionProvider", "basic") - assert cls is _MockTracer - - -def test_register_multiple_levels(): - """Same EP can have different tracers for basic/detail.""" - register_tracer("MOCK", "basic", _MockTracer) - register_tracer("MOCK", "detail", _AnotherMockTracer) - - assert get_tracer("MOCK", "basic") is _MockTracer - assert get_tracer("MOCK", "detail") is _AnotherMockTracer - - -def test_default_qnn_tracers_registered(): - """The auto-registered QNN tracers should be present.""" - from winml.modelkit.optracing.qnn.profiler import QNNProfiler - - basic_cls = get_tracer("QNN", "basic") - detail_cls = get_tracer("QNN", "detail") - - assert basic_cls is QNNProfiler - assert detail_cls is QNNProfiler - - -def test_pattern_substring_not_exact(): - """Pattern matching uses substring, not exact match.""" - register_tracer("Custom", "basic", _MockTracer) - - # "Custom" is a substring of "CustomExecutionProvider" - assert get_tracer("CustomExecutionProvider", "basic") is _MockTracer - # But "CustomOther" should NOT match "Custom" if "Custom" is in "CustomOther" - # Actually substring: "Custom" IS in "CustomOther", so it should match. - assert get_tracer("CustomOther", "basic") is _MockTracer - # "Cust" should NOT match pattern "Custom" - assert get_tracer("Cust", "basic") is None diff --git a/tests/unit/optracing/test_result.py b/tests/unit/optracing/test_result.py deleted file mode 100644 index 15e4cf0b6..000000000 --- a/tests/unit/optracing/test_result.py +++ /dev/null @@ -1,76 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------------------- -"""Test OpTraceResult dataclass and serialization.""" - -import json - -from winml.modelkit.optracing import OperatorMetrics, OpTraceResult - - -def test_operator_metrics_to_dict(): - op = OperatorMetrics(name="Conv2d", op_path="/layer1/conv/Conv", duration_us=45.2) - d = op.to_dict() - assert d["name"] == "Conv2d" - assert d["op_path"] == "/layer1/conv/Conv" - assert d["duration_us"] == 45.2 - assert d["dram_read_bytes"] is None - - -def test_operator_metrics_with_detail_fields(): - op = OperatorMetrics( - name="Conv2d", - op_path="/conv", - duration_us=100.0, - dram_read_bytes=1024, - vtcm_read_bytes=4096, - vtcm_hit_ratio=0.8, - dominant_path_us=50.0, - ) - d = op.to_dict() - assert d["dram_read_bytes"] == 1024 - assert d["vtcm_hit_ratio"] == 0.8 - assert d["dominant_path_us"] == 50.0 - - -def test_op_trace_result_to_dict(): - result = OpTraceResult( - model="resnet-50", - device="npu", - tracing_level="basic", - operators=[OperatorMetrics(name="Conv2d", op_path="/conv", duration_us=10.0)], - ) - d = result.to_dict() - assert d["metadata"]["model"] == "resnet-50" - assert d["metadata"]["device"] == "npu" - assert d["metadata"]["tracing_level"] == "basic" - assert len(d["operators"]) == 1 - assert d["operators"][0]["name"] == "Conv2d" - - -def test_op_trace_result_to_json(): - result = OpTraceResult( - model="resnet-50", - device="npu", - tracing_level="detail", - ep="QNNExecutionProvider", - operators=[ - OperatorMetrics(name="Conv2d", op_path="/conv", duration_us=10.0), - OperatorMetrics(name="Add", op_path="/add", duration_us=5.0), - ], - summary={"time_us": 1343, "utilization_pct": 99.59}, - ) - j = result.to_json() - parsed = json.loads(j) - assert parsed["metadata"]["model"] == "resnet-50" - assert parsed["metadata"]["ep"] == "QNNExecutionProvider" - assert len(parsed["operators"]) == 2 - assert parsed["summary"]["time_us"] == 1343 - - -def test_op_trace_result_empty(): - result = OpTraceResult(model="test", device="cpu", tracing_level="basic") - d = result.to_dict() - assert d["operators"] == [] - assert d["summary"] == {} diff --git a/tests/unit/optracing/__init__.py b/tests/unit/session/__init__.py similarity index 100% rename from tests/unit/optracing/__init__.py rename to tests/unit/session/__init__.py diff --git a/src/winml/modelkit/optracing/qnn/__init__.py b/tests/unit/session/monitor/__init__.py similarity index 85% rename from src/winml/modelkit/optracing/qnn/__init__.py rename to tests/unit/session/monitor/__init__.py index 0c6712fb0..862c45ce3 100644 --- a/src/winml/modelkit/optracing/qnn/__init__.py +++ b/tests/unit/session/monitor/__init__.py @@ -2,4 +2,3 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- -"""QNN EP operator profiling via ORT.""" diff --git a/tests/unit/session/monitor/qnn/__init__.py b/tests/unit/session/monitor/qnn/__init__.py new file mode 100644 index 000000000..862c45ce3 --- /dev/null +++ b/tests/unit/session/monitor/qnn/__init__.py @@ -0,0 +1,4 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- diff --git a/tests/unit/optracing/fixtures/optrace_resnet50.csv b/tests/unit/session/monitor/qnn/fixtures/optrace_resnet50.csv similarity index 100% rename from tests/unit/optracing/fixtures/optrace_resnet50.csv rename to tests/unit/session/monitor/qnn/fixtures/optrace_resnet50.csv diff --git a/tests/unit/optracing/fixtures/qhas_resnet50.json b/tests/unit/session/monitor/qnn/fixtures/qhas_resnet50.json similarity index 100% rename from tests/unit/optracing/fixtures/qhas_resnet50.json rename to tests/unit/session/monitor/qnn/fixtures/qhas_resnet50.json diff --git a/tests/unit/optracing/test_csv_parser.py b/tests/unit/session/monitor/qnn/test_csv_parser.py similarity index 95% rename from tests/unit/optracing/test_csv_parser.py rename to tests/unit/session/monitor/qnn/test_csv_parser.py index 14f6aa7cc..66c826e30 100644 --- a/tests/unit/optracing/test_csv_parser.py +++ b/tests/unit/session/monitor/qnn/test_csv_parser.py @@ -6,7 +6,7 @@ from pathlib import Path -from winml.modelkit.optracing.qnn.csv_parser import parse_qnn_profiling_csv +from winml.modelkit.session.monitor.qnn.csv_parser import parse_qnn_profiling_csv FIXTURE_DIR = Path(__file__).parent / "fixtures" diff --git a/tests/unit/optracing/test_qhas_parser.py b/tests/unit/session/monitor/qnn/test_qhas_parser.py similarity index 96% rename from tests/unit/optracing/test_qhas_parser.py rename to tests/unit/session/monitor/qnn/test_qhas_parser.py index 9a767991a..cc4fabc6a 100644 --- a/tests/unit/optracing/test_qhas_parser.py +++ b/tests/unit/session/monitor/qnn/test_qhas_parser.py @@ -7,7 +7,7 @@ import json from pathlib import Path -from winml.modelkit.optracing.qnn.qhas_parser import parse_qhas +from winml.modelkit.session.monitor.qnn.qhas_parser import parse_qhas FIXTURE_DIR = Path(__file__).parent / "fixtures" diff --git a/tests/unit/session/monitor/qnn/test_viewer.py b/tests/unit/session/monitor/qnn/test_viewer.py new file mode 100644 index 000000000..4195c9724 --- /dev/null +++ b/tests/unit/session/monitor/qnn/test_viewer.py @@ -0,0 +1,31 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for qnn.viewer SDK-root resolution. + +These tests cover the env-only resolution contract for +``find_qnn_sdk`` (no hardcoded developer-machine fallback paths). +""" + +from __future__ import annotations + +from winml.modelkit.session.monitor.qnn.viewer import find_qnn_sdk + + +def test_find_qnn_sdk_returns_none_when_env_unset(monkeypatch, tmp_path): + """No env var set -> None (no fallback to hardcoded paths).""" + monkeypatch.delenv("QNN_SDK_ROOT", raising=False) + assert find_qnn_sdk() is None + + +def test_find_qnn_sdk_returns_path_when_env_points_to_dir(monkeypatch, tmp_path): + """Env var pointing to an existing directory -> that Path is returned.""" + monkeypatch.setenv("QNN_SDK_ROOT", str(tmp_path)) + assert find_qnn_sdk() == tmp_path + + +def test_find_qnn_sdk_returns_none_when_env_points_to_nonexistent(monkeypatch, tmp_path): + """Env var pointing to a non-existent path -> None.""" + monkeypatch.setenv("QNN_SDK_ROOT", str(tmp_path / "does-not-exist")) + assert find_qnn_sdk() is None diff --git a/tests/unit/session/monitor/test_ep_monitor_base.py b/tests/unit/session/monitor/test_ep_monitor_base.py new file mode 100644 index 000000000..7dbe6034a --- /dev/null +++ b/tests/unit/session/monitor/test_ep_monitor_base.py @@ -0,0 +1,62 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for EPMonitor ABC default hook behavior.""" + +from __future__ import annotations + +import pytest + +from winml.modelkit.session.monitor.ep_monitor import EPMonitor, NullEPMonitor + + +def test_null_monitor_default_get_session_options(): + """NullEPMonitor inherits empty session-options default.""" + assert NullEPMonitor().get_session_options() == {} + + +def test_null_monitor_default_get_provider_options(): + """NullEPMonitor inherits empty provider-options default.""" + assert NullEPMonitor().get_provider_options() == {} + + +def test_null_monitor_default_requires_teardown(): + """NullEPMonitor.requires_session_teardown is False by default.""" + assert NullEPMonitor.requires_session_teardown is False + + +def test_ep_monitor_is_abstract(): + """EPMonitor cannot be instantiated directly (still abstract).""" + with pytest.raises(TypeError): + EPMonitor() # type: ignore[abstract] + + +def test_hooks_return_fresh_dicts(): + """get_*_options returns a fresh dict each call (not a shared mutable).""" + m = NullEPMonitor() + d1 = m.get_session_options() + d1["injected"] = "1" + d2 = m.get_session_options() + assert "injected" not in d2 + + +def test_requires_session_teardown_must_be_bool() -> None: + """Shadowing requires_session_teardown with a non-bool fails at class-def time.""" + with pytest.raises(TypeError, match="requires_session_teardown must be a class-level bool"): + + class _BadMonitor(EPMonitor): + requires_session_teardown = "yes" # wrong type + + def __enter__(self): + return self + + def __exit__(self, *a): + return None + + def to_dict(self): + return {} + + @classmethod + def is_available(cls): + return True diff --git a/tests/unit/session/monitor/test_op_metrics.py b/tests/unit/session/monitor/test_op_metrics.py new file mode 100644 index 000000000..235670270 --- /dev/null +++ b/tests/unit/session/monitor/test_op_metrics.py @@ -0,0 +1,121 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for the relocated OpTraceResult + new status/error fields.""" + +from __future__ import annotations + +import json + +from winml.modelkit.session.monitor.op_metrics import ( + OperatorMetrics, + OpTraceResult, +) + + +def test_model_field_accepts_none(): + """model: str | None — passing None must not raise.""" + r = OpTraceResult(model=None, device="npu", tracing_level="basic") + assert r.model is None + + +def test_status_default_is_ok(): + """New status field defaults to 'ok' for backward compat with existing construction.""" + r = OpTraceResult(model="x", device="npu", tracing_level="basic") + assert r.status == "ok" + assert r.error is None + + +def test_status_can_be_set(): + r = OpTraceResult( + model="x", + device="npu", + tracing_level="basic", + status="parse_failed", + error="corrupt CSV", + ) + assert r.status == "parse_failed" + assert r.error == "corrupt CSV" + + +def test_to_dict_preserves_nested_schema(): + """Existing nested schema must be preserved. + + The ``metadata`` block must include ``num_samples`` and ``timestamp`` — + A2-I1 in PR review: a regression that drops either field would silently + pass an "only check the easy keys" assertion. + """ + r = OpTraceResult( + model="m.onnx", + device="npu", + tracing_level="basic", + ep="QNN", + num_samples=42, + ) + d = r.to_dict() + assert "metadata" in d + assert d["metadata"]["model"] == "m.onnx" + assert d["metadata"]["device"] == "npu" + assert d["metadata"]["tracing_level"] == "basic" + assert d["metadata"]["ep"] == "QNN" + assert "summary" in d + assert "operators" in d + assert "statistics" in d + assert "artifacts" in d + # A2-I1: num_samples + timestamp must be in nested metadata. + assert "num_samples" in d["metadata"] + assert "timestamp" in d["metadata"] + assert d["metadata"]["num_samples"] == r.num_samples == 42 + # The timestamp default is an ISO-8601 string from datetime.isoformat(). + assert isinstance(d["metadata"]["timestamp"], str) + assert d["metadata"]["timestamp"] == r.timestamp + # Sanity-check it parses as ISO-8601 (drops the 'Z'/offset gracefully). + from datetime import datetime + + datetime.fromisoformat(d["metadata"]["timestamp"]) + + +def test_to_dict_adds_status_and_error_at_top_level(): + """New fields are additive top-level keys.""" + r = OpTraceResult( + model="x", + device="npu", + tracing_level="basic", + status="no_data", + error=None, + ) + d = r.to_dict() + assert d["status"] == "no_data" + assert d["error"] is None + + +def test_to_json_round_trip(): + r = OpTraceResult(model="x", device="npu", tracing_level="basic", status="ok") + parsed = json.loads(r.to_json()) + assert parsed["metadata"]["model"] == "x" + assert parsed["status"] == "ok" + + +def test_operator_metrics_to_dict_preserved(): + op = OperatorMetrics(name="Conv", op_path="/conv_1", duration_us=12.5, percent_of_total=5.0) + d = op.to_dict() + assert d["name"] == "Conv" + assert d["duration_us"] == 12.5 + + +def test_to_dict_status_only_accepts_known_values_per_typing() -> None: + """status is a Literal — assert each declared value round-trips through to_dict. + + Python does not enforce ``Literal`` at runtime, so this test verifies *the + declared values are accepted and serialize correctly*. Static enforcement + is delegated to mypy / ruff. + """ + for status in ("ok", "no_data", "parse_failed", "basic_fallback", "not_run"): + r = OpTraceResult(model=None, device="npu", tracing_level="basic", status=status) + assert r.to_dict()["status"] == status + + +def test_trace_status_alias_importable() -> None: + """``TraceStatus`` must be importable as a public symbol from op_metrics.""" + from winml.modelkit.session.monitor.op_metrics import TraceStatus # noqa: F401 diff --git a/tests/unit/session/monitor/test_qnn_monitor.py b/tests/unit/session/monitor/test_qnn_monitor.py new file mode 100644 index 000000000..a4359a975 --- /dev/null +++ b/tests/unit/session/monitor/test_qnn_monitor.py @@ -0,0 +1,505 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for QNNMonitor — the QNN EP op-tracing monitor.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + + +def test_ctor_defaults(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor() + assert m._level == "basic" + assert m._output_dir.exists() + assert m._csv_path.is_absolute() + + +def test_ctor_accepts_custom_output_dir(tmp_path): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor(output_dir=tmp_path) + assert m._output_dir == tmp_path + assert str(m._csv_path).startswith(str(tmp_path)) + + +def test_ctor_rejects_invalid_level(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with pytest.raises(ValueError, match="level"): + QNNMonitor(level="bogus") # type: ignore[arg-type] + + +def test_get_session_options_enables_epcontext_caching(): + """get_session_options enables EPContext caching only. + + `session.disable_cpu_ep_fallback` is intentionally NOT set: under + onnxruntime-windowsml the WinML-registered QNN partitions a QDQ-wrapped + EPContext model into Q/DQ-on-CPU + EPContext-on-QNN, which is correct. + The "no silent CPU fallback" guarantee is provided upstream by + add_provider_for_devices, not here. + """ + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + opts = QNNMonitor().get_session_options() + assert opts == { + "ep.context_enable": "1", + "ep.context_embed_mode": "0", + } + assert "session.disable_cpu_ep_fallback" not in opts + + +def test_get_provider_options_owner_keys_only(): + """get_provider_options sets ONLY the two profiling keys + user extras. + + backend_path / htp_* are NOT defaulted: they would overwrite WinML's + registered absolute backend_path and break DLL loading. Callers who + need them pass via extra_provider_options. + """ + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + opts = QNNMonitor(level="basic").get_provider_options() + assert opts == { + "profiling_level": "detailed", + "profiling_file_path": opts["profiling_file_path"], + } + # Verify no defaults that would conflict with WinML registration + assert "backend_path" not in opts + assert "htp_performance_mode" not in opts + + +def test_get_provider_options_detail(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + assert QNNMonitor(level="detail").get_provider_options()["profiling_level"] == "optrace" + + +def test_extra_provider_options_pass_through(): + """User-supplied extras are honored (e.g. backend_path for bundled ORT QNN).""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor( + level="basic", + extra_provider_options={ + "backend_path": r"C:\path\to\QnnHtp.dll", + "htp_performance_mode": "balanced", + }, + ) + opts = m.get_provider_options() + assert opts["backend_path"] == r"C:\path\to\QnnHtp.dll" + assert opts["htp_performance_mode"] == "balanced" + + +def test_profiling_keys_not_user_overridable(): + """C-3: user extras cannot override profiling_level or profiling_file_path.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor( + level="basic", + extra_provider_options={ + "profiling_level": "off", + "profiling_file_path": "/attacker/path", + "htp_performance_mode": "balanced", + }, + ) + opts = m.get_provider_options() + assert opts["profiling_level"] == "detailed" + assert opts["profiling_file_path"] != "/attacker/path" + assert opts["htp_performance_mode"] == "balanced" # non-owned extra honored + + +def test_get_provider_options_idempotent(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor(level="basic") + assert m.get_provider_options() == m.get_provider_options() + + +def test_get_session_options_idempotent(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor(level="basic") + assert m.get_session_options() == m.get_session_options() + + +def test_requires_session_teardown_true(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + assert QNNMonitor.requires_session_teardown is True + + +def test_double_enter_raises(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor() + m.__enter__() + with pytest.raises(RuntimeError, match="already entered"): + m.__enter__() + + +def test_exit_with_no_csv_reports_no_data(tmp_path): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor(output_dir=tmp_path) + m.__enter__() + m.__exit__(None, None, None) + d = m.to_dict() + assert d["status"] == "no_data" + + +def test_exit_parse_failure_caught(tmp_path): + """If CSV exists but is corrupt, status is 'parse_failed' and error is populated.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + csv = tmp_path / "profiling_output.csv" + csv.write_text("this is not a valid qnn csv") + m = QNNMonitor(output_dir=tmp_path) + m.__enter__() + m.__exit__(None, None, None) + d = m.to_dict() + # Either 'parse_failed' (if parser raises) or 'ok'/'no_data' (if parser + # gracefully returns empty). We accept any of those but must NOT raise. + assert d["status"] in ("parse_failed", "no_data", "ok") + + +def test_exit_does_not_suppress_caller_exception(tmp_path): + """EPMonitor.__exit__ returning None (not True) → exception propagates.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor(output_dir=tmp_path) + m.__enter__() + result = m.__exit__(RuntimeError, RuntimeError("test"), None) + assert result is None or result is False + + +def test_to_dict_before_enter(): + """Calling to_dict() before enter/exit returns 'not_run' status in nested schema.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor() + d = m.to_dict() + assert d["status"] == "not_run" + # Schema must match the post-exit OpTraceResult.to_dict() shape. + assert d["metadata"]["ep"] == "QNNExecutionProvider" + + +def test_to_dict_pre_exit_returns_nested_schema(tmp_path): + """Pre-exit to_dict() emits the same nested keys as a fully-populated result.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + monitor = QNNMonitor(level="basic", output_dir=tmp_path) + out = monitor.to_dict() + assert "metadata" in out + assert "summary" in out + assert "operators" in out + assert "artifacts" in out + assert out["status"] == "not_run" + assert out["metadata"]["tracing_level"] == "basic" + assert out["metadata"]["device"] == "npu" + + +def test_is_available_via_bundled(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with patch( + "onnxruntime.get_available_providers", + return_value=["QNNExecutionProvider", "CPUExecutionProvider"], + ): + assert QNNMonitor.is_available() is True + + +def test_is_available_via_winml(): + """When QNN EP is registered via WinML, is_available() returns True.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + fake_ep = MagicMock() + fake_ep.ep_name = "QNNExecutionProvider" + with ( + patch("onnxruntime.get_available_providers", return_value=["CPUExecutionProvider"]), + patch("onnxruntime.get_ep_devices", return_value=[fake_ep]), + patch("winml.modelkit.session.ep_registry.ensure_initialized"), + ): + assert QNNMonitor.is_available() is True + + +def test_is_available_neither(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + with ( + patch("onnxruntime.get_available_providers", return_value=["CPUExecutionProvider"]), + patch("onnxruntime.get_ep_devices", return_value=[]), + patch("winml.modelkit.session.ep_registry.ensure_initialized"), + ): + assert QNNMonitor.is_available() is False + + +def test_is_available_winml_path_failure_logs_warning(caplog, monkeypatch): + """NFR-2: real environmental failure on the WinML path must log at WARNING, not DEBUG. + + The bare-Exception swallow downgraded broken Windows App SDK / denied + registry access to "feature unavailable" silently. Any non-ImportError + in ``ensure_initialized()`` MUST surface at WARNING with the exception + class, so users can diagnose the underlying environment problem. + """ + import logging + + import onnxruntime as ort + + from winml.modelkit.session import ep_registry + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + # Force the QNN-bundled path to miss + monkeypatch.setattr(ort, "get_available_providers", lambda: ["CPUExecutionProvider"]) + monkeypatch.setattr(ort, "get_ep_devices", list) + + # Make ensure_initialized raise a non-ImportError exception + def _raises() -> None: + raise RuntimeError("simulated WinML init failure") + + monkeypatch.setattr(ep_registry, "ensure_initialized", _raises) + + with caplog.at_level(logging.WARNING): + assert QNNMonitor.is_available() is False + + # Assert the log carries enough info to diagnose + warnings = [r for r in caplog.records if r.levelname == "WARNING"] + matched = any( + "WinML EP probe failed" in r.message and "RuntimeError" in r.message for r in warnings + ) + assert matched, ( + f"expected WARNING with 'WinML EP probe failed' + 'RuntimeError', " + f"got: {[r.message for r in warnings]}" + ) + + +def test_result_property_none_before_exit(): + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + m = QNNMonitor() + assert m.result is None + + +def test_no_os_chdir(): + """QNNMonitor MUST NOT mutate CWD per FR-12 / C-5.""" + from pathlib import Path + + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + cwd_before = Path.cwd() + m = QNNMonitor() + m.__enter__() + m.__exit__(None, None, None) + assert Path.cwd() == cwd_before + + +def test_find_schematic_rejects_stale_cwd_candidate(tmp_path, monkeypatch): + """A *_schematic.bin in CWD older than the profiling CSV must NOT be returned. + + Setup: + - output_dir = tmp_path/out (no schematic in it → exercise CWD fallback) + - cwd = tmp_path/cwd (contains a STALE schematic) + - csv = tmp_path/out/profiling_output.csv (FRESH, written 'now') + Expected: the stale CWD schematic is older than the CSV by >5s, so the + mtime gate rejects it and _find_schematic() returns None. + """ + import os + import time + + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + out_dir = tmp_path / "out" + cwd_dir = tmp_path / "cwd" + out_dir.mkdir() + cwd_dir.mkdir() + + monitor = QNNMonitor(level="detail", output_dir=out_dir) + # Fresh CSV (now) + monitor._csv_path.write_text("dummy") + # Stale schematic in CWD (1 hour old) + stale = cwd_dir / "stale_schematic.bin" + stale.write_bytes(b"") + old = time.time() - 3600 + os.utime(stale, (old, old)) + + monkeypatch.chdir(cwd_dir) + # CWD glob would surface 'stale', but mtime guard rejects. + assert monitor._find_schematic() is None + + +def test_find_schematic_accepts_fresh_cwd_candidate(tmp_path, monkeypatch): + """A *_schematic.bin in CWD newer than the profiling CSV is accepted (mtime gate).""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + out_dir = tmp_path / "out" + cwd_dir = tmp_path / "cwd" + out_dir.mkdir() + cwd_dir.mkdir() + + monitor = QNNMonitor(level="detail", output_dir=out_dir) + # CSV first, then a fresh schematic — the schematic mtime >= CSV mtime. + monitor._csv_path.write_text("dummy") + fresh = cwd_dir / "fresh_schematic.bin" + fresh.write_bytes(b"") + + monkeypatch.chdir(cwd_dir) + assert monitor._find_schematic() == fresh + + +def test_find_schematic_prefers_output_dir_over_cwd(tmp_path, monkeypatch): + """When output_dir contains a schematic, CWD is never consulted.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + out_dir = tmp_path / "out" + cwd_dir = tmp_path / "cwd" + out_dir.mkdir() + cwd_dir.mkdir() + + monitor = QNNMonitor(level="detail", output_dir=out_dir) + in_out = out_dir / "graph_schematic.bin" + in_out.write_bytes(b"") + in_cwd = cwd_dir / "graph_schematic.bin" + in_cwd.write_bytes(b"") + + monkeypatch.chdir(cwd_dir) + assert monitor._find_schematic() == in_out + + +def test_output_dir_property_exposes_path(tmp_path): + """The output_dir property returns the directory used for artifacts.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + monitor = QNNMonitor(level="basic", output_dir=tmp_path) + assert monitor.output_dir == tmp_path + assert monitor.output_dir.is_dir() + + +def test_output_dir_property_for_default_tempdir(): + """When output_dir=None, the property exposes the auto-minted tempdir.""" + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + monitor = QNNMonitor(level="basic") + assert monitor.output_dir.is_dir() + assert monitor.output_dir.name.startswith("qnn_profile_") + + +def test_output_dir_property_is_read_only(tmp_path): + """output_dir is exposed as a property; rebinding must raise AttributeError.""" + import pytest as _pytest + + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + monitor = QNNMonitor(level="basic", output_dir=tmp_path) + with _pytest.raises(AttributeError): + monitor.output_dir = tmp_path / "other" # type: ignore[misc] + + +# --------------------------------------------------------------------------- +# Detail-mode fallback (FR-5 / PR review A2-I7) +# --------------------------------------------------------------------------- + + +def test_detail_mode_falls_back_to_basic_when_qhas_unavailable(tmp_path): + """A detail-level monitor with a valid CSV but no QHAS path produces status='basic_fallback'. + + PRD FR-5: when the user requests ``level="detail"`` but post-processing + artifacts (``*_qnn.log`` / ``*_schematic.bin`` / SDK) are unavailable, + the monitor MUST surface a populated CSV-only result with + ``status="basic_fallback"`` rather than raising or producing + ``status="ok"`` (which would silently pretend QHAS data was present). + """ + from pathlib import Path + + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + monitor = QNNMonitor(level="detail", output_dir=tmp_path) + # Drop the real CSV fixture into the spot the monitor expects so the + # CSV parse path succeeds. The QHAS branch will fail naturally because + # no *_qnn.log is present in the output directory — this is the + # cleanest hit on the basic_fallback codepath in _try_qhas. + fixture = Path(__file__).parent / "qnn" / "fixtures" / "optrace_resnet50.csv" + monitor._csv_path.write_text(fixture.read_text(encoding="utf-8"), encoding="utf-8") + + monitor.__enter__() + monitor.__exit__(None, None, None) + + assert monitor.result is not None + assert monitor.result.status == "basic_fallback" + # CSV-only data must still be populated — basic_fallback is degraded + # *success*, not failure: operators and summary are non-empty. + assert monitor.result.operators, "expected CSV-derived operators in basic_fallback result" + assert monitor.result.summary, "expected CSV-derived summary in basic_fallback result" + # No QHAS artifact recorded; CSV artifact recorded. + assert "qhas" not in monitor.result.artifacts + assert "csv" in monitor.result.artifacts + + +# --------------------------------------------------------------------------- +# Windows file-handle retry (R-2 / PR review A2-I8) +# --------------------------------------------------------------------------- + + +def test_parse_artifacts_retries_when_csv_absent(tmp_path, monkeypatch): + """R-2 mitigation: a 50ms ``time.sleep`` retry fires when the CSV is + absent on the first ``is_file()`` check. + + QNN EP flushes the profiling CSV on session destruction, but on Windows + file-handle close can lag the actual unlink/rename behind the calling + thread. The monitor's ``_parse_artifacts`` does one 50ms retry before + declaring ``no_data``. Without this retry, slow filesystems would + silently produce ``status="no_data"`` for runs that did finish flushing. + """ + from winml.modelkit.session.monitor import qnn_monitor as qnn_monitor_mod + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + monitor = QNNMonitor(level="basic", output_dir=tmp_path) + + sleep_calls: list[float] = [] + + def _track_sleep(seconds: float) -> None: + sleep_calls.append(seconds) + + monkeypatch.setattr(qnn_monitor_mod.time, "sleep", _track_sleep) + + # CSV never appears, so the retry will not save the result, but the + # critical assertion is that the 50ms retry DID fire. + monitor.__enter__() + monitor.__exit__(None, None, None) + + assert any(abs(s - 0.05) < 1e-9 for s in sleep_calls), ( + f"expected exactly one 0.05s retry sleep, got {sleep_calls!r}" + ) + # And status confirms the post-retry path: CSV still missing → no_data. + assert monitor.result is not None + assert monitor.result.status == "no_data" + + +def test_parse_artifacts_no_retry_when_csv_present_on_first_check(tmp_path, monkeypatch): + """If the CSV is on disk on the FIRST ``is_file()`` check, the 50ms + retry sleep MUST NOT fire. Verifies the retry is gated, not unconditional. + """ + from pathlib import Path + + from winml.modelkit.session.monitor import qnn_monitor as qnn_monitor_mod + from winml.modelkit.session.monitor.qnn_monitor import QNNMonitor + + monitor = QNNMonitor(level="basic", output_dir=tmp_path) + # Pre-populate the CSV with valid content. + fixture = Path(__file__).parent / "qnn" / "fixtures" / "optrace_resnet50.csv" + monitor._csv_path.write_text(fixture.read_text(encoding="utf-8"), encoding="utf-8") + + sleep_calls: list[float] = [] + monkeypatch.setattr(qnn_monitor_mod.time, "sleep", lambda s: sleep_calls.append(s)) + + monitor.__enter__() + monitor.__exit__(None, None, None) + + assert sleep_calls == [], ( + f"expected no retry sleep when CSV is present on first check, got {sleep_calls!r}" + ) + assert monitor.result is not None + assert monitor.result.status == "ok" diff --git a/tests/unit/optracing/test_report.py b/tests/unit/session/monitor/test_report.py similarity index 98% rename from tests/unit/optracing/test_report.py rename to tests/unit/session/monitor/test_report.py index e97eb09a4..442b5c9ab 100644 --- a/tests/unit/optracing/test_report.py +++ b/tests/unit/session/monitor/test_report.py @@ -10,13 +10,15 @@ import pytest from rich.console import Console -from winml.modelkit.optracing import ( +from winml.modelkit.session.monitor.op_metrics import ( OperatorMetrics, OpTraceResult, +) +from winml.modelkit.session.monitor.report import ( + _format_bytes, # Testing internal implementation display_op_trace_report, write_op_trace_json, ) -from winml.modelkit.optracing.report import _format_bytes # Testing internal implementation # --------------------------------------------------------------------------- diff --git a/tests/unit/session/test_ep_monitor.py b/tests/unit/session/test_ep_monitor.py index c5af9c74e..40d52de9e 100644 --- a/tests/unit/session/test_ep_monitor.py +++ b/tests/unit/session/test_ep_monitor.py @@ -558,38 +558,11 @@ def test_cpu_samples_accessible(self): # ============================================================================ -# QNNMonitor tests (placeholder) +# QNNMonitor tests — moved to tests/unit/session/monitor/test_qnn_monitor.py +# (QNNMonitor is no longer a placeholder; it is a full implementation). # ============================================================================ -class TestQNNMonitor: - """Test QNNMonitor placeholder.""" - - def test_is_available_returns_false(self): - from winml.modelkit.session import QNNMonitor - - assert QNNMonitor.is_available() is False - - def test_context_manager_noop(self): - from winml.modelkit.session import QNNMonitor - - with QNNMonitor() as hw: - pass - - assert hw.to_dict()["ep"] == "QNN" - - def test_to_dict_returns_stub(self): - from winml.modelkit.session import QNNMonitor - - with QNNMonitor() as hw: - pass - - d = hw.to_dict() - assert d["ep"] == "QNN" - assert d["device"] == "NPU" - assert d["status"] == "not_implemented" - - # ============================================================================ # OpenVinoMonitor tests (placeholder) # ============================================================================ @@ -797,7 +770,7 @@ class TestLiveMonitorDisplay: """Test LiveMonitorDisplay logic (non-visual).""" def test_render_status_warmup_phase(self): - from winml.modelkit.commands.live_chart import LiveMonitorDisplay + from winml.modelkit.commands._live_chart import LiveMonitorDisplay display = LiveMonitorDisplay(total_iterations=110, warmup=10, model_id="test", device="npu") status = display._render_status( @@ -813,7 +786,7 @@ def test_render_status_warmup_phase(self): assert "npu" in status.lower() or "Device" in status def test_render_status_benchmark_phase(self): - from winml.modelkit.commands.live_chart import LiveMonitorDisplay + from winml.modelkit.commands._live_chart import LiveMonitorDisplay display = LiveMonitorDisplay(total_iterations=110, warmup=10, model_id="test", device="npu") status = display._render_status( @@ -830,7 +803,7 @@ def test_render_status_benchmark_phase(self): assert "Latency" in status def test_render_status_zero_latency_no_crash(self): - from winml.modelkit.commands.live_chart import LiveMonitorDisplay + from winml.modelkit.commands._live_chart import LiveMonitorDisplay display = LiveMonitorDisplay(total_iterations=10, warmup=0, model_id="test", device="cpu") # latency_ms=0 should not cause division by zero @@ -842,7 +815,7 @@ def test_render_status_zero_latency_no_crash(self): assert "Throughput" in status def test_render_status_empty_samples(self): - from winml.modelkit.commands.live_chart import LiveMonitorDisplay + from winml.modelkit.commands._live_chart import LiveMonitorDisplay display = LiveMonitorDisplay(total_iterations=10, warmup=0, model_id="test", device="cpu") status = display._render_status( @@ -853,7 +826,7 @@ def test_render_status_empty_samples(self): assert "0.0%" in status # NPU should show 0.0% def test_update_noop_when_live_is_none(self): - from winml.modelkit.commands.live_chart import LiveMonitorDisplay + from winml.modelkit.commands._live_chart import LiveMonitorDisplay display = LiveMonitorDisplay(total_iterations=10, warmup=0, model_id="test", device="cpu") # _live is None (not entered context) — should not crash @@ -864,7 +837,7 @@ def test_update_noop_when_live_is_none(self): ) def test_print_final_snapshot_is_noop(self): - from winml.modelkit.commands.live_chart import LiveMonitorDisplay + from winml.modelkit.commands._live_chart import LiveMonitorDisplay display = LiveMonitorDisplay(total_iterations=10, warmup=0, model_id="test", device="cpu") # Should not crash or print anything diff --git a/tests/unit/session/test_ep_registry.py b/tests/unit/session/test_ep_registry.py new file mode 100644 index 000000000..4d5c95a69 --- /dev/null +++ b/tests/unit/session/test_ep_registry.py @@ -0,0 +1,106 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for ep_registry module-level helpers.""" + +from __future__ import annotations + +import logging +from unittest.mock import patch + +from winml.modelkit.session.ep_registry import ensure_initialized + + +def test_ensure_initialized_calls_registry_once(): + """ensure_initialized() calls register_to_ort() via singleton; idempotent across calls. + + A2-I3 (PR review): the previous loose ``call_count >= 1`` assertion would + pass if the wrapper accidentally amplified calls (e.g., re-instantiating + the registry on every entry). Pin the contract: + + * ``WinMLEPRegistry.get_instance()`` is hit exactly once per + ``ensure_initialized()`` call (no extra allocations). + * ``register_to_ort()`` is invoked once per call — the singleton's + internal ``_registered_eps`` skip-list provides actual no-op + idempotency, NOT the wrapper. + * No exception is raised for any number of calls. + """ + with patch("winml.modelkit.session.ep_registry.WinMLEPRegistry") as mock_registry_cls: + instance = mock_registry_cls.get_instance.return_value + instance.winml_available = True + + ensure_initialized() + ensure_initialized() + ensure_initialized() + + # Wrapper makes exactly one get_instance + one register_to_ort per call. + assert mock_registry_cls.get_instance.call_count == 3 + assert instance.register_to_ort.call_count == 3 + + +def test_ensure_initialized_failure_logs_warning(caplog): + """NFR-2: registration failure must log at WARNING (not DEBUG) with exception class. + + The previous DEBUG-level swallow downgraded real environmental failures + (broken Windows App SDK, etc.) to invisible "feature unavailable". + """ + with patch("winml.modelkit.session.ep_registry.WinMLEPRegistry") as mock_registry_cls: + instance = mock_registry_cls.get_instance.return_value + instance.winml_available = True + instance.register_to_ort.side_effect = RuntimeError("boom") + + with caplog.at_level(logging.WARNING): + ensure_initialized() # must NOT raise + + warnings = [r for r in caplog.records if r.levelname == "WARNING"] + assert any( + "WinML EP registration failed" in r.message and "RuntimeError" in r.message + for r in warnings + ), f"expected WARNING surfacing RuntimeError, got: {[r.message for r in warnings]}" + + +def test_ensure_initialized_allows_retry_after_failure(caplog): + """A first-call failure does not latch — the next call retries registration.""" + with patch("winml.modelkit.session.ep_registry.WinMLEPRegistry") as mock_registry_cls: + instance = mock_registry_cls.get_instance.return_value + instance.winml_available = True + instance.register_to_ort.side_effect = [RuntimeError("transient"), None] + + with caplog.at_level(logging.WARNING): + ensure_initialized() # fails + ensure_initialized() # should retry + + # register_to_ort should have been called both times. + assert instance.register_to_ort.call_count == 2 + + +def test_register_to_ort_failure_records_per_ep_state(): + """NFR-2: per-EP registration failures must be tracked in registration_failures.""" + from winml.modelkit.session.ep_registry import WinMLEPRegistry + + # Reset the singleton's failure dict for test isolation by using + # get_instance + manipulating instance state directly. + registry = WinMLEPRegistry.get_instance() + # Inject test EP paths and force registration failure + registry._ep_paths = {"FakeEP": "C:/nonexistent/fake.dll"} + registry._registered_eps = [] + registry._registration_failures = {} + registry._winml_available = True + + fake_ort = type("M", (), {})() + + def _bad_register(name, path): + raise RuntimeError(f"cannot load {path}") + + fake_ort.register_execution_provider_library = _bad_register + + with patch.dict("sys.modules", {"onnxruntime": fake_ort}): + registry.register_to_ort() + + assert "FakeEP" in registry.registration_failures + assert "RuntimeError" in registry.registration_failures["FakeEP"] + # Property returns a copy — mutating it must not corrupt internal state. + snap = registry.registration_failures + snap.clear() + assert "FakeEP" in registry.registration_failures diff --git a/tests/unit/session/test_is_compatible.py b/tests/unit/session/test_is_compatible.py index 39a997aa0..0a8d5c3aa 100644 --- a/tests/unit/session/test_is_compatible.py +++ b/tests/unit/session/test_is_compatible.py @@ -46,7 +46,7 @@ def cpu_session(tmp_path: Path) -> WinMLSession: onnx.save(model, str(model_path)) # Mock EP registry to avoid slow WMI/PowerShell queries on CI - with patch.object(WinMLSession, "_init_winml_eps_once"): + with patch("winml.modelkit.session.ep_registry.ensure_initialized"): return WinMLSession(onnx_path=model_path, device="cpu") diff --git a/tests/unit/session/test_perf_auto_reset.py b/tests/unit/session/test_perf_auto_reset.py new file mode 100644 index 000000000..a7591a094 --- /dev/null +++ b/tests/unit/session/test_perf_auto_reset.py @@ -0,0 +1,70 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Auto-reset behavior: session.perf(monitor=...) with options on already-compiled session.""" + +from __future__ import annotations + +import logging + +from tests._helpers import get_minimal_onnx_model_path + + +def test_auto_reset_fires_when_options_contributed(caplog): + """If session is already compiled AND monitor contributes provider_options, + session.perf().__enter__ auto-resets with a WARNING log.""" + from winml.modelkit.session.monitor.ep_monitor import EPMonitor + from winml.modelkit.session.session import WinMLSession + + class _ContributingMonitor(EPMonitor): + @classmethod + def is_available(cls): + return True + + def __enter__(self): + return self + + def __exit__(self, *a): + pass + + def to_dict(self): + return {"ep": "test"} + + def get_provider_options(self): + return {"some_key": "1"} + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + session.compile() + assert session._session is not None + pre_session = session._session + + with caplog.at_level(logging.WARNING), session.perf(monitor=_ContributingMonitor()): + pass + + # NFR-3: the verbatim phrase MUST appear as a substring of the log. + expected = "auto-resetting compiled session to apply monitor session/provider options" + warnings = [r.message for r in caplog.records if r.levelname == "WARNING"] + assert any(expected in m for m in warnings), ( + f"NFR-3 verbatim phrase not in WARNING records. expected substring: " + f"{expected!r}; got: {warnings}" + ) + # Old session object was dropped + assert session._session is None or session._session is not pre_session + + +def test_no_auto_reset_when_monitor_empty(): + """If monitor contributes NO options, no reset occurs.""" + from winml.modelkit.session.monitor.ep_monitor import NullEPMonitor + from winml.modelkit.session.session import WinMLSession + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + session.compile() + pre_session = session._session + assert pre_session is not None + + with session.perf(monitor=NullEPMonitor()): + pass + + # Session should NOT have been reset + assert session._session is pre_session diff --git a/tests/unit/session/test_perf_monitor_integration.py b/tests/unit/session/test_perf_monitor_integration.py new file mode 100644 index 000000000..50cbc9eee --- /dev/null +++ b/tests/unit/session/test_perf_monitor_integration.py @@ -0,0 +1,185 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Integration tests for WinMLSession.perf(monitor=...) — teardown ordering, +auto-reset, session/provider option merging, exception transparency. + +This file grows across multiple tasks (7, 8). +""" + +from __future__ import annotations + +import numpy as np +import onnxruntime as ort +import pytest + +from tests._helpers import get_minimal_onnx_model_path + + +def test_active_session_option_entries_applied_in_build(): + """_build_session_options applies monitor-contributed entries on the returned SessionOptions.""" + from winml.modelkit.session.session import WinMLSession + + # Construct without going through __init__ to avoid file I/O + session = WinMLSession.__new__(WinMLSession) + session._device = "cpu" + session._ep = None + session._session_options = ort.SessionOptions() + session._provider_options = {} + session._active_session_option_entries = { + "session.disable_cpu_ep_fallback": "1", + } + + opts = session._build_session_options("cpu") + # ORT doesn't expose a clean read-back API for session config entries, + # but the call should not raise and should return a SessionOptions + assert isinstance(opts, ort.SessionOptions) + + +def test_active_session_option_entries_default_empty(): + """Newly-constructed WinMLSession has empty _active_session_option_entries.""" + from winml.modelkit.session.session import WinMLSession + + session = WinMLSession.__new__(WinMLSession) + # Simulate post-__init__ state without file I/O + session._active_session_option_entries = {} # from __init__ + assert session._active_session_option_entries == {} + + +def test_perf_monitor_none_yields_perfcontext_with_null_monitor(): + """perf() with no monitor yields PerfContext whose monitor is NullEPMonitor.""" + from winml.modelkit.session.monitor.ep_monitor import NullEPMonitor + from winml.modelkit.session.session import PerfContext, WinMLSession + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + with session.perf(warmup=0) as ctx: + assert isinstance(ctx, PerfContext) + assert isinstance(ctx.monitor, NullEPMonitor) + # ctx.stats must be the PerfStats instance + assert ctx.stats is not None + + +def test_nested_perf_raises(): + """Entering perf() while another is active raises RuntimeError.""" + from winml.modelkit.session.session import WinMLSession + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + with session.perf(), pytest.raises(RuntimeError, match="already active"), session.perf(): + pass + + +def test_teardown_ordering_reset_before_monitor_exit(): + """For monitor.requires_session_teardown=True, self.reset() fires BEFORE monitor.__exit__.""" + from winml.modelkit.session.monitor.ep_monitor import EPMonitor + from winml.modelkit.session.session import WinMLSession + + observations: dict = {} + + class _TeardownMonitor(EPMonitor): + requires_session_teardown = True + + def __init__(self): + self.session_ref = None + + @classmethod + def is_available(cls): + return True + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # At this point, session.reset() should have fired → self.session_ref._session is None + if self.session_ref is not None: + observations["session_at_exit"] = self.session_ref._session + + def to_dict(self): + return {"ep": "test"} + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + mon = _TeardownMonitor() + mon.session_ref = session + + with session.perf(monitor=mon): + # Force compile so reset has something to tear down + session.run({"input": np.zeros((1, 4), dtype=np.float32)}) + + # After perf exit, session._session should be None (reset happened) + assert session._session is None + # And the observation captured by monitor.__exit__ should also be None + # (meaning reset fired before __exit__) + assert observations.get("session_at_exit") is None + + +def test_exception_transparency(): + """Exception in `with session.perf()` body propagates; monitor.__exit__ sees exc_info.""" + from winml.modelkit.session.monitor.ep_monitor import EPMonitor + from winml.modelkit.session.session import WinMLSession + + captured: dict = {} + + class _CapturingMonitor(EPMonitor): + @classmethod + def is_available(cls): + return True + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + captured["exc_type"] = exc_type + + def to_dict(self): + return {"ep": "test"} + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + mon = _CapturingMonitor() + + with pytest.raises(ValueError, match="boom"), session.perf(monitor=mon): + raise ValueError("boom") + + assert captured.get("exc_type") is ValueError + + +def test_monitor_enter_raises_leaves_session_clean(): + """If mon.__enter__() raises, session state is not polluted. + + Regression guard: an earlier version mutated _perf_stats and _provider_options + before mon.__enter__(), so an __enter__ exception left the session stuck + (nested-perf error on every subsequent perf() call). + """ + from winml.modelkit.session.monitor.ep_monitor import EPMonitor + from winml.modelkit.session.session import WinMLSession + + class _RaisingEnterMonitor(EPMonitor): + @classmethod + def is_available(cls): + return True + + def __enter__(self): + raise RuntimeError("simulated __enter__ failure") + + def __exit__(self, *a): + pass + + def to_dict(self): + return {"ep": "test"} + + def get_provider_options(self): + return {"some_key": "1"} + + session = WinMLSession(get_minimal_onnx_model_path(), device="cpu") + + mon = _RaisingEnterMonitor() + with pytest.raises(RuntimeError, match="simulated"), session.perf(monitor=mon): + pass # never reached + + # Session state must be fully restored + assert session._perf_stats is None + assert session._active_session_option_entries == {} + assert session._provider_options == {} + + # Subsequent perf() MUST work (no stuck state) + with session.perf() as ctx: + assert ctx is not None diff --git a/tests/unit/session/test_qairt_session.py b/tests/unit/session/test_qairt_session.py index 0d5631d13..7aa7d8948 100644 --- a/tests/unit/session/test_qairt_session.py +++ b/tests/unit/session/test_qairt_session.py @@ -22,7 +22,7 @@ @pytest.fixture(autouse=True) def mock_ep_registration(): """Prevent WinML EP registration from loading native DLLs.""" - with patch("winml.modelkit.session.session.WinMLSession._init_winml_eps_once"): + with patch("winml.modelkit.session.ep_registry.ensure_initialized"): yield diff --git a/tests/unit/session/test_winml_session.py b/tests/unit/session/test_winml_session.py index a0cc54705..917e67c22 100644 --- a/tests/unit/session/test_winml_session.py +++ b/tests/unit/session/test_winml_session.py @@ -548,7 +548,8 @@ def test_perf_context_manager_returns_stats( device="cpu", ) - with session.perf() as stats: + with session.perf() as ctx: + stats = ctx.stats assert stats is not None assert isinstance(stats, PerfStats) assert stats.count == 0 @@ -564,10 +565,11 @@ def test_perf_records_samples( device="cpu", ) - with session.perf() as stats: + with session.perf() as ctx: for _ in range(5): session.run(sample_input) + stats = ctx.stats assert stats.count == 5 assert len(stats.samples_ms) == 5 assert all(t > 0 for t in stats.samples_ms) @@ -583,10 +585,11 @@ def test_perf_stats_computed_correctly( device="cpu", ) - with session.perf() as stats: + with session.perf() as ctx: for _ in range(10): session.run(sample_input) + stats = ctx.stats assert stats.count == 10 assert stats.total_ms > 0 assert stats.mean_ms > 0 @@ -607,10 +610,11 @@ def test_perf_warmup_excludes_samples( device="cpu", ) - with session.perf(warmup=3) as stats: + with session.perf(warmup=3) as ctx: for _ in range(10): session.run(sample_input) + stats = ctx.stats # 10 total, 3 warmup = 7 effective assert stats.total_count == 10 assert stats.count == 7 @@ -628,8 +632,9 @@ def test_perf_disabled_after_context( device="cpu", ) - with session.perf() as stats: + with session.perf() as ctx: session.run(sample_input) + stats = ctx.stats assert stats.count == 1 # After context, perf_stats should be None @@ -672,10 +677,11 @@ def test_perf_stats_accessible_after_context( device="cpu", ) - with session.perf(warmup=2) as stats: + with session.perf(warmup=2) as ctx: for _ in range(5): session.run(sample_input) + stats = ctx.stats # Stats still accessible after context assert stats.count == 3 # 5 - 2 warmup assert stats.mean_ms > 0