From d8622c53e887d4bb10dd0c6b42804f36a905f4a0 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 2 Jun 2026 16:25:05 -0700
Subject: [PATCH 01/11] Align generate-model-package CLI with onnxruntime-genai
 v4 schema

Update metadata.json to inline EP info (single EP per variant) with
schema_version and component_name; rename compatibility list to a single
compatibility_string passthrough. Emit genai_config_overlay.json carrying
per-variant session_options/provider_options as an RFC-7386 merge patch keyed
by the genai role resolved from the base genai_config. Add package_name,
package_version and configs_dir to manifest.json.

The v4 format removes variant.json and has no cross-variant weight-sharing
mechanism, so drop variant.json emission and shared_weights deduplication:
each variant directory now keeps its ONNX file and external-data blobs inline
so stock ORT can load it directly.
---
 olive/cli/model_package.py     | 292 ++++++++++++++++++++-------------
 test/cli/test_model_package.py | 148 ++++++++++-------
 2 files changed, 267 insertions(+), 173 deletions(-)
diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index 5513c2d71..a638f0dd0 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -19,23 +19,25 @@
     │   └── <consumer-shared assets>           # tokenizer, genai_config, ...
     └── <component>/
         ├── metadata.json
-        ├── shared_weights/
-        │   └── <sha256>/<blob>                # opt-in cross-variant dedup
         └── <variant>/
-            ├── variant.json
+            ├── genai_config_overlay.json      # optional: per-variant runtime fields
             ├── model.onnx
-            └── ...
+            └── ...                            # external-data blobs (inline)
 
 Notes:
-- ``shared_weights`` is opt-in per blob. A blob whose SHA-256 appears in only
-  one variant stays inline next to its ONNX file in the variant directory,
-  keeping the single-variant case loadable by stock ORT.
-- Cross-variant dedup moves a duplicated blob to
-  ``<component>/shared_weights/<sha256>/<basename>`` and records the mapping
-  in the per-file ``shared_files`` map of the variant's ``variant.json``.
-  Loading such a variant requires a model-package-aware consumer.
-- ``genai_config.json`` is copied verbatim into ``<output>/configs/``;
-  per-variant overlays are ORT-GenAI's responsibility, not Olive's.
+- ``metadata.json`` is selection-only. Each variant declares a single
+  execution provider inline (``ep``) plus optional ``device`` and opaque
+  ``compatibility_string`` (ORT v4 schema).
+- Each variant directory is self-contained: the ONNX file and any external-data
+  blobs it references are copied inline so stock ORT can load it directly. The
+  ORT v4 package format has no cross-variant weight-sharing mechanism
+  (``variant.json`` was removed), so blobs are never deduplicated across
+  variants.
+- ``genai_config.json`` is copied verbatim into ``<output>/configs/``. Under
+  the v4 schema per-variant runtime fields (``session_options``,
+  ``provider_options``) are expressed as a per-variant
+  ``genai_config_overlay.json`` (an RFC 7386 JSON Merge Patch applied on top of
+  ``configs/genai_config.json``).
 
 """
 
@@ -62,8 +64,30 @@
 # rather than under <package>/configs/.
 _MODEL_SUFFIXES = {".onnx", ".bin", ".data", ".xml"}
 
-# Schema version emitted in manifest.json. Keep in sync with the proposal.
+# Schema versions emitted in the package JSON files. Keep in sync with the
+# ORT v4 model-package schema.
 _MANIFEST_SCHEMA_VERSION = 1
+_METADATA_SCHEMA_VERSION = 1
+
+# Directory under the package root that holds consumer-shared config assets
+# (genai_config base, tokenizer, processor configs, chat templates).
+_CONFIGS_DIR = "configs"
+
+# Map canonical ONNX Runtime EP names to the short provider aliases used inside
+# genai_config.json's ``session_options.provider_options`` list. Mirrors the
+# aliases ORT-GenAI accepts (see ORT-GenAI src/config.cpp provider dispatch).
+_EP_TO_GENAI: dict[str, str] = {
+    "CPUExecutionProvider": "cpu",
+    "CUDAExecutionProvider": "cuda",
+    "DmlExecutionProvider": "dml",
+    "WebGpuExecutionProvider": "webgpu",
+    "JsExecutionProvider": "web",
+    "QNNExecutionProvider": "qnn",
+    "OpenVINOExecutionProvider": "openvino",
+    "ROCMExecutionProvider": "rocm",
+    "TensorrtExecutionProvider": "tensorrt",
+    "NvTensorRTRTXExecutionProvider": "NvTensorRtRtx",
+}
 
 # Hash chunk size for SHA-256 over external-data blobs.
 _HASH_CHUNK = 1024 * 1024
@@ -175,6 +199,8 @@ def run(self):
             variants=variants,
             config_files=config_files,
             producer_info=producer_info,
+            package_name=self.args.model_name or output_dir.name,
+            package_version=self.args.model_version,
         )
 
         logger.info("Model package generated at %s", output_dir)
@@ -191,7 +217,7 @@ def _build_single_variants(self, targets: list[tuple[str, Path, dict]]) -> list[
         for target_name, _src, model_config in targets:
             attrs = _get_model_attributes(model_config)
             onnx_path = _resolve_onnx_path(model_config)
-            ep, device, compatibility = _ep_device_compatibility(attrs, onnx_path)
+            ep, device, compatibility_string = _ep_device_compatibility(attrs, onnx_path)
             variants.append(
                 VariantSpec(
                     component_name=component_name,
@@ -199,7 +225,7 @@ def _build_single_variants(self, targets: list[tuple[str, Path, dict]]) -> list[
                     onnx_files=[onnx_path],
                     ep=ep,
                     device=device,
-                    compatibility=compatibility,
+                    compatibility_string=compatibility_string,
                     inference_settings=model_config.get("config", {}).get("inference_settings") or {},
                 )
             )
@@ -228,7 +254,7 @@ def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> li
                 comp_attrs.update(_get_model_attributes(comp_config))
 
                 onnx_path = _resolve_onnx_path(comp_config)
-                ep, device, compatibility = _ep_device_compatibility(comp_attrs, onnx_path)
+                ep, device, compatibility_string = _ep_device_compatibility(comp_attrs, onnx_path)
 
                 spec = VariantSpec(
                     component_name=comp_name,
@@ -236,7 +262,7 @@ def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> li
                     onnx_files=[onnx_path],
                     ep=ep,
                     device=device,
-                    compatibility=compatibility,
+                    compatibility_string=compatibility_string,
                     inference_settings=comp_inference,
                 )
                 component_variants.setdefault(comp_name, []).append(spec)
@@ -355,7 +381,7 @@ class VariantSpec:
     onnx_files: list[Path]
     ep: str
     device: Optional[str] = None
-    compatibility: list[str] = field(default_factory=list)
+    compatibility_string: Optional[str] = None
     inference_settings: dict[str, Any] = field(default_factory=dict)
     consumer_metadata: Optional[dict[str, Any]] = None
 
@@ -365,6 +391,8 @@ def write_model_package(
     variants: list[VariantSpec],
     config_files: Optional[dict[str, Path]] = None,
     producer_info: Optional[dict[str, Any]] = None,
+    package_name: Optional[str] = None,
+    package_version: str = "1.0",
 ) -> None:
     """Materialize a model package on disk.
 
@@ -378,9 +406,11 @@ def write_model_package(
         different sources should be byte-identical; the first wins on
         conflict and a warning is logged.
     :param producer_info: Olive-specific provenance recorded under
-        ``manifest.producer``. Schema-tolerated extra field (the proposal
-        defines only ``schema_version``, ``components``, and
-        ``merge_provenance``; producers may add namespaced extras).
+        ``manifest.producer``. Schema-tolerated extra field; producers may add
+        namespaced extras.
+    :param package_name: Name recorded under ``manifest.package_name``.
+        Defaults to the output directory name.
+    :param package_version: Version recorded under ``manifest.package_version``.
     """
     if not variants:
         raise ValueError("write_model_package requires at least one variant.")
@@ -389,6 +419,12 @@ def write_model_package(
     _ensure_empty_output_dir(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
 
+    # Map each package component to the genai_config role that references it, so
+    # per-variant overlays patch the right ``model.<role>`` block. Roles can be
+    # named differently from components, so we resolve via the base config's
+    # ``model.<role>.component`` pointers and fall back to the component name.
+    component_to_role = _resolve_component_roles(config_files)
+
     # Group by component while preserving insertion order.
     components: dict[str, list[VariantSpec]] = {}
     for v in variants:
@@ -409,31 +445,36 @@ def write_model_package(
             seen.add(v.variant_name)
 
     for comp_name, comp_variants in components.items():
-        _write_component(output_dir, comp_name, comp_variants)
+        _write_component(output_dir, comp_name, comp_variants, component_to_role.get(comp_name, comp_name))
 
     if config_files:
         _copy_config_files(output_dir, config_files)
 
-    _write_manifest(output_dir, list(components.keys()), producer_info)
+    _write_manifest(
+        output_dir, list(components.keys()), producer_info, package_name or output_dir.name, package_version
+    )
 
 
-def _write_component(output_dir: Path, component_name: str, comp_variants: list[VariantSpec]) -> None:
+def _write_component(
+    output_dir: Path,
+    component_name: str,
+    comp_variants: list[VariantSpec],
+    component_role: str,
+) -> None:
     component_dir = output_dir / component_name
     component_dir.mkdir(parents=True, exist_ok=True)
 
-    # First pass: copy each variant's ONNX file(s) and discover external-data
-    # references. We hash blobs as we copy so multi-variant packages don't
-    # re-read the data later.
-    blob_index: dict[str, dict[str, Any]] = {}
-    variant_files: dict[str, list[tuple[str, list[tuple[str, str]]]]] = {}
-
+    # Copy each variant's ONNX file(s) along with any external-data blobs they
+    # reference, keeping everything inline in the variant directory so each
+    # variant is self-contained and loadable by stock ORT. The ORT v4 package
+    # format has no cross-variant weight-sharing mechanism (variant.json was
+    # removed), so we never deduplicate blobs across variants.
     for v in comp_variants:
         if not v.onnx_files:
             raise ValueError(f"Variant '{v.variant_name}' under component '{component_name}' has no ONNX files.")
 
         variant_dir = component_dir / v.variant_name
         variant_dir.mkdir(parents=True, exist_ok=True)
-        files_for_variant: list[tuple[str, list[tuple[str, str]]]] = []
 
         for onnx_src in v.onnx_files:
             onnx_src_path = Path(onnx_src)
@@ -445,7 +486,6 @@ def _write_component(output_dir: Path, component_name: str, comp_variants: list[
 
             ext_refs = _discover_external_data(onnx_src_path)
             external_root = onnx_src_path.parent.resolve()
-            blob_records: list[tuple[str, str]] = []
             for graph_location in ext_refs:
                 blob_src = (onnx_src_path.parent / graph_location).resolve()
                 if not blob_src.is_relative_to(external_root):
@@ -469,94 +509,115 @@ def _write_component(output_dir: Path, component_name: str, comp_variants: list[
                 if not blob_dst.exists():
                     shutil.copy2(str(blob_src), str(blob_dst))
 
-                sha = _sha256_file(blob_dst)
-                blob_records.append((graph_location, sha))
-
-                entry = blob_index.setdefault(
-                    sha, {"first_path": blob_dst, "occurrences": 0, "basename": Path(graph_location).name}
-                )
-                entry["occurrences"] += 1
+        # Per-variant runtime fields flow through genai_config_overlay.json.
+        _write_genai_config_overlay(variant_dir, component_role, v)
 
-            files_for_variant.append((onnx_dst.name, blob_records))
+    _write_metadata(component_dir, component_name, comp_variants)
 
-        variant_files[v.variant_name] = files_for_variant
 
-    # Second pass: dedup any blob that appears in 2+ variants of this
-    # component into <component>/shared_weights/<sha>/<basename>. Single-
-    # occurrence blobs stay inline so single-variant packages remain
-    # loadable without the package API.
-    shared_weights_dir = component_dir / "shared_weights"
-    shared_blob_paths: dict[str, Path] = {}
-    for sha, entry in blob_index.items():
-        if entry["occurrences"] < 2:
-            continue
-        sha_dir = shared_weights_dir / sha
-        sha_dir.mkdir(parents=True, exist_ok=True)
-        target = sha_dir / entry["basename"]
-        if not target.exists():
-            shutil.copy2(str(entry["first_path"]), str(target))
-        shared_blob_paths[sha] = target
-
-    # Third pass: for each variant, remove deduped blobs from the variant
-    # directory and emit variant.json with the right shared_files map per
-    # files[i]. Then emit metadata.json for the component.
-    for v in comp_variants:
-        variant_dir = component_dir / v.variant_name
-        files_payload: list[dict[str, Any]] = []
-        for onnx_filename, blob_records in variant_files[v.variant_name]:
-            shared_files: dict[str, str] = {}
-            for graph_location, sha in blob_records:
-                if sha in shared_blob_paths:
-                    inline = variant_dir / graph_location
-                    if inline.exists():
-                        inline.unlink()
-                        # Clean up any now-empty parent directories created for
-                        # nested graph_location paths, but stop at variant_dir.
-                        parent = inline.parent
-                        while parent != variant_dir and parent.is_dir() and not any(parent.iterdir()):
-                            parent.rmdir()
-                            parent = parent.parent
-                    shared_files[graph_location] = sha
-
-            file_entry: dict[str, Any] = {"filename": onnx_filename}
-            so = (v.inference_settings or {}).get("session_options") or {}
-            po = _provider_options_for_ep(v.inference_settings or {}, v.ep)
-            if so:
-                file_entry["session_options"] = so
-            if po:
-                file_entry["provider_options"] = po
-            if shared_files:
-                file_entry["shared_files"] = shared_files
-            files_payload.append(file_entry)
-
-        variant_payload: dict[str, Any] = {"files": files_payload}
-        if v.consumer_metadata is not None:
-            variant_payload["consumer_metadata"] = v.consumer_metadata
-        _write_json(variant_dir / "variant.json", variant_payload)
-
-    _write_metadata(component_dir, comp_variants)
-
-
-def _write_metadata(component_dir: Path, comp_variants: list[VariantSpec]) -> None:
+def _write_metadata(component_dir: Path, component_name: str, comp_variants: list[VariantSpec]) -> None:
     variants_payload: dict[str, Any] = {}
     for v in comp_variants:
-        ep_entry: dict[str, Any] = {"ep": v.ep}
+        # ORT v4: EP fields are inline on the variant object; a variant targets
+        # a single execution provider.
+        variant_obj: dict[str, Any] = {"ep": v.ep}
         if v.device:
-            ep_entry["device"] = v.device
-        if v.compatibility:
-            ep_entry["compatibility"] = list(v.compatibility)
-        variants_payload[v.variant_name] = {"ep_compatibility": [ep_entry]}
-    _write_json(component_dir / "metadata.json", {"variants": variants_payload})
+            variant_obj["device"] = v.device
+        if v.compatibility_string:
+            variant_obj["compatibility_string"] = v.compatibility_string
+        variants_payload[v.variant_name] = variant_obj
+    _write_json(
+        component_dir / "metadata.json",
+        {
+            "schema_version": _METADATA_SCHEMA_VERSION,
+            "component_name": component_name,
+            "variants": variants_payload,
+        },
+    )
+
+
+def _genai_provider_name(ep: str) -> str:
+    """Map a canonical ORT EP name to the genai_config provider alias."""
+    if ep in _EP_TO_GENAI:
+        return _EP_TO_GENAI[ep]
+    # Best-effort fallback: strip the ExecutionProvider suffix and lowercase.
+    return ep[: -len("ExecutionProvider")].lower() if ep.endswith("ExecutionProvider") else ep
+
+
+def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: VariantSpec) -> None:
+    """Emit a per-variant ``genai_config_overlay.json`` (RFC 7386 merge patch).
+
+    Under the ORT v4 schema per-variant runtime fields are no longer read from
+    variant.json; they flow through a JSON Merge Patch applied on top of the
+    package's base ``configs/genai_config.json``. We express the variant's
+    ``session_options`` and EP-scoped ``provider_options`` under the role that
+    references this component (``model.<role>.session_options``).
+    """
+    inference = v.inference_settings or {}
+    session_options: dict[str, Any] = dict(inference.get("session_options") or {})
+    provider_options = _provider_options_for_ep(inference, v.ep)
+
+    # For non-CPU variants always declare the provider so the merged config
+    # appends the right EP at session-construction time (the base config copied
+    # into configs/ is from a single source and may target a different EP).
+    # CPU is ORT-GenAI's default, so only declare it when there are real options.
+    if v.ep != "CPUExecutionProvider" or provider_options:
+        session_options["provider_options"] = [{_genai_provider_name(v.ep): provider_options}]
+
+    if not session_options:
+        return
+
+    overlay = {"model": {component_role: {"session_options": session_options}}}
+    _write_json(variant_dir / "genai_config_overlay.json", overlay)
+
+
+def _resolve_component_roles(config_files: Optional[dict[str, Path]]) -> dict[str, str]:
+    """Map each package component to the genai_config role that references it.
+
+    The base ``genai_config.json`` declares roles under ``model.<role>`` and
+    each role names the package component it loads via a ``component`` field.
+    Per-variant overlays must patch ``model.<role>``, which can differ from the
+    component name, so we invert that mapping here. Returns an empty map when no
+    base config is available (callers fall back to the component name).
+    """
+    if not config_files:
+        return {}
+    src = config_files.get("genai_config.json")
+    if src is None:
+        return {}
+    try:
+        with Path(src).open(encoding="utf-8") as fh:
+            config = json.load(fh)
+    except Exception:
+        logger.debug("Could not read genai_config.json from %s for role mapping.", src, exc_info=True)
+        return {}
+
+    model_block = config.get("model")
+    if not isinstance(model_block, dict):
+        return {}
+
+    component_to_role: dict[str, str] = {}
+    for role, role_block in model_block.items():
+        if isinstance(role_block, dict):
+            component = role_block.get("component")
+            if isinstance(component, str) and component and component not in component_to_role:
+                component_to_role[component] = role
+    return component_to_role
 
 
 def _write_manifest(
     output_dir: Path,
     components: list[str],
     producer_info: Optional[dict[str, Any]],
+    package_name: str,
+    package_version: str,
 ) -> None:
     manifest: dict[str, Any] = {
         "schema_version": _MANIFEST_SCHEMA_VERSION,
+        "package_name": package_name,
+        "package_version": package_version,
         "components": components,
+        "configs_dir": _CONFIGS_DIR,
     }
     if producer_info:
         # Olive-specific provenance under a namespaced key so future schema
@@ -571,7 +632,7 @@ def _write_manifest(
 
 
 def _copy_config_files(output_dir: Path, config_files: dict[str, Path]) -> None:
-    configs_dir = output_dir / "configs"
+    configs_dir = output_dir / _CONFIGS_DIR
     configs_dir.mkdir(parents=True, exist_ok=True)
     configs_root = configs_dir.resolve()
     for name, src in config_files.items():
@@ -589,8 +650,8 @@ def _copy_config_files(output_dir: Path, config_files: dict[str, Path]) -> None:
             if not _paths_equal(src_path, dest):
                 logger.warning(
                     "configs/%s already present and differs from %s; keeping the existing copy. "
-                    "Per-variant config differences belong in variant.json's consumer_metadata, "
-                    "which is consumer-defined and out of Olive's scope.",
+                    "Per-variant config differences belong in genai_config_overlay.json, "
+                    "not in the shared configs/ directory.",
                     name,
                     src_path,
                 )
@@ -813,12 +874,19 @@ def _resolve_onnx_path(model_config: dict) -> Path:
     raise FileNotFoundError(f"model_path does not exist: {p}")
 
 
-def _ep_device_compatibility(attrs: dict, onnx_path: Path) -> tuple[str, Optional[str], list[str]]:
-    """Extract (ep, device, compatibility[]) for one variant from Olive metadata."""
+def _ep_device_compatibility(attrs: dict, onnx_path: Path) -> tuple[str, Optional[str], Optional[str]]:
+    """Extract (ep, device, compatibility_string) for one variant from Olive metadata.
+
+    Under the ORT v4 schema each variant declares a single opaque
+    ``compatibility_string``. Olive stores the EP-side preference as a
+    comma-delimited string in the ONNX metadata prop ``ep_compatibility_info.<EP>``;
+    it is passed through verbatim (ORT does not interpret the encoding).
+    """
     ep = attrs.get("ep") or "CPUExecutionProvider"
     device = attrs.get("device") or None
-    compatibility = parse_compatibility_strings(_extract_ep_compatibility_from_onnx(onnx_path, ep))
-    return ep, device, compatibility
+    raw = _extract_ep_compatibility_from_onnx(onnx_path, ep)
+    compatibility_string = raw.strip() if raw and raw.strip() else None
+    return ep, device, compatibility_string
 
 
 def _extract_ep_compatibility_from_onnx(model_path: Path, ep: str = "") -> Optional[str]:
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index 3f2549634..a893216b8 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -208,17 +208,18 @@ def test_writes_proposal_layout(self, tmp_path):
         assert manifest["producer"]["model_name"] == "test_model"
         assert manifest["producer"]["model_version"] == "2.0"
 
-        # metadata uses ep_compatibility[]
+        # metadata uses inline EP (v4 schema)
         metadata = json.loads((out / "model" / "metadata.json").read_text())
+        assert metadata["schema_version"] == 1
+        assert metadata["component_name"] == "model"
         assert set(metadata["variants"]) == {"soc_60", "soc_73"}
         for variant_payload in metadata["variants"].values():
-            ep_compat = variant_payload["ep_compatibility"]
-            assert ep_compat == [{"ep": "QNNExecutionProvider", "device": "NPU"}]
+            assert variant_payload == {"ep": "QNNExecutionProvider", "device": "NPU"}
 
-        # variant.json contains files[] with filename
+        # variant.json is never emitted under v4; the ONNX file still lands in
+        # the variant directory.
         for v in ("soc_60", "soc_73"):
-            variant_json = json.loads((out / "model" / v / "variant.json").read_text())
-            assert variant_json["files"][0]["filename"] == "model.onnx"
+            assert not (out / "model" / v / "variant.json").exists()
             assert (out / "model" / v / "model.onnx").is_file()
 
 
@@ -234,13 +235,13 @@ def test_single_source_is_valid_package(self, tmp_path):
         assert manifest["components"] == ["model"]
         metadata = json.loads((out / "model" / "metadata.json").read_text())
         assert "cpu_x64" in metadata["variants"]
-        assert metadata["variants"]["cpu_x64"]["ep_compatibility"] == [{"ep": "CPUExecutionProvider"}]
+        assert metadata["variants"]["cpu_x64"] == {"ep": "CPUExecutionProvider"}
         # No shared_weights because nothing to dedup.
         assert not (out / "model" / "shared_weights").exists()
 
 
 # ---------------------------------------------------------------------------
-# Writer: layout + manifest + metadata + variant.json
+# Writer: layout + manifest + metadata
 # ---------------------------------------------------------------------------
 
 
@@ -265,7 +266,8 @@ def test_writes_proposal_shape_for_single_variant(self, tmp_path):
 
         assert (out / "manifest.json").is_file()
         assert (out / "decoder" / "metadata.json").is_file()
-        assert (out / "decoder" / "cpu" / "variant.json").is_file()
+        # variant.json is never emitted under v4.
+        assert not (out / "decoder" / "cpu" / "variant.json").exists()
         assert (out / "decoder" / "cpu" / "model.onnx").is_file()
         assert not (out / "models").exists()
 
@@ -289,6 +291,9 @@ def test_manifest_uses_proposal_schema(self, tmp_path):
         manifest = json.loads((out / "manifest.json").read_text())
         assert manifest["schema_version"] == 1
         assert manifest["components"] == ["decoder"]
+        assert manifest["package_name"] == "package"
+        assert manifest["package_version"] == "1.0"
+        assert manifest["configs_dir"] == "configs"
         assert manifest["producer"] == {
             "tool": "olive-ai",
             "tool_version": "1.2.3",
@@ -299,7 +304,7 @@ def test_manifest_uses_proposal_schema(self, tmp_path):
         assert "component_models" not in manifest
         assert "model_version" not in manifest
 
-    def test_metadata_uses_ep_compatibility_array(self, tmp_path):
+    def test_metadata_uses_inline_ep(self, tmp_path):
         onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
         out = tmp_path / "package"
 
@@ -312,14 +317,19 @@ def test_metadata_uses_ep_compatibility_array(self, tmp_path):
                     onnx_files=[onnx_path],
                     ep="QNNExecutionProvider",
                     device="NPU",
-                    compatibility=["soc_60", "soc_69"],
+                    compatibility_string="soc_60,soc_69",
                 )
             ],
         )
 
         metadata = json.loads((out / "decoder" / "metadata.json").read_text())
-        ep_compat = metadata["variants"]["qnn-npu"]["ep_compatibility"]
-        assert ep_compat == [{"ep": "QNNExecutionProvider", "device": "NPU", "compatibility": ["soc_60", "soc_69"]}]
+        assert metadata["schema_version"] == 1
+        assert metadata["component_name"] == "decoder"
+        assert metadata["variants"]["qnn-npu"] == {
+            "ep": "QNNExecutionProvider",
+            "device": "NPU",
+            "compatibility_string": "soc_60,soc_69",
+        }
         assert "model_variants" not in metadata
 
     def test_metadata_omits_optional_fields_when_unset(self, tmp_path):
@@ -339,17 +349,16 @@ def test_metadata_omits_optional_fields_when_unset(self, tmp_path):
         )
 
         metadata = json.loads((out / "decoder" / "metadata.json").read_text())
-        ep_compat = metadata["variants"]["cpu"]["ep_compatibility"][0]
-        assert ep_compat == {"ep": "CPUExecutionProvider"}
+        assert metadata["variants"]["cpu"] == {"ep": "CPUExecutionProvider"}
 
-    def test_variant_json_carries_session_and_provider_options(self, tmp_path):
+    def test_overlay_carries_session_and_provider_options(self, tmp_path):
         onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
         out = tmp_path / "package"
 
         inference = {
             "session_options": {"graph_optimization_level": 3},
-            "execution_provider": ["CPUExecutionProvider"],
-            "provider_options": [{"intra_op_num_threads": 4}],
+            "execution_provider": ["CUDAExecutionProvider"],
+            "provider_options": [{"device_id": "0"}],
         }
 
         write_model_package(
@@ -357,24 +366,29 @@ def test_variant_json_carries_session_and_provider_options(self, tmp_path):
             variants=[
                 VariantSpec(
                     component_name="decoder",
-                    variant_name="cpu",
+                    variant_name="cuda",
                     onnx_files=[onnx_path],
-                    ep="CPUExecutionProvider",
+                    ep="CUDAExecutionProvider",
                     inference_settings=inference,
                 )
             ],
         )
 
-        variant = json.loads((out / "decoder" / "cpu" / "variant.json").read_text())
-        assert variant["files"] == [
-            {
-                "filename": "model.onnx",
-                "session_options": {"graph_optimization_level": 3},
-                "provider_options": {"intra_op_num_threads": 4},
+        # Runtime fields go to genai_config_overlay.json, not variant.json.
+        assert not (out / "decoder" / "cuda" / "variant.json").exists()
+        overlay = json.loads((out / "decoder" / "cuda" / "genai_config_overlay.json").read_text())
+        assert overlay == {
+            "model": {
+                "decoder": {
+                    "session_options": {
+                        "graph_optimization_level": 3,
+                        "provider_options": [{"cuda": {"device_id": "0"}}],
+                    }
+                }
             }
-        ]
+        }
 
-    def test_provider_options_match_ep_by_name(self, tmp_path):
+    def test_overlay_provider_options_match_ep_by_name(self, tmp_path):
         """When inference_settings has multiple EPs, pick the one whose name matches VariantSpec.ep."""
         onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
         out = tmp_path / "package"
@@ -398,18 +412,37 @@ def test_provider_options_match_ep_by_name(self, tmp_path):
             ],
         )
 
-        variant = json.loads((out / "decoder" / "qnn" / "variant.json").read_text())
-        assert variant["files"][0].get("provider_options") == {"backend_path": "QnnHtp.so"}
-        assert "session_options" not in variant["files"][0]
+        overlay = json.loads((out / "decoder" / "qnn" / "genai_config_overlay.json").read_text())
+        assert overlay["model"]["decoder"]["session_options"]["provider_options"] == [
+            {"qnn": {"backend_path": "QnnHtp.so"}}
+        ]
+
+    def test_overlay_omitted_for_cpu_variant_without_options(self, tmp_path):
+        onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
+        out = tmp_path / "package"
+
+        write_model_package(
+            output_dir=out,
+            variants=[
+                VariantSpec(
+                    component_name="decoder",
+                    variant_name="cpu",
+                    onnx_files=[onnx_path],
+                    ep="CPUExecutionProvider",
+                )
+            ],
+        )
+
+        assert not (out / "decoder" / "cpu" / "genai_config_overlay.json").exists()
 
 
 # ---------------------------------------------------------------------------
-# Writer: shared_weights / external-data dedup
+# Writer: external-data blobs are always kept inline per variant (no dedup)
 # ---------------------------------------------------------------------------
 
 
-class TestSharedWeightsDedup:
-    def test_dedups_identical_external_data_across_variants(self, tmp_path):
+class TestExternalDataInline:
+    def test_keeps_identical_external_data_inline_in_each_variant(self, tmp_path):
         blob = b"\x00\x01\x02\x03" * 64
         a = _make_onnx_with_external(tmp_path / "a" / "model.onnx", "model.onnx.data", blob)
         b = _make_onnx_with_external(tmp_path / "b" / "model.onnx", "model.onnx.data", blob)
@@ -433,22 +466,15 @@ def test_dedups_identical_external_data_across_variants(self, tmp_path):
             ],
         )
 
-        shared_root = out / "decoder" / "shared_weights"
-        assert shared_root.is_dir()
-        sha_dirs = list(shared_root.iterdir())
-        assert len(sha_dirs) == 1
-        sha = sha_dirs[0].name
-        assert (shared_root / sha / "model.onnx.data").is_file()
-        assert not (out / "decoder" / "v1" / "model.onnx.data").exists()
-        assert not (out / "decoder" / "v2" / "model.onnx.data").exists()
-
+        # The ORT v4 package format has no cross-variant weight sharing: even
+        # identical blobs stay inline in each variant directory, and neither
+        # shared_weights/ nor variant.json is emitted.
+        assert not (out / "decoder" / "shared_weights").exists()
         for v in ("v1", "v2"):
-            variant = json.loads((out / "decoder" / v / "variant.json").read_text())
-            entry = variant["files"][0]
-            assert entry["filename"] == "model.onnx"
-            assert entry["shared_files"] == {"model.onnx.data": sha}
+            assert (out / "decoder" / v / "model.onnx.data").is_file()
+            assert not (out / "decoder" / v / "variant.json").exists()
 
-    def test_keeps_external_data_inline_when_unique(self, tmp_path):
+    def test_keeps_distinct_external_data_inline_per_variant(self, tmp_path):
         a = _make_onnx_with_external(tmp_path / "a" / "model.onnx", "model.onnx.data", b"a-bytes" * 32)
         b = _make_onnx_with_external(tmp_path / "b" / "model.onnx", "model.onnx.data", b"b-bytes" * 32)
         out = tmp_path / "package"
@@ -475,9 +501,9 @@ def test_keeps_external_data_inline_when_unique(self, tmp_path):
         assert (out / "decoder" / "v1" / "model.onnx.data").is_file()
         assert (out / "decoder" / "v2" / "model.onnx.data").is_file()
 
+        # variant.json is never emitted under the v4 schema.
         for v in ("v1", "v2"):
-            variant = json.loads((out / "decoder" / v / "variant.json").read_text())
-            assert "shared_files" not in variant["files"][0]
+            assert not (out / "decoder" / v / "variant.json").exists()
 
     def test_single_variant_keeps_blob_inline(self, tmp_path):
         onnx_path = _make_onnx_with_external(tmp_path / "src" / "model.onnx", "model.onnx.data", b"x" * 128)
@@ -497,8 +523,8 @@ def test_single_variant_keeps_blob_inline(self, tmp_path):
 
         assert (out / "decoder" / "cpu" / "model.onnx.data").is_file()
         assert not (out / "decoder" / "shared_weights").exists()
-        variant = json.loads((out / "decoder" / "cpu" / "variant.json").read_text())
-        assert "shared_files" not in variant["files"][0]
+        # variant.json is never emitted under the v4 schema.
+        assert not (out / "decoder" / "cpu" / "variant.json").exists()
 
 
 # ---------------------------------------------------------------------------
@@ -711,7 +737,7 @@ def test_isolates_collisions_per_component(self):
 
 
 class TestCompatibilityFromOnnxMetadata:
-    def test_splits_comma_delimited_metadata(self, tmp_path):
+    def test_passes_through_comma_delimited_metadata(self, tmp_path):
         # setup: source with QNNExecutionProvider compat info in ONNX metadata_props
         src = _create_source_dir(
             tmp_path,
@@ -725,11 +751,11 @@ def test_splits_comma_delimited_metadata(self, tmp_path):
         # execute
         cmd.run()
 
-        # assert: compatibility array reflects the comma-split list
+        # assert: compatibility_string passes the raw opaque string through verbatim
         metadata = json.loads((out / "model" / "metadata.json").read_text())
-        ep_compat = metadata["variants"]["soc_60"]["ep_compatibility"][0]
-        assert ep_compat["ep"] == "QNNExecutionProvider"
-        assert ep_compat["compatibility"] == ["soc_60", "soc_69", "soc_73"]
+        variant = metadata["variants"]["soc_60"]
+        assert variant["ep"] == "QNNExecutionProvider"
+        assert variant["compatibility_string"] == "soc_60,soc_69,soc_73"
 
 
 # ---------------------------------------------------------------------------
@@ -796,11 +822,11 @@ def test_per_component_inference_settings_wins(self, tmp_path):
         cmd.run()
 
         # assert: encoder uses target-level, decoder uses component-level
-        encoder_v = json.loads((out / "encoder" / "soc_60" / "variant.json").read_text())
-        assert encoder_v["files"][0]["session_options"] == {"graph_optimization_level": 1}
+        encoder_overlay = json.loads((out / "encoder" / "soc_60" / "genai_config_overlay.json").read_text())
+        assert encoder_overlay["model"]["encoder"]["session_options"]["graph_optimization_level"] == 1
 
-        decoder_v = json.loads((out / "decoder" / "soc_60" / "variant.json").read_text())
-        assert decoder_v["files"][0]["session_options"] == {"graph_optimization_level": 99}
+        decoder_overlay = json.loads((out / "decoder" / "soc_60" / "genai_config_overlay.json").read_text())
+        assert decoder_overlay["model"]["decoder"]["session_options"]["graph_optimization_level"] == 99
 
 
 # ---------------------------------------------------------------------------

From d8c09e4b26f5880b55b8616f75bcfcfad3a879c9 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 2 Jun 2026 18:55:36 -0700
Subject: [PATCH 02/11] Drop versioned/migration wording from model-package CLI

Describe the model-package writer as a single current behavior: remove
references to a specific ORT schema version (v4) and to fields or files
that were removed/changed elsewhere (e.g. variant.json), so the docstrings,
comments, and test comments read as one self-contained feature.
---
 olive/cli/model_package.py     | 39 ++++++++++++++--------------------
 test/cli/test_model_package.py | 17 +++++++--------
 2 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index a638f0dd0..a82b81942 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -27,17 +27,13 @@
 Notes:
 - ``metadata.json`` is selection-only. Each variant declares a single
   execution provider inline (``ep``) plus optional ``device`` and opaque
-  ``compatibility_string`` (ORT v4 schema).
+  ``compatibility_string``.
 - Each variant directory is self-contained: the ONNX file and any external-data
-  blobs it references are copied inline so stock ORT can load it directly. The
-  ORT v4 package format has no cross-variant weight-sharing mechanism
-  (``variant.json`` was removed), so blobs are never deduplicated across
-  variants.
-- ``genai_config.json`` is copied verbatim into ``<output>/configs/``. Under
-  the v4 schema per-variant runtime fields (``session_options``,
-  ``provider_options``) are expressed as a per-variant
-  ``genai_config_overlay.json`` (an RFC 7386 JSON Merge Patch applied on top of
-  ``configs/genai_config.json``).
+  blobs it references are copied inline so stock ORT can load it directly.
+- ``genai_config.json`` is copied verbatim into ``<output>/configs/``.
+  Per-variant runtime fields (``session_options``, ``provider_options``) are
+  expressed as a per-variant ``genai_config_overlay.json`` (an RFC 7386 JSON
+  Merge Patch applied on top of ``configs/genai_config.json``).
 
 """
 
@@ -65,7 +61,7 @@
 _MODEL_SUFFIXES = {".onnx", ".bin", ".data", ".xml"}
 
 # Schema versions emitted in the package JSON files. Keep in sync with the
-# ORT v4 model-package schema.
+# ORT model-package schema.
 _MANIFEST_SCHEMA_VERSION = 1
 _METADATA_SCHEMA_VERSION = 1
 
@@ -466,9 +462,7 @@ def _write_component(
 
     # Copy each variant's ONNX file(s) along with any external-data blobs they
     # reference, keeping everything inline in the variant directory so each
-    # variant is self-contained and loadable by stock ORT. The ORT v4 package
-    # format has no cross-variant weight-sharing mechanism (variant.json was
-    # removed), so we never deduplicate blobs across variants.
+    # variant is self-contained and loadable by stock ORT.
     for v in comp_variants:
         if not v.onnx_files:
             raise ValueError(f"Variant '{v.variant_name}' under component '{component_name}' has no ONNX files.")
@@ -518,8 +512,8 @@ def _write_component(
 def _write_metadata(component_dir: Path, component_name: str, comp_variants: list[VariantSpec]) -> None:
     variants_payload: dict[str, Any] = {}
     for v in comp_variants:
-        # ORT v4: EP fields are inline on the variant object; a variant targets
-        # a single execution provider.
+        # EP fields are inline on the variant object; a variant targets a
+        # single execution provider.
         variant_obj: dict[str, Any] = {"ep": v.ep}
         if v.device:
             variant_obj["device"] = v.device
@@ -547,9 +541,8 @@ def _genai_provider_name(ep: str) -> str:
 def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: VariantSpec) -> None:
     """Emit a per-variant ``genai_config_overlay.json`` (RFC 7386 merge patch).
 
-    Under the ORT v4 schema per-variant runtime fields are no longer read from
-    variant.json; they flow through a JSON Merge Patch applied on top of the
-    package's base ``configs/genai_config.json``. We express the variant's
+    Per-variant runtime fields flow through a JSON Merge Patch applied on top of
+    the package's base ``configs/genai_config.json``. We express the variant's
     ``session_options`` and EP-scoped ``provider_options`` under the role that
     references this component (``model.<role>.session_options``).
     """
@@ -877,10 +870,10 @@ def _resolve_onnx_path(model_config: dict) -> Path:
 def _ep_device_compatibility(attrs: dict, onnx_path: Path) -> tuple[str, Optional[str], Optional[str]]:
     """Extract (ep, device, compatibility_string) for one variant from Olive metadata.
 
-    Under the ORT v4 schema each variant declares a single opaque
-    ``compatibility_string``. Olive stores the EP-side preference as a
-    comma-delimited string in the ONNX metadata prop ``ep_compatibility_info.<EP>``;
-    it is passed through verbatim (ORT does not interpret the encoding).
+    Each variant declares a single opaque ``compatibility_string``. Olive stores
+    the EP-side preference as a comma-delimited string in the ONNX metadata prop
+    ``ep_compatibility_info.<EP>``; it is passed through verbatim (ORT does not
+    interpret the encoding).
     """
     ep = attrs.get("ep") or "CPUExecutionProvider"
     device = attrs.get("device") or None
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index a893216b8..84391554a 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -208,7 +208,7 @@ def test_writes_proposal_layout(self, tmp_path):
         assert manifest["producer"]["model_name"] == "test_model"
         assert manifest["producer"]["model_version"] == "2.0"
 
-        # metadata uses inline EP (v4 schema)
+        # metadata uses inline EP
         metadata = json.loads((out / "model" / "metadata.json").read_text())
         assert metadata["schema_version"] == 1
         assert metadata["component_name"] == "model"
@@ -216,8 +216,8 @@ def test_writes_proposal_layout(self, tmp_path):
         for variant_payload in metadata["variants"].values():
             assert variant_payload == {"ep": "QNNExecutionProvider", "device": "NPU"}
 
-        # variant.json is never emitted under v4; the ONNX file still lands in
-        # the variant directory.
+        # No variant.json is emitted; the ONNX file lands in the variant
+        # directory.
         for v in ("soc_60", "soc_73"):
             assert not (out / "model" / v / "variant.json").exists()
             assert (out / "model" / v / "model.onnx").is_file()
@@ -266,7 +266,7 @@ def test_writes_proposal_shape_for_single_variant(self, tmp_path):
 
         assert (out / "manifest.json").is_file()
         assert (out / "decoder" / "metadata.json").is_file()
-        # variant.json is never emitted under v4.
+        # No variant.json is emitted.
         assert not (out / "decoder" / "cpu" / "variant.json").exists()
         assert (out / "decoder" / "cpu" / "model.onnx").is_file()
         assert not (out / "models").exists()
@@ -466,9 +466,8 @@ def test_keeps_identical_external_data_inline_in_each_variant(self, tmp_path):
             ],
         )
 
-        # The ORT v4 package format has no cross-variant weight sharing: even
-        # identical blobs stay inline in each variant directory, and neither
-        # shared_weights/ nor variant.json is emitted.
+        # Each variant keeps its own external-data blob inline; no shared_weights
+        # directory or variant.json is emitted.
         assert not (out / "decoder" / "shared_weights").exists()
         for v in ("v1", "v2"):
             assert (out / "decoder" / v / "model.onnx.data").is_file()
@@ -501,7 +500,7 @@ def test_keeps_distinct_external_data_inline_per_variant(self, tmp_path):
         assert (out / "decoder" / "v1" / "model.onnx.data").is_file()
         assert (out / "decoder" / "v2" / "model.onnx.data").is_file()
 
-        # variant.json is never emitted under the v4 schema.
+        # No variant.json is emitted.
         for v in ("v1", "v2"):
             assert not (out / "decoder" / v / "variant.json").exists()
 
@@ -523,7 +522,7 @@ def test_single_variant_keeps_blob_inline(self, tmp_path):
 
         assert (out / "decoder" / "cpu" / "model.onnx.data").is_file()
         assert not (out / "decoder" / "shared_weights").exists()
-        # variant.json is never emitted under the v4 schema.
+        # No variant.json is emitted.
         assert not (out / "decoder" / "cpu" / "variant.json").exists()
 
 

From d511c1a73edbc53c6ff6b7a75b3f90a0fc299db1 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Wed, 3 Jun 2026 23:04:32 +0000
Subject: [PATCH 03/11] Fix bugs

---
 olive/cli/model_package.py | 196 ++++++++++++++++++++++++++++++-------
 1 file changed, 160 insertions(+), 36 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index a82b81942..804e82ecb 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -17,12 +17,13 @@
     ├── manifest.json
     ├── configs/
     │   └── <consumer-shared assets>           # tokenizer, genai_config, ...
-    └── <component>/
-        ├── metadata.json
-        └── <variant>/
-            ├── genai_config_overlay.json      # optional: per-variant runtime fields
-            ├── model.onnx
-            └── ...                            # external-data blobs (inline)
+    └── models/
+        └── <component>/
+            ├── metadata.json
+            └── <variant>/
+                ├── genai_config_overlay.json      # optional: per-variant runtime fields
+                ├── model.onnx
+                └── ...                            # external-data blobs (inline)
 
 Notes:
 - ``metadata.json`` is selection-only. Each variant declares a single
@@ -69,20 +70,33 @@
 # (genai_config base, tokenizer, processor configs, chat templates).
 _CONFIGS_DIR = "configs"
 
+# Directory under the package root that holds per-component subdirectories.
+# Required by the ORT model-package schema; ORT's model-package loader
+# discovers components via ``<package>/models/<component>/metadata.json``.
+_MODELS_DIR = "models"
+
 # Map canonical ONNX Runtime EP names to the short provider aliases used inside
-# genai_config.json's ``session_options.provider_options`` list. Mirrors the
-# aliases ORT-GenAI accepts (see ORT-GenAI src/config.cpp provider dispatch).
+# genai_config.json's ``session_options.provider_options`` list. Matches the
+# accepted aliases reported by ORT-GenAI when it parses an unknown provider name
+# ("Currently supported values are 'DML'/'DmlExecutionProvider', ...").
 _EP_TO_GENAI: dict[str, str] = {
-    "CPUExecutionProvider": "cpu",
+    "CPUExecutionProvider": "CPU",
     "CUDAExecutionProvider": "cuda",
-    "DmlExecutionProvider": "dml",
-    "WebGpuExecutionProvider": "webgpu",
-    "JsExecutionProvider": "web",
-    "QNNExecutionProvider": "qnn",
-    "OpenVINOExecutionProvider": "openvino",
+    "DmlExecutionProvider": "DML",
+    "WebGpuExecutionProvider": "WebGPU",
+    "JsExecutionProvider": "JS",
+    "QNNExecutionProvider": "QNN",
+    "OpenVINOExecutionProvider": "OpenVINO",
     "ROCMExecutionProvider": "rocm",
     "TensorrtExecutionProvider": "tensorrt",
     "NvTensorRTRTXExecutionProvider": "NvTensorRtRtx",
+    "XnnpackExecutionProvider": "XNNPACK",
+    "WebNNExecutionProvider": "WEBNN",
+    "AzureExecutionProvider": "AZURE",
+    "VitisAIExecutionProvider": "VitisAI",
+    "CoreMLExecutionProvider": "CoreML",
+    "MIGraphXExecutionProvider": "MIGraphX",
+    "SNPEExecutionProvider": "SNPE",
 }
 
 # Hash chunk size for SHA-256 over external-data blobs.
@@ -156,9 +170,10 @@ def run(self):
             model_config = self._read_model_config(source_path)
             targets.append((target_name, source_path, model_config))
 
-        types = {targets[i][2].get("type") for i in range(len(targets))}
-        if types - {"ONNXModel", "CompositeModel"}:
-            unsupported = sorted(types - {"ONNXModel", "CompositeModel"})
+        types = {(targets[i][2].get("type") or "").lower() for i in range(len(targets))}
+        supported = {"onnxmodel", "compositemodel"}
+        if types - supported:
+            unsupported = sorted(types - supported)
             raise ValueError(
                 f"Unsupported source model type(s) {unsupported!r}. "
                 "generate-model-package supports ONNXModel and CompositeModel only."
@@ -168,7 +183,7 @@ def run(self):
                 f"Sources mix model types {sorted(types)!r}. All sources must share the same type "
                 "(all ONNXModel or all CompositeModel)."
             )
-        is_composite = next(iter(types)) == "CompositeModel"
+        is_composite = next(iter(types)) == "compositemodel"
 
         if is_composite:
             variants = self._build_composite_variants(targets)
@@ -213,7 +228,7 @@ def _build_single_variants(self, targets: list[tuple[str, Path, dict]]) -> list[
         for target_name, _src, model_config in targets:
             attrs = _get_model_attributes(model_config)
             onnx_path = _resolve_onnx_path(model_config)
-            ep, device, compatibility_string = _ep_device_compatibility(attrs, onnx_path)
+            ep, device, compatibility_string = _ep_device_compatibility(attrs, onnx_path, target_name)
             variants.append(
                 VariantSpec(
                     component_name=component_name,
@@ -250,7 +265,7 @@ def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> li
                 comp_attrs.update(_get_model_attributes(comp_config))
 
                 onnx_path = _resolve_onnx_path(comp_config)
-                ep, device, compatibility_string = _ep_device_compatibility(comp_attrs, onnx_path)
+                ep, device, compatibility_string = _ep_device_compatibility(comp_attrs, onnx_path, target_name)
 
                 spec = VariantSpec(
                     component_name=comp_name,
@@ -443,8 +458,18 @@ def write_model_package(
     for comp_name, comp_variants in components.items():
         _write_component(output_dir, comp_name, comp_variants, component_to_role.get(comp_name, comp_name))
 
+    # Build the role -> component map needed by _copy_config_files so it can
+    # inject ``model.<role>.component`` markers into the base genai_config. ORT
+    # requires every role-block to declare which package component it loads;
+    # without those markers ORT-GenAI's variant auto-selection fails with
+    # "the genai config does not reference any package components".
+    role_to_component: dict[str, str] = {}
+    for comp_name in components:
+        role = component_to_role.get(comp_name, comp_name)
+        role_to_component.setdefault(role, comp_name)
+
     if config_files:
-        _copy_config_files(output_dir, config_files)
+        _copy_config_files(output_dir, config_files, role_to_component)
 
     _write_manifest(
         output_dir, list(components.keys()), producer_info, package_name or output_dir.name, package_version
@@ -457,7 +482,7 @@ def _write_component(
     comp_variants: list[VariantSpec],
     component_role: str,
 ) -> None:
-    component_dir = output_dir / component_name
+    component_dir = output_dir / _MODELS_DIR / component_name
     component_dir.mkdir(parents=True, exist_ok=True)
 
     # Copy each variant's ONNX file(s) along with any external-data blobs they
@@ -543,27 +568,46 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
 
     Per-variant runtime fields flow through a JSON Merge Patch applied on top of
     the package's base ``configs/genai_config.json``. We express the variant's
-    ``session_options`` and EP-scoped ``provider_options`` under the role that
-    references this component (``model.<role>.session_options``).
+    ``filename`` (the variant-local ONNX file basename), ``session_options`` and
+    EP-scoped ``provider_options`` under the role that references this component
+    (``model.<role>``). The base config has those keys stripped (see
+    ``_strip_variant_specific``); each variant overlay puts them back so ORT
+    resolves files inside the chosen variant directory.
     """
     inference = v.inference_settings or {}
     session_options: dict[str, Any] = dict(inference.get("session_options") or {})
     provider_options = _provider_options_for_ep(inference, v.ep)
 
-    # For non-CPU variants always declare the provider so the merged config
-    # appends the right EP at session-construction time (the base config copied
-    # into configs/ is from a single source and may target a different EP).
-    # CPU is ORT-GenAI's default, so only declare it when there are real options.
-    if v.ep != "CPUExecutionProvider" or provider_options:
-        session_options["provider_options"] = [{_genai_provider_name(v.ep): provider_options}]
+    # Always declare the variant's EP under session_options.provider_options so
+    # the merged genai_config tells ORT-GenAI which EP to register for this
+    # variant. Without an explicit entry ORT-GenAI fails session construction
+    # with "No execution providers were provided or selected" even when the
+    # variant's metadata.json ep matches the user's request.
+    session_options["provider_options"] = [{_genai_provider_name(v.ep): provider_options}]
 
-    if not session_options:
-        return
+    role_patch: dict[str, Any] = {}
+    if v.onnx_files:
+        role_patch["filename"] = Path(v.onnx_files[0]).name
+    role_patch["session_options"] = session_options
 
-    overlay = {"model": {component_role: {"session_options": session_options}}}
+    overlay = {"model": {component_role: role_patch}}
     _write_json(variant_dir / "genai_config_overlay.json", overlay)
 
 
+def _strip_variant_specific(node: Any, keys: tuple[str, ...] = ("filename", "session_options")) -> Any:
+    """Recursively drop variant-specific keys from a genai_config-shaped dict.
+
+    ``filename`` and ``session_options`` are intrinsically variant-specific and
+    must not live in the package's base ``configs/genai_config.json``; per-variant
+    ``genai_config_overlay.json`` files patch them back in. Returns a deep copy.
+    """
+    if isinstance(node, dict):
+        return {k: _strip_variant_specific(v, keys) for k, v in node.items() if k not in keys}
+    if isinstance(node, list):
+        return [_strip_variant_specific(v, keys) for v in node]
+    return node
+
+
 def _resolve_component_roles(config_files: Optional[dict[str, Path]]) -> dict[str, str]:
     """Map each package component to the genai_config role that references it.
 
@@ -624,7 +668,11 @@ def _write_manifest(
 # ---------------------------------------------------------------------------
 
 
-def _copy_config_files(output_dir: Path, config_files: dict[str, Path]) -> None:
+def _copy_config_files(
+    output_dir: Path,
+    config_files: dict[str, Path],
+    role_to_component: Optional[dict[str, str]] = None,
+) -> None:
     configs_dir = output_dir / _CONFIGS_DIR
     configs_dir.mkdir(parents=True, exist_ok=True)
     configs_root = configs_dir.resolve()
@@ -649,6 +697,26 @@ def _copy_config_files(output_dir: Path, config_files: dict[str, Path]) -> None:
                     src_path,
                 )
             continue
+        if name == "genai_config.json" and src_path.is_file():
+            # Strip variant-specific keys from the base genai_config and inject
+            # ``model.<role>.component`` markers so ORT-GenAI can resolve each
+            # role to a package component (and apply the right per-variant
+            # overlay). Each variant's genai_config_overlay.json patches the
+            # stripped keys back in.
+            try:
+                with src_path.open(encoding="utf-8") as fh:
+                    base_genai = json.load(fh)
+                stripped = _strip_variant_specific(base_genai)
+                if role_to_component:
+                    _inject_role_components(stripped, role_to_component)
+                _write_json(dest, stripped)
+                continue
+            except Exception:
+                logger.debug(
+                    "Failed to strip variant-specific keys from %s; falling back to verbatim copy.",
+                    src_path,
+                    exc_info=True,
+                )
         if src_path.is_dir():
             shutil.copytree(str(src_path), str(dest))
         elif src_path.is_file():
@@ -657,6 +725,23 @@ def _copy_config_files(output_dir: Path, config_files: dict[str, Path]) -> None:
             logger.warning("Config source %s does not exist; skipping.", src_path)
 
 
+def _inject_role_components(genai: dict, role_to_component: dict[str, str]) -> None:
+    """Inject ``model.<role>.component = <component>`` markers in-place.
+
+    ORT-GenAI's model-package variant selection requires every role block in
+    the base ``configs/genai_config.json`` to declare which package component
+    serves it. Olive-generated source ``genai_config.json`` typically lacks
+    these markers because the source is a flat-directory build, not a package.
+    """
+    model_block = genai.get("model")
+    if not isinstance(model_block, dict):
+        return
+    for role, component in role_to_component.items():
+        role_block = model_block.get(role)
+        if isinstance(role_block, dict):
+            role_block["component"] = component
+
+
 def _paths_equal(a: Path, b: Path) -> bool:
     """Return True if a and b have identical content (file or directory)."""
     if a.is_file() and b.is_file():
@@ -867,21 +952,60 @@ def _resolve_onnx_path(model_config: dict) -> Path:
     raise FileNotFoundError(f"model_path does not exist: {p}")
 
 
-def _ep_device_compatibility(attrs: dict, onnx_path: Path) -> tuple[str, Optional[str], Optional[str]]:
+def _ep_device_compatibility(
+    attrs: dict, onnx_path: Path, variant_name: Optional[str] = None
+) -> tuple[str, Optional[str], Optional[str]]:
     """Extract (ep, device, compatibility_string) for one variant from Olive metadata.
 
     Each variant declares a single opaque ``compatibility_string``. Olive stores
     the EP-side preference as a comma-delimited string in the ONNX metadata prop
     ``ep_compatibility_info.<EP>``; it is passed through verbatim (ORT does not
     interpret the encoding).
+
+    When ``model_attributes.ep`` is absent, fall back to a common-variant-name
+    heuristic (``gpu``/``cuda`` → CUDA, ``qnn`` → QNN, etc.) so users who don't
+    manually annotate their Olive outputs still get distinct EP entries in each
+    component's metadata.json. Final fallback is CPU.
     """
-    ep = attrs.get("ep") or "CPUExecutionProvider"
+    ep = attrs.get("ep") or _guess_ep_from_variant_name(variant_name) or "CPUExecutionProvider"
     device = attrs.get("device") or None
     raw = _extract_ep_compatibility_from_onnx(onnx_path, ep)
     compatibility_string = raw.strip() if raw and raw.strip() else None
     return ep, device, compatibility_string
 
 
+# Best-effort mapping from common Olive output / EP-build directory names to
+# canonical ORT EP strings. Used only as a fallback when model_attributes.ep is
+# not set. Keep substrings short and lowercased; matched via ``in``.
+_VARIANT_NAME_EP_HINTS: tuple[tuple[str, str], ...] = (
+    ("cuda", "CUDAExecutionProvider"),
+    ("gpu", "CUDAExecutionProvider"),
+    ("trt", "TensorrtExecutionProvider"),
+    ("tensorrt", "TensorrtExecutionProvider"),
+    ("rocm", "ROCMExecutionProvider"),
+    ("dml", "DmlExecutionProvider"),
+    ("directml", "DmlExecutionProvider"),
+    ("qnn", "QNNExecutionProvider"),
+    ("npu", "QNNExecutionProvider"),
+    ("openvino", "OpenVINOExecutionProvider"),
+    ("ovep", "OpenVINOExecutionProvider"),
+    ("webgpu", "WebGpuExecutionProvider"),
+    ("xnnpack", "XnnpackExecutionProvider"),
+    ("coreml", "CoreMLExecutionProvider"),
+    ("cpu", "CPUExecutionProvider"),
+)
+
+
+def _guess_ep_from_variant_name(variant_name: Optional[str]) -> Optional[str]:
+    if not variant_name:
+        return None
+    name = variant_name.lower()
+    for hint, ep in _VARIANT_NAME_EP_HINTS:
+        if hint in name:
+            return ep
+    return None
+
+
 def _extract_ep_compatibility_from_onnx(model_path: Path, ep: str = "") -> Optional[str]:
     """Read ``ep_compatibility_info.<EP>`` from the ONNX model's metadata_props."""
     if not model_path.is_file():

From 4318b8aa8c42c1d2dc91616a84c677ad68842c80 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Wed, 3 Jun 2026 23:11:33 +0000
Subject: [PATCH 04/11] add suffix

---
 olive/cli/model_package.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index 804e82ecb..6118d2c30 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -75,6 +75,12 @@
 # discovers components via ``<package>/models/<component>/metadata.json``.
 _MODELS_DIR = "models"
 
+# Conventional directory suffix for an ORT model package. Not enforced by
+# ORT/ORT-GenAI loaders (they probe structure, not filenames), but matches
+# the canonical naming used in ORT's model-package documentation and the
+# reference ``build_packages.py`` examples.
+_PACKAGE_SUFFIX = ".ortpackage"
+
 # Map canonical ONNX Runtime EP names to the short provider aliases used inside
 # genai_config.json's ``session_options.provider_options`` list. Matches the
 # accepted aliases reported by ORT-GenAI when it parses an unknown provider name
@@ -139,7 +145,11 @@ def register_subcommand(parser: ArgumentParser):
             "--output_path",
             type=str,
             required=True,
-            help="Output directory for the model package. Must be empty or non-existent.",
+            help=(
+                "Output directory for the model package. The ``.ortpackage`` "
+                "suffix is appended automatically if missing. Must be empty "
+                "or non-existent."
+            ),
         )
 
         sub_parser.add_argument(
@@ -164,6 +174,9 @@ def register_subcommand(parser: ArgumentParser):
     def run(self):
         sources = self._parse_sources()
         output_dir = Path(self.args.output_path)
+        if output_dir.suffix != _PACKAGE_SUFFIX:
+            output_dir = output_dir.with_name(output_dir.name + _PACKAGE_SUFFIX)
+        package_default_name = output_dir.stem
 
         targets = []
         for target_name, source_path in sources:
@@ -200,7 +213,7 @@ def run(self):
             producer_info["tool_version"] = _olive_version
         except Exception:
             logger.debug("Could not read olive.__version__", exc_info=True)
-        producer_info["model_name"] = self.args.model_name or output_dir.name
+        producer_info["model_name"] = self.args.model_name or package_default_name
         producer_info["model_version"] = self.args.model_version
         if task:
             producer_info["task"] = task
@@ -210,7 +223,7 @@ def run(self):
             variants=variants,
             config_files=config_files,
             producer_info=producer_info,
-            package_name=self.args.model_name or output_dir.name,
+            package_name=self.args.model_name or package_default_name,
             package_version=self.args.model_version,
         )
 

From 5c2bd169a88955ab3e730774ae39bbb7937ccb8f Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 4 Jun 2026 00:25:05 +0000
Subject: [PATCH 05/11] Fix nit

---
 olive/cli/model_package.py | 47 +++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index 6118d2c30..fb1090971 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -31,9 +31,11 @@
   ``compatibility_string``.
 - Each variant directory is self-contained: the ONNX file and any external-data
   blobs it references are copied inline so stock ORT can load it directly.
-- ``genai_config.json`` is copied verbatim into ``<output>/configs/``.
-  Per-variant runtime fields (``session_options``, ``provider_options``) are
-  expressed as a per-variant ``genai_config_overlay.json`` (an RFC 7386 JSON
+- ``genai_config.json`` is canonicalized into ``<output>/configs/``: variant-
+  specific runtime fields (``filename``, ``session_options``) are stripped from
+  the base and each role gets a ``component`` pointer so ORT GenAI can map
+  roles to ``models/<component>/`` at load time. The stripped fields are
+  re-injected per variant as a ``genai_config_overlay.json`` (an RFC 7386 JSON
   Merge Patch applied on top of ``configs/genai_config.json``).
 
 """
@@ -265,10 +267,15 @@ def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> li
             target_attrs = _get_model_attributes(model_config)
             target_inference = model_config.get("config", {}).get("inference_settings") or {}
             components = model_config["config"].get("model_components", [])
-            component_names = model_config["config"].get("component_names", [])
+            component_names = model_config["config"].get("model_component_names", [])
 
             if not components:
                 raise ValueError(f"Composite source {target_name!r} declares no model_components.")
+            if len(components) != len(component_names):
+                raise ValueError(
+                    f"Composite source {target_name!r} has {len(components)} model_components but "
+                    f"{len(component_names)} model_component_names; counts must match."
+                )
 
             for comp_config, comp_name in zip(components, component_names):
                 # Component-level inference_settings overrides target-level if present.
@@ -624,11 +631,18 @@ def _strip_variant_specific(node: Any, keys: tuple[str, ...] = ("filename", "ses
 def _resolve_component_roles(config_files: Optional[dict[str, Path]]) -> dict[str, str]:
     """Map each package component to the genai_config role that references it.
 
-    The base ``genai_config.json`` declares roles under ``model.<role>`` and
-    each role names the package component it loads via a ``component`` field.
-    Per-variant overlays must patch ``model.<role>``, which can differ from the
-    component name, so we invert that mapping here. Returns an empty map when no
-    base config is available (callers fall back to the component name).
+    The base ``genai_config.json`` declares roles under ``model.<role>``. The
+    role name and component directory name are not always the same (e.g.
+    Mobius emits role ``vision`` for a component dir named ``vision_encoder``),
+    so per-variant overlays need a role lookup. We try two signals in order:
+
+    1. ``model.<role>.component`` (explicit pointer, ORT spec).
+    2. The first path segment of ``model.<role>.filename`` — Mobius and other
+       flat-dir producers write paths like ``vision_encoder/model.onnx`` so the
+       directory naturally names the component.
+
+    Returns an empty map when no base config is available; callers fall back
+    to the component name as the role.
     """
     if not config_files:
         return {}
@@ -648,10 +662,17 @@ def _resolve_component_roles(config_files: Optional[dict[str, Path]]) -> dict[st
 
     component_to_role: dict[str, str] = {}
     for role, role_block in model_block.items():
-        if isinstance(role_block, dict):
-            component = role_block.get("component")
-            if isinstance(component, str) and component and component not in component_to_role:
-                component_to_role[component] = role
+        if not isinstance(role_block, dict):
+            continue
+        component = role_block.get("component")
+        if not (isinstance(component, str) and component):
+            filename = role_block.get("filename")
+            if isinstance(filename, str) and filename:
+                parts = Path(filename).parts
+                if len(parts) >= 2:
+                    component = parts[0]
+        if isinstance(component, str) and component and component not in component_to_role:
+            component_to_role[component] = role
     return component_to_role
 
 

From a000797cad64df24e3bcff2c99e48e4597e27e7c Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 4 Jun 2026 00:44:08 +0000
Subject: [PATCH 06/11] Fix comments

---
 olive/cli/model_package.py     |  8 +--
 test/cli/test_model_package.py | 92 ++++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index fb1090971..9baa76fed 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -93,7 +93,7 @@
     "DmlExecutionProvider": "DML",
     "WebGpuExecutionProvider": "WebGPU",
     "JsExecutionProvider": "JS",
-    "QNNExecutionProvider": "QNN",
+    "QNNExecutionProvider": "qnn",
     "OpenVINOExecutionProvider": "OpenVINO",
     "ROCMExecutionProvider": "rocm",
     "TensorrtExecutionProvider": "tensorrt",
@@ -605,10 +605,12 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
     # variant's metadata.json ep matches the user's request.
     session_options["provider_options"] = [{_genai_provider_name(v.ep): provider_options}]
 
-    role_patch: dict[str, Any] = {}
+    role_patch: dict[str, Any] = {"session_options": session_options}
     if v.onnx_files:
+        # The base config strips ``filename`` (it was a variant-specific path
+        # like ``decoder/model.onnx``); the loader resolves the variant ONNX as
+        # ``<variant_dir>/<filename>``, so emit the basename here.
         role_patch["filename"] = Path(v.onnx_files[0]).name
-    role_patch["session_options"] = session_options
 
     overlay = {"model": {component_role: role_patch}}
     _write_json(variant_dir / "genai_config_overlay.json", overlay)
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index 84391554a..571a88a53 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -178,7 +178,7 @@ def test_writes_proposal_layout(self, tmp_path):
         # setup
         src1 = _create_source_dir(tmp_path, "soc_60", {"ep": "QNNExecutionProvider", "device": "NPU"})
         src2 = _create_source_dir(tmp_path, "soc_73", {"ep": "QNNExecutionProvider", "device": "NPU"})
-        out = tmp_path / "out"
+        out = tmp_path / "out.ortpackage"
         cmd = _make_command(
             [
                 "generate-model-package",
@@ -198,9 +198,9 @@ def test_writes_proposal_layout(self, tmp_path):
         # execute
         cmd.run()
 
-        # assert: top-level layout (no models/ wrapper)
+        # assert: top-level manifest + components under models/
         assert (out / "manifest.json").is_file()
-        assert not (out / "models").exists()
+        assert (out / "models").is_dir()
 
         manifest = json.loads((out / "manifest.json").read_text())
         assert manifest["schema_version"] == 1
@@ -209,7 +209,7 @@ def test_writes_proposal_layout(self, tmp_path):
         assert manifest["producer"]["model_version"] == "2.0"
 
         # metadata uses inline EP
-        metadata = json.loads((out / "model" / "metadata.json").read_text())
+        metadata = json.loads((out / "models" / "model" / "metadata.json").read_text())
         assert metadata["schema_version"] == 1
         assert metadata["component_name"] == "model"
         assert set(metadata["variants"]) == {"soc_60", "soc_73"}
@@ -219,25 +219,25 @@ def test_writes_proposal_layout(self, tmp_path):
         # No variant.json is emitted; the ONNX file lands in the variant
         # directory.
         for v in ("soc_60", "soc_73"):
-            assert not (out / "model" / v / "variant.json").exists()
-            assert (out / "model" / v / "model.onnx").is_file()
+            assert not (out / "models" / "model" / v / "variant.json").exists()
+            assert (out / "models" / "model" / v / "model.onnx").is_file()
 
 
 class TestGeneratePackageSingleSource:
     def test_single_source_is_valid_package(self, tmp_path):
         src = _create_source_dir(tmp_path, "cpu_x64", {"ep": "CPUExecutionProvider"})
-        out = tmp_path / "out"
+        out = tmp_path / "out.ortpackage"
         cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
 
         cmd.run()
 
         manifest = json.loads((out / "manifest.json").read_text())
         assert manifest["components"] == ["model"]
-        metadata = json.loads((out / "model" / "metadata.json").read_text())
+        metadata = json.loads((out / "models" / "model" / "metadata.json").read_text())
         assert "cpu_x64" in metadata["variants"]
         assert metadata["variants"]["cpu_x64"] == {"ep": "CPUExecutionProvider"}
         # No shared_weights because nothing to dedup.
-        assert not (out / "model" / "shared_weights").exists()
+        assert not (out / "models" / "model" / "shared_weights").exists()
 
 
 # ---------------------------------------------------------------------------
@@ -265,11 +265,10 @@ def test_writes_proposal_shape_for_single_variant(self, tmp_path):
         )
 
         assert (out / "manifest.json").is_file()
-        assert (out / "decoder" / "metadata.json").is_file()
+        assert (out / "models" / "decoder" / "metadata.json").is_file()
         # No variant.json is emitted.
-        assert not (out / "decoder" / "cpu" / "variant.json").exists()
-        assert (out / "decoder" / "cpu" / "model.onnx").is_file()
-        assert not (out / "models").exists()
+        assert not (out / "models" / "decoder" / "cpu" / "variant.json").exists()
+        assert (out / "models" / "decoder" / "cpu" / "model.onnx").is_file()
 
     def test_manifest_uses_proposal_schema(self, tmp_path):
         onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
@@ -322,7 +321,7 @@ def test_metadata_uses_inline_ep(self, tmp_path):
             ],
         )
 
-        metadata = json.loads((out / "decoder" / "metadata.json").read_text())
+        metadata = json.loads((out / "models" / "decoder" / "metadata.json").read_text())
         assert metadata["schema_version"] == 1
         assert metadata["component_name"] == "decoder"
         assert metadata["variants"]["qnn-npu"] == {
@@ -348,7 +347,7 @@ def test_metadata_omits_optional_fields_when_unset(self, tmp_path):
             ],
         )
 
-        metadata = json.loads((out / "decoder" / "metadata.json").read_text())
+        metadata = json.loads((out / "models" / "decoder" / "metadata.json").read_text())
         assert metadata["variants"]["cpu"] == {"ep": "CPUExecutionProvider"}
 
     def test_overlay_carries_session_and_provider_options(self, tmp_path):
@@ -375,15 +374,16 @@ def test_overlay_carries_session_and_provider_options(self, tmp_path):
         )
 
         # Runtime fields go to genai_config_overlay.json, not variant.json.
-        assert not (out / "decoder" / "cuda" / "variant.json").exists()
-        overlay = json.loads((out / "decoder" / "cuda" / "genai_config_overlay.json").read_text())
+        assert not (out / "models" / "decoder" / "cuda" / "variant.json").exists()
+        overlay = json.loads((out / "models" / "decoder" / "cuda" / "genai_config_overlay.json").read_text())
         assert overlay == {
             "model": {
                 "decoder": {
+                    "filename": "model.onnx",
                     "session_options": {
                         "graph_optimization_level": 3,
                         "provider_options": [{"cuda": {"device_id": "0"}}],
-                    }
+                    },
                 }
             }
         }
@@ -412,12 +412,18 @@ def test_overlay_provider_options_match_ep_by_name(self, tmp_path):
             ],
         )
 
-        overlay = json.loads((out / "decoder" / "qnn" / "genai_config_overlay.json").read_text())
+        overlay = json.loads((out / "models" / "decoder" / "qnn" / "genai_config_overlay.json").read_text())
         assert overlay["model"]["decoder"]["session_options"]["provider_options"] == [
             {"qnn": {"backend_path": "QnnHtp.so"}}
         ]
 
-    def test_overlay_omitted_for_cpu_variant_without_options(self, tmp_path):
+    def test_overlay_always_emits_provider_options_for_cpu(self, tmp_path):
+        """Plain CPU variants still get an overlay so ORT-GenAI sees an explicit
+        provider entry. The base config strips ``session_options`` (variant-
+        specific), so without this overlay session construction would fail with
+        "No execution providers were provided or selected" even though the
+        variant's metadata.json names CPUExecutionProvider.
+        """
         onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
         out = tmp_path / "package"
 
@@ -433,7 +439,15 @@ def test_overlay_omitted_for_cpu_variant_without_options(self, tmp_path):
             ],
         )
 
-        assert not (out / "decoder" / "cpu" / "genai_config_overlay.json").exists()
+        overlay = json.loads((out / "models" / "decoder" / "cpu" / "genai_config_overlay.json").read_text())
+        assert overlay == {
+            "model": {
+                "decoder": {
+                    "filename": "model.onnx",
+                    "session_options": {"provider_options": [{"CPU": {}}]},
+                }
+            }
+        }
 
 
 # ---------------------------------------------------------------------------
@@ -468,10 +482,10 @@ def test_keeps_identical_external_data_inline_in_each_variant(self, tmp_path):
 
         # Each variant keeps its own external-data blob inline; no shared_weights
         # directory or variant.json is emitted.
-        assert not (out / "decoder" / "shared_weights").exists()
+        assert not (out / "models" / "decoder" / "shared_weights").exists()
         for v in ("v1", "v2"):
-            assert (out / "decoder" / v / "model.onnx.data").is_file()
-            assert not (out / "decoder" / v / "variant.json").exists()
+            assert (out / "models" / "decoder" / v / "model.onnx.data").is_file()
+            assert not (out / "models" / "decoder" / v / "variant.json").exists()
 
     def test_keeps_distinct_external_data_inline_per_variant(self, tmp_path):
         a = _make_onnx_with_external(tmp_path / "a" / "model.onnx", "model.onnx.data", b"a-bytes" * 32)
@@ -496,13 +510,13 @@ def test_keeps_distinct_external_data_inline_per_variant(self, tmp_path):
             ],
         )
 
-        assert not (out / "decoder" / "shared_weights").exists()
-        assert (out / "decoder" / "v1" / "model.onnx.data").is_file()
-        assert (out / "decoder" / "v2" / "model.onnx.data").is_file()
+        assert not (out / "models" / "decoder" / "shared_weights").exists()
+        assert (out / "models" / "decoder" / "v1" / "model.onnx.data").is_file()
+        assert (out / "models" / "decoder" / "v2" / "model.onnx.data").is_file()
 
         # No variant.json is emitted.
         for v in ("v1", "v2"):
-            assert not (out / "decoder" / v / "variant.json").exists()
+            assert not (out / "models" / "decoder" / v / "variant.json").exists()
 
     def test_single_variant_keeps_blob_inline(self, tmp_path):
         onnx_path = _make_onnx_with_external(tmp_path / "src" / "model.onnx", "model.onnx.data", b"x" * 128)
@@ -520,10 +534,10 @@ def test_single_variant_keeps_blob_inline(self, tmp_path):
             ],
         )
 
-        assert (out / "decoder" / "cpu" / "model.onnx.data").is_file()
-        assert not (out / "decoder" / "shared_weights").exists()
+        assert (out / "models" / "decoder" / "cpu" / "model.onnx.data").is_file()
+        assert not (out / "models" / "decoder" / "shared_weights").exists()
         # No variant.json is emitted.
-        assert not (out / "decoder" / "cpu" / "variant.json").exists()
+        assert not (out / "models" / "decoder" / "cpu" / "variant.json").exists()
 
 
 # ---------------------------------------------------------------------------
@@ -744,14 +758,14 @@ def test_passes_through_comma_delimited_metadata(self, tmp_path):
             {"ep": "QNNExecutionProvider", "device": "NPU"},
             onnx_metadata={"ep_compatibility_info.QNNExecutionProvider": "soc_60,soc_69,soc_73"},
         )
-        out = tmp_path / "out"
+        out = tmp_path / "out.ortpackage"
         cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
 
         # execute
         cmd.run()
 
         # assert: compatibility_string passes the raw opaque string through verbatim
-        metadata = json.loads((out / "model" / "metadata.json").read_text())
+        metadata = json.loads((out / "models" / "model" / "metadata.json").read_text())
         variant = metadata["variants"]["soc_60"]
         assert variant["ep"] == "QNNExecutionProvider"
         assert variant["compatibility_string"] == "soc_60,soc_69,soc_73"
@@ -774,7 +788,7 @@ def _create_composite_source(
     """Create an Olive-style composite source dir."""
     source_dir = tmp_path / name
     source_dir.mkdir(parents=True)
-    cfg = {"model_components": components, "component_names": component_names}
+    cfg = {"model_components": components, "model_component_names": component_names}
     if target_inference is not None:
         cfg["inference_settings"] = target_inference
     if target_attrs is not None:
@@ -814,17 +828,21 @@ def test_per_component_inference_settings_wins(self, tmp_path):
             target_inference=target_inference,
             target_attrs={"ep": "CPUExecutionProvider"},
         )
-        out = tmp_path / "out"
+        out = tmp_path / "out.ortpackage"
         cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
 
         # execute
         cmd.run()
 
         # assert: encoder uses target-level, decoder uses component-level
-        encoder_overlay = json.loads((out / "encoder" / "soc_60" / "genai_config_overlay.json").read_text())
+        encoder_overlay = json.loads(
+            (out / "models" / "encoder" / "soc_60" / "genai_config_overlay.json").read_text()
+        )
         assert encoder_overlay["model"]["encoder"]["session_options"]["graph_optimization_level"] == 1
 
-        decoder_overlay = json.loads((out / "decoder" / "soc_60" / "genai_config_overlay.json").read_text())
+        decoder_overlay = json.loads(
+            (out / "models" / "decoder" / "soc_60" / "genai_config_overlay.json").read_text()
+        )
         assert decoder_overlay["model"]["decoder"]["session_options"]["graph_optimization_level"] == 99
 
 

From f7b73e350f4009afe06619e8034cedfb519cd7bf Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 4 Jun 2026 20:05:41 +0000
Subject: [PATCH 07/11] Emit empty provider_options for CPU variant overlays

ORT-GenAI's SetProviderSessionOptions dispatch table has no CPU handler
(src/models/session_options.cpp:150-159); the prior sentinel entry
[{"CPU": {}}] only triggered a V1 no-op registration. ORT
InferenceSession implicitly registers the CPU EP when no other provider
is selected (onnxruntime/core/session/inference_session.cc), so emitting
an empty provider_options list for CPU is sufficient and matches the
convention used by reference ORT model packages.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 olive/cli/model_package.py     | 25 ++++++++++++++++++-------
 test/cli/test_model_package.py | 25 ++++++++++++-------------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index 9baa76fed..e0d7fba84 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -597,13 +597,24 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
     inference = v.inference_settings or {}
     session_options: dict[str, Any] = dict(inference.get("session_options") or {})
     provider_options = _provider_options_for_ep(inference, v.ep)
-
-    # Always declare the variant's EP under session_options.provider_options so
-    # the merged genai_config tells ORT-GenAI which EP to register for this
-    # variant. Without an explicit entry ORT-GenAI fails session construction
-    # with "No execution providers were provided or selected" even when the
-    # variant's metadata.json ep matches the user's request.
-    session_options["provider_options"] = [{_genai_provider_name(v.ep): provider_options}]
+    genai_ep = _genai_provider_name(v.ep)
+
+    # ORT-GenAI's FinalizeConfig builds session_options.providers from
+    # provider_options[*].name (src/config.cpp:1643-1645), and
+    # SetProviderSessionOptions then registers each named provider. CPU is not
+    # in the dispatch table (src/models/session_options.cpp:150-159); it has no
+    # configurable options, and ORT InferenceSession adds it implicitly when no
+    # other EP is registered (onnxruntime/core/session/inference_session.cc:
+    # SetCpuProviderWasImplicitlyAdded). For CPU variants we therefore emit an
+    # empty list rather than a sentinel ``[{"CPU": {}}]`` entry. For every
+    # other EP we name it explicitly (NormalizeProviderName canonicalises the
+    # case for QNN/DML/OpenVINO/etc., and "cuda" is already lowercase in the
+    # dispatch table). This matches the convention used by reference ORT model
+    # packages and avoids registering CPU through the V1 no-op path.
+    if genai_ep == "CPU":
+        session_options["provider_options"] = []
+    else:
+        session_options["provider_options"] = [{genai_ep: provider_options}]
 
     role_patch: dict[str, Any] = {"session_options": session_options}
     if v.onnx_files:
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index 571a88a53..9e426a4e6 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -417,12 +417,15 @@ def test_overlay_provider_options_match_ep_by_name(self, tmp_path):
             {"qnn": {"backend_path": "QnnHtp.so"}}
         ]
 
-    def test_overlay_always_emits_provider_options_for_cpu(self, tmp_path):
-        """Plain CPU variants still get an overlay so ORT-GenAI sees an explicit
-        provider entry. The base config strips ``session_options`` (variant-
-        specific), so without this overlay session construction would fail with
-        "No execution providers were provided or selected" even though the
-        variant's metadata.json names CPUExecutionProvider.
+    def test_overlay_emits_empty_provider_options_for_cpu(self, tmp_path):
+        """CPU variants emit ``provider_options: []`` rather than a sentinel entry.
+
+        ``[{"CPU": {}}]`` is not needed: ORT-GenAI's dispatch table has no CPU
+        handler (src/models/session_options.cpp), and ORT InferenceSession
+        implicitly registers the CPU EP when no other provider is selected
+        (onnxruntime/core/session/inference_session.cc), so the explicit entry
+        would only trigger a V1 no-op registration. An empty list matches the
+        convention used by reference ORT model packages.
         """
         onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
         out = tmp_path / "package"
@@ -444,7 +447,7 @@ def test_overlay_always_emits_provider_options_for_cpu(self, tmp_path):
             "model": {
                 "decoder": {
                     "filename": "model.onnx",
-                    "session_options": {"provider_options": [{"CPU": {}}]},
+                    "session_options": {"provider_options": []},
                 }
             }
         }
@@ -835,14 +838,10 @@ def test_per_component_inference_settings_wins(self, tmp_path):
         cmd.run()
 
         # assert: encoder uses target-level, decoder uses component-level
-        encoder_overlay = json.loads(
-            (out / "models" / "encoder" / "soc_60" / "genai_config_overlay.json").read_text()
-        )
+        encoder_overlay = json.loads((out / "models" / "encoder" / "soc_60" / "genai_config_overlay.json").read_text())
         assert encoder_overlay["model"]["encoder"]["session_options"]["graph_optimization_level"] == 1
 
-        decoder_overlay = json.loads(
-            (out / "models" / "decoder" / "soc_60" / "genai_config_overlay.json").read_text()
-        )
+        decoder_overlay = json.loads((out / "models" / "decoder" / "soc_60" / "genai_config_overlay.json").read_text())
         assert decoder_overlay["model"]["decoder"]["session_options"]["graph_optimization_level"] == 99
 
 

From 78cb6e40d06734a6234811e14a06fcf279f3d5a1 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 4 Jun 2026 21:46:21 +0000
Subject: [PATCH 08/11] Sweep model-suffix sidecar files into each variant
 directory

OpenVINO/QNN variants ship a tiny EPContext stub .onnx plus same-stem .xml/.bin
sidecars that the loader expects to find next to it. These sidecars are not
referenced through ONNX initializer external_data, so the previous copy path
missed them and the produced variants were unloadable.

After copying each .onnx and its external-data blobs, walk the source
directory once more and copy any remaining files whose suffix is one of the
known model suffixes (.onnx/.bin/.xml/.data). Each Olive source directory
holds the artifacts for a single variant, so any model-suffix file there
belongs next to the ONNX. Skips duplicates already copied via external_data.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 olive/cli/model_package.py     | 20 +++++++++++
 test/cli/test_model_package.py | 66 ++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index e0d7fba84..9a0000763 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -515,6 +515,7 @@ def _write_component(
         variant_dir = component_dir / v.variant_name
         variant_dir.mkdir(parents=True, exist_ok=True)
 
+        source_dirs: set[Path] = set()
         for onnx_src in v.onnx_files:
             onnx_src_path = Path(onnx_src)
             if not onnx_src_path.is_file():
@@ -522,6 +523,7 @@ def _write_component(
 
             onnx_dst = variant_dir / onnx_src_path.name
             shutil.copy2(str(onnx_src_path), str(onnx_dst))
+            source_dirs.add(onnx_src_path.parent.resolve())
 
             ext_refs = _discover_external_data(onnx_src_path)
             external_root = onnx_src_path.parent.resolve()
@@ -548,6 +550,24 @@ def _write_component(
                 if not blob_dst.exists():
                     shutil.copy2(str(blob_src), str(blob_dst))
 
+        # Sweep each source directory for remaining model-suffix sidecar files
+        # (e.g. an EPContext stub ``.onnx`` typically points at a same-stem
+        # ``.xml``/``.bin`` pair for OpenVINO or a ``.bin`` context blob for
+        # QNN; these sidecars don't appear in the ONNX initializer
+        # ``external_data`` table so the standard external-data copy above
+        # misses them). Each Olive source directory holds the artifacts for a
+        # single variant, so any file with a model suffix is part of this
+        # variant and belongs next to the ONNX. Duplicates already copied as
+        # external-data are skipped.
+        for src_dir in sorted(source_dirs):
+            for entry in sorted(src_dir.iterdir()):
+                if not entry.is_file() or entry.suffix not in _MODEL_SUFFIXES:
+                    continue
+                dst = variant_dir / entry.name
+                if dst.exists():
+                    continue
+                shutil.copy2(str(entry), str(dst))
+
         # Per-variant runtime fields flow through genai_config_overlay.json.
         _write_genai_config_overlay(variant_dir, component_role, v)
 
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index 9e426a4e6..27543e772 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -542,6 +542,72 @@ def test_single_variant_keeps_blob_inline(self, tmp_path):
         # No variant.json is emitted.
         assert not (out / "models" / "decoder" / "cpu" / "variant.json").exists()
 
+    def test_copies_model_suffix_sidecars_into_variant_dir(self, tmp_path):
+        """Sidecars next to an EPContext stub get copied into the variant dir.
+
+        OpenVINO/QNN-style sidecars (e.g. ``.xml``/``.bin`` next to an EPContext stub
+        ``.onnx``) aren't referenced through ONNX initializer external_data, so the
+        writer sweeps the source directory and copies every model-suffix file next to
+        the variant ONNX. Non-model files like ``.bak`` and ``.json`` are left alone.
+        """
+        src_dir = tmp_path / "src"
+        onnx_path = _make_onnx_inline(src_dir / "openvino_model_dy.onnx")
+        (src_dir / "openvino_model_dy.xml").write_bytes(b"<openvino-ir/>")
+        (src_dir / "openvino_model_dy.bin").write_bytes(b"\x01\x02\x03\x04" * 64)
+        # Files that must NOT be picked up by the sidecar sweep:
+        (src_dir / "openvino_model_dy.onnx.bak").write_bytes(b"stale")
+        (src_dir / "tokenizer.json").write_text("{}")
+
+        out = tmp_path / "package"
+        write_model_package(
+            output_dir=out,
+            variants=[
+                VariantSpec(
+                    component_name="decoder",
+                    variant_name="openvino_gpu",
+                    onnx_files=[onnx_path],
+                    ep="OpenVINOExecutionProvider",
+                )
+            ],
+        )
+
+        variant_dir = out / "models" / "decoder" / "openvino_gpu"
+        assert (variant_dir / "openvino_model_dy.onnx").is_file()
+        assert (variant_dir / "openvino_model_dy.xml").is_file()
+        assert (variant_dir / "openvino_model_dy.bin").is_file()
+        assert (variant_dir / "openvino_model_dy.bin").read_bytes() == b"\x01\x02\x03\x04" * 64
+        # .bak and .json must stay out of the variant dir; .bak has the wrong suffix
+        # and .json belongs under configs/, not next to the ONNX.
+        assert not (variant_dir / "openvino_model_dy.onnx.bak").exists()
+        assert not (variant_dir / "tokenizer.json").exists()
+
+    def test_sidecar_sweep_does_not_overwrite_external_data(self, tmp_path):
+        """External-data blobs are not overwritten by the sidecar sweep.
+
+        Blobs already copied through the ONNX initializer path must not be overwritten
+        by the broader source-directory sweep — the existing copy is authoritative
+        (it came from the ONNX it belongs to).
+        """
+        blob = b"\xaa" * 256
+        onnx_path = _make_onnx_with_external(tmp_path / "src" / "model.onnx", "model.onnx.data", blob)
+        out = tmp_path / "package"
+
+        write_model_package(
+            output_dir=out,
+            variants=[
+                VariantSpec(
+                    component_name="decoder",
+                    variant_name="cpu",
+                    onnx_files=[onnx_path],
+                    ep="CPUExecutionProvider",
+                )
+            ],
+        )
+
+        copied = out / "models" / "decoder" / "cpu" / "model.onnx.data"
+        assert copied.is_file()
+        assert copied.read_bytes() == blob
+
 
 # ---------------------------------------------------------------------------
 # Writer: configs/ + safety

From 071bee2999f16a07ac53706325b2fdb8bd4590bb Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 4 Jun 2026 21:53:33 +0000
Subject: [PATCH 09/11] Lift per-variant model-level fields into each variant
 overlay
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Variants of the same model can legitimately differ on a small set of
model-level scalars — most importantly `context_length`, which on
OpenVINO NPU is capped at the prompt+response budget (e.g. 4224) but
on GPU/CPU runs at the full pretrained limit (e.g. 131072). Similar
applies to `pad_token_id`, `bos_token_id`, `eos_token_id`, and
`type`. The base genai_config can only hold one value for each of
these, so without per-variant overlay the merged config would silently
use whichever source happened to win the base.

This change lifts those fields verbatim from each variant's source
`genai_config.json` into its overlay, and strips them from the
base. The strip is required for the array field `eos_token_id`:
GenAI's overlay merge appends arrays rather than replacing them, so a
base entry would duplicate (not override) the variant entry.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 olive/cli/model_package.py     |  79 ++++++++++++++++++++++--
 test/cli/test_model_package.py | 108 +++++++++++++++++++++++++++++++++
 2 files changed, 182 insertions(+), 5 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index 9a0000763..7e6245cca 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -240,7 +240,7 @@ def _build_single_variants(self, targets: list[tuple[str, Path, dict]]) -> list[
         task = self._extract_task(targets)
         component_name = _task_to_component_name(task)
         variants: list[VariantSpec] = []
-        for target_name, _src, model_config in targets:
+        for target_name, source_path, model_config in targets:
             attrs = _get_model_attributes(model_config)
             onnx_path = _resolve_onnx_path(model_config)
             ep, device, compatibility_string = _ep_device_compatibility(attrs, onnx_path, target_name)
@@ -253,6 +253,7 @@ def _build_single_variants(self, targets: list[tuple[str, Path, dict]]) -> list[
                     device=device,
                     compatibility_string=compatibility_string,
                     inference_settings=model_config.get("config", {}).get("inference_settings") or {},
+                    source_genai=_load_source_genai(source_path),
                 )
             )
         return variants
@@ -263,9 +264,10 @@ def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> li
         # Track per-component variants in source insertion order.
         component_variants: dict[str, list[VariantSpec]] = OrderedDict()
 
-        for target_name, _src, model_config in targets:
+        for target_name, source_path, model_config in targets:
             target_attrs = _get_model_attributes(model_config)
             target_inference = model_config.get("config", {}).get("inference_settings") or {}
+            target_genai = _load_source_genai(source_path)
             components = model_config["config"].get("model_components", [])
             component_names = model_config["config"].get("model_component_names", [])
 
@@ -295,6 +297,7 @@ def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> li
                     device=device,
                     compatibility_string=compatibility_string,
                     inference_settings=comp_inference,
+                    source_genai=target_genai,
                 )
                 component_variants.setdefault(comp_name, []).append(spec)
 
@@ -415,6 +418,11 @@ class VariantSpec:
     compatibility_string: Optional[str] = None
     inference_settings: dict[str, Any] = field(default_factory=dict)
     consumer_metadata: Optional[dict[str, Any]] = None
+    # The variant's source ``genai_config.json`` (parsed). Used to lift
+    # per-variant fields (context_length, pad_token_id, decoder.inputs, ...)
+    # into the variant overlay. Kept as a deep object rather than a path so
+    # callers can synthesize it without touching disk.
+    source_genai: Optional[dict[str, Any]] = None
 
 
 def write_model_package(
@@ -643,16 +651,57 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
         # ``<variant_dir>/<filename>``, so emit the basename here.
         role_patch["filename"] = Path(v.onnx_files[0]).name
 
-    overlay = {"model": {component_role: role_patch}}
+    model_patch: dict[str, Any] = {component_role: role_patch}
+    # Lift per-variant model-level scalars from the variant's own
+    # genai_config.json. The base config strips these (see
+    # ``_strip_variant_specific``) because they legitimately differ across
+    # variants (e.g. NPU runtime caps ``context_length`` at 4224 while CPU/CUDA
+    # use the full 131072; pad_token_id can differ when one exporter uses the
+    # EOS as PAD and another uses the sentinel). Without this lift the merged
+    # config would silently use whichever variant happened to win the base
+    # selection.
+    src_genai = v.source_genai or {}
+    src_model = src_genai.get("model") if isinstance(src_genai, dict) else None
+    if isinstance(src_model, dict):
+        for k in _VARIANT_LEVEL_MODEL_KEYS:
+            if k in src_model:
+                # Deep-copy via JSON round-trip so we never share refs with the
+                # caller's dict; arrays in particular must be independent
+                # because GenAI's overlay parser treats arrays as append-merge.
+                model_patch[k] = json.loads(json.dumps(src_model[k]))
+
+    overlay = {"model": model_patch}
     _write_json(variant_dir / "genai_config_overlay.json", overlay)
 
 
-def _strip_variant_specific(node: Any, keys: tuple[str, ...] = ("filename", "session_options")) -> Any:
+# Per-variant model-level keys that we strip from the package's base
+# genai_config.json and re-supply from each variant's source. These appear
+# directly under ``model`` (not nested under ``model.<role>``) and we have
+# observed them to legitimately diverge across variants of the same model
+# (NPU context truncation, exporter-specific pad token encoding, etc.). Kept
+# minimal: only add a key here when we have evidence it varies AND its base
+# value would be wrong for some variant.
+_VARIANT_LEVEL_MODEL_KEYS: tuple[str, ...] = (
+    "context_length",
+    "pad_token_id",
+    "eos_token_id",
+    "bos_token_id",
+    "type",
+)
+
+
+def _strip_variant_specific(
+    node: Any,
+    keys: tuple[str, ...] = ("filename", "session_options", *_VARIANT_LEVEL_MODEL_KEYS),
+) -> Any:
     """Recursively drop variant-specific keys from a genai_config-shaped dict.
 
     ``filename`` and ``session_options`` are intrinsically variant-specific and
     must not live in the package's base ``configs/genai_config.json``; per-variant
-    ``genai_config_overlay.json`` files patch them back in. Returns a deep copy.
+    ``genai_config_overlay.json`` files patch them back in. The same logic
+    applies to per-variant model-level scalars listed in
+    ``_VARIANT_LEVEL_MODEL_KEYS`` (e.g. ``context_length`` differs between NPU
+    and GPU variants of the same model). Returns a deep copy.
     """
     if isinstance(node, dict):
         return {k: _strip_variant_specific(v, keys) for k, v in node.items() if k not in keys}
@@ -987,6 +1036,26 @@ def _get_model_attributes(model_config: dict) -> dict:
     return model_config.get("config", {}).get("model_attributes") or {}
 
 
+def _load_source_genai(source_path: Path) -> Optional[dict]:
+    """Return the parsed ``<source>/genai_config.json`` if present.
+
+    Each variant's source directory carries its own genai_config; the writer
+    lifts per-variant model-level fields from it into the variant overlay.
+    Missing or unparseable files yield ``None`` rather than failing so a
+    source without genai_config (e.g. a pure-ONNX export not destined for
+    GenAI) can still be packaged.
+    """
+    path = Path(source_path) / "genai_config.json"
+    if not path.is_file():
+        return None
+    try:
+        with path.open(encoding="utf-8") as fh:
+            return json.load(fh)
+    except Exception:
+        logger.debug("Could not parse %s; skipping per-variant model-field lift.", path, exc_info=True)
+        return None
+
+
 def _resolve_onnx_path(model_config: dict) -> Path:
     """Resolve the ONNX file path from an Olive model config.
 
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index 27543e772..cad786e09 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -452,6 +452,114 @@ def test_overlay_emits_empty_provider_options_for_cpu(self, tmp_path):
             }
         }
 
+    def test_overlay_lifts_per_variant_model_level_fields(self, tmp_path):
+        """Per-variant ``context_length`` (and similar) flows from source to overlay.
+
+        Each variant's source ``genai_config.json`` is the source of truth for
+        model-level scalars that legitimately vary across variants of the same
+        model (e.g. an NPU build caps ``context_length`` while the GPU build
+        does not). The writer strips these from the base config and re-supplies
+        them per variant; without this lift the merged config would silently
+        use whichever variant's base happened to win.
+        """
+        onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
+        out = tmp_path / "package"
+
+        npu_source_genai = {
+            "model": {
+                "type": "phi3",
+                "context_length": 4224,
+                "pad_token_id": 200020,
+                "eos_token_id": [200020, 199999],
+                "bos_token_id": 199999,
+                "vocab_size": 200064,
+                "decoder": {"head_size": 128, "filename": "model.onnx", "session_options": {}},
+            }
+        }
+
+        write_model_package(
+            output_dir=out,
+            variants=[
+                VariantSpec(
+                    component_name="decoder",
+                    variant_name="npu",
+                    onnx_files=[onnx_path],
+                    ep="OpenVINOExecutionProvider",
+                    source_genai=npu_source_genai,
+                )
+            ],
+        )
+
+        overlay = json.loads((out / "models" / "decoder" / "npu" / "genai_config_overlay.json").read_text())
+        model_patch = overlay["model"]
+        assert model_patch["context_length"] == 4224
+        assert model_patch["pad_token_id"] == 200020
+        assert model_patch["eos_token_id"] == [200020, 199999]
+        assert model_patch["bos_token_id"] == 199999
+        assert model_patch["type"] == "phi3"
+        # ``vocab_size`` is structural (shared across all variants of a model)
+        # and is not in the per-variant lift list, so it must NOT appear in
+        # the overlay — otherwise it would duplicate the base copy.
+        assert "vocab_size" not in model_patch
+
+    def test_base_genai_strips_per_variant_model_fields(self, tmp_path):
+        """The base ``configs/genai_config.json`` must not carry per-variant fields.
+
+        If ``context_length`` (or similar) lived in the base, GenAI's overlay
+        merge would still honour the per-variant value (overlay scalar wins),
+        but ``_VARIANT_LEVEL_MODEL_KEYS`` includes arrays (``eos_token_id``)
+        whose presence in the base would trigger GenAI's array-append merge
+        semantics — the merged result would duplicate the array. So the base
+        must be free of every variant-level model key.
+        """
+        onnx_path = _make_onnx_inline(tmp_path / "src" / "model.onnx")
+        out = tmp_path / "package"
+        cfg = tmp_path / "configs_src" / "genai_config.json"
+        cfg.parent.mkdir(parents=True)
+        cfg.write_text(
+            json.dumps(
+                {
+                    "model": {
+                        "type": "phi3",
+                        "context_length": 131072,
+                        "pad_token_id": 199999,
+                        "eos_token_id": [200020, 199999],
+                        "bos_token_id": 199999,
+                        "vocab_size": 200064,
+                        "decoder": {
+                            "head_size": 128,
+                            "filename": "model.onnx",
+                            "session_options": {"log_id": "x"},
+                        },
+                    }
+                }
+            )
+        )
+
+        write_model_package(
+            output_dir=out,
+            variants=[
+                VariantSpec(
+                    component_name="decoder",
+                    variant_name="cpu",
+                    onnx_files=[onnx_path],
+                    ep="CPUExecutionProvider",
+                )
+            ],
+            config_files={"genai_config.json": cfg},
+        )
+
+        base = json.loads((out / "configs" / "genai_config.json").read_text())
+        model = base["model"]
+        for stripped in ("context_length", "pad_token_id", "eos_token_id", "bos_token_id", "type"):
+            assert stripped not in model, f"base genai_config must not contain {stripped!r}"
+        # Variant-specific decoder fields also stripped.
+        assert "filename" not in model["decoder"]
+        assert "session_options" not in model["decoder"]
+        # Structural shared fields remain.
+        assert model["vocab_size"] == 200064
+        assert model["decoder"]["head_size"] == 128
+
 
 # ---------------------------------------------------------------------------
 # Writer: external-data blobs are always kept inline per variant (no dedup)

From befee955e1d7dc03bdb0112618450da19f4a7387 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 4 Jun 2026 22:23:23 +0000
Subject: [PATCH 10/11] Support pipeline sources and genai_config-only sources
 in generate-model-package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds support for two source shapes that previously couldn't be packaged:

1. **Pipeline (multi-stage) sources** — e.g. QNN exports that ship a
   single source dir containing 4 ONNX stages (embedding,
   prompt-processor, token-generator, transformer-head) plus QNN context
   binaries. The pipeline structure lives in the source's
   `genai_config.json` at `model.<role>.pipeline`. The packager:
   - lifts every stage's ONNX file from the source into the variant dir
     (the existing sidecar sweep already takes care of the QNN .bin
     context binaries that sit next to the ONNX files);
   - writes the pipeline array verbatim into the variant overlay so
     each stage keeps its own filename and EP-specific
     `session_options.provider_options` (htp_performance_mode,
     soc_model, etc.);
   - strips `pipeline` from the base genai_config because GenAI's
     overlay parser appends arrays rather than replacing them — a
     pipeline in both base and overlay would double every stage.

2. **GenAI-shaped sources without `model_config.json`** — sources
   downloaded directly from a model hub instead of produced by an Olive
   workflow. `_read_model_config` now synthesises a minimal config
   from `genai_config.json` + a directory scan so the rest of the
   packager stays single-codepath. As a bonus, an existing
   `model_config.json` whose `model_path` is unreachable (a common
   state when artifacts are copied between hosts) is repaired in-memory
   by repointing it at the source directory.

Supporting changes:
- `_extract_task` now honours `model_attributes.task` and falls
  back to inspecting the source genai_config for a `decoder` role, so
  the component directory ends up as `models/decoder/...` (not the
  generic `models/model/...`).
- EP derivation prefers the source genai's
  `session_options.provider_options` alias over the variant-name
  heuristic — e.g. a directory named `vitia_npu` correctly resolves
  to VitisAI rather than QNN (the `npu` substring would otherwise win
  by accident).
- `_VARIANT_NAME_EP_HINTS` gains `vitisai`/`vitia` entries ahead
  of `npu` so the heuristic itself is also unambiguous.

Validated end-to-end on Phi-4-mini-reasoning by packaging:
- qnn_npu  → 2.8 GB pipeline package (4 ONNX + 4 .bin + per-stage
  overlay options preserved)
- vitia_npu → 3.2 GB flat package (single ONNX + VitisAI overlay)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 olive/cli/model_package.py     | 416 +++++++++++++++++++++++++++++----
 test/cli/test_model_package.py | 260 +++++++++++++++++++++
 2 files changed, 626 insertions(+), 50 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index 7e6245cca..3c111551a 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -107,6 +107,12 @@
     "SNPEExecutionProvider": "SNPE",
 }
 
+# Reverse lookup: ``genai_config`` provider alias (case-insensitive) → canonical
+# ORT EP name. Used when reading a source's ``genai_config.json`` to derive the
+# EP from a ``provider_options`` entry without requiring the source to also
+# carry an Olive ``model_config.json``.
+_GENAI_TO_EP: dict[str, str] = {alias.lower(): ep for ep, alias in _EP_TO_GENAI.items()}
+
 # Hash chunk size for SHA-256 over external-data blobs.
 _HASH_CHUNK = 1024 * 1024
 
@@ -242,18 +248,53 @@ def _build_single_variants(self, targets: list[tuple[str, Path, dict]]) -> list[
         variants: list[VariantSpec] = []
         for target_name, source_path, model_config in targets:
             attrs = _get_model_attributes(model_config)
-            onnx_path = _resolve_onnx_path(model_config)
-            ep, device, compatibility_string = _ep_device_compatibility(attrs, onnx_path, target_name)
+            source_genai = _load_source_genai(source_path)
+            # Pipeline sources (e.g. QNN multi-stage exports) declare each
+            # stage's ONNX inside ``genai_config.model.<role>.pipeline``;
+            # there can be multiple ``.onnx`` files in the source directory
+            # so the single-ONNX resolver would fail. We list them all here
+            # and let the writer's copy loop fan them out into the variant
+            # directory; the overlay writer lifts the pipeline structure
+            # verbatim from the source genai_config.
+            pipeline_files = _resolve_pipeline_onnx_files(source_path, source_genai)
+            if pipeline_files:
+                onnx_files = pipeline_files
+                ep = (
+                    attrs.get("ep")
+                    or _derive_ep_from_genai(source_genai, _pick_primary_role(source_genai))
+                    or (model_config.get("config", {}).get("inference_settings") or {}).get(
+                        "execution_provider", [None]
+                    )[0]
+                    or _guess_ep_from_variant_name(target_name)
+                    or "CPUExecutionProvider"
+                )
+                device = attrs.get("device") or None
+                compatibility_string = None
+            else:
+                onnx_files = [_resolve_onnx_path(model_config)]
+                # When the source has its own genai_config and ``attrs.ep``
+                # is absent, the genai's ``provider_options`` are a stronger
+                # signal than the variant-directory-name heuristic — e.g. a
+                # directory named ``vitia_npu`` should be VitisAI rather
+                # than QNN (``npu`` hint wins by accident).
+                attrs_for_compat = dict(attrs)
+                if not attrs_for_compat.get("ep"):
+                    derived = _derive_ep_from_genai(source_genai, _pick_primary_role(source_genai))
+                    if derived:
+                        attrs_for_compat["ep"] = derived
+                ep, device, compatibility_string = _ep_device_compatibility(
+                    attrs_for_compat, onnx_files[0], target_name
+                )
             variants.append(
                 VariantSpec(
                     component_name=component_name,
                     variant_name=target_name,
-                    onnx_files=[onnx_path],
+                    onnx_files=onnx_files,
                     ep=ep,
                     device=device,
                     compatibility_string=compatibility_string,
                     inference_settings=model_config.get("config", {}).get("inference_settings") or {},
-                    source_genai=_load_source_genai(source_path),
+                    source_genai=source_genai,
                 )
             )
         return variants
@@ -352,10 +393,14 @@ def _parse_sources(self) -> list[tuple[str, Path]]:
             path = Path(source)
             if not path.is_dir():
                 raise ValueError(f"Source path does not exist or is not a directory: {path}")
-            if not (path / "model_config.json").exists():
+            # Either an Olive-emitted ``model_config.json`` or a stand-alone
+            # GenAI export (which carries its own ``genai_config.json``) is
+            # acceptable; the latter is synthesized into a model_config below.
+            if not (path / "model_config.json").is_file() and not (path / "genai_config.json").is_file():
                 raise ValueError(
-                    f"No model_config.json found in {path}. "
-                    "Source must be an Olive output directory with model_config.json."
+                    f"Source {path} has neither model_config.json nor genai_config.json. "
+                    "Provide an Olive output directory or a GenAI-shaped source "
+                    "containing genai_config.json plus its ONNX files."
                 )
             name = path.name
             if name in seen_names:
@@ -371,8 +416,37 @@ def _parse_sources(self) -> list[tuple[str, Path]]:
 
     @staticmethod
     def _read_model_config(source_path: Path) -> dict:
-        with (source_path / "model_config.json").open() as f:
-            return json.load(f)
+        """Return the source's ``model_config.json``, synthesizing one if absent.
+
+        Olive-emitted directories ship a ``model_config.json``; GenAI-shaped
+        directories (e.g. ones downloaded straight from a model hub) only
+        ship ``genai_config.json``. For the latter we derive enough of an
+        Olive-shaped config from the genai file and a directory scan so the
+        rest of the packager can stay single-codepath.
+
+        An existing ``model_config.json`` whose ``model_path`` no longer
+        resolves on this machine (a common state when artifacts are copied
+        between hosts) is repaired in-memory: ``model_path`` is rewritten to
+        point at ``source_path`` so the local ONNX is used. The original
+        ``model_attributes`` (vocab_size, num_hidden_layers, ...) are kept
+        because they remain valid descriptors of the model itself.
+        """
+        config_path = source_path / "model_config.json"
+        if config_path.is_file():
+            with config_path.open() as f:
+                model_config = json.load(f)
+            cfg = model_config.get("config") or {}
+            raw_path = cfg.get("model_path")
+            if raw_path and not Path(raw_path).exists():
+                logger.info(
+                    "model_config.json in %s references unreachable model_path %r; repointing to the source directory.",
+                    source_path,
+                    raw_path,
+                )
+                cfg["model_path"] = str(source_path)
+                model_config["config"] = cfg
+            return model_config
+        return _synthesize_model_config_from_source(source_path, _load_source_genai(source_path))
 
     # ------------------------------------------------------------------
     # Task extraction
@@ -380,6 +454,19 @@ def _read_model_config(source_path: Path) -> dict:
 
     @staticmethod
     def _extract_task(targets: list[tuple[str, Path, dict]]) -> str:
+        # An explicit ``model_attributes.task`` takes precedence: an Olive
+        # workflow records it from its own config, and a synthesized
+        # model_config (for sources lacking model_config.json) carries it
+        # directly. Falling through to the HuggingFace Hub lookup is only
+        # needed for older workflows that recorded ``_name_or_path`` but no
+        # task. Returns the underscore-normalised form (``text_generation``)
+        # so ``_task_to_component_name`` can resolve a component name.
+        for _target_name, _source_path, model_config in targets:
+            attrs = _get_model_attributes(model_config)
+            task = attrs.get("task")
+            if isinstance(task, str) and task:
+                return task.replace("-", "_")
+
         model_name_or_path = ""
         for _target_name, _source_path, model_config in targets:
             attrs = _get_model_attributes(model_config)
@@ -387,18 +474,36 @@ def _extract_task(targets: list[tuple[str, Path, dict]]) -> str:
             if model_name_or_path:
                 break
 
-        if not model_name_or_path:
-            return ""
+        if model_name_or_path:
+            try:
+                from huggingface_hub import model_info
 
-        try:
-            from huggingface_hub import model_info
+                info = model_info(model_name_or_path)
+                tag = info.pipeline_tag or ""
+                return tag.replace("-", "_")
+            except Exception:
+                logger.debug("Could not fetch task from HuggingFace Hub for %s", model_name_or_path, exc_info=True)
+
+        # Last-ditch: peek at each source's genai_config.json. GenAI roles
+        # map cleanly to tasks (``decoder`` → text generation, etc.), and a
+        # source without an Olive model_config typically still ships this
+        # file. Keeps the component directory name consistent
+        # (``models/decoder/...``) instead of falling through to the
+        # generic ``models/model/...``.
+        for _target_name, source_path, _model_config in targets:
+            source_genai = _load_source_genai(source_path)
+            if not isinstance(source_genai, dict):
+                continue
+            model_block = source_genai.get("model")
+            if not isinstance(model_block, dict):
+                continue
+            roles = {k for k, v in model_block.items() if isinstance(v, dict)}
+            if "decoder" in roles:
+                return "text_generation"
+            if "encoder" in roles and "decoder" in roles:
+                return "text2text_generation"
 
-            info = model_info(model_name_or_path)
-            tag = info.pipeline_tag or ""
-            return tag.replace("-", "_")
-        except Exception:
-            logger.debug("Could not fetch task from HuggingFace Hub for %s", model_name_or_path, exc_info=True)
-            return ""
+        return ""
 
 
 # ---------------------------------------------------------------------------
@@ -621,35 +726,47 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
     (``model.<role>``). The base config has those keys stripped (see
     ``_strip_variant_specific``); each variant overlay puts them back so ORT
     resolves files inside the chosen variant directory.
+
+    Pipeline variants (multi-stage exports, e.g. QNN) are handled by lifting
+    the source genai_config's ``model.<role>`` block verbatim — pipeline
+    stage filenames are inside the array entries, and per-stage
+    ``session_options.provider_options`` already carry the right EP-scoped
+    options. The base has ``pipeline`` stripped because GenAI's overlay
+    parser would otherwise append the variant's array onto the base's,
+    producing duplicated stages.
     """
-    inference = v.inference_settings or {}
-    session_options: dict[str, Any] = dict(inference.get("session_options") or {})
-    provider_options = _provider_options_for_ep(inference, v.ep)
-    genai_ep = _genai_provider_name(v.ep)
-
-    # ORT-GenAI's FinalizeConfig builds session_options.providers from
-    # provider_options[*].name (src/config.cpp:1643-1645), and
-    # SetProviderSessionOptions then registers each named provider. CPU is not
-    # in the dispatch table (src/models/session_options.cpp:150-159); it has no
-    # configurable options, and ORT InferenceSession adds it implicitly when no
-    # other EP is registered (onnxruntime/core/session/inference_session.cc:
-    # SetCpuProviderWasImplicitlyAdded). For CPU variants we therefore emit an
-    # empty list rather than a sentinel ``[{"CPU": {}}]`` entry. For every
-    # other EP we name it explicitly (NormalizeProviderName canonicalises the
-    # case for QNN/DML/OpenVINO/etc., and "cuda" is already lowercase in the
-    # dispatch table). This matches the convention used by reference ORT model
-    # packages and avoids registering CPU through the V1 no-op path.
-    if genai_ep == "CPU":
-        session_options["provider_options"] = []
+    pipeline_role_body = _pipeline_role_body_for_overlay(v.source_genai, component_role)
+    if pipeline_role_body is not None:
+        role_patch: dict[str, Any] = pipeline_role_body
     else:
-        session_options["provider_options"] = [{genai_ep: provider_options}]
+        inference = v.inference_settings or {}
+        session_options: dict[str, Any] = dict(inference.get("session_options") or {})
+        provider_options = _provider_options_for_ep(inference, v.ep)
+        genai_ep = _genai_provider_name(v.ep)
+
+        # ORT-GenAI's FinalizeConfig builds session_options.providers from
+        # provider_options[*].name (src/config.cpp:1643-1645), and
+        # SetProviderSessionOptions then registers each named provider. CPU is not
+        # in the dispatch table (src/models/session_options.cpp:150-159); it has no
+        # configurable options, and ORT InferenceSession adds it implicitly when no
+        # other EP is registered (onnxruntime/core/session/inference_session.cc:
+        # SetCpuProviderWasImplicitlyAdded). For CPU variants we therefore emit an
+        # empty list rather than a sentinel ``[{"CPU": {}}]`` entry. For every
+        # other EP we name it explicitly (NormalizeProviderName canonicalises the
+        # case for QNN/DML/OpenVINO/etc., and "cuda" is already lowercase in the
+        # dispatch table). This matches the convention used by reference ORT model
+        # packages and avoids registering CPU through the V1 no-op path.
+        if genai_ep == "CPU":
+            session_options["provider_options"] = []
+        else:
+            session_options["provider_options"] = [{genai_ep: provider_options}]
 
-    role_patch: dict[str, Any] = {"session_options": session_options}
-    if v.onnx_files:
-        # The base config strips ``filename`` (it was a variant-specific path
-        # like ``decoder/model.onnx``); the loader resolves the variant ONNX as
-        # ``<variant_dir>/<filename>``, so emit the basename here.
-        role_patch["filename"] = Path(v.onnx_files[0]).name
+        role_patch = {"session_options": session_options}
+        if v.onnx_files:
+            # The base config strips ``filename`` (it was a variant-specific path
+            # like ``decoder/model.onnx``); the loader resolves the variant ONNX as
+            # ``<variant_dir>/<filename>``, so emit the basename here.
+            role_patch["filename"] = Path(v.onnx_files[0]).name
 
     model_patch: dict[str, Any] = {component_role: role_patch}
     # Lift per-variant model-level scalars from the variant's own
@@ -674,6 +791,41 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
     _write_json(variant_dir / "genai_config_overlay.json", overlay)
 
 
+def _pipeline_role_body_for_overlay(source_genai: Optional[dict], component_role: str) -> Optional[dict]:
+    """If ``source_genai`` declares a pipeline for ``component_role``, return its overlay body.
+
+    Returns a deep-copied ``{"pipeline": [...], "session_options": {...}}``
+    dict ready to drop under ``model.<role>`` in the overlay. The source's
+    per-stage ``filename`` + ``session_options.provider_options`` are
+    preserved verbatim (each stage's EP options were validated by the
+    producing toolchain and copying them as-is avoids the EP-canonicalisation
+    bookkeeping the flat-variant path otherwise has to do). Returns
+    ``None`` when no pipeline applies — caller falls back to the flat
+    overlay shape.
+    """
+    if not isinstance(source_genai, dict):
+        return None
+    model_block = source_genai.get("model")
+    if not isinstance(model_block, dict):
+        return None
+    role_body = model_block.get(component_role)
+    if not isinstance(role_body, dict):
+        return None
+    pipeline = role_body.get("pipeline")
+    if not isinstance(pipeline, list) or not pipeline:
+        return None
+
+    patch: dict[str, Any] = {"pipeline": json.loads(json.dumps(pipeline))}
+    # The role-level ``session_options`` carries the decoder's
+    # ``intra_op_num_threads`` / ``log_id`` / etc.; per-stage session_options
+    # nested inside ``pipeline`` are independent and already covered by the
+    # deep copy above.
+    so = role_body.get("session_options")
+    if isinstance(so, dict):
+        patch["session_options"] = json.loads(json.dumps(so))
+    return patch
+
+
 # Per-variant model-level keys that we strip from the package's base
 # genai_config.json and re-supply from each variant's source. These appear
 # directly under ``model`` (not nested under ``model.<role>``) and we have
@@ -692,16 +844,19 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
 
 def _strip_variant_specific(
     node: Any,
-    keys: tuple[str, ...] = ("filename", "session_options", *_VARIANT_LEVEL_MODEL_KEYS),
+    keys: tuple[str, ...] = ("filename", "session_options", "pipeline", *_VARIANT_LEVEL_MODEL_KEYS),
 ) -> Any:
     """Recursively drop variant-specific keys from a genai_config-shaped dict.
 
     ``filename`` and ``session_options`` are intrinsically variant-specific and
     must not live in the package's base ``configs/genai_config.json``; per-variant
-    ``genai_config_overlay.json`` files patch them back in. The same logic
-    applies to per-variant model-level scalars listed in
-    ``_VARIANT_LEVEL_MODEL_KEYS`` (e.g. ``context_length`` differs between NPU
-    and GPU variants of the same model). Returns a deep copy.
+    ``genai_config_overlay.json`` files patch them back in. ``pipeline`` is
+    also stripped because GenAI's overlay parser appends arrays rather than
+    replacing them — a pipeline present in both base and overlay would
+    duplicate every stage on merge. The same logic applies to per-variant
+    model-level scalars listed in ``_VARIANT_LEVEL_MODEL_KEYS`` (e.g.
+    ``context_length`` differs between NPU and GPU variants of the same
+    model). Returns a deep copy.
     """
     if isinstance(node, dict):
         return {k: _strip_variant_specific(v, keys) for k, v in node.items() if k not in keys}
@@ -1056,6 +1211,162 @@ def _load_source_genai(source_path: Path) -> Optional[dict]:
         return None
 
 
+def _pick_primary_role(source_genai: Optional[dict]) -> Optional[str]:
+    """Pick the genai_config role that names the model's primary component.
+
+    A genai_config's ``model`` block keys mix per-role objects (``decoder``,
+    ``embedding``, ...) with model-level scalars (``vocab_size``,
+    ``context_length``, ...). The primary role is the first key whose value
+    is an object carrying either a ``filename`` (flat variant) or a
+    ``pipeline`` (multi-stage variant). Returns ``None`` when no such role
+    is found (e.g. genai_config missing or malformed).
+    """
+    if not isinstance(source_genai, dict):
+        return None
+    model_block = source_genai.get("model")
+    if not isinstance(model_block, dict):
+        return None
+    for role, body in model_block.items():
+        if not isinstance(body, dict):
+            continue
+        if "pipeline" in body or "filename" in body:
+            return role
+    return None
+
+
+def _resolve_pipeline_onnx_files(source_path: Path, source_genai: Optional[dict]) -> Optional[list[Path]]:
+    """Return the ordered list of stage ONNX paths if the source is a pipeline.
+
+    A genai pipeline is encoded as ``model.<role>.pipeline`` — a list whose
+    elements are single-key dicts mapping stage name to a body with a
+    ``filename``. Each stage's ONNX file lives directly under ``source_path``
+    (the standard layout for QNN-style multi-stage exports). Returns
+    ``None`` when the source has no pipeline (caller falls back to the
+    single-ONNX flow).
+    """
+    role = _pick_primary_role(source_genai)
+    if role is None:
+        return None
+    role_body = source_genai["model"][role]
+    pipeline = role_body.get("pipeline")
+    if not isinstance(pipeline, list) or not pipeline:
+        return None
+    files: list[Path] = []
+    for stage in pipeline:
+        if not isinstance(stage, dict):
+            continue
+        for stage_body in stage.values():
+            if not isinstance(stage_body, dict):
+                continue
+            filename = stage_body.get("filename")
+            if not isinstance(filename, str) or not filename:
+                continue
+            files.append(source_path / filename)
+    return files or None
+
+
+def _derive_ep_from_genai(source_genai: Optional[dict], role: Optional[str]) -> Optional[str]:
+    """Derive a canonical ORT EP name from a source's ``genai_config.json``.
+
+    Walks the role's ``session_options.provider_options`` (flat variant) and
+    every stage's ``session_options.provider_options`` (pipeline variant),
+    picking the first non-CPU alias and mapping it back through
+    ``_GENAI_TO_EP``. Returns ``None`` when nothing usable is found (caller
+    falls back to a variant-name heuristic).
+    """
+    if not isinstance(source_genai, dict) or role is None:
+        return None
+    model_block = source_genai.get("model")
+    if not isinstance(model_block, dict):
+        return None
+    role_body = model_block.get(role)
+    if not isinstance(role_body, dict):
+        return None
+
+    candidates: list[dict] = []
+    so = role_body.get("session_options")
+    if isinstance(so, dict):
+        candidates.append(so)
+    pipeline = role_body.get("pipeline")
+    if isinstance(pipeline, list):
+        for stage in pipeline:
+            if not isinstance(stage, dict):
+                continue
+            for stage_body in stage.values():
+                if isinstance(stage_body, dict):
+                    inner_so = stage_body.get("session_options")
+                    if isinstance(inner_so, dict):
+                        candidates.append(inner_so)
+
+    for so_block in candidates:
+        po = so_block.get("provider_options")
+        if not isinstance(po, list):
+            continue
+        for entry in po:
+            if not isinstance(entry, dict):
+                continue
+            for alias in entry:
+                ep = _GENAI_TO_EP.get(alias.lower())
+                if ep and ep != "CPUExecutionProvider":
+                    return ep
+    return None
+
+
+def _synthesize_model_config_from_source(source_path: Path, source_genai: Optional[dict]) -> dict:
+    """Build a minimal Olive-shaped ``model_config`` dict from a source dir.
+
+    Triggered when a source has no ``model_config.json`` (e.g. assets
+    downloaded directly from a model hub rather than emitted by an Olive
+    workflow). The synthesized config carries just enough for the rest of
+    the packager: a ``model_path`` (the source directory) and an
+    ``inference_settings`` derived from ``genai_config.json`` so the
+    per-variant overlay writer ends up with the right EP.
+
+    Pipeline-shaped sources (multiple ONNX stages declared under
+    ``model.<role>.pipeline``) are detected here too: when present we copy
+    the pipeline stages into ``model_attributes.onnx_files`` so the
+    builder can pass all stages downstream without re-reading the
+    genai_config.
+    """
+    role = _pick_primary_role(source_genai)
+    ep = (
+        _derive_ep_from_genai(source_genai, role)
+        or _guess_ep_from_variant_name(source_path.name)
+        or "CPUExecutionProvider"
+    )
+
+    inference_settings: dict[str, Any] = {"execution_provider": [ep], "provider_options": [{}]}
+    if isinstance(source_genai, dict) and role:
+        role_body = (source_genai.get("model") or {}).get(role)
+        if isinstance(role_body, dict):
+            so = role_body.get("session_options")
+            if isinstance(so, dict):
+                # Lift the role-level provider_options when present; for
+                # pipeline sources this is usually empty (per-stage options
+                # live inside ``pipeline[i].<stage>.session_options`` and the
+                # overlay writer lifts them verbatim), but for flat sources
+                # this is the one place that carries the EP-specific knobs.
+                po = so.get("provider_options")
+                if isinstance(po, list) and po:
+                    for entry in po:
+                        if isinstance(entry, dict):
+                            for alias, opts in entry.items():
+                                if _GENAI_TO_EP.get(alias.lower()) == ep and isinstance(opts, dict):
+                                    inference_settings["provider_options"] = [opts]
+                                    break
+
+    model_attributes: dict[str, Any] = {"task": "text-generation", "ep": ep}
+
+    return {
+        "type": "ONNXModel",
+        "config": {
+            "model_path": str(source_path),
+            "model_attributes": model_attributes,
+            "inference_settings": inference_settings,
+        },
+    }
+
+
 def _resolve_onnx_path(model_config: dict) -> Path:
     """Resolve the ONNX file path from an Olive model config.
 
@@ -1121,6 +1432,11 @@ def _ep_device_compatibility(
     ("rocm", "ROCMExecutionProvider"),
     ("dml", "DmlExecutionProvider"),
     ("directml", "DmlExecutionProvider"),
+    # Vendor-specific NPU hints come before the generic ``npu`` so a
+    # variant directory named e.g. ``vitia_npu`` resolves to VitisAI rather
+    # than the QNN fallback.
+    ("vitisai", "VitisAIExecutionProvider"),
+    ("vitia", "VitisAIExecutionProvider"),
     ("qnn", "QNNExecutionProvider"),
     ("npu", "QNNExecutionProvider"),
     ("openvino", "OpenVINOExecutionProvider"),
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index cad786e09..7c62d40f2 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -128,6 +128,12 @@ def test_accepts_single_source(self, tmp_path):
         assert sources == [("soc_60", src)]
 
     def test_rejects_missing_model_config(self, tmp_path):
+        """A source with NEITHER model_config.json nor genai_config.json is rejected.
+
+        A genai_config-only source is now accepted (covered separately by
+        ``TestPipelineAndSynthesis``); only the truly empty / non-source
+        directory should fail.
+        """
         no_config = tmp_path / "no_config"
         no_config.mkdir()
         valid = _create_source_dir(tmp_path, "valid", {"ep": "QNNExecutionProvider"})
@@ -1038,3 +1044,257 @@ def test_rejects_pytorch_model(self, tmp_path):
         # execute + assert
         with pytest.raises(ValueError, match="Unsupported source model type"):
             cmd.run()
+
+
+# ---------------------------------------------------------------------------
+# Pipeline sources (multi-stage exports, e.g. QNN) and model_config synthesis
+# ---------------------------------------------------------------------------
+
+
+def _create_pipeline_source(
+    tmp_path: Path,
+    name: str,
+    *,
+    stage_filenames: list[str],
+    stage_with_options: str,
+    provider_alias: str,
+    provider_options: dict,
+    extra_files: dict[str, str] | None = None,
+) -> Path:
+    """Build a fake GenAI-shaped multi-stage source dir (e.g. QNN pipeline).
+
+    The source has ONE genai_config.json + N real ONNX stage files and NO
+    model_config.json — exercising the synthesis path. ``stage_with_options``
+    is the only stage carrying provider_options (per QNN convention where
+    embedding / transformer-head run on CPU and only the prompt / iter
+    stages carry the HTP options).
+    """
+    source_dir = tmp_path / name
+    source_dir.mkdir(parents=True)
+    for fname in stage_filenames:
+        _make_onnx_inline(source_dir / fname)
+
+    pipeline_stages = []
+    stage_names = ["embedding", "prompt-processor", "token-generator", "transformer-head"][: len(stage_filenames)]
+    for stage_name, fname in zip(stage_names, stage_filenames):
+        body: dict = {"filename": fname, "inputs": [], "outputs": []}
+        if stage_name == stage_with_options:
+            body["session_options"] = {
+                "provider_options": [{provider_alias: provider_options}],
+            }
+        pipeline_stages.append({stage_name: body})
+
+    genai = {
+        "model": {
+            "type": "phi3-pipeline",
+            "context_length": 4096,
+            "pad_token_id": 199999,
+            "eos_token_id": [200020, 199999],
+            "bos_token_id": 199999,
+            "vocab_size": 200064,
+            "decoder": {
+                "head_size": 128,
+                "session_options": {"log_id": "onnxruntime-genai"},
+                "pipeline": pipeline_stages,
+            },
+        }
+    }
+    (source_dir / "genai_config.json").write_text(json.dumps(genai))
+
+    if extra_files:
+        for fname, content in extra_files.items():
+            (source_dir / fname).write_text(content)
+    return source_dir
+
+
+class TestPipelineAndSynthesis:
+    """Pipeline multi-stage sources + ``model_config.json`` synthesis."""
+
+    def test_accepts_source_without_model_config_when_genai_config_present(self, tmp_path):
+        """A source carrying only ``genai_config.json`` + ONNX files is accepted.
+
+        Useful for packaging GenAI-shaped exports downloaded from a hub: no
+        Olive workflow was used so no ``model_config.json`` exists, but
+        ``genai_config.json`` is enough for the packager to derive the EP,
+        component name, and per-variant overlay structure.
+        """
+        src = _create_pipeline_source(
+            tmp_path,
+            "qnn_npu",
+            stage_filenames=["embed.onnx", "ctx.onnx", "iter.onnx", "head.onnx"],
+            stage_with_options="prompt-processor",
+            provider_alias="qnn",
+            provider_options={"soc_model": "60"},
+        )
+        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(tmp_path / "out")])
+
+        sources = cmd._parse_sources()
+        assert sources == [("qnn_npu", src)]
+
+    def test_rejects_source_without_model_config_or_genai_config(self, tmp_path):
+        """A source with neither config file is rejected with a clear error."""
+        empty = tmp_path / "empty"
+        empty.mkdir()
+        _make_onnx_inline(empty / "model.onnx")
+        cmd = _make_command(["generate-model-package", "-s", str(empty), "-o", str(tmp_path / "out")])
+
+        with pytest.raises(ValueError, match=r"neither model_config\.json nor genai_config\.json"):
+            cmd._parse_sources()
+
+    def test_packs_pipeline_with_all_stage_onnx_files(self, tmp_path):
+        """All pipeline-stage ONNX files land in the variant directory.
+
+        The single-ONNX resolver would fail because the source has >1 ONNX;
+        the pipeline resolver enumerates stage filenames from the source
+        genai_config so every stage is copied next to the variant's
+        overlay.
+        """
+        stage_files = ["phi_embed.onnx", "phi_ctx.onnx", "phi_iter.onnx", "phi_head.onnx"]
+        src = _create_pipeline_source(
+            tmp_path,
+            "qnn_npu",
+            stage_filenames=stage_files,
+            stage_with_options="prompt-processor",
+            provider_alias="qnn",
+            provider_options={"soc_model": "60"},
+        )
+        out = tmp_path / "out"
+        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out), "--model_name", "phi-pipe"])
+
+        cmd.run()
+
+        variant_dir = out.with_suffix(".ortpackage") / "models" / "decoder" / "qnn_npu"
+        assert variant_dir.is_dir()
+        for fname in stage_files:
+            assert (variant_dir / fname).is_file(), f"missing stage file {fname}"
+
+    def test_pipeline_overlay_lifts_full_stage_structure_from_source(self, tmp_path):
+        """The variant overlay carries the pipeline list with per-stage options.
+
+        The producing toolchain decided per-stage EP knobs (soc_model,
+        htp_performance_mode, etc.); copying them verbatim avoids the
+        overlay writer having to re-derive each one and guarantees the
+        loader sees the exact same configuration the source intended.
+        """
+        src = _create_pipeline_source(
+            tmp_path,
+            "qnn_npu",
+            stage_filenames=["e.onnx", "c.onnx", "i.onnx", "h.onnx"],
+            stage_with_options="prompt-processor",
+            provider_alias="qnn",
+            provider_options={"htp_performance_mode": "burst", "soc_model": "60"},
+        )
+        out = tmp_path / "out"
+        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
+
+        cmd.run()
+
+        overlay_path = out.with_suffix(".ortpackage") / "models" / "decoder" / "qnn_npu" / "genai_config_overlay.json"
+        overlay = json.loads(overlay_path.read_text())
+        decoder = overlay["model"]["decoder"]
+        assert "pipeline" in decoder
+        stage_names = [next(iter(stage)) for stage in decoder["pipeline"]]
+        assert stage_names == ["embedding", "prompt-processor", "token-generator", "transformer-head"]
+        prompt_stage = decoder["pipeline"][1]["prompt-processor"]
+        assert prompt_stage["filename"] == "c.onnx"
+        assert prompt_stage["session_options"]["provider_options"] == [
+            {"qnn": {"htp_performance_mode": "burst", "soc_model": "60"}}
+        ]
+        # decoder-level session_options also lifted from source so log_id etc. survive.
+        assert decoder["session_options"]["log_id"] == "onnxruntime-genai"
+
+    def test_base_genai_strips_pipeline_field(self, tmp_path):
+        """``pipeline`` lives only in the overlay; base must not duplicate it.
+
+        GenAI's overlay parser appends arrays rather than replacing them
+        (``src/config.cpp:PipelineModelObject_Element``), so a ``pipeline``
+        in both base and overlay would double every stage. The strip is the
+        guard.
+        """
+        src = _create_pipeline_source(
+            tmp_path,
+            "qnn_npu",
+            stage_filenames=["e.onnx", "c.onnx", "i.onnx", "h.onnx"],
+            stage_with_options="prompt-processor",
+            provider_alias="qnn",
+            provider_options={"soc_model": "60"},
+        )
+        out = tmp_path / "out"
+        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
+
+        cmd.run()
+
+        base = json.loads((out.with_suffix(".ortpackage") / "configs" / "genai_config.json").read_text())
+        decoder = base["model"]["decoder"]
+        assert "pipeline" not in decoder, "base genai_config must not retain the pipeline array"
+
+    def test_flat_source_ep_derived_from_source_genai_when_attrs_missing(self, tmp_path):
+        """For flat sources, source genai's ``provider_options`` overrules name guess.
+
+        A directory named ``vitia_npu`` would otherwise be heuristically
+        classified as QNN (the ``npu`` substring wins by accident); the
+        source genai_config saying ``provider_options: [{"VitisAI": {}}]``
+        is the authoritative signal.
+        """
+        source_dir = tmp_path / "vitia_npu"
+        source_dir.mkdir()
+        _make_onnx_inline(source_dir / "model.onnx")
+        (source_dir / "genai_config.json").write_text(
+            json.dumps(
+                {
+                    "model": {
+                        "type": "phi3",
+                        "vocab_size": 200064,
+                        "decoder": {
+                            "head_size": 128,
+                            "filename": "model.onnx",
+                            "session_options": {"provider_options": [{"VitisAI": {}}]},
+                        },
+                    }
+                }
+            )
+        )
+        out = tmp_path / "out"
+        cmd = _make_command(["generate-model-package", "-s", str(source_dir), "-o", str(out)])
+
+        cmd.run()
+
+        metadata = json.loads((out.with_suffix(".ortpackage") / "models" / "decoder" / "metadata.json").read_text())
+        assert metadata["variants"]["vitia_npu"]["ep"] == "VitisAIExecutionProvider"
+        overlay = json.loads(
+            (
+                out.with_suffix(".ortpackage") / "models" / "decoder" / "vitia_npu" / "genai_config_overlay.json"
+            ).read_text()
+        )
+        assert overlay["model"]["decoder"]["session_options"]["provider_options"] == [{"VitisAI": {}}]
+
+    def test_unreachable_model_path_is_repointed_to_source_dir(self, tmp_path):
+        """A stale ``model_path`` (e.g. copied from another machine) is repaired.
+
+        The original ``model_attributes`` are preserved (they remain valid
+        descriptors of the model itself); only the on-disk path is patched
+        so the local ONNX file is the one actually packaged.
+        """
+        source_dir = tmp_path / "stale"
+        source_dir.mkdir()
+        _make_onnx_inline(source_dir / "model.onnx")
+        (source_dir / "genai_config.json").write_text(
+            json.dumps({"model": {"vocab_size": 100, "decoder": {"filename": "model.onnx"}}})
+        )
+        (source_dir / "model_config.json").write_text(
+            json.dumps(
+                {
+                    "type": "ONNXModel",
+                    "config": {
+                        "model_path": "/nonexistent/elsewhere/model.onnx",
+                        "model_attributes": {"task": "text-generation", "vocab_size": 100},
+                    },
+                }
+            )
+        )
+        out = tmp_path / "out"
+        cmd = _make_command(["generate-model-package", "-s", str(source_dir), "-o", str(out)])
+
+        cmd.run()
+
+        assert (out.with_suffix(".ortpackage") / "models" / "decoder" / "stale" / "model.onnx").is_file()

From 100ffec9e28dc9d7a57f77a1444726d95a65a4d6 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Thu, 4 Jun 2026 22:54:27 +0000
Subject: [PATCH 11/11] Drop model_config.json and composite support from
 generate-model-package

Make `olive generate-model-package` purely genai_config-driven. The
source's `genai_config.json` is now the only declarative input the
packager reads: it names the role(s), their ONNX filename(s) (flat) or
pipeline stages (multi-stage), provider_options (and thus the ORT EP),
and the model-level scalars (context_length etc.) that legitimately
diverge per variant.

Behavior changes:

- `--source` now requires `genai_config.json`. `model_config.json` and
  the older ONNXModel/CompositeModel synthesis paths are removed; HF
  Hub task lookup is dropped too.
- Multi-role flat sources (e.g. VLMs with vision + embedding + decoder
  ONNXs in one dir) now have EVERY role's `filename` and
  `session_options` lifted into the per-variant
  `genai_config_overlay.json`, not just the primary role. Previously
  the loader would lose vision/embedding filenames at load time.
- Base `configs/genai_config.json` injects `component=<comp>` markers
  for every role found in any variant's source genai_config.
- `device` is no longer emitted in variant metadata (it was sourced
  from `model_attributes` which is no longer read).

Test changes:

- `_create_source_dir` now writes a minimal genai_config.json (no
  model_config.json) parameterised by EP.
- Removed TestMixedSourceTypes / TestCompositeBuild /
  TestUnsupportedModelType (composite + model_config-only behaviors).
- Added TestVLMMultiRoleOverlay covering the multi-role overlay
  restoration + per-role component-marker injection.
- TestPipelineSources tightened: error message updated, the stale
  model_config-repointing test dropped.

Validated end-to-end by repacking 7 Phi-4-mini-reasoning variants (CPU,
CUDA, OpenVINO GPU/NPU, QNN NPU, VitisAI NPU, WebGPU) and 2
qwen3-vl-2b-instruct variants (CPU, CUDA) from genai_config-only
sources. QNN pipeline retains all 4 stages and .bin context binaries;
each EP overlay carries its correct provider_options; qwen3-vl overlays
carry vision + embedding + decoder filenames.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 olive/cli/model_package.py     | 569 ++++++++++-----------------------
 test/cli/test_model_package.py | 390 ++++++++++------------
 2 files changed, 348 insertions(+), 611 deletions(-)

diff --git a/olive/cli/model_package.py b/olive/cli/model_package.py
index 3c111551a..cb71b62ec 100644
--- a/olive/cli/model_package.py
+++ b/olive/cli/model_package.py
@@ -186,30 +186,18 @@ def run(self):
             output_dir = output_dir.with_name(output_dir.name + _PACKAGE_SUFFIX)
         package_default_name = output_dir.stem
 
-        targets = []
+        targets: list[tuple[str, Path, dict]] = []
         for target_name, source_path in sources:
-            model_config = self._read_model_config(source_path)
-            targets.append((target_name, source_path, model_config))
-
-        types = {(targets[i][2].get("type") or "").lower() for i in range(len(targets))}
-        supported = {"onnxmodel", "compositemodel"}
-        if types - supported:
-            unsupported = sorted(types - supported)
-            raise ValueError(
-                f"Unsupported source model type(s) {unsupported!r}. "
-                "generate-model-package supports ONNXModel and CompositeModel only."
-            )
-        if len(types) > 1:
-            raise ValueError(
-                f"Sources mix model types {sorted(types)!r}. All sources must share the same type "
-                "(all ONNXModel or all CompositeModel)."
-            )
-        is_composite = next(iter(types)) == "compositemodel"
+            source_genai = _load_source_genai(source_path)
+            if not isinstance(source_genai, dict):
+                raise ValueError(
+                    f"Source {source_path} has an unreadable genai_config.json. "
+                    "The packager is genai_config-driven; the file must be valid JSON describing "
+                    "the model layout (role filenames, session_options, etc.)."
+                )
+            targets.append((target_name, source_path, source_genai))
 
-        if is_composite:
-            variants = self._build_composite_variants(targets)
-        else:
-            variants = self._build_single_variants(targets)
+        variants = self._build_variants(targets)
 
         config_files = self._collect_config_files(targets)
 
@@ -242,111 +230,59 @@ def run(self):
     # VariantSpec construction
     # ------------------------------------------------------------------
 
-    def _build_single_variants(self, targets: list[tuple[str, Path, dict]]) -> list["VariantSpec"]:
+    def _build_variants(self, targets: list[tuple[str, Path, dict]]) -> list["VariantSpec"]:
         task = self._extract_task(targets)
         component_name = _task_to_component_name(task)
         variants: list[VariantSpec] = []
-        for target_name, source_path, model_config in targets:
-            attrs = _get_model_attributes(model_config)
-            source_genai = _load_source_genai(source_path)
+        for target_name, source_path, source_genai in targets:
             # Pipeline sources (e.g. QNN multi-stage exports) declare each
             # stage's ONNX inside ``genai_config.model.<role>.pipeline``;
             # there can be multiple ``.onnx`` files in the source directory
-            # so the single-ONNX resolver would fail. We list them all here
-            # and let the writer's copy loop fan them out into the variant
-            # directory; the overlay writer lifts the pipeline structure
-            # verbatim from the source genai_config.
+            # so we list them all here and let the writer's copy loop fan
+            # them out into the variant directory. The overlay writer lifts
+            # the pipeline structure verbatim from the source genai_config.
+            primary_role = _pick_primary_role(source_genai)
             pipeline_files = _resolve_pipeline_onnx_files(source_path, source_genai)
             if pipeline_files:
                 onnx_files = pipeline_files
-                ep = (
-                    attrs.get("ep")
-                    or _derive_ep_from_genai(source_genai, _pick_primary_role(source_genai))
-                    or (model_config.get("config", {}).get("inference_settings") or {}).get(
-                        "execution_provider", [None]
-                    )[0]
-                    or _guess_ep_from_variant_name(target_name)
-                    or "CPUExecutionProvider"
-                )
-                device = attrs.get("device") or None
-                compatibility_string = None
             else:
-                onnx_files = [_resolve_onnx_path(model_config)]
-                # When the source has its own genai_config and ``attrs.ep``
-                # is absent, the genai's ``provider_options`` are a stronger
-                # signal than the variant-directory-name heuristic — e.g. a
-                # directory named ``vitia_npu`` should be VitisAI rather
-                # than QNN (``npu`` hint wins by accident).
-                attrs_for_compat = dict(attrs)
-                if not attrs_for_compat.get("ep"):
-                    derived = _derive_ep_from_genai(source_genai, _pick_primary_role(source_genai))
-                    if derived:
-                        attrs_for_compat["ep"] = derived
-                ep, device, compatibility_string = _ep_device_compatibility(
-                    attrs_for_compat, onnx_files[0], target_name
-                )
+                # Flat single-ONNX source: take the primary role's filename
+                # from genai_config (the same field the GenAI loader uses
+                # at runtime). Normalise to basename — some Olive exports
+                # write paths like ``decoder/model.onnx`` but every variant
+                # directory in the package is flat.
+                if primary_role is None:
+                    raise ValueError(
+                        f"Source {source_path} has no role in genai_config.json with a "
+                        "``filename`` or ``pipeline``; cannot determine which ONNX file(s) "
+                        "to package."
+                    )
+                role_body = source_genai["model"][primary_role]
+                filename = role_body.get("filename")
+                if not isinstance(filename, str) or not filename:
+                    raise ValueError(f"Source {source_path} role {primary_role!r} has no ``filename``.")
+                onnx_files = [source_path / Path(filename).name]
+
+            ep = (
+                _derive_ep_from_genai(source_genai, primary_role)
+                or _guess_ep_from_variant_name(target_name)
+                or "CPUExecutionProvider"
+            )
+            raw_compat = _extract_ep_compatibility_from_onnx(onnx_files[0], ep) if onnx_files else None
+            compatibility_string = raw_compat.strip() if raw_compat and raw_compat.strip() else None
+
             variants.append(
                 VariantSpec(
                     component_name=component_name,
                     variant_name=target_name,
                     onnx_files=onnx_files,
                     ep=ep,
-                    device=device,
                     compatibility_string=compatibility_string,
-                    inference_settings=model_config.get("config", {}).get("inference_settings") or {},
                     source_genai=source_genai,
                 )
             )
         return variants
 
-    def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> list["VariantSpec"]:
-        from collections import OrderedDict
-
-        # Track per-component variants in source insertion order.
-        component_variants: dict[str, list[VariantSpec]] = OrderedDict()
-
-        for target_name, source_path, model_config in targets:
-            target_attrs = _get_model_attributes(model_config)
-            target_inference = model_config.get("config", {}).get("inference_settings") or {}
-            target_genai = _load_source_genai(source_path)
-            components = model_config["config"].get("model_components", [])
-            component_names = model_config["config"].get("model_component_names", [])
-
-            if not components:
-                raise ValueError(f"Composite source {target_name!r} declares no model_components.")
-            if len(components) != len(component_names):
-                raise ValueError(
-                    f"Composite source {target_name!r} has {len(components)} model_components but "
-                    f"{len(component_names)} model_component_names; counts must match."
-                )
-
-            for comp_config, comp_name in zip(components, component_names):
-                # Component-level inference_settings overrides target-level if present.
-                comp_inference = comp_config.get("config", {}).get("inference_settings") or target_inference
-                # Component-level model_attributes overlay target-level.
-                comp_attrs = dict(target_attrs)
-                comp_attrs.update(_get_model_attributes(comp_config))
-
-                onnx_path = _resolve_onnx_path(comp_config)
-                ep, device, compatibility_string = _ep_device_compatibility(comp_attrs, onnx_path, target_name)
-
-                spec = VariantSpec(
-                    component_name=comp_name,
-                    variant_name=target_name,
-                    onnx_files=[onnx_path],
-                    ep=ep,
-                    device=device,
-                    compatibility_string=compatibility_string,
-                    inference_settings=comp_inference,
-                    source_genai=target_genai,
-                )
-                component_variants.setdefault(comp_name, []).append(spec)
-
-        flat: list[VariantSpec] = []
-        for comp_specs in component_variants.values():
-            flat.extend(comp_specs)
-        return flat
-
     # ------------------------------------------------------------------
     # Config file handling
     # ------------------------------------------------------------------
@@ -355,31 +291,18 @@ def _build_composite_variants(self, targets: list[tuple[str, Path, dict]]) -> li
     def _collect_config_files(targets: list[tuple[str, Path, dict]]) -> dict[str, Path]:
         """Pick consumer-shared config files (genai_config, tokenizer, ...).
 
-        Source-of-truth order:
-        1. ``model_attributes.additional_files`` of any source that has it.
-        2. Otherwise, the first source's non-model files.
+        Sweeps the first source's directory for any non-ONNX/binary files
+        (tokenizer assets, genai_config.json, chat_template, processor_config,
+        etc.). Subsequent sources don't add files — the package emits one
+        shared base config set.
         """
         config_entries: dict[str, Path] = {}
-
-        for _target_name, _source_path, model_config in targets:
-            attrs = _get_model_attributes(model_config)
-            for fp in attrs.get("additional_files", []):
-                p = Path(fp)
-                if (p.is_file() or p.is_dir()) and p.name not in config_entries:
-                    config_entries[p.name] = p
+        for _target_name, source_path, _source_genai in targets:
+            for f in sorted(source_path.iterdir()):
+                if (f.is_file() and f.suffix not in _MODEL_SUFFIXES) or f.is_dir():
+                    config_entries[f.name] = f
             if config_entries:
                 break
-
-        if not config_entries:
-            for _target_name, source_path, _model_config in targets:
-                for f in sorted(source_path.iterdir()):
-                    if f.name == "model_config.json":
-                        continue
-                    if (f.is_file() and f.suffix not in _MODEL_SUFFIXES) or f.is_dir():
-                        config_entries[f.name] = f
-                if config_entries:
-                    break
-
         return config_entries
 
     # ------------------------------------------------------------------
@@ -393,14 +316,15 @@ def _parse_sources(self) -> list[tuple[str, Path]]:
             path = Path(source)
             if not path.is_dir():
                 raise ValueError(f"Source path does not exist or is not a directory: {path}")
-            # Either an Olive-emitted ``model_config.json`` or a stand-alone
-            # GenAI export (which carries its own ``genai_config.json``) is
-            # acceptable; the latter is synthesized into a model_config below.
-            if not (path / "model_config.json").is_file() and not (path / "genai_config.json").is_file():
+            # ``genai_config.json`` is the single source of truth: it tells
+            # us which ONNX files are roles vs pipeline stages, the EP for
+            # each role (via session_options.provider_options), and the
+            # variant-specific model scalars to lift into the overlay.
+            if not (path / "genai_config.json").is_file():
                 raise ValueError(
-                    f"Source {path} has neither model_config.json nor genai_config.json. "
-                    "Provide an Olive output directory or a GenAI-shaped source "
-                    "containing genai_config.json plus its ONNX files."
+                    f"Source {path} has no genai_config.json. Each source must be a "
+                    "GenAI-shaped directory containing genai_config.json plus the ONNX "
+                    "file(s) it references."
                 )
             name = path.name
             if name in seen_names:
@@ -414,95 +338,29 @@ def _parse_sources(self) -> list[tuple[str, Path]]:
             raise ValueError("At least one --source directory is required.")
         return sources
 
-    @staticmethod
-    def _read_model_config(source_path: Path) -> dict:
-        """Return the source's ``model_config.json``, synthesizing one if absent.
-
-        Olive-emitted directories ship a ``model_config.json``; GenAI-shaped
-        directories (e.g. ones downloaded straight from a model hub) only
-        ship ``genai_config.json``. For the latter we derive enough of an
-        Olive-shaped config from the genai file and a directory scan so the
-        rest of the packager can stay single-codepath.
-
-        An existing ``model_config.json`` whose ``model_path`` no longer
-        resolves on this machine (a common state when artifacts are copied
-        between hosts) is repaired in-memory: ``model_path`` is rewritten to
-        point at ``source_path`` so the local ONNX is used. The original
-        ``model_attributes`` (vocab_size, num_hidden_layers, ...) are kept
-        because they remain valid descriptors of the model itself.
-        """
-        config_path = source_path / "model_config.json"
-        if config_path.is_file():
-            with config_path.open() as f:
-                model_config = json.load(f)
-            cfg = model_config.get("config") or {}
-            raw_path = cfg.get("model_path")
-            if raw_path and not Path(raw_path).exists():
-                logger.info(
-                    "model_config.json in %s references unreachable model_path %r; repointing to the source directory.",
-                    source_path,
-                    raw_path,
-                )
-                cfg["model_path"] = str(source_path)
-                model_config["config"] = cfg
-            return model_config
-        return _synthesize_model_config_from_source(source_path, _load_source_genai(source_path))
-
     # ------------------------------------------------------------------
     # Task extraction
     # ------------------------------------------------------------------
 
     @staticmethod
     def _extract_task(targets: list[tuple[str, Path, dict]]) -> str:
-        # An explicit ``model_attributes.task`` takes precedence: an Olive
-        # workflow records it from its own config, and a synthesized
-        # model_config (for sources lacking model_config.json) carries it
-        # directly. Falling through to the HuggingFace Hub lookup is only
-        # needed for older workflows that recorded ``_name_or_path`` but no
-        # task. Returns the underscore-normalised form (``text_generation``)
-        # so ``_task_to_component_name`` can resolve a component name.
-        for _target_name, _source_path, model_config in targets:
-            attrs = _get_model_attributes(model_config)
-            task = attrs.get("task")
-            if isinstance(task, str) and task:
-                return task.replace("-", "_")
-
-        model_name_or_path = ""
-        for _target_name, _source_path, model_config in targets:
-            attrs = _get_model_attributes(model_config)
-            model_name_or_path = attrs.get("_name_or_path", "")
-            if model_name_or_path:
-                break
-
-        if model_name_or_path:
-            try:
-                from huggingface_hub import model_info
-
-                info = model_info(model_name_or_path)
-                tag = info.pipeline_tag or ""
-                return tag.replace("-", "_")
-            except Exception:
-                logger.debug("Could not fetch task from HuggingFace Hub for %s", model_name_or_path, exc_info=True)
-
-        # Last-ditch: peek at each source's genai_config.json. GenAI roles
-        # map cleanly to tasks (``decoder`` → text generation, etc.), and a
-        # source without an Olive model_config typically still ships this
-        # file. Keeps the component directory name consistent
-        # (``models/decoder/...``) instead of falling through to the
-        # generic ``models/model/...``.
-        for _target_name, source_path, _model_config in targets:
-            source_genai = _load_source_genai(source_path)
+        # Inspect each source's genai_config.json roles to infer the task.
+        # GenAI roles map cleanly to tasks (``decoder`` → text generation;
+        # both ``encoder`` and ``decoder`` → text2text generation), and the
+        # task in turn names the component directory under ``models/``.
+        # Returns the underscore-normalised form (``text_generation``) so
+        # ``_task_to_component_name`` can resolve a component name.
+        for _target_name, _source_path, source_genai in targets:
             if not isinstance(source_genai, dict):
                 continue
             model_block = source_genai.get("model")
             if not isinstance(model_block, dict):
                 continue
             roles = {k for k, v in model_block.items() if isinstance(v, dict)}
-            if "decoder" in roles:
-                return "text_generation"
             if "encoder" in roles and "decoder" in roles:
                 return "text2text_generation"
-
+            if "decoder" in roles:
+                return "text_generation"
         return ""
 
 
@@ -597,9 +455,25 @@ def write_model_package(
     # without those markers ORT-GenAI's variant auto-selection fails with
     # "the genai config does not reference any package components".
     role_to_component: dict[str, str] = {}
+    # Seed from each variant's source genai_config: every role that appears
+    # under ``model.<role>`` (vision, embedding, decoder, ...) gets mapped
+    # to that variant's component_name. Multi-role sources (e.g. VLMs)
+    # share a single component dir, so all of their roles end up pointing
+    # at the same component — which is what the loader needs to find each
+    # role's ONNX file inside the package.
+    for v in variants:
+        src_genai = getattr(v, "source_genai", None) or {}
+        model_block = src_genai.get("model") if isinstance(src_genai, dict) else None
+        if isinstance(model_block, dict):
+            for role_name, role_body in model_block.items():
+                if isinstance(role_body, dict):
+                    role_to_component.setdefault(role_name, v.component_name)
+    # Fallback for components whose variants carried no usable source_genai:
+    # map the component name to itself as the role, matching the legacy
+    # writer behaviour for direct ``write_model_package`` callers.
     for comp_name in components:
-        role = component_to_role.get(comp_name, comp_name)
-        role_to_component.setdefault(role, comp_name)
+        explicit_role = component_to_role.get(comp_name, comp_name)
+        role_to_component.setdefault(explicit_role, comp_name)
 
     if config_files:
         _copy_config_files(output_dir, config_files, role_to_component)
@@ -719,26 +593,50 @@ def _genai_provider_name(ep: str) -> str:
 def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: VariantSpec) -> None:
     """Emit a per-variant ``genai_config_overlay.json`` (RFC 7386 merge patch).
 
-    Per-variant runtime fields flow through a JSON Merge Patch applied on top of
-    the package's base ``configs/genai_config.json``. We express the variant's
-    ``filename`` (the variant-local ONNX file basename), ``session_options`` and
-    EP-scoped ``provider_options`` under the role that references this component
-    (``model.<role>``). The base config has those keys stripped (see
-    ``_strip_variant_specific``); each variant overlay puts them back so ORT
-    resolves files inside the chosen variant directory.
-
-    Pipeline variants (multi-stage exports, e.g. QNN) are handled by lifting
-    the source genai_config's ``model.<role>`` block verbatim — pipeline
-    stage filenames are inside the array entries, and per-stage
-    ``session_options.provider_options`` already carry the right EP-scoped
-    options. The base has ``pipeline`` stripped because GenAI's overlay
-    parser would otherwise append the variant's array onto the base's,
-    producing duplicated stages.
+    Per-variant runtime fields flow through a JSON Merge Patch applied on top
+    of the package's base ``configs/genai_config.json``. The base has every
+    role's ``filename`` / ``session_options`` / ``pipeline`` stripped (see
+    ``_strip_variant_specific``); this overlay restores them.
+
+    When the variant carries its source ``genai_config.json`` (the default
+    CLI path) we lift every role's per-variant body verbatim — including
+    non-primary roles (e.g. a VLM source has ``vision``/``embedding``/
+    ``decoder`` roles, each with its own filename and session_options).
+    Without this multi-role lift the loader would lose all but the primary
+    role's filename and the package wouldn't load.
+
+    Pipeline-shaped roles (multi-stage exports, e.g. QNN) are covered by the
+    same lift: ``pipeline`` is in the strip set so the base loses it, the
+    overlay restores it (with per-stage ``filename`` + per-stage
+    ``session_options.provider_options`` preserved as-is). The strip is
+    required because GenAI's overlay parser appends arrays rather than
+    replacing them — a pipeline in both base and overlay would duplicate
+    every stage.
+
+    Direct ``write_model_package`` callers that don't pass ``source_genai``
+    fall back to the legacy ``inference_settings``-driven shape so existing
+    programmatic tests keep working.
     """
-    pipeline_role_body = _pipeline_role_body_for_overlay(v.source_genai, component_role)
-    if pipeline_role_body is not None:
-        role_patch: dict[str, Any] = pipeline_role_body
+    src_genai = v.source_genai or {}
+    src_model = src_genai.get("model") if isinstance(src_genai, dict) else None
+
+    model_patch: dict[str, Any] = {}
+
+    if isinstance(src_model, dict):
+        # New path: lift every role's per-variant fields. Each role in the
+        # source genai_config (vision, embedding, decoder, ...) gets its
+        # filename + session_options + pipeline copied into the overlay
+        # under the same role name.
+        for role_name, role_body in src_model.items():
+            if not isinstance(role_body, dict):
+                continue
+            role_patch = _lift_role_overlay_body(role_body)
+            if role_patch:
+                model_patch[role_name] = role_patch
     else:
+        # Legacy path: callers that don't pass source_genai (writer-only
+        # tests) construct a single-role overlay from VariantSpec's
+        # inference_settings.
         inference = v.inference_settings or {}
         session_options: dict[str, Any] = dict(inference.get("session_options") or {})
         provider_options = _provider_options_for_ep(inference, v.ep)
@@ -746,83 +644,70 @@ def _write_genai_config_overlay(variant_dir: Path, component_role: str, v: Varia
 
         # ORT-GenAI's FinalizeConfig builds session_options.providers from
         # provider_options[*].name (src/config.cpp:1643-1645), and
-        # SetProviderSessionOptions then registers each named provider. CPU is not
-        # in the dispatch table (src/models/session_options.cpp:150-159); it has no
-        # configurable options, and ORT InferenceSession adds it implicitly when no
-        # other EP is registered (onnxruntime/core/session/inference_session.cc:
-        # SetCpuProviderWasImplicitlyAdded). For CPU variants we therefore emit an
-        # empty list rather than a sentinel ``[{"CPU": {}}]`` entry. For every
-        # other EP we name it explicitly (NormalizeProviderName canonicalises the
-        # case for QNN/DML/OpenVINO/etc., and "cuda" is already lowercase in the
-        # dispatch table). This matches the convention used by reference ORT model
-        # packages and avoids registering CPU through the V1 no-op path.
+        # SetProviderSessionOptions then registers each named provider. CPU
+        # is not in the dispatch table (src/models/session_options.cpp:
+        # 150-159); it has no configurable options, and ORT InferenceSession
+        # adds it implicitly when no other EP is registered. We therefore
+        # emit ``provider_options: []`` for CPU variants and an explicit
+        # named entry for every other EP.
         if genai_ep == "CPU":
             session_options["provider_options"] = []
         else:
             session_options["provider_options"] = [{genai_ep: provider_options}]
 
-        role_patch = {"session_options": session_options}
+        legacy_patch: dict[str, Any] = {"session_options": session_options}
         if v.onnx_files:
-            # The base config strips ``filename`` (it was a variant-specific path
-            # like ``decoder/model.onnx``); the loader resolves the variant ONNX as
-            # ``<variant_dir>/<filename>``, so emit the basename here.
-            role_patch["filename"] = Path(v.onnx_files[0]).name
+            # The base strips ``filename``; the loader resolves the variant
+            # ONNX as ``<variant_dir>/<filename>``, so emit the basename.
+            legacy_patch["filename"] = Path(v.onnx_files[0]).name
+        model_patch[component_role] = legacy_patch
 
-    model_patch: dict[str, Any] = {component_role: role_patch}
     # Lift per-variant model-level scalars from the variant's own
-    # genai_config.json. The base config strips these (see
-    # ``_strip_variant_specific``) because they legitimately differ across
-    # variants (e.g. NPU runtime caps ``context_length`` at 4224 while CPU/CUDA
-    # use the full 131072; pad_token_id can differ when one exporter uses the
-    # EOS as PAD and another uses the sentinel). Without this lift the merged
-    # config would silently use whichever variant happened to win the base
+    # genai_config.json. The base config strips these because they
+    # legitimately differ across variants (e.g. NPU runtime caps
+    # ``context_length`` at 4224 while CPU/CUDA use the full 131072;
+    # pad_token_id can differ when one exporter uses the EOS as PAD and
+    # another uses the sentinel). Without this lift the merged config
+    # would silently use whichever variant happened to win the base
     # selection.
-    src_genai = v.source_genai or {}
-    src_model = src_genai.get("model") if isinstance(src_genai, dict) else None
     if isinstance(src_model, dict):
         for k in _VARIANT_LEVEL_MODEL_KEYS:
             if k in src_model:
-                # Deep-copy via JSON round-trip so we never share refs with the
-                # caller's dict; arrays in particular must be independent
-                # because GenAI's overlay parser treats arrays as append-merge.
+                # Deep-copy via JSON round-trip so we never share refs with
+                # the caller's dict; arrays in particular must be
+                # independent because GenAI's overlay parser treats arrays
+                # as append-merge.
                 model_patch[k] = json.loads(json.dumps(src_model[k]))
 
     overlay = {"model": model_patch}
     _write_json(variant_dir / "genai_config_overlay.json", overlay)
 
 
-def _pipeline_role_body_for_overlay(source_genai: Optional[dict], component_role: str) -> Optional[dict]:
-    """If ``source_genai`` declares a pipeline for ``component_role``, return its overlay body.
+def _lift_role_overlay_body(role_body: dict) -> dict:
+    """Lift the per-variant fields from a single source genai_config role body.
 
-    Returns a deep-copied ``{"pipeline": [...], "session_options": {...}}``
-    dict ready to drop under ``model.<role>`` in the overlay. The source's
-    per-stage ``filename`` + ``session_options.provider_options`` are
-    preserved verbatim (each stage's EP options were validated by the
-    producing toolchain and copying them as-is avoids the EP-canonicalisation
-    bookkeeping the flat-variant path otherwise has to do). Returns
-    ``None`` when no pipeline applies — caller falls back to the flat
-    overlay shape.
-    """
-    if not isinstance(source_genai, dict):
-        return None
-    model_block = source_genai.get("model")
-    if not isinstance(model_block, dict):
-        return None
-    role_body = model_block.get(component_role)
-    if not isinstance(role_body, dict):
-        return None
-    pipeline = role_body.get("pipeline")
-    if not isinstance(pipeline, list) or not pipeline:
-        return None
+    Each role body may carry ``filename`` (flat-variant primary file),
+    ``pipeline`` (multi-stage), and ``session_options`` (provider_options +
+    EP knobs). All three are stripped from the base genai_config; this
+    helper recovers them as the role's overlay patch.
 
-    patch: dict[str, Any] = {"pipeline": json.loads(json.dumps(pipeline))}
-    # The role-level ``session_options`` carries the decoder's
-    # ``intra_op_num_threads`` / ``log_id`` / etc.; per-stage session_options
-    # nested inside ``pipeline`` are independent and already covered by the
-    # deep copy above.
+    Filename values are normalised to basename — some Olive exporters write
+    paths like ``decoder/model.onnx`` but every variant directory in the
+    package is flat, so any path prefix the source carried would mis-route
+    the loader. Pipeline and session_options are deep-copied verbatim to
+    preserve the producing toolchain's per-stage EP knobs and avoid
+    aliasing with the caller's dict.
+    """
+    patch: dict[str, Any] = {}
+    filename = role_body.get("filename")
+    if isinstance(filename, str) and filename:
+        patch["filename"] = Path(filename).name
     so = role_body.get("session_options")
     if isinstance(so, dict):
         patch["session_options"] = json.loads(json.dumps(so))
+    pipeline = role_body.get("pipeline")
+    if isinstance(pipeline, list) and pipeline:
+        patch["pipeline"] = json.loads(json.dumps(pipeline))
     return patch
 
 
@@ -1183,14 +1068,10 @@ def disambiguate_variant_names(candidates: list[tuple[str, str]]) -> list[str]:
 
 
 # ---------------------------------------------------------------------------
-# Olive model-config helpers
+# genai_config helpers
 # ---------------------------------------------------------------------------
 
 
-def _get_model_attributes(model_config: dict) -> dict:
-    return model_config.get("config", {}).get("model_attributes") or {}
-
-
 def _load_source_genai(source_path: Path) -> Optional[dict]:
     """Return the parsed ``<source>/genai_config.json`` if present.
 
@@ -1312,118 +1193,10 @@ def _derive_ep_from_genai(source_genai: Optional[dict], role: Optional[str]) ->
     return None
 
 
-def _synthesize_model_config_from_source(source_path: Path, source_genai: Optional[dict]) -> dict:
-    """Build a minimal Olive-shaped ``model_config`` dict from a source dir.
-
-    Triggered when a source has no ``model_config.json`` (e.g. assets
-    downloaded directly from a model hub rather than emitted by an Olive
-    workflow). The synthesized config carries just enough for the rest of
-    the packager: a ``model_path`` (the source directory) and an
-    ``inference_settings`` derived from ``genai_config.json`` so the
-    per-variant overlay writer ends up with the right EP.
-
-    Pipeline-shaped sources (multiple ONNX stages declared under
-    ``model.<role>.pipeline``) are detected here too: when present we copy
-    the pipeline stages into ``model_attributes.onnx_files`` so the
-    builder can pass all stages downstream without re-reading the
-    genai_config.
-    """
-    role = _pick_primary_role(source_genai)
-    ep = (
-        _derive_ep_from_genai(source_genai, role)
-        or _guess_ep_from_variant_name(source_path.name)
-        or "CPUExecutionProvider"
-    )
-
-    inference_settings: dict[str, Any] = {"execution_provider": [ep], "provider_options": [{}]}
-    if isinstance(source_genai, dict) and role:
-        role_body = (source_genai.get("model") or {}).get(role)
-        if isinstance(role_body, dict):
-            so = role_body.get("session_options")
-            if isinstance(so, dict):
-                # Lift the role-level provider_options when present; for
-                # pipeline sources this is usually empty (per-stage options
-                # live inside ``pipeline[i].<stage>.session_options`` and the
-                # overlay writer lifts them verbatim), but for flat sources
-                # this is the one place that carries the EP-specific knobs.
-                po = so.get("provider_options")
-                if isinstance(po, list) and po:
-                    for entry in po:
-                        if isinstance(entry, dict):
-                            for alias, opts in entry.items():
-                                if _GENAI_TO_EP.get(alias.lower()) == ep and isinstance(opts, dict):
-                                    inference_settings["provider_options"] = [opts]
-                                    break
-
-    model_attributes: dict[str, Any] = {"task": "text-generation", "ep": ep}
-
-    return {
-        "type": "ONNXModel",
-        "config": {
-            "model_path": str(source_path),
-            "model_attributes": model_attributes,
-            "inference_settings": inference_settings,
-        },
-    }
-
-
-def _resolve_onnx_path(model_config: dict) -> Path:
-    """Resolve the ONNX file path from an Olive model config.
-
-    The config's ``model_path`` may be either:
-    - the ONNX file itself (a ``LocalFile`` resource),
-    - a directory containing the ONNX file (a ``LocalFolder`` resource),
-      in which case ``onnx_file_name`` (or a single ``.onnx`` in the dir)
-      identifies the actual file.
-    """
-    cfg = model_config.get("config", {}) or {}
-    raw = cfg.get("model_path")
-    if not raw:
-        raise ValueError("Model config has no model_path.")
-    p = Path(raw)
-    if p.is_file():
-        return p
-    if p.is_dir():
-        onnx_name = cfg.get("onnx_file_name")
-        if onnx_name:
-            candidate = p / onnx_name
-            if candidate.is_file():
-                return candidate
-        onnx_files = list(p.glob("*.onnx"))
-        if len(onnx_files) == 1:
-            return onnx_files[0]
-        raise ValueError(
-            f"Cannot resolve a unique ONNX file under {p}; "
-            "set onnx_file_name in the model config or pass the file path directly."
-        )
-    raise FileNotFoundError(f"model_path does not exist: {p}")
-
-
-def _ep_device_compatibility(
-    attrs: dict, onnx_path: Path, variant_name: Optional[str] = None
-) -> tuple[str, Optional[str], Optional[str]]:
-    """Extract (ep, device, compatibility_string) for one variant from Olive metadata.
-
-    Each variant declares a single opaque ``compatibility_string``. Olive stores
-    the EP-side preference as a comma-delimited string in the ONNX metadata prop
-    ``ep_compatibility_info.<EP>``; it is passed through verbatim (ORT does not
-    interpret the encoding).
-
-    When ``model_attributes.ep`` is absent, fall back to a common-variant-name
-    heuristic (``gpu``/``cuda`` → CUDA, ``qnn`` → QNN, etc.) so users who don't
-    manually annotate their Olive outputs still get distinct EP entries in each
-    component's metadata.json. Final fallback is CPU.
-    """
-    ep = attrs.get("ep") or _guess_ep_from_variant_name(variant_name) or "CPUExecutionProvider"
-    device = attrs.get("device") or None
-    raw = _extract_ep_compatibility_from_onnx(onnx_path, ep)
-    compatibility_string = raw.strip() if raw and raw.strip() else None
-    return ep, device, compatibility_string
-
-
 # Best-effort mapping from common Olive output / EP-build directory names to
-# canonical ORT EP strings. Used only as a fallback when model_attributes.ep is
-# not set. Keep substrings short and lowercased; matched via ``in``.
+# canonical ORT EP strings. Used only as a fallback when neither
+# ``genai_config.json``'s ``provider_options`` nor any explicit metadata
+# names the EP. Keep substrings short and lowercased; matched via ``in``.
 _VARIANT_NAME_EP_HINTS: tuple[tuple[str, str], ...] = (
     ("cuda", "CUDAExecutionProvider"),
     ("gpu", "CUDAExecutionProvider"),
diff --git a/test/cli/test_model_package.py b/test/cli/test_model_package.py
index 7c62d40f2..be568bbba 100644
--- a/test/cli/test_model_package.py
+++ b/test/cli/test_model_package.py
@@ -86,21 +86,54 @@ def _make_onnx_with_external(
 def _create_source_dir(
     tmp_path: Path,
     name: str,
-    model_attributes: dict,
     *,
+    ep: str = "CPUExecutionProvider",
     onnx_metadata: dict[str, str] | None = None,
-    inference_settings: dict | None = None,
+    filename: str = "model.onnx",
+    provider_options: dict | None = None,
+    session_options_extras: dict | None = None,
+    role: str = "decoder",
 ) -> Path:
-    """Create a fake Olive output directory with model_config.json and a real ONNX file."""
+    """Create a fake GenAI-shaped source directory.
+
+    Writes a minimal ``genai_config.json`` describing one role (default
+    ``decoder``) with ``filename``, plus a real ONNX file at the role's
+    filename. Optionally seeds the role's ``session_options.provider_options``
+    with the canonical alias for the supplied ``ep`` so the packager's
+    EP-derivation logic resolves the variant to that EP. No
+    ``model_config.json`` is written — the packager is genai_config-driven.
+    """
     source_dir = tmp_path / name
     source_dir.mkdir(parents=True)
-    onnx_path = source_dir / "model.onnx"
+    onnx_path = source_dir / filename
     _make_onnx_inline(onnx_path, metadata_props=onnx_metadata)
-    cfg: dict = {"model_path": str(onnx_path), "model_attributes": model_attributes}
-    if inference_settings is not None:
-        cfg["inference_settings"] = inference_settings
-    model_config = {"type": "ONNXModel", "config": cfg}
-    (source_dir / "model_config.json").write_text(json.dumps(model_config))
+
+    ep_to_alias = {
+        "CPUExecutionProvider": "CPU",
+        "CUDAExecutionProvider": "cuda",
+        "QNNExecutionProvider": "qnn",
+        "OpenVINOExecutionProvider": "OpenVINO",
+        "VitisAIExecutionProvider": "VitisAI",
+        "WebGpuExecutionProvider": "WebGPU",
+        "DmlExecutionProvider": "DML",
+        "TensorrtExecutionProvider": "tensorrt",
+        "ROCMExecutionProvider": "rocm",
+        "CoreMLExecutionProvider": "CoreML",
+        "XnnpackExecutionProvider": "XNNPACK",
+    }
+    alias = ep_to_alias.get(ep, "CPU")
+    session_options: dict = dict(session_options_extras or {})
+    if alias == "CPU":
+        session_options.setdefault("provider_options", [])
+    else:
+        session_options.setdefault("provider_options", [{alias: provider_options or {}}])
+
+    genai = {
+        "model": {
+            role: {"filename": filename, "session_options": session_options},
+        }
+    }
+    (source_dir / "genai_config.json").write_text(json.dumps(genai))
     return source_dir
 
 
@@ -120,32 +153,33 @@ def _make_command(args_list):
 
 class TestSourceValidation:
     def test_accepts_single_source(self, tmp_path):
-        src = _create_source_dir(tmp_path, "soc_60", {"ep": "QNNExecutionProvider"})
+        src = _create_source_dir(tmp_path, "soc_60", ep="QNNExecutionProvider")
         cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(tmp_path / "out")])
 
         sources = cmd._parse_sources()
 
         assert sources == [("soc_60", src)]
 
-    def test_rejects_missing_model_config(self, tmp_path):
-        """A source with NEITHER model_config.json nor genai_config.json is rejected.
+    def test_rejects_missing_genai_config(self, tmp_path):
+        """A source without ``genai_config.json`` is rejected.
 
-        A genai_config-only source is now accepted (covered separately by
-        ``TestPipelineAndSynthesis``); only the truly empty / non-source
-        directory should fail.
+        The packager is genai_config-driven: it lifts the model layout
+        (role filenames, session_options, pipeline) directly from the
+        source's genai_config. A directory lacking that file has no way to
+        describe its contents to the packager.
         """
         no_config = tmp_path / "no_config"
         no_config.mkdir()
-        valid = _create_source_dir(tmp_path, "valid", {"ep": "QNNExecutionProvider"})
+        valid = _create_source_dir(tmp_path, "valid", ep="QNNExecutionProvider")
         cmd = _make_command(
             ["generate-model-package", "-s", str(no_config), "-s", str(valid), "-o", str(tmp_path / "out")]
         )
 
-        with pytest.raises(ValueError, match=r"model_config\.json"):
+        with pytest.raises(ValueError, match=r"genai_config\.json"):
             cmd._parse_sources()
 
     def test_rejects_nonexistent_path(self, tmp_path):
-        valid = _create_source_dir(tmp_path, "valid", {"ep": "QNNExecutionProvider"})
+        valid = _create_source_dir(tmp_path, "valid", ep="QNNExecutionProvider")
         cmd = _make_command(
             ["generate-model-package", "-s", "/nonexistent/path", "-s", str(valid), "-o", str(tmp_path / "out")]
         )
@@ -155,16 +189,16 @@ def test_rejects_nonexistent_path(self, tmp_path):
 
     def test_rejects_duplicate_source_basenames(self, tmp_path):
         # Two source dirs share basename "soc_60" — variant names would collide.
-        src_a = _create_source_dir(tmp_path / "a", "soc_60", {"ep": "QNNExecutionProvider"})
-        src_b = _create_source_dir(tmp_path / "b", "soc_60", {"ep": "QNNExecutionProvider"})
+        src_a = _create_source_dir(tmp_path / "a", "soc_60", ep="QNNExecutionProvider")
+        src_b = _create_source_dir(tmp_path / "b", "soc_60", ep="QNNExecutionProvider")
         cmd = _make_command(["generate-model-package", "-s", str(src_a), "-s", str(src_b), "-o", str(tmp_path / "out")])
 
         with pytest.raises(ValueError, match="share the directory name"):
             cmd._parse_sources()
 
     def test_parses_two_valid_sources(self, tmp_path):
-        src1 = _create_source_dir(tmp_path, "soc_60", {"ep": "QNNExecutionProvider"})
-        src2 = _create_source_dir(tmp_path, "soc_73", {"ep": "QNNExecutionProvider"})
+        src1 = _create_source_dir(tmp_path, "soc_60", ep="QNNExecutionProvider")
+        src2 = _create_source_dir(tmp_path, "soc_73", ep="QNNExecutionProvider")
         cmd = _make_command(["generate-model-package", "-s", str(src1), "-s", str(src2), "-o", str(tmp_path / "out")])
 
         sources = cmd._parse_sources()
@@ -182,8 +216,8 @@ def test_parses_two_valid_sources(self, tmp_path):
 class TestGeneratePackageMultiVariant:
     def test_writes_proposal_layout(self, tmp_path):
         # setup
-        src1 = _create_source_dir(tmp_path, "soc_60", {"ep": "QNNExecutionProvider", "device": "NPU"})
-        src2 = _create_source_dir(tmp_path, "soc_73", {"ep": "QNNExecutionProvider", "device": "NPU"})
+        src1 = _create_source_dir(tmp_path, "soc_60", ep="QNNExecutionProvider")
+        src2 = _create_source_dir(tmp_path, "soc_73", ep="QNNExecutionProvider")
         out = tmp_path / "out.ortpackage"
         cmd = _make_command(
             [
@@ -210,40 +244,42 @@ def test_writes_proposal_layout(self, tmp_path):
 
         manifest = json.loads((out / "manifest.json").read_text())
         assert manifest["schema_version"] == 1
-        assert manifest["components"] == ["model"]
+        # ``decoder`` (not ``model``) — the genai_config role is ``decoder``,
+        # so _extract_task -> ``text_generation`` -> component dir ``decoder``.
+        assert manifest["components"] == ["decoder"]
         assert manifest["producer"]["model_name"] == "test_model"
         assert manifest["producer"]["model_version"] == "2.0"
 
         # metadata uses inline EP
-        metadata = json.loads((out / "models" / "model" / "metadata.json").read_text())
+        metadata = json.loads((out / "models" / "decoder" / "metadata.json").read_text())
         assert metadata["schema_version"] == 1
-        assert metadata["component_name"] == "model"
+        assert metadata["component_name"] == "decoder"
         assert set(metadata["variants"]) == {"soc_60", "soc_73"}
         for variant_payload in metadata["variants"].values():
-            assert variant_payload == {"ep": "QNNExecutionProvider", "device": "NPU"}
+            assert variant_payload == {"ep": "QNNExecutionProvider"}
 
         # No variant.json is emitted; the ONNX file lands in the variant
         # directory.
         for v in ("soc_60", "soc_73"):
-            assert not (out / "models" / "model" / v / "variant.json").exists()
-            assert (out / "models" / "model" / v / "model.onnx").is_file()
+            assert not (out / "models" / "decoder" / v / "variant.json").exists()
+            assert (out / "models" / "decoder" / v / "model.onnx").is_file()
 
 
 class TestGeneratePackageSingleSource:
     def test_single_source_is_valid_package(self, tmp_path):
-        src = _create_source_dir(tmp_path, "cpu_x64", {"ep": "CPUExecutionProvider"})
+        src = _create_source_dir(tmp_path, "cpu_x64", ep="CPUExecutionProvider")
         out = tmp_path / "out.ortpackage"
         cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
 
         cmd.run()
 
         manifest = json.loads((out / "manifest.json").read_text())
-        assert manifest["components"] == ["model"]
-        metadata = json.loads((out / "models" / "model" / "metadata.json").read_text())
+        assert manifest["components"] == ["decoder"]
+        metadata = json.loads((out / "models" / "decoder" / "metadata.json").read_text())
         assert "cpu_x64" in metadata["variants"]
         assert metadata["variants"]["cpu_x64"] == {"ep": "CPUExecutionProvider"}
         # No shared_weights because nothing to dedup.
-        assert not (out / "models" / "model" / "shared_weights").exists()
+        assert not (out / "models" / "decoder" / "shared_weights").exists()
 
 
 # ---------------------------------------------------------------------------
@@ -865,38 +901,6 @@ def test_skips_config_file_with_unsafe_key(self, tmp_path):
         assert sorted(p.name for p in (out / "configs").iterdir()) == ["ok.txt"]
 
 
-# ---------------------------------------------------------------------------
-# CLI: mixed source types
-# ---------------------------------------------------------------------------
-
-
-class TestMixedSourceTypes:
-    def test_rejects_mixed_onnx_and_composite(self, tmp_path):
-        # setup: one ONNXModel source, one CompositeModel source
-        onnx_src = _create_source_dir(tmp_path, "onnx_src", {"ep": "CPUExecutionProvider"})
-        comp_src = tmp_path / "comp_src"
-        comp_src.mkdir()
-        comp_onnx = _make_onnx_inline(comp_src / "comp.onnx")
-        (comp_src / "model_config.json").write_text(
-            json.dumps(
-                {
-                    "type": "CompositeModel",
-                    "config": {
-                        "model_components": [{"type": "ONNXModel", "config": {"model_path": str(comp_onnx)}}],
-                        "component_names": ["decoder"],
-                    },
-                }
-            )
-        )
-        cmd = _make_command(
-            ["generate-model-package", "-s", str(onnx_src), "-s", str(comp_src), "-o", str(tmp_path / "out")]
-        )
-
-        # execute + assert
-        with pytest.raises(ValueError, match="mix model types"):
-            cmd.run()
-
-
 # ---------------------------------------------------------------------------
 # Helper functions
 # ---------------------------------------------------------------------------
@@ -938,7 +942,7 @@ def test_passes_through_comma_delimited_metadata(self, tmp_path):
         src = _create_source_dir(
             tmp_path,
             "soc_60",
-            {"ep": "QNNExecutionProvider", "device": "NPU"},
+            ep="QNNExecutionProvider",
             onnx_metadata={"ep_compatibility_info.QNNExecutionProvider": "soc_60,soc_69,soc_73"},
         )
         out = tmp_path / "out.ortpackage"
@@ -948,106 +952,14 @@ def test_passes_through_comma_delimited_metadata(self, tmp_path):
         cmd.run()
 
         # assert: compatibility_string passes the raw opaque string through verbatim
-        metadata = json.loads((out / "models" / "model" / "metadata.json").read_text())
+        metadata = json.loads((out / "models" / "decoder" / "metadata.json").read_text())
         variant = metadata["variants"]["soc_60"]
         assert variant["ep"] == "QNNExecutionProvider"
         assert variant["compatibility_string"] == "soc_60,soc_69,soc_73"
 
 
 # ---------------------------------------------------------------------------
-# CLI: composite (per-component inference_settings precedence)
-# ---------------------------------------------------------------------------
-
-
-def _create_composite_source(
-    tmp_path: Path,
-    name: str,
-    components: list[dict],
-    component_names: list[str],
-    *,
-    target_inference: dict | None = None,
-    target_attrs: dict | None = None,
-) -> Path:
-    """Create an Olive-style composite source dir."""
-    source_dir = tmp_path / name
-    source_dir.mkdir(parents=True)
-    cfg = {"model_components": components, "model_component_names": component_names}
-    if target_inference is not None:
-        cfg["inference_settings"] = target_inference
-    if target_attrs is not None:
-        cfg["model_attributes"] = target_attrs
-    (source_dir / "model_config.json").write_text(json.dumps({"type": "CompositeModel", "config": cfg}))
-    return source_dir
-
-
-class TestCompositeBuild:
-    def test_per_component_inference_settings_wins(self, tmp_path):
-        # setup: component-level inference_settings should override target-level
-        comp_a_onnx = _make_onnx_inline(tmp_path / "comp_a" / "model.onnx")
-        comp_b_onnx = _make_onnx_inline(tmp_path / "comp_b" / "model.onnx")
-
-        target_inference = {
-            "session_options": {"graph_optimization_level": 1},
-            "execution_provider": ["CPUExecutionProvider"],
-            "provider_options": [{}],
-        }
-        comp_b_inference = {
-            "session_options": {"graph_optimization_level": 99},
-            "execution_provider": ["CPUExecutionProvider"],
-            "provider_options": [{}],
-        }
-        components = [
-            {"type": "ONNXModel", "config": {"model_path": str(comp_a_onnx)}},
-            {
-                "type": "ONNXModel",
-                "config": {"model_path": str(comp_b_onnx), "inference_settings": comp_b_inference},
-            },
-        ]
-        src = _create_composite_source(
-            tmp_path,
-            "soc_60",
-            components,
-            ["encoder", "decoder"],
-            target_inference=target_inference,
-            target_attrs={"ep": "CPUExecutionProvider"},
-        )
-        out = tmp_path / "out.ortpackage"
-        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
-
-        # execute
-        cmd.run()
-
-        # assert: encoder uses target-level, decoder uses component-level
-        encoder_overlay = json.loads((out / "models" / "encoder" / "soc_60" / "genai_config_overlay.json").read_text())
-        assert encoder_overlay["model"]["encoder"]["session_options"]["graph_optimization_level"] == 1
-
-        decoder_overlay = json.loads((out / "models" / "decoder" / "soc_60" / "genai_config_overlay.json").read_text())
-        assert decoder_overlay["model"]["decoder"]["session_options"]["graph_optimization_level"] == 99
-
-
-# ---------------------------------------------------------------------------
-# CLI: unsupported model type
-# ---------------------------------------------------------------------------
-
-
-class TestUnsupportedModelType:
-    def test_rejects_pytorch_model(self, tmp_path):
-        # setup: a source whose model_config declares an unsupported type
-        source_dir = tmp_path / "pytorch_src"
-        source_dir.mkdir()
-        (source_dir / "model_config.json").write_text(
-            json.dumps({"type": "PyTorchModel", "config": {"model_path": "pt"}})
-        )
-        out = tmp_path / "out"
-        cmd = _make_command(["generate-model-package", "-s", str(source_dir), "-o", str(out)])
-
-        # execute + assert
-        with pytest.raises(ValueError, match="Unsupported source model type"):
-            cmd.run()
-
-
-# ---------------------------------------------------------------------------
-# Pipeline sources (multi-stage exports, e.g. QNN) and model_config synthesis
+# Pipeline sources (multi-stage exports, e.g. QNN) and VLM multi-role overlay
 # ---------------------------------------------------------------------------
 
 
@@ -1063,11 +975,10 @@ def _create_pipeline_source(
 ) -> Path:
     """Build a fake GenAI-shaped multi-stage source dir (e.g. QNN pipeline).
 
-    The source has ONE genai_config.json + N real ONNX stage files and NO
-    model_config.json — exercising the synthesis path. ``stage_with_options``
-    is the only stage carrying provider_options (per QNN convention where
-    embedding / transformer-head run on CPU and only the prompt / iter
-    stages carry the HTP options).
+    The source has ONE genai_config.json + N real ONNX stage files (no
+    ``model_config.json``). ``stage_with_options`` is the only stage carrying
+    provider_options (per QNN convention where embedding / transformer-head
+    run on CPU and only the prompt / iter stages carry the HTP options).
     """
     source_dir = tmp_path / name
     source_dir.mkdir(parents=True)
@@ -1107,38 +1018,54 @@ def _create_pipeline_source(
     return source_dir
 
 
-class TestPipelineAndSynthesis:
-    """Pipeline multi-stage sources + ``model_config.json`` synthesis."""
+def _create_vlm_source(tmp_path: Path, name: str) -> Path:
+    """Build a fake flat VLM source (vision + embedding + decoder ONNXs in one dir).
 
-    def test_accepts_source_without_model_config_when_genai_config_present(self, tmp_path):
-        """A source carrying only ``genai_config.json`` + ONNX files is accepted.
+    Mirrors the shape of real-world VLM packages where a single source dir
+    holds multiple roles' ONNX files alongside one ``genai_config.json`` that
+    references each role's ``filename``. The packager must restore EVERY
+    role's filename in the per-variant overlay — not just the primary one —
+    or the GenAI loader cannot locate the vision/embedding ONNXs at load
+    time.
+    """
+    source_dir = tmp_path / name
+    source_dir.mkdir(parents=True)
+    for fname in ("vision.onnx", "embedding.onnx", "text.onnx"):
+        _make_onnx_inline(source_dir / fname)
+    genai = {
+        "model": {
+            "type": "qwen3vl",
+            "vocab_size": 151936,
+            "vision": {
+                "filename": "vision.onnx",
+                "session_options": {"provider_options": []},
+            },
+            "embedding": {
+                "filename": "embedding.onnx",
+                "session_options": {"provider_options": []},
+            },
+            "decoder": {
+                "head_size": 128,
+                "filename": "text.onnx",
+                "session_options": {"provider_options": []},
+            },
+        }
+    }
+    (source_dir / "genai_config.json").write_text(json.dumps(genai))
+    return source_dir
 
-        Useful for packaging GenAI-shaped exports downloaded from a hub: no
-        Olive workflow was used so no ``model_config.json`` exists, but
-        ``genai_config.json`` is enough for the packager to derive the EP,
-        component name, and per-variant overlay structure.
-        """
-        src = _create_pipeline_source(
-            tmp_path,
-            "qnn_npu",
-            stage_filenames=["embed.onnx", "ctx.onnx", "iter.onnx", "head.onnx"],
-            stage_with_options="prompt-processor",
-            provider_alias="qnn",
-            provider_options={"soc_model": "60"},
-        )
-        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(tmp_path / "out")])
 
-        sources = cmd._parse_sources()
-        assert sources == [("qnn_npu", src)]
+class TestPipelineSources:
+    """Pipeline multi-stage sources (e.g. QNN)."""
 
-    def test_rejects_source_without_model_config_or_genai_config(self, tmp_path):
-        """A source with neither config file is rejected with a clear error."""
+    def test_rejects_source_without_genai_config(self, tmp_path):
+        """A source without ``genai_config.json`` is rejected with a clear error."""
         empty = tmp_path / "empty"
         empty.mkdir()
         _make_onnx_inline(empty / "model.onnx")
         cmd = _make_command(["generate-model-package", "-s", str(empty), "-o", str(tmp_path / "out")])
 
-        with pytest.raises(ValueError, match=r"neither model_config\.json nor genai_config\.json"):
+        with pytest.raises(ValueError, match=r"no genai_config\.json"):
             cmd._parse_sources()
 
     def test_packs_pipeline_with_all_stage_onnx_files(self, tmp_path):
@@ -1268,33 +1195,70 @@ def test_flat_source_ep_derived_from_source_genai_when_attrs_missing(self, tmp_p
         )
         assert overlay["model"]["decoder"]["session_options"]["provider_options"] == [{"VitisAI": {}}]
 
-    def test_unreachable_model_path_is_repointed_to_source_dir(self, tmp_path):
-        """A stale ``model_path`` (e.g. copied from another machine) is repaired.
 
-        The original ``model_attributes`` are preserved (they remain valid
-        descriptors of the model itself); only the on-disk path is patched
-        so the local ONNX file is the one actually packaged.
-        """
-        source_dir = tmp_path / "stale"
-        source_dir.mkdir()
-        _make_onnx_inline(source_dir / "model.onnx")
-        (source_dir / "genai_config.json").write_text(
-            json.dumps({"model": {"vocab_size": 100, "decoder": {"filename": "model.onnx"}}})
-        )
-        (source_dir / "model_config.json").write_text(
-            json.dumps(
-                {
-                    "type": "ONNXModel",
-                    "config": {
-                        "model_path": "/nonexistent/elsewhere/model.onnx",
-                        "model_attributes": {"task": "text-generation", "vocab_size": 100},
-                    },
-                }
-            )
+class TestVLMMultiRoleOverlay:
+    """Multi-role (vision + embedding + decoder) overlay restoration for VLM sources.
+
+    A flat VLM source dir packs >1 ONNX file referenced by >1 role in the
+    same ``genai_config.json``. The sidecar sweep copies every ONNX into
+    the variant directory, but the loader still needs the overlay to
+    declare ``filename`` for each role — the base genai_config strips
+    every role's filename and session_options. Without the multi-role
+    overlay lift the package would only restore the primary role
+    (``decoder``) and the loader would fail when it looks for the vision
+    or embedding ONNX.
+    """
+
+    def test_overlay_restores_filename_for_every_role(self, tmp_path):
+        src = _create_vlm_source(tmp_path, "cpu_and_mobile")
+        out = tmp_path / "out"
+        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
+
+        cmd.run()
+
+        overlay_path = (
+            out.with_suffix(".ortpackage") / "models" / "decoder" / "cpu_and_mobile" / "genai_config_overlay.json"
         )
+        overlay = json.loads(overlay_path.read_text())
+        model = overlay["model"]
+        # The VLM fix: every role with a filename in the source must appear
+        # in the overlay with that filename restored.
+        assert model["vision"]["filename"] == "vision.onnx"
+        assert model["embedding"]["filename"] == "embedding.onnx"
+        assert model["decoder"]["filename"] == "text.onnx"
+
+    def test_variant_dir_contains_all_role_onnxs(self, tmp_path):
+        """Sidecar sweep copies every ONNX next to the primary in the variant dir.
+
+        The single component directory holds the vision, embedding, and
+        decoder ONNXs side-by-side; without this the overlay's filename
+        restoration would resolve to missing files.
+        """
+        src = _create_vlm_source(tmp_path, "cpu_and_mobile")
         out = tmp_path / "out"
-        cmd = _make_command(["generate-model-package", "-s", str(source_dir), "-o", str(out)])
+        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
+
+        cmd.run()
+
+        variant_dir = out.with_suffix(".ortpackage") / "models" / "decoder" / "cpu_and_mobile"
+        for fname in ("vision.onnx", "embedding.onnx", "text.onnx"):
+            assert (variant_dir / fname).is_file(), f"missing {fname} in variant dir"
+
+    def test_base_genai_injects_component_marker_for_every_role(self, tmp_path):
+        """Every multi-role source role gets a ``component=<comp>`` marker in base.
+
+        The merged config the loader sees must know which component
+        directory each role lives in. The base genai_config injects a
+        ``component`` field for every role so per-variant lookups resolve
+        to the correct on-disk variant directory.
+        """
+        src = _create_vlm_source(tmp_path, "cpu_and_mobile")
+        out = tmp_path / "out"
+        cmd = _make_command(["generate-model-package", "-s", str(src), "-o", str(out)])
 
         cmd.run()
 
-        assert (out.with_suffix(".ortpackage") / "models" / "decoder" / "stale" / "model.onnx").is_file()
+        base = json.loads((out.with_suffix(".ortpackage") / "configs" / "genai_config.json").read_text())
+        model = base["model"]
+        for role in ("vision", "embedding", "decoder"):
+            assert model[role]["component"] == "decoder", f"role {role} missing component marker"