paiml · noahgift · May 19, 2026 · May 19, 2026
diff --git a/contracts/qwen3-moe-serve-dispatch-v1.yaml b/contracts/qwen3-moe-serve-dispatch-v1.yaml
@@ -1,6 +1,7 @@
 metadata:
-  version: "1.0.0"
+  version: "1.1.0"
   created: "2026-05-19"
+  updated: "2026-05-19"
   author: PAIML Engineering
   registry: true
   references:
@@ -11,7 +12,7 @@ metadata:
 
 kind: KernelContract
 name: qwen3-moe-serve-dispatch
-version: "1.0.0"
+version: "1.1.0"
 scope: "crates/aprender-serve/src/api/cuda_chat_backend.rs + crates/aprender-serve/src/infer/inference_result.rs"
 
 description: |
@@ -128,6 +129,7 @@ implementation_phases:
       gives operators a clear signal of what's missing. Discharges
       FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002.
     estimated_effort: "1-2 hours"
+    status: "SHIPPED 2026-05-19 via paiml/aprender#1806"
 
   phase_2:
     name: "Wire run_inference into chat handler for GGUF paths"
@@ -139,6 +141,7 @@ implementation_phases:
       alongside the Model abstraction. Discharges
       FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_001 + 003.
     estimated_effort: "1-2 days"
+    status: "SHIPPING 2026-05-19 (this PR) — AppState gains mapped_gguf_model field; try_qwen3_moe_backend replaces guard with real run_qwen3_moe_generate dispatch"
 
   phase_3:
     name: "End-to-end validation via CCPA Phase 6 bench"
@@ -147,3 +150,14 @@ implementation_phases:
       with Qwen3-Coder-30B-MoE GGUF + assert student_pass_rate > 0.
       Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_004.
     estimated_effort: "~7 hours wall (operator-coordinated; no engineering)"
+    status: "PENDING — awaits Phase 2 merge + operator-coordinated bench dispatch"
+
+status_history:
+  - version: "1.0.0"
+    date: "2026-05-19"
+    pr: "paiml/aprender#1806"
+    summary: "Initial contract registered. Phase 1 (Option A) shipped: arch-detection guard + 5 unit tests discharging V1_002."
+  - version: "1.1.0"
+    date: "2026-05-19"
+    pr: "paiml/aprender#1807 (this PR)"
+    summary: "Phase 2 (Option B) ships: AppState gains mapped_gguf_model field; CLI server-command load path retains MappedGGUFModel in Arc; try_qwen3_moe_backend replaces guard with real run_qwen3_moe_generate dispatch. V1_001 + V1_003 discharged pending integration-test fixture availability."
diff --git a/crates/aprender-serve/src/api/cuda_chat_backend.rs b/crates/aprender-serve/src/api/cuda_chat_backend.rs
@@ -594,7 +594,7 @@ pub async fn openai_chat_completions_handler(
             .as_millis()
     );
 
-    if let Some(r) = guard_qwen3_moe_dispatch(&state) {
+    if let Some(r) = try_qwen3_moe_backend(&state, &request, &request_id, start) {
         return r;
     }
 
@@ -637,40 +637,130 @@ pub async fn openai_chat_completions_handler(
     registry_fallback(&state, &request, &request_id, start)
 }
 
-/// aprender#1789: qwen3_moe dispatch guard for /v1/chat/completions.
+/// aprender#1789 Option B: qwen3_moe MoE-aware dispatch for /v1/chat/completions.
 ///
-/// The HTTP chat handler dispatches inference through `Arc<Model>::generate()`,
-/// which calls the dense FFN matmul path. For qwen3_moe GGUFs that path fails
-/// (per-expert tensors stored under `ffn_*_exps.weight`; the dense
-/// `ffn_up.weight` references zero-byte data — see aprender#1790's defensive
-/// guard). The MoE-aware path exists at `infer::run_inference` but is only
-/// wired into the `apr run` CLI today.
+/// Detects qwen3_moe architecture + dispatches inference through
+/// `run_qwen3_moe_generate` (the same path used by the `apr run` CLI),
+/// which correctly indexes per-expert FFN tensors from the mmap.
 ///
-/// Until Option B in `docs/specifications/qwen3-moe-serve-dispatch-fix.md`
-/// lands, surface a structured `NOT_IMPLEMENTED` error so callers see a clean
-/// classification instead of a cryptic matmul shape error. Discharges
-/// FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002 in
+/// For non-qwen3_moe archs returns `None` — handler falls through to the
+/// dense backend chain (CUDA / cached / quantized / registry-fallback).
+///
+/// For qwen3_moe archs where AppState was constructed WITHOUT
+/// `with_mapped_gguf_model` (no retained mmap), returns NOT_IMPLEMENTED
+/// with the same actionable error class Option A surfaced. The
+/// defensive guard from aprender#1790's `validate_matmul_weight_shape`
+/// will NOT fire because we never reach the dense FFN matmul.
+///
+/// Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_001 + V1_003 in
 /// `contracts/qwen3-moe-serve-dispatch-v1.yaml`.
-fn guard_qwen3_moe_dispatch(state: &AppState) -> Option<Response> {
+fn try_qwen3_moe_backend(
+    state: &AppState,
+    request: &ChatCompletionRequest,
+    request_id: &str,
+    start: Instant,
+) -> Option<Response> {
+    use crate::gguf::QuantizedGenerateConfig;
+
     let raw_arch = state.model_architecture()?;
     if !is_qwen3_moe_arch(&raw_arch) {
         return None;
     }
-    eprintln!(
-        "[WARN] aprender#1789: qwen3_moe arch detected at \
-         /v1/chat/completions (raw_arch={raw_arch}, canonical=qwen3_moe). \
-         MoE dispatch via HTTP not yet wired (only `apr run` CLI routes \
-         through the MoE path). Returning NOT_IMPLEMENTED. \
-         See contracts/qwen3-moe-serve-dispatch-v1.yaml + \
-         https://github.com/paiml/aprender/issues/1789"
-    );
-    Some(fail_response(
+
+    let mapped = match state.mapped_gguf_model() {
+        Some(m) => m,
+        None => {
+            eprintln!(
+                "[WARN] aprender#1789: qwen3_moe arch detected at \
+                 /v1/chat/completions (raw_arch={raw_arch}, canonical=qwen3_moe) \
+                 but AppState has no retained MappedGGUFModel. This means the \
+                 CLI server-command load path didn't call \
+                 .with_mapped_gguf_model(). Returning NOT_IMPLEMENTED. \
+                 See contracts/qwen3-moe-serve-dispatch-v1.yaml + \
+                 https://github.com/paiml/aprender/issues/1789"
+            );
+            return Some(fail_response(
+                state,
+                StatusCode::NOT_IMPLEMENTED,
+                "qwen3_moe arch detected but mapped GGUF not retained in AppState. \
+                 See aprender#1789 + contracts/qwen3-moe-serve-dispatch-v1.yaml.",
+            ));
+        }
+    };
+    let quantized = match state.quantized_model() {
+        Some(q) => q.clone(),
+        None => {
+            return Some(fail_response(
+                state,
+                StatusCode::NOT_IMPLEMENTED,
+                "qwen3_moe arch detected but no OwnedQuantizedModel in AppState. \
+                 See aprender#1789.",
+            ));
+        }
+    };
+    let tokenizer = match require_tokenizer(state) {
+        Ok(t) => t,
+        Err(r) => return Some(r),
+    };
+
+    let input_ids = match tokenize_chat_prompt(
+        &tokenizer,
+        &request.messages,
+        Some(&request.model),
         state,
-        StatusCode::NOT_IMPLEMENTED,
-        "qwen3_moe-arch GGUFs are not yet supported via /v1/chat/completions. \
-         Use `apr run` CLI for MoE inference. See aprender#1789 + \
-         contracts/qwen3-moe-serve-dispatch-v1.yaml. \
-         (Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002.)",
+    ) {
+        Ok(ids) => ids,
+        Err(r) => return Some(r),
+    };
+    let prompt_token_count = input_ids.len();
+
+    let max_tokens = request.max_tokens.unwrap_or(256).min(4096) as usize;
+    let gen_config = QuantizedGenerateConfig {
+        max_tokens,
+        temperature: request.temperature.unwrap_or(0.0),
+        ..QuantizedGenerateConfig::default()
+    };
+
+    let tokens = match crate::infer::qwen3_moe_generate::run_qwen3_moe_generate(
+        &mapped,
+        &quantized,
+        &input_ids,
+        &gen_config,
+    ) {
+        Ok(t) => t,
+        Err(e) => {
+            state.metrics.record_failure();
+            return Some(fail_response(
+                state,
+                StatusCode::INTERNAL_SERVER_ERROR,
+                format!("qwen3_moe generation failed: {e}"),
+            ));
+        }
+    };
+
+    let generated_ids: Vec<u32> = tokens[input_ids.len()..].to_vec();
+    let completion_tokens = generated_ids.len();
+
+    let response_text = match tokenizer.decode(&generated_ids) {
+        Ok(t) => t,
+        Err(e) => {
+            state.metrics.record_failure();
+            return Some(fail_response(state, StatusCode::INTERNAL_SERVER_ERROR, e));
+        }
+    };
+
+    let duration = start.elapsed();
+    state.metrics.record_success(completion_tokens, duration);
+
+    Some(build_chat_response(
+        request_id.to_string(),
+        request.model.clone(),
+        response_text,
+        prompt_token_count,
+        completion_tokens,
+        max_tokens,
+        None,
+        duration,
     ))
 }
 

diff --git a/crates/aprender-serve/src/api/mod.rs b/crates/aprender-serve/src/api/mod.rs
@@ -166,6 +166,13 @@ pub struct AppState {
         Option<Arc<std::sync::Mutex<crate::safetensors_cuda::SafeTensorsCudaModel>>>,
     /// GH-319: Cached model architecture string (avoids RwLock in hot path)
     cached_architecture: Option<String>,
+    /// aprender#1789 Option B: retained MappedGGUFModel for MoE-aware HTTP
+    /// dispatch. `run_qwen3_moe_generate` borrows per-expert tensors
+    /// directly from the mmap, so the mapped model must outlive any
+    /// inference call. Held in an `Arc` to share between the chat handler
+    /// + any future streaming/batch backends. See
+    /// `contracts/qwen3-moe-serve-dispatch-v1.yaml` (V1_001 + V1_003).
+    mapped_gguf_model: Option<Arc<crate::gguf::MappedGGUFModel>>,
     /// GH-330: Cached EOS token ID (avoids RwLock in hot path)
     cached_eos_token_id: Option<u32>,
     /// GH-152: Enable verbose request/response logging

diff --git a/crates/aprender-serve/src/api/mod_app_state_gpu.rs b/crates/aprender-serve/src/api/mod_app_state_gpu.rs
@@ -59,6 +59,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -112,6 +113,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -173,6 +175,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: arch,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -237,6 +240,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: arch,
+            mapped_gguf_model: None,
             cached_eos_token_id: eos,
             verbose: false,
             trace: false,
@@ -290,6 +294,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: arch,
+            mapped_gguf_model: None,
             cached_eos_token_id: eos,
             verbose: false,
             trace: false,
@@ -348,6 +353,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: Some(Arc::new(transformer)),
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -568,6 +574,7 @@ impl AppState {
             apr_q4k_tx: Some(q4k_tx),
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: eos_id,
             verbose: false,
             trace: false,
@@ -613,6 +620,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -639,6 +647,30 @@ impl AppState {
         self
     }
 
+    /// aprender#1789 Option B: builder to attach the retained
+    /// `MappedGGUFModel` for MoE-aware HTTP dispatch.
+    ///
+    /// `run_qwen3_moe_generate` borrows per-expert tensors directly from
+    /// the mmap, so the mapped model must outlive any inference call. The
+    /// CLI server-command load path must retain its `MappedGGUFModel` in
+    /// an `Arc` + attach it here.
+    #[must_use]
+    pub fn with_mapped_gguf_model(
+        mut self,
+        mapped: std::sync::Arc<crate::gguf::MappedGGUFModel>,
+    ) -> Self {
+        self.mapped_gguf_model = Some(mapped);
+        self
+    }
+
+    /// aprender#1789 Option B: accessor for the retained
+    /// `MappedGGUFModel`. Used by the chat-completions handler to route
+    /// qwen3_moe inference through `run_qwen3_moe_generate`.
+    #[must_use]
+    pub fn mapped_gguf_model(&self) -> Option<std::sync::Arc<crate::gguf::MappedGGUFModel>> {
+        self.mapped_gguf_model.clone()
+    }
+
     /// GH-319: Get model architecture from whichever backend is loaded.
     ///
     /// Used for chat template auto-detection instead of hardcoding "qwen".

diff --git a/crates/aprender-serve/src/api/mod_app_state_new.rs b/crates/aprender-serve/src/api/mod_app_state_new.rs
@@ -40,6 +40,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -98,6 +99,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -197,6 +199,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -268,6 +271,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -321,6 +325,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -379,6 +384,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -432,6 +438,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,
@@ -498,6 +505,7 @@ impl AppState {
             apr_q4k_tx: None,
             apr_transformer: None,
             cached_architecture: None,
+            mapped_gguf_model: None,
             cached_eos_token_id: None,
             verbose: false,
             trace: false,