diff --git a/contracts/qwen3-moe-serve-dispatch-v1.yaml b/contracts/qwen3-moe-serve-dispatch-v1.yaml index f9b074f74..c7ff420dc 100644 --- a/contracts/qwen3-moe-serve-dispatch-v1.yaml +++ b/contracts/qwen3-moe-serve-dispatch-v1.yaml @@ -1,6 +1,7 @@ metadata: - version: "1.0.0" + version: "1.1.0" created: "2026-05-19" + updated: "2026-05-19" author: PAIML Engineering registry: true references: @@ -11,7 +12,7 @@ metadata: kind: KernelContract name: qwen3-moe-serve-dispatch -version: "1.0.0" +version: "1.1.0" scope: "crates/aprender-serve/src/api/cuda_chat_backend.rs + crates/aprender-serve/src/infer/inference_result.rs" description: | @@ -128,6 +129,7 @@ implementation_phases: gives operators a clear signal of what's missing. Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002. estimated_effort: "1-2 hours" + status: "SHIPPED 2026-05-19 via paiml/aprender#1806" phase_2: name: "Wire run_inference into chat handler for GGUF paths" @@ -139,6 +141,7 @@ implementation_phases: alongside the Model abstraction. Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_001 + 003. estimated_effort: "1-2 days" + status: "SHIPPING 2026-05-19 (this PR) — AppState gains mapped_gguf_model field; try_qwen3_moe_backend replaces guard with real run_qwen3_moe_generate dispatch" phase_3: name: "End-to-end validation via CCPA Phase 6 bench" @@ -147,3 +150,14 @@ implementation_phases: with Qwen3-Coder-30B-MoE GGUF + assert student_pass_rate > 0. Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_004. estimated_effort: "~7 hours wall (operator-coordinated; no engineering)" + status: "PENDING — awaits Phase 2 merge + operator-coordinated bench dispatch" + +status_history: + - version: "1.0.0" + date: "2026-05-19" + pr: "paiml/aprender#1806" + summary: "Initial contract registered. Phase 1 (Option A) shipped: arch-detection guard + 5 unit tests discharging V1_002." + - version: "1.1.0" + date: "2026-05-19" + pr: "paiml/aprender#1807 (this PR)" + summary: "Phase 2 (Option B) ships: AppState gains mapped_gguf_model field; CLI server-command load path retains MappedGGUFModel in Arc; try_qwen3_moe_backend replaces guard with real run_qwen3_moe_generate dispatch. V1_001 + V1_003 discharged pending integration-test fixture availability." diff --git a/crates/aprender-serve/src/api/cuda_chat_backend.rs b/crates/aprender-serve/src/api/cuda_chat_backend.rs index 018552073..d1127d1a1 100644 --- a/crates/aprender-serve/src/api/cuda_chat_backend.rs +++ b/crates/aprender-serve/src/api/cuda_chat_backend.rs @@ -594,7 +594,7 @@ pub async fn openai_chat_completions_handler( .as_millis() ); - if let Some(r) = guard_qwen3_moe_dispatch(&state) { + if let Some(r) = try_qwen3_moe_backend(&state, &request, &request_id, start) { return r; } @@ -637,40 +637,130 @@ pub async fn openai_chat_completions_handler( registry_fallback(&state, &request, &request_id, start) } -/// aprender#1789: qwen3_moe dispatch guard for /v1/chat/completions. +/// aprender#1789 Option B: qwen3_moe MoE-aware dispatch for /v1/chat/completions. /// -/// The HTTP chat handler dispatches inference through `Arc::generate()`, -/// which calls the dense FFN matmul path. For qwen3_moe GGUFs that path fails -/// (per-expert tensors stored under `ffn_*_exps.weight`; the dense -/// `ffn_up.weight` references zero-byte data — see aprender#1790's defensive -/// guard). The MoE-aware path exists at `infer::run_inference` but is only -/// wired into the `apr run` CLI today. +/// Detects qwen3_moe architecture + dispatches inference through +/// `run_qwen3_moe_generate` (the same path used by the `apr run` CLI), +/// which correctly indexes per-expert FFN tensors from the mmap. /// -/// Until Option B in `docs/specifications/qwen3-moe-serve-dispatch-fix.md` -/// lands, surface a structured `NOT_IMPLEMENTED` error so callers see a clean -/// classification instead of a cryptic matmul shape error. Discharges -/// FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002 in +/// For non-qwen3_moe archs returns `None` — handler falls through to the +/// dense backend chain (CUDA / cached / quantized / registry-fallback). +/// +/// For qwen3_moe archs where AppState was constructed WITHOUT +/// `with_mapped_gguf_model` (no retained mmap), returns NOT_IMPLEMENTED +/// with the same actionable error class Option A surfaced. The +/// defensive guard from aprender#1790's `validate_matmul_weight_shape` +/// will NOT fire because we never reach the dense FFN matmul. +/// +/// Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_001 + V1_003 in /// `contracts/qwen3-moe-serve-dispatch-v1.yaml`. -fn guard_qwen3_moe_dispatch(state: &AppState) -> Option { +fn try_qwen3_moe_backend( + state: &AppState, + request: &ChatCompletionRequest, + request_id: &str, + start: Instant, +) -> Option { + use crate::gguf::QuantizedGenerateConfig; + let raw_arch = state.model_architecture()?; if !is_qwen3_moe_arch(&raw_arch) { return None; } - eprintln!( - "[WARN] aprender#1789: qwen3_moe arch detected at \ - /v1/chat/completions (raw_arch={raw_arch}, canonical=qwen3_moe). \ - MoE dispatch via HTTP not yet wired (only `apr run` CLI routes \ - through the MoE path). Returning NOT_IMPLEMENTED. \ - See contracts/qwen3-moe-serve-dispatch-v1.yaml + \ - https://github.com/paiml/aprender/issues/1789" - ); - Some(fail_response( + + let mapped = match state.mapped_gguf_model() { + Some(m) => m, + None => { + eprintln!( + "[WARN] aprender#1789: qwen3_moe arch detected at \ + /v1/chat/completions (raw_arch={raw_arch}, canonical=qwen3_moe) \ + but AppState has no retained MappedGGUFModel. This means the \ + CLI server-command load path didn't call \ + .with_mapped_gguf_model(). Returning NOT_IMPLEMENTED. \ + See contracts/qwen3-moe-serve-dispatch-v1.yaml + \ + https://github.com/paiml/aprender/issues/1789" + ); + return Some(fail_response( + state, + StatusCode::NOT_IMPLEMENTED, + "qwen3_moe arch detected but mapped GGUF not retained in AppState. \ + See aprender#1789 + contracts/qwen3-moe-serve-dispatch-v1.yaml.", + )); + } + }; + let quantized = match state.quantized_model() { + Some(q) => q.clone(), + None => { + return Some(fail_response( + state, + StatusCode::NOT_IMPLEMENTED, + "qwen3_moe arch detected but no OwnedQuantizedModel in AppState. \ + See aprender#1789.", + )); + } + }; + let tokenizer = match require_tokenizer(state) { + Ok(t) => t, + Err(r) => return Some(r), + }; + + let input_ids = match tokenize_chat_prompt( + &tokenizer, + &request.messages, + Some(&request.model), state, - StatusCode::NOT_IMPLEMENTED, - "qwen3_moe-arch GGUFs are not yet supported via /v1/chat/completions. \ - Use `apr run` CLI for MoE inference. See aprender#1789 + \ - contracts/qwen3-moe-serve-dispatch-v1.yaml. \ - (Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002.)", + ) { + Ok(ids) => ids, + Err(r) => return Some(r), + }; + let prompt_token_count = input_ids.len(); + + let max_tokens = request.max_tokens.unwrap_or(256).min(4096) as usize; + let gen_config = QuantizedGenerateConfig { + max_tokens, + temperature: request.temperature.unwrap_or(0.0), + ..QuantizedGenerateConfig::default() + }; + + let tokens = match crate::infer::qwen3_moe_generate::run_qwen3_moe_generate( + &mapped, + &quantized, + &input_ids, + &gen_config, + ) { + Ok(t) => t, + Err(e) => { + state.metrics.record_failure(); + return Some(fail_response( + state, + StatusCode::INTERNAL_SERVER_ERROR, + format!("qwen3_moe generation failed: {e}"), + )); + } + }; + + let generated_ids: Vec = tokens[input_ids.len()..].to_vec(); + let completion_tokens = generated_ids.len(); + + let response_text = match tokenizer.decode(&generated_ids) { + Ok(t) => t, + Err(e) => { + state.metrics.record_failure(); + return Some(fail_response(state, StatusCode::INTERNAL_SERVER_ERROR, e)); + } + }; + + let duration = start.elapsed(); + state.metrics.record_success(completion_tokens, duration); + + Some(build_chat_response( + request_id.to_string(), + request.model.clone(), + response_text, + prompt_token_count, + completion_tokens, + max_tokens, + None, + duration, )) } diff --git a/crates/aprender-serve/src/api/mod.rs b/crates/aprender-serve/src/api/mod.rs index 1279fb9be..69d3c1e41 100644 --- a/crates/aprender-serve/src/api/mod.rs +++ b/crates/aprender-serve/src/api/mod.rs @@ -166,6 +166,13 @@ pub struct AppState { Option>>, /// GH-319: Cached model architecture string (avoids RwLock in hot path) cached_architecture: Option, + /// aprender#1789 Option B: retained MappedGGUFModel for MoE-aware HTTP + /// dispatch. `run_qwen3_moe_generate` borrows per-expert tensors + /// directly from the mmap, so the mapped model must outlive any + /// inference call. Held in an `Arc` to share between the chat handler + /// + any future streaming/batch backends. See + /// `contracts/qwen3-moe-serve-dispatch-v1.yaml` (V1_001 + V1_003). + mapped_gguf_model: Option>, /// GH-330: Cached EOS token ID (avoids RwLock in hot path) cached_eos_token_id: Option, /// GH-152: Enable verbose request/response logging diff --git a/crates/aprender-serve/src/api/mod_app_state_gpu.rs b/crates/aprender-serve/src/api/mod_app_state_gpu.rs index 0d768736f..f2f1fbd83 100644 --- a/crates/aprender-serve/src/api/mod_app_state_gpu.rs +++ b/crates/aprender-serve/src/api/mod_app_state_gpu.rs @@ -59,6 +59,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -112,6 +113,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -173,6 +175,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: arch, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -237,6 +240,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: arch, + mapped_gguf_model: None, cached_eos_token_id: eos, verbose: false, trace: false, @@ -290,6 +294,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: arch, + mapped_gguf_model: None, cached_eos_token_id: eos, verbose: false, trace: false, @@ -348,6 +353,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: Some(Arc::new(transformer)), cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -568,6 +574,7 @@ impl AppState { apr_q4k_tx: Some(q4k_tx), apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: eos_id, verbose: false, trace: false, @@ -613,6 +620,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -639,6 +647,30 @@ impl AppState { self } + /// aprender#1789 Option B: builder to attach the retained + /// `MappedGGUFModel` for MoE-aware HTTP dispatch. + /// + /// `run_qwen3_moe_generate` borrows per-expert tensors directly from + /// the mmap, so the mapped model must outlive any inference call. The + /// CLI server-command load path must retain its `MappedGGUFModel` in + /// an `Arc` + attach it here. + #[must_use] + pub fn with_mapped_gguf_model( + mut self, + mapped: std::sync::Arc, + ) -> Self { + self.mapped_gguf_model = Some(mapped); + self + } + + /// aprender#1789 Option B: accessor for the retained + /// `MappedGGUFModel`. Used by the chat-completions handler to route + /// qwen3_moe inference through `run_qwen3_moe_generate`. + #[must_use] + pub fn mapped_gguf_model(&self) -> Option> { + self.mapped_gguf_model.clone() + } + /// GH-319: Get model architecture from whichever backend is loaded. /// /// Used for chat template auto-detection instead of hardcoding "qwen". diff --git a/crates/aprender-serve/src/api/mod_app_state_new.rs b/crates/aprender-serve/src/api/mod_app_state_new.rs index 25fa61b5e..e51307466 100644 --- a/crates/aprender-serve/src/api/mod_app_state_new.rs +++ b/crates/aprender-serve/src/api/mod_app_state_new.rs @@ -40,6 +40,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -98,6 +99,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -197,6 +199,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -268,6 +271,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -321,6 +325,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -379,6 +384,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -432,6 +438,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, @@ -498,6 +505,7 @@ impl AppState { apr_q4k_tx: None, apr_transformer: None, cached_architecture: None, + mapped_gguf_model: None, cached_eos_token_id: None, verbose: false, trace: false, diff --git a/crates/aprender-serve/src/cli/mod_server_commands.rs b/crates/aprender-serve/src/cli/mod_server_commands.rs index d73a6c14e..9922072fc 100644 --- a/crates/aprender-serve/src/cli/mod_server_commands.rs +++ b/crates/aprender-serve/src/cli/mod_server_commands.rs @@ -130,6 +130,13 @@ mod server_commands { crate::api::AppState::with_quantized_model_and_vocab(quantized_model, vocab)? }; + // aprender#1789 Option B: retain MappedGGUFModel in AppState so the + // chat-completions handler can route qwen3_moe inference through + // `run_qwen3_moe_generate` (which borrows per-expert tensors + // directly from the mmap). For non-MoE archs this is just an extra + // Arc reference; for MoE it's the critical lifetime anchor. + let state = state.with_mapped_gguf_model(std::sync::Arc::new(mapped)); + Ok(PreparedServer { state, batch_mode_enabled: batch_mode, diff --git a/docs/specifications/qwen3-moe-serve-dispatch-fix.md b/docs/specifications/qwen3-moe-serve-dispatch-fix.md index 17ee49610..98ed69f7f 100644 --- a/docs/specifications/qwen3-moe-serve-dispatch-fix.md +++ b/docs/specifications/qwen3-moe-serve-dispatch-fix.md @@ -1,6 +1,6 @@ # Qwen3-MoE serve-dispatch fix (paiml/aprender#1789) -**Status (2026-05-19)**: SCOPE + CONTRACT in place; implementation in flight on branch `fix/1789-qwen3-moe-serve-dispatch`. +**Status (2026-05-19)**: SCOPE + CONTRACT v1.1.0 + Option A (PR #1806, in CI) + Option B (PR #1807, this PR) all in flight. Option C deferred indefinitely (architectural cleanup; not on critical path). **Cross-refs**: - Contract: [`contracts/qwen3-moe-serve-dispatch-v1.yaml`](../../contracts/qwen3-moe-serve-dispatch-v1.yaml)