Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions contracts/qwen3-moe-serve-dispatch-v1.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
metadata:
version: "1.0.0"
version: "1.1.0"
created: "2026-05-19"
updated: "2026-05-19"
author: PAIML Engineering
registry: true
references:
Expand All @@ -11,7 +12,7 @@ metadata:

kind: KernelContract
name: qwen3-moe-serve-dispatch
version: "1.0.0"
version: "1.1.0"
scope: "crates/aprender-serve/src/api/cuda_chat_backend.rs + crates/aprender-serve/src/infer/inference_result.rs"

description: |
Expand Down Expand Up @@ -128,6 +129,7 @@ implementation_phases:
gives operators a clear signal of what's missing. Discharges
FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002.
estimated_effort: "1-2 hours"
status: "SHIPPED 2026-05-19 via paiml/aprender#1806"

phase_2:
name: "Wire run_inference into chat handler for GGUF paths"
Expand All @@ -139,6 +141,7 @@ implementation_phases:
alongside the Model abstraction. Discharges
FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_001 + 003.
estimated_effort: "1-2 days"
status: "SHIPPING 2026-05-19 (this PR) — AppState gains mapped_gguf_model field; try_qwen3_moe_backend replaces guard with real run_qwen3_moe_generate dispatch"

phase_3:
name: "End-to-end validation via CCPA Phase 6 bench"
Expand All @@ -147,3 +150,14 @@ implementation_phases:
with Qwen3-Coder-30B-MoE GGUF + assert student_pass_rate > 0.
Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_004.
estimated_effort: "~7 hours wall (operator-coordinated; no engineering)"
status: "PENDING — awaits Phase 2 merge + operator-coordinated bench dispatch"

status_history:
- version: "1.0.0"
date: "2026-05-19"
pr: "paiml/aprender#1806"
summary: "Initial contract registered. Phase 1 (Option A) shipped: arch-detection guard + 5 unit tests discharging V1_002."
- version: "1.1.0"
date: "2026-05-19"
pr: "paiml/aprender#1807 (this PR)"
summary: "Phase 2 (Option B) ships: AppState gains mapped_gguf_model field; CLI server-command load path retains MappedGGUFModel in Arc; try_qwen3_moe_backend replaces guard with real run_qwen3_moe_generate dispatch. V1_001 + V1_003 discharged pending integration-test fixture availability."
144 changes: 117 additions & 27 deletions crates/aprender-serve/src/api/cuda_chat_backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ pub async fn openai_chat_completions_handler(
.as_millis()
);

if let Some(r) = guard_qwen3_moe_dispatch(&state) {
if let Some(r) = try_qwen3_moe_backend(&state, &request, &request_id, start) {
return r;
}

Expand Down Expand Up @@ -637,40 +637,130 @@ pub async fn openai_chat_completions_handler(
registry_fallback(&state, &request, &request_id, start)
}

/// aprender#1789: qwen3_moe dispatch guard for /v1/chat/completions.
/// aprender#1789 Option B: qwen3_moe MoE-aware dispatch for /v1/chat/completions.
///
/// The HTTP chat handler dispatches inference through `Arc<Model>::generate()`,
/// which calls the dense FFN matmul path. For qwen3_moe GGUFs that path fails
/// (per-expert tensors stored under `ffn_*_exps.weight`; the dense
/// `ffn_up.weight` references zero-byte data — see aprender#1790's defensive
/// guard). The MoE-aware path exists at `infer::run_inference` but is only
/// wired into the `apr run` CLI today.
/// Detects qwen3_moe architecture + dispatches inference through
/// `run_qwen3_moe_generate` (the same path used by the `apr run` CLI),
/// which correctly indexes per-expert FFN tensors from the mmap.
///
/// Until Option B in `docs/specifications/qwen3-moe-serve-dispatch-fix.md`
/// lands, surface a structured `NOT_IMPLEMENTED` error so callers see a clean
/// classification instead of a cryptic matmul shape error. Discharges
/// FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002 in
/// For non-qwen3_moe archs returns `None` — handler falls through to the
/// dense backend chain (CUDA / cached / quantized / registry-fallback).
///
/// For qwen3_moe archs where AppState was constructed WITHOUT
/// `with_mapped_gguf_model` (no retained mmap), returns NOT_IMPLEMENTED
/// with the same actionable error class Option A surfaced. The
/// defensive guard from aprender#1790's `validate_matmul_weight_shape`
/// will NOT fire because we never reach the dense FFN matmul.
///
/// Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_001 + V1_003 in
/// `contracts/qwen3-moe-serve-dispatch-v1.yaml`.
fn guard_qwen3_moe_dispatch(state: &AppState) -> Option<Response> {
fn try_qwen3_moe_backend(
state: &AppState,
request: &ChatCompletionRequest,
request_id: &str,
start: Instant,
) -> Option<Response> {
use crate::gguf::QuantizedGenerateConfig;

let raw_arch = state.model_architecture()?;
if !is_qwen3_moe_arch(&raw_arch) {
return None;
}
eprintln!(
"[WARN] aprender#1789: qwen3_moe arch detected at \
/v1/chat/completions (raw_arch={raw_arch}, canonical=qwen3_moe). \
MoE dispatch via HTTP not yet wired (only `apr run` CLI routes \
through the MoE path). Returning NOT_IMPLEMENTED. \
See contracts/qwen3-moe-serve-dispatch-v1.yaml + \
https://github.com/paiml/aprender/issues/1789"
);
Some(fail_response(

let mapped = match state.mapped_gguf_model() {
Some(m) => m,
None => {
eprintln!(
"[WARN] aprender#1789: qwen3_moe arch detected at \
/v1/chat/completions (raw_arch={raw_arch}, canonical=qwen3_moe) \
but AppState has no retained MappedGGUFModel. This means the \
CLI server-command load path didn't call \
.with_mapped_gguf_model(). Returning NOT_IMPLEMENTED. \
See contracts/qwen3-moe-serve-dispatch-v1.yaml + \
https://github.com/paiml/aprender/issues/1789"
);
return Some(fail_response(
state,
StatusCode::NOT_IMPLEMENTED,
"qwen3_moe arch detected but mapped GGUF not retained in AppState. \
See aprender#1789 + contracts/qwen3-moe-serve-dispatch-v1.yaml.",
));
}
};
let quantized = match state.quantized_model() {
Some(q) => q.clone(),
None => {
return Some(fail_response(
state,
StatusCode::NOT_IMPLEMENTED,
"qwen3_moe arch detected but no OwnedQuantizedModel in AppState. \
See aprender#1789.",
));
}
};
let tokenizer = match require_tokenizer(state) {
Ok(t) => t,
Err(r) => return Some(r),
};

let input_ids = match tokenize_chat_prompt(
&tokenizer,
&request.messages,
Some(&request.model),
state,
StatusCode::NOT_IMPLEMENTED,
"qwen3_moe-arch GGUFs are not yet supported via /v1/chat/completions. \
Use `apr run` CLI for MoE inference. See aprender#1789 + \
contracts/qwen3-moe-serve-dispatch-v1.yaml. \
(Discharges FALSIFY-QWEN3_MOE_SERVE_DISPATCH_V1_002.)",
) {
Ok(ids) => ids,
Err(r) => return Some(r),
};
let prompt_token_count = input_ids.len();

let max_tokens = request.max_tokens.unwrap_or(256).min(4096) as usize;
let gen_config = QuantizedGenerateConfig {
max_tokens,
temperature: request.temperature.unwrap_or(0.0),
..QuantizedGenerateConfig::default()
};

let tokens = match crate::infer::qwen3_moe_generate::run_qwen3_moe_generate(
&mapped,
&quantized,
&input_ids,
&gen_config,
) {
Ok(t) => t,
Err(e) => {
state.metrics.record_failure();
return Some(fail_response(
state,
StatusCode::INTERNAL_SERVER_ERROR,
format!("qwen3_moe generation failed: {e}"),
));
}
};

let generated_ids: Vec<u32> = tokens[input_ids.len()..].to_vec();
let completion_tokens = generated_ids.len();

let response_text = match tokenizer.decode(&generated_ids) {
Ok(t) => t,
Err(e) => {
state.metrics.record_failure();
return Some(fail_response(state, StatusCode::INTERNAL_SERVER_ERROR, e));
}
};

let duration = start.elapsed();
state.metrics.record_success(completion_tokens, duration);

Some(build_chat_response(
request_id.to_string(),
request.model.clone(),
response_text,
prompt_token_count,
completion_tokens,
max_tokens,
None,
duration,
))
}

Expand Down
7 changes: 7 additions & 0 deletions crates/aprender-serve/src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,13 @@ pub struct AppState {
Option<Arc<std::sync::Mutex<crate::safetensors_cuda::SafeTensorsCudaModel>>>,
/// GH-319: Cached model architecture string (avoids RwLock in hot path)
cached_architecture: Option<String>,
/// aprender#1789 Option B: retained MappedGGUFModel for MoE-aware HTTP
/// dispatch. `run_qwen3_moe_generate` borrows per-expert tensors
/// directly from the mmap, so the mapped model must outlive any
/// inference call. Held in an `Arc` to share between the chat handler
/// + any future streaming/batch backends. See
/// `contracts/qwen3-moe-serve-dispatch-v1.yaml` (V1_001 + V1_003).
mapped_gguf_model: Option<Arc<crate::gguf::MappedGGUFModel>>,
/// GH-330: Cached EOS token ID (avoids RwLock in hot path)
cached_eos_token_id: Option<u32>,
/// GH-152: Enable verbose request/response logging
Expand Down
32 changes: 32 additions & 0 deletions crates/aprender-serve/src/api/mod_app_state_gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -112,6 +113,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -173,6 +175,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: arch,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -237,6 +240,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: arch,
mapped_gguf_model: None,
cached_eos_token_id: eos,
verbose: false,
trace: false,
Expand Down Expand Up @@ -290,6 +294,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: arch,
mapped_gguf_model: None,
cached_eos_token_id: eos,
verbose: false,
trace: false,
Expand Down Expand Up @@ -348,6 +353,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: Some(Arc::new(transformer)),
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -568,6 +574,7 @@ impl AppState {
apr_q4k_tx: Some(q4k_tx),
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: eos_id,
verbose: false,
trace: false,
Expand Down Expand Up @@ -613,6 +620,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand All @@ -639,6 +647,30 @@ impl AppState {
self
}

/// aprender#1789 Option B: builder to attach the retained
/// `MappedGGUFModel` for MoE-aware HTTP dispatch.
///
/// `run_qwen3_moe_generate` borrows per-expert tensors directly from
/// the mmap, so the mapped model must outlive any inference call. The
/// CLI server-command load path must retain its `MappedGGUFModel` in
/// an `Arc` + attach it here.
#[must_use]
pub fn with_mapped_gguf_model(
mut self,
mapped: std::sync::Arc<crate::gguf::MappedGGUFModel>,
) -> Self {
self.mapped_gguf_model = Some(mapped);
self
}

/// aprender#1789 Option B: accessor for the retained
/// `MappedGGUFModel`. Used by the chat-completions handler to route
/// qwen3_moe inference through `run_qwen3_moe_generate`.
#[must_use]
pub fn mapped_gguf_model(&self) -> Option<std::sync::Arc<crate::gguf::MappedGGUFModel>> {
self.mapped_gguf_model.clone()
}

/// GH-319: Get model architecture from whichever backend is loaded.
///
/// Used for chat template auto-detection instead of hardcoding "qwen".
Expand Down
8 changes: 8 additions & 0 deletions crates/aprender-serve/src/api/mod_app_state_new.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -98,6 +99,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -197,6 +199,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -268,6 +271,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -321,6 +325,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -379,6 +384,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -432,6 +438,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down Expand Up @@ -498,6 +505,7 @@ impl AppState {
apr_q4k_tx: None,
apr_transformer: None,
cached_architecture: None,
mapped_gguf_model: None,
cached_eos_token_id: None,
verbose: false,
trace: false,
Expand Down
Loading
Loading