mudler · mudler · Apr 13, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
@@ -53,6 +53,19 @@ jobs:
             dockerfile: "./backend/Dockerfile.python"
             context: "./"
             ubuntu-version: '2204'
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-vllm'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:24.04"
+            skip-drivers: 'true'
+            backend: "vllm"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./"
+            ubuntu-version: '2404'
           - build-type: ''
             cuda-major-version: ""
             cuda-minor-version: ""

diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
@@ -31,6 +31,7 @@ jobs:
       llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
       llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
       ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
+      vllm: ${{ steps.detect.outputs.vllm }}
       acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
       qwen3-tts-cpp: ${{ steps.detect.outputs.qwen3-tts-cpp }}
       voxtral: ${{ steps.detect.outputs.voxtral }}
@@ -501,6 +502,52 @@ jobs:
       - name: Build ik-llama-cpp backend image and run gRPC e2e tests
         run: |
           make test-extra-backend-ik-llama-cpp
+  # tests-vllm-grpc is currently disabled in CI.
+  #
+  # The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16
+  # instructions, and neither ubuntu-latest nor the bigger-runner pool
+  # offers a stable CPU baseline that supports them — runners come
+  # back with different hardware between runs and SIGILL on import of
+  # vllm.model_executor.models.registry. Compiling vllm from source
+  # via FROM_SOURCE=true works on any CPU but takes 30-50 minutes per
+  # run, which is too slow for a smoke test.
+  #
+  # The test itself (tests/e2e-backends + make test-extra-backend-vllm)
+  # is fully working and validated locally on a host with the right
+  # SIMD baseline. Run it manually with:
+  #
+  #   make test-extra-backend-vllm
+  #
+  # Re-enable this job once we have a self-hosted runner label with
+  # guaranteed AVX-512 VNNI/BF16 support, or once the vllm project
+  # publishes a CPU wheel with a wider baseline.
+  #
+  # tests-vllm-grpc:
+  #   needs: detect-changes
+  #   if: needs.detect-changes.outputs.vllm == 'true' || needs.detect-changes.outputs.run-all == 'true'
+  #   runs-on: bigger-runner
+  #   timeout-minutes: 90
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v6
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install -y --no-install-recommends \
+  #             make build-essential curl unzip ca-certificates git tar
+  #     - name: Setup Go
+  #       uses: actions/setup-go@v5
+  #       with:
+  #         go-version: '1.25.4'
+  #     - name: Free disk space
+  #       run: |
+  #         sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /opt/hostedtoolcache/CodeQL || true
+  #         df -h
+  #     - name: Build vllm (cpu) backend image and run gRPC e2e tests
+  #       run: |
+  #         make test-extra-backend-vllm
   tests-acestep-cpp:
     needs: detect-changes
     if: needs.detect-changes.outputs.acestep-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'

diff --git a/Makefile b/Makefile
@@ -466,8 +466,14 @@ test-extra: prepare-test-extra
 ##   BACKEND_IMAGE            Required. Docker image to test, e.g. local-ai-backend:llama-cpp.
 ##   BACKEND_TEST_MODEL_URL   URL of a model file to download and load.
 ##   BACKEND_TEST_MODEL_FILE  Path to an already-downloaded model (skips download).
+##   BACKEND_TEST_MODEL_NAME  HuggingFace repo id (e.g. Qwen/Qwen2.5-0.5B-Instruct).
+##                            Use this instead of MODEL_URL for backends that
+##                            resolve HF model ids natively (vllm, vllm-omni).
 ##   BACKEND_TEST_CAPS        Comma-separated capabilities, default "health,load,predict,stream".
+##                            Adds "tools" to exercise ChatDelta tool call extraction.
 ##   BACKEND_TEST_PROMPT      Override the prompt used in predict/stream specs.
+##   BACKEND_TEST_OPTIONS     Comma-separated Options[] entries forwarded to LoadModel,
+##                            e.g. "tool_parser:hermes,reasoning_parser:qwen3".
 ##
 ## Direct usage (image already built, no docker-build-* dependency):
 ##
@@ -486,9 +492,13 @@ test-extra-backend: protogen-go
 	BACKEND_IMAGE="$$BACKEND_IMAGE" \
 	BACKEND_TEST_MODEL_URL="$${BACKEND_TEST_MODEL_URL:-$(BACKEND_TEST_MODEL_URL)}" \
 	BACKEND_TEST_MODEL_FILE="$$BACKEND_TEST_MODEL_FILE" \
+	BACKEND_TEST_MODEL_NAME="$$BACKEND_TEST_MODEL_NAME" \
 	BACKEND_TEST_CAPS="$$BACKEND_TEST_CAPS" \
 	BACKEND_TEST_PROMPT="$$BACKEND_TEST_PROMPT" \
-	go test -v -timeout 15m ./tests/e2e-backends/...
+	BACKEND_TEST_OPTIONS="$$BACKEND_TEST_OPTIONS" \
+	BACKEND_TEST_TOOL_PROMPT="$$BACKEND_TEST_TOOL_PROMPT" \
+	BACKEND_TEST_TOOL_NAME="$$BACKEND_TEST_TOOL_NAME" \
+	go test -v -timeout 30m ./tests/e2e-backends/...
 
 ## Convenience wrappers: build the image, then exercise it.
 test-extra-backend-llama-cpp: docker-build-llama-cpp
@@ -497,6 +507,18 @@ test-extra-backend-llama-cpp: docker-build-llama-cpp
 test-extra-backend-ik-llama-cpp: docker-build-ik-llama-cpp
 	BACKEND_IMAGE=local-ai-backend:ik-llama-cpp $(MAKE) test-extra-backend
 
+## vllm is resolved from a HuggingFace model id (no file download) and
+## exercises Predict + streaming + tool-call extraction via the hermes parser.
+## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
+## wheel was compiled against (AVX-512 VNNI/BF16); older CPUs will SIGILL
+## on import — on CI this means using the bigger-runner label.
+test-extra-backend-vllm: docker-build-vllm
+	BACKEND_IMAGE=local-ai-backend:vllm \
+	BACKEND_TEST_MODEL_NAME=Qwen/Qwen2.5-0.5B-Instruct \
+	BACKEND_TEST_CAPS=health,load,predict,stream,tools \
+	BACKEND_TEST_OPTIONS=tool_parser:hermes \
+	$(MAKE) test-extra-backend
+
 DOCKER_IMAGE?=local-ai
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:24.04
@@ -650,6 +672,7 @@ define docker-build-backend
 		--build-arg CUDA_MINOR_VERSION=$(CUDA_MINOR_VERSION) \
 		--build-arg UBUNTU_VERSION=$(UBUNTU_VERSION) \
 		--build-arg UBUNTU_CODENAME=$(UBUNTU_CODENAME) \
+		$(if $(FROM_SOURCE),--build-arg FROM_SOURCE=$(FROM_SOURCE)) \
 		$(if $(filter true,$(5)),--build-arg BACKEND=$(1)) \
 		-t local-ai-backend:$(1) -f backend/Dockerfile.$(2) $(3)
 endef

diff --git a/backend/Dockerfile.python b/backend/Dockerfile.python
@@ -29,6 +29,7 @@ RUN apt-get update && \
         curl python3-pip \
         python-is-python3 \
         python3-dev llvm \
+        libnuma1 libgomp1 \
         python3-venv make cmake && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -195,6 +196,12 @@ COPY backend/backend.proto /${BACKEND}/backend.proto
 COPY backend/python/common/ /${BACKEND}/common
 COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh
 
+# Optional per-backend source build toggle (e.g. vllm on CPU can set
+# FROM_SOURCE=true to compile against the build host SIMD instead of
+# pulling a prebuilt wheel). Default empty — most backends ignore it.
+ARG FROM_SOURCE=""
+ENV FROM_SOURCE=${FROM_SOURCE}
+
 RUN cd /${BACKEND} && PORTABLE_PYTHON=true make
 
 # Package GPU libraries into the backend's lib directory

diff --git a/backend/index.yaml b/backend/index.yaml
@@ -132,7 +132,7 @@
  capabilities:
    nvidia: "cuda12-rfdetr"
    intel: "intel-rfdetr"
    #amd: "rocm-rfdetr"
    nvidia-l4t: "nvidia-l4t-arm64-rfdetr"
    metal: "metal-rfdetr"
    default: "cpu-rfdetr"
@@ -197,6 +197,7 @@
     amd: "rocm-vllm"
     intel: "intel-vllm"
     nvidia-cuda-12: "cuda12-vllm"
+    cpu: "cpu-vllm"
 - &vllm-omni
   name: "vllm-omni"
   license: apache-2.0
@@ -1563,6 +1564,7 @@
     nvidia: "cuda12-vllm-development"
     amd: "rocm-vllm-development"
     intel: "intel-vllm-development"
+    cpu: "cpu-vllm-development"
 - !!merge <<: *vllm
   name: "cuda12-vllm"
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
@@ -1578,6 +1580,11 @@
   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-vllm"
   mirrors:
     - localai/localai-backends:latest-gpu-intel-vllm
+- !!merge <<: *vllm
+  name: "cpu-vllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-vllm"
+  mirrors:
+    - localai/localai-backends:latest-cpu-vllm
 - !!merge <<: *vllm
   name: "cuda12-vllm-development"
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm"
@@ -1593,6 +1600,11 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-vllm"
   mirrors:
     - localai/localai-backends:master-gpu-intel-vllm
+- !!merge <<: *vllm
+  name: "cpu-vllm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-vllm"
+  mirrors:
+    - localai/localai-backends:master-cpu-vllm
 # vllm-omni
 - !!merge <<: *vllm-omni
   name: "vllm-omni-development"
@@ -1626,7 +1638,7 @@
  capabilities:
    nvidia: "cuda12-rfdetr-development"
    intel: "intel-rfdetr-development"
    #amd: "rocm-rfdetr-development"
    nvidia-l4t: "nvidia-l4t-arm64-rfdetr-development"
    metal: "metal-rfdetr-development"
    default: "cpu-rfdetr-development"

diff --git a/backend/python/common/vllm_utils.py b/backend/python/common/vllm_utils.py
@@ -0,0 +1,84 @@
+"""Shared utilities for vLLM-based backends."""
+import json
+import sys
+
+
+def parse_options(options_list):
+    """Parse Options[] list of 'key:value' strings into a dict.
+
+    Supports type inference for common cases (bool, int, float).
+    Used by LoadModel to extract backend-specific options.
+    """
+    opts = {}
+    for opt in options_list:
+        if ":" not in opt:
+            continue
+        key, value = opt.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+        # Try type conversion
+        if value.lower() in ("true", "false"):
+            opts[key] = value.lower() == "true"
+        else:
+            try:
+                opts[key] = int(value)
+            except ValueError:
+                try:
+                    opts[key] = float(value)
+                except ValueError:
+                    opts[key] = value
+    return opts
+
+
+def messages_to_dicts(proto_messages):
+    """Convert proto Message objects to list of dicts for apply_chat_template().
+
+    Handles: role, content, name, tool_call_id, reasoning_content, tool_calls (JSON string -> list).
+    """
+    result = []
+    for msg in proto_messages:
+        d = {"role": msg.role, "content": msg.content or ""}
+        if msg.name:
+            d["name"] = msg.name
+        if msg.tool_call_id:
+            d["tool_call_id"] = msg.tool_call_id
+        if msg.reasoning_content:
+            d["reasoning_content"] = msg.reasoning_content
+        if msg.tool_calls:
+            try:
+                d["tool_calls"] = json.loads(msg.tool_calls)
+            except json.JSONDecodeError:
+                pass
+        result.append(d)
+    return result
+
+
+def setup_parsers(opts):
+    """Return (tool_parser_cls, reasoning_parser_cls) tuple from opts dict.
+
+    Uses vLLM's native ToolParserManager and ReasoningParserManager.
+    Returns (None, None) if vLLM is not installed or parsers not available.
+    """
+    tool_parser_cls = None
+    reasoning_parser_cls = None
+
+    tool_parser_name = opts.get("tool_parser")
+    reasoning_parser_name = opts.get("reasoning_parser")
+
+    if tool_parser_name:
+        try:
+            from vllm.tool_parsers import ToolParserManager
+            tool_parser_cls = ToolParserManager.get_tool_parser(tool_parser_name)
+            print(f"[vllm_utils] Loaded tool_parser: {tool_parser_name}", file=sys.stderr)
+        except Exception as e:
+            print(f"[vllm_utils] Failed to load tool_parser {tool_parser_name}: {e}", file=sys.stderr)
+
+    if reasoning_parser_name:
+        try:
+            from vllm.reasoning import ReasoningParserManager
+            reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name)
+            print(f"[vllm_utils] Loaded reasoning_parser: {reasoning_parser_name}", file=sys.stderr)
+        except Exception as e:
+            print(f"[vllm_utils] Failed to load reasoning_parser {reasoning_parser_name}: {e}", file=sys.stderr)
+
+    return tool_parser_cls, reasoning_parser_cls