From 01cbd76c299aa31cf62844f7114ee3978f2ab009 Mon Sep 17 00:00:00 2001 From: mudler <2420543+mudler@users.noreply.github.com> Date: Mon, 25 May 2026 20:36:08 +0000 Subject: [PATCH 1/2] :arrow_up: Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- backend/cpp/llama-cpp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 98b934af132d..f2b199ba9d0f 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=549b9d84330c327e6791fa812a7d60c0cf63572e +LLAMA_VERSION?=35c9b1f39ebe5a7bb83986d64415a079218be78d LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= From 9be4a679d09be6302d43f1ae0c2dc11543725151 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 25 May 2026 21:44:01 +0000 Subject: [PATCH 2/2] fix(llama-cpp): track upstream rename checkpoint_every_nt -> checkpoint_min_step Upstream llama.cpp renamed common_params::checkpoint_every_nt to checkpoint_min_step and changed its default from 8192 to 256. The semantics also shifted: it used to enforce a fixed checkpoint cadence during prefill, now it sets a minimum spacing between context checkpoints. Track the new field name in grpc-server.cpp and accept the old option names as backward- compatible aliases for users with existing configs. Signed-off-by: Ettore Di Giacinto Assisted-by: claude-code:claude-opus-4-7 --- backend/cpp/llama-cpp/grpc-server.cpp | 22 ++++++++++++++-------- docs/content/features/text-generation.md | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 84c584f134a8..3681a21dc827 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -570,9 +570,11 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // kv_unified=false or cache_ram_mib=0, so flipping kv_unified above is // what actually unlocks it. params.cache_idle_slots = true; - // checkpoint_every_nt: create a context checkpoint every N tokens during - // prefill (-1 disables). Match upstream's default (8192). - params.checkpoint_every_nt = 8192; + // checkpoint_min_step: minimum spacing between context checkpoints in + // tokens (0 disables the minimum). Match upstream's default (256). This + // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics + // also shifted from a fixed cadence to a minimum spacing. + params.checkpoint_min_step = 256; // decode options. Options are in form optname:optvale, or if booleans only optname. for (int i = 0; i < request->options_size(); i++) { @@ -746,14 +748,18 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.cache_idle_slots = false; } - // --- prefill checkpoint cadence (upstream -cpent / --checkpoint-every-n-tokens) --- - // -1 disables checkpointing during prefill. - } else if (!strcmp(optname, "checkpoint_every_nt") || !strcmp(optname, "checkpoint_every_n_tokens")) { + // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) --- + // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`, + // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility + // with existing user configs: upstream renamed the field and shifted its + // semantics from a fixed cadence to a minimum spacing. + } else if (!strcmp(optname, "checkpoint_min_step") || !strcmp(optname, "checkpoint_min_spacing") || + !strcmp(optname, "checkpoint_every_nt") || !strcmp(optname, "checkpoint_every_n_tokens")) { if (optval != NULL) { try { - params.checkpoint_every_nt = std::stoi(optval_str); + params.checkpoint_min_step = std::stoi(optval_str); } catch (const std::exception& e) { - // If conversion fails, keep default value (8192) + // If conversion fails, keep default value (256) } } diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index ae2646e58ed7..b39377e73f4d 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -515,7 +515,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp | `kv_unified` or `unified_kv` | boolean | Use a single unified KV buffer shared across all sequences. Default: `true` (LocalAI override; upstream defaults to `false` but auto-enables it when slot count is auto). **Required for `cache_idle_slots` to work**: without it the server force-disables idle-slot saving at init, and the prompt cache is never written across requests. | `kv_unified:false` | | `cache_idle_slots` or `idle_slots_cache` | boolean | On a new task, save the previous slot's KV state into the prompt cache (and clear the slot) so a later request with the same prefix can warm-load it. Default: `true`. Auto-disabled by the server if `kv_unified=false` or `cache_ram=0`. | `cache_idle_slots:false` | | `n_ctx_checkpoints` or `ctx_checkpoints` | integer | Maximum number of context checkpoints per slot (used for partial-prefix recovery, e.g. SWA). Default: `32`. | `ctx_checkpoints:16` | -| `checkpoint_every_nt` or `checkpoint_every_n_tokens` | integer | Create a context checkpoint every N tokens during prefill. `-1` disables checkpointing. Default: `8192`. | `checkpoint_every_nt:4096` | +| `checkpoint_min_step` or `checkpoint_min_spacing` (aliases: `checkpoint_every_nt`, `checkpoint_every_n_tokens`) | integer | Minimum spacing in tokens between context checkpoints. `0` disables the minimum-spacing gate. Default: `256`. (Renamed upstream from `checkpoint_every_nt`; semantics shifted from a fixed cadence to a minimum spacing.) | `checkpoint_min_step:1024` | | `split_mode` or `sm` | string | How to split the model across multiple GPUs: `none` (single GPU only), `layer` (default — split layers and KV across GPUs), `row` (split rows across GPUs), `tensor` (experimental tensor parallelism — requires `flash_attention: true`, no KV-cache quantization, manually set `context_size`, and a llama.cpp build that includes [#19378](https://github.com/ggml-org/llama.cpp/pull/19378)). | `split_mode:tensor` | **Example configuration with options:**