From 6ce55a9d716f6029b87763a6ab26dee2ad105dc3 Mon Sep 17 00:00:00 2001 From: JonGames <18472148+jongames@users.noreply.github.com> Date: Thu, 25 Sep 2025 13:35:16 -0400 Subject: [PATCH] Fix reranking models being limited to 512 tokens input in llama.cpp backend Signed-off-by: JonGames <18472148+jongames@users.noreply.github.com> --- backend/cpp/llama-cpp/grpc-server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index d0de499461da..93bb07e2a1ea 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -231,6 +231,7 @@ static void params_parse(const backend::ModelOptions* request, params.cpuparams.n_threads = request->threads(); params.n_gpu_layers = request->ngpulayers(); params.n_batch = request->nbatch(); + params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size" // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1 //params.n_parallel = 1; const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");