From 7b963364449debbf4968e7be3253949c15ddfc09 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 5 Nov 2025 18:52:18 +0100 Subject: [PATCH 01/14] feat(llama.cpp): expose env vars as options for consistency This allows to configure everything in the YAML file of the model rather than have global configurations Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 59 ++++++++++++++++++++------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index a6c610106d4c..0fae12b2fe48 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -254,26 +254,15 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions params.n_gpu_layers = request->ngpulayers(); params.n_batch = request->nbatch(); params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size" - // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1 - //params.n_parallel = 1; - const char *env_parallel = std::getenv("LLAMACPP_PARALLEL"); - if (env_parallel != NULL) { - params.n_parallel = std::stoi(env_parallel); - params.cont_batching = true; - } else { - params.n_parallel = 1; - } - - - const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS"); - if (llama_grpc_servers != NULL) { - add_rpc_devices(std::string(llama_grpc_servers)); - } // Initialize ctx_shift to false by default (can be overridden by options) params.ctx_shift = false; // Initialize cache_ram_mib to -1 by default (no limit, can be overridden by options) params.cache_ram_mib = -1; + // Initialize n_parallel to 1 by default (can be overridden by options) + params.n_parallel = 1; + // Initialize grpc_servers to empty (can be overridden by options) + std::string grpc_servers_option = ""; // decode options. Options are in form optname:optvale, or if booleans only optname. for (int i = 0; i < request->options_size(); i++) { @@ -298,6 +287,46 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions // If conversion fails, keep default value (-1) } } + } else if (!strcmp(optname, "parallel") || !strcmp(optname, "n_parallel")) { + if (optval != NULL) { + try { + params.n_parallel = std::stoi(optval); + if (params.n_parallel > 1) { + params.cont_batching = true; + } + } catch (const std::exception& e) { + // If conversion fails, keep default value (1) + } + } + } else if (!strcmp(optname, "grpc_servers") || !strcmp(optname, "rpc_servers")) { + if (optval != NULL) { + grpc_servers_option = std::string(optval); + } + } + } + + // Set params.n_parallel from environment variable if not set via options (fallback) + if (params.n_parallel == 1) { + const char *env_parallel = std::getenv("LLAMACPP_PARALLEL"); + if (env_parallel != NULL) { + try { + params.n_parallel = std::stoi(env_parallel); + if (params.n_parallel > 1) { + params.cont_batching = true; + } + } catch (const std::exception& e) { + // If conversion fails, keep default value (1) + } + } + } + + // Add RPC devices from option or environment variable (fallback) + if (!grpc_servers_option.empty()) { + add_rpc_devices(grpc_servers_option); + } else { + const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS"); + if (llama_grpc_servers != NULL) { + add_rpc_devices(std::string(llama_grpc_servers)); } } From 657676e4e0c16e9a71e6baf1e59700f01258af28 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 5 Nov 2025 18:53:43 +0100 Subject: [PATCH 02/14] feat(llama.cpp): respect usetokenizertemplate and use llama.cpp templating system to process messages Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 104 +++++++++++++++++++++----- 1 file changed, 85 insertions(+), 19 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 0fae12b2fe48..b62117773d3e 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -57,9 +57,8 @@ static void start_llama_server(server_context& ctx_server) { // common_chat_templates_source(ctx_server.chat_templates.get()), // common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str(), ctx_server.params_base.default_template_kwargs); - // Reset the chat templates - // TODO: We should make this configurable by respecting the option that is already present in LocalAI for vLLM - ctx_server.chat_templates.reset(); + // Keep the chat templates initialized in load_model() so they can be used when UseTokenizerTemplate is enabled + // Templates will only be used conditionally in Predict/PredictStream when UseTokenizerTemplate is true and Messages are provided ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) { ctx_server.process_single_task(std::move(task)); @@ -115,7 +114,11 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const data["n_keep"] = predict->nkeep(); data["seed"] = predict->seed(); data["grammar"] = predict->grammar(); - data["prompt"] = predict->prompt(); + // Only set prompt if UseTokenizerTemplate is false or if no Messages are provided + // When UseTokenizerTemplate is true and Messages are provided, prompt will be set via chat templates in Predict/PredictStream + if (!predict->usetokenizertemplate() || predict->messages_size() == 0) { + data["prompt"] = predict->prompt(); + } data["ignore_eos"] = predict->ignoreeos(); data["embeddings"] = predict->embeddings(); // TODO: add back json_schema and let this be controlled by the user @@ -524,7 +527,43 @@ class BackendServiceImpl final : public backend::Backend::Service { try { std::vector tasks; - const auto & prompt = data.at("prompt"); + std::string prompt_str; + // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided + if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) { + // Convert proto Messages to JSON format + json messages_json = json::array(); + for (int i = 0; i < request->messages_size(); i++) { + const auto& msg = request->messages(i); + json msg_json; + msg_json["role"] = msg.role(); + msg_json["content"] = msg.content(); + messages_json.push_back(msg_json); + } + + // Parse messages using llama.cpp's chat message parser + auto chat_messages = common_chat_msgs_parse_oaicompat(messages_json); + + // Prepare chat template inputs + common_chat_templates_inputs inputs; + inputs.messages = chat_messages; + inputs.grammar = data.value("grammar", ""); + inputs.use_jinja = ctx_server.params_base.use_jinja; + inputs.add_generation_prompt = true; + inputs.chat_template_kwargs = ctx_server.params_base.default_template_kwargs; + + // Apply chat template + auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); + prompt_str = chat_params.prompt; + } else { + // Use prompt directly from data + if (data.contains("prompt") && data["prompt"].is_string()) { + prompt_str = data["prompt"].get(); + } else { + prompt_str = request->prompt(); + } + } + + const auto & prompt = prompt_str; const auto type = SERVER_TASK_TYPE_COMPLETION; // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); @@ -554,16 +593,12 @@ class BackendServiceImpl final : public backend::Backend::Service { // process prompt std::vector inputs; - if (!prompt.is_string()) { - throw std::runtime_error("prompt must be a string"); - } - if (has_mtmd) { // multimodal - inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get(), files)); + inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt_str, files)); } else { // Everything else, including multimodal completions. - inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); + inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt_str, true, true); } tasks.reserve(inputs.size()); @@ -673,7 +708,43 @@ class BackendServiceImpl final : public backend::Backend::Service { try { std::vector tasks; - const auto & prompt = data.at("prompt"); + std::string prompt_str; + // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided + if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) { + // Convert proto Messages to JSON format + json messages_json = json::array(); + for (int i = 0; i < request->messages_size(); i++) { + const auto& msg = request->messages(i); + json msg_json; + msg_json["role"] = msg.role(); + msg_json["content"] = msg.content(); + messages_json.push_back(msg_json); + } + + // Parse messages using llama.cpp's chat message parser + auto chat_messages = common_chat_msgs_parse_oaicompat(messages_json); + + // Prepare chat template inputs + common_chat_templates_inputs inputs; + inputs.messages = chat_messages; + inputs.grammar = data.value("grammar", ""); + inputs.use_jinja = ctx_server.params_base.use_jinja; + inputs.add_generation_prompt = true; + inputs.chat_template_kwargs = ctx_server.params_base.default_template_kwargs; + + // Apply chat template + auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); + prompt_str = chat_params.prompt; + } else { + // Use prompt directly from data + if (data.contains("prompt") && data["prompt"].is_string()) { + prompt_str = data["prompt"].get(); + } else { + prompt_str = request->prompt(); + } + } + + const auto & prompt = prompt_str; const auto type = SERVER_TASK_TYPE_COMPLETION; // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); @@ -708,17 +779,12 @@ class BackendServiceImpl final : public backend::Backend::Service { // process prompt std::vector inputs; - if (!prompt.is_string()) { - std::cout << "[PREDICT] Prompt must be a string" << std::endl; - throw std::runtime_error("prompt must be a string"); - } - if (has_mtmd) { // multimodal - inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get(), files)); + inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt_str, files)); } else { // Everything else, including multimodal completions. - inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); + inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt_str, true, true); } tasks.reserve(inputs.size()); From 0ff7c03a9901426e771dc4c2fc952daaa6e5dd5a Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 5 Nov 2025 19:24:58 +0100 Subject: [PATCH 03/14] WIP Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 1 + backend/cpp/llama-cpp/grpc-server.cpp | 106 +++++++++++++++++++++++--- core/config/model_config.go | 19 +++++ core/http/endpoints/openai/chat.go | 90 ++++++++++++++++++---- 4 files changed, 188 insertions(+), 28 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index f7bcf79726ff..a1788f46d80e 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -154,6 +154,7 @@ message PredictOptions { repeated string Videos = 45; repeated string Audios = 46; string CorrelationId = 47; + string JsonSchema = 48; // JSON schema for grammar generation (when use_llama_grammar is enabled) } // The response message containing the result diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index b62117773d3e..c08925a44e49 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -91,7 +91,7 @@ static void start_llama_server(server_context& ctx_server) { ctx_server.queue_tasks.start_loop(); } -json parse_options(bool streaming, const backend::PredictOptions* predict, const server_context& ctx_server) +json parse_options(bool streaming, const backend::PredictOptions* predict, const server_context& ctx_server, bool use_llama_grammar = false) { // Create now a json data from the prediction options instead @@ -113,7 +113,43 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const data["mirostat_eta"] = predict->mirostateta(); data["n_keep"] = predict->nkeep(); data["seed"] = predict->seed(); - data["grammar"] = predict->grammar(); + + // Handle grammar/json_schema based on use_llama_grammar flag + // Priority: JsonSchema field > grammar field (when use_llama_grammar is enabled) + std::string json_schema_str = predict->jsonschema(); + std::string grammar_str = predict->grammar(); + + if (!json_schema_str.empty()) { + // JsonSchema field is set - use it directly (highest priority) + try { + json json_schema_obj = json::parse(json_schema_str); + data["json_schema"] = json_schema_obj; + // Don't set grammar when json_schema is provided (llama.cpp requirement) + } catch (const json::parse_error& e) { + // If json_schema is invalid JSON, fall back to grammar + if (!grammar_str.empty()) { + data["grammar"] = grammar_str; + } + } + } else if (use_llama_grammar && !grammar_str.empty()) { + // use_llama_grammar is enabled and no JsonSchema field - try to parse grammar as JSON + // This is a fallback for backward compatibility + try { + json test_json = json::parse(grammar_str); + // If parsing succeeds, it's JSON - pass as json_schema + data["json_schema"] = test_json; + // Don't set grammar when json_schema is provided (llama.cpp requirement) + } catch (const json::parse_error&) { + // Not valid JSON, use as regular grammar + data["grammar"] = grammar_str; + } + } else { + // Normal behavior: use grammar as-is + if (!grammar_str.empty()) { + data["grammar"] = grammar_str; + } + } + // Only set prompt if UseTokenizerTemplate is false or if no Messages are provided // When UseTokenizerTemplate is true and Messages are provided, prompt will be set via chat templates in Predict/PredictStream if (!predict->usetokenizertemplate() || predict->messages_size() == 0) { @@ -121,8 +157,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const } data["ignore_eos"] = predict->ignoreeos(); data["embeddings"] = predict->embeddings(); - // TODO: add back json_schema and let this be controlled by the user - // data["json_schema"] = predict->jsonschema(); // Add the correlationid to json data data["correlation_id"] = predict->correlationid(); @@ -233,7 +267,7 @@ static void add_rpc_devices(std::string servers) { } static void params_parse(server_context& ctx_server, const backend::ModelOptions* request, - common_params & params) { + common_params & params, bool& use_llama_grammar_out) { // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809 @@ -307,6 +341,25 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions } } } + + // Parse use_llama_grammar option separately since we need to store it in BackendServiceImpl + // We'll set it in LoadModel after parsing options + bool use_llama_grammar_option = false; + for (int i = 0; i < request->options_size(); i++) { + std::string opt = request->options(i); + char *optname = strtok(&opt[0], ":"); + char *optval = strtok(NULL, ":"); + if (optval == NULL) { + optval = "true"; + } + + if (!strcmp(optname, "use_llama_grammar") || !strcmp(optname, "llama_grammar")) { + if (!strcmp(optval, "true") || !strcmp(optval, "1") || !strcmp(optval, "yes") || !strcmp(optval, "on") || !strcmp(optval, "enabled")) { + use_llama_grammar_option = true; + } + } + } + use_llama_grammar_out = use_llama_grammar_option; // Set params.n_parallel from environment variable if not set via options (fallback) if (params.n_parallel == 1) { @@ -438,6 +491,7 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions class BackendServiceImpl final : public backend::Backend::Service { private: server_context& ctx_server; + bool use_llama_grammar = false; // Flag to enable llama.cpp grammar generation from json_schema public: BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {} @@ -451,7 +505,9 @@ class BackendServiceImpl final : public backend::Backend::Service { grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) { // Implement LoadModel RPC common_params params; - params_parse(ctx_server, request, params); + bool use_llama_grammar_flag = false; + params_parse(ctx_server, request, params, use_llama_grammar_flag); + use_llama_grammar = use_llama_grammar_flag; common_init(); @@ -513,7 +569,7 @@ class BackendServiceImpl final : public backend::Backend::Service { } grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter* writer) override { - json data = parse_options(true, request, ctx_server); + json data = parse_options(true, request, ctx_server, use_llama_grammar); //Raise error if embeddings is set to true @@ -541,15 +597,28 @@ class BackendServiceImpl final : public backend::Backend::Service { } // Parse messages using llama.cpp's chat message parser + // This will automatically extract tool_calls from messages if they are embedded in the JSON + // (tool_calls are typically embedded in assistant messages in OpenAI format) auto chat_messages = common_chat_msgs_parse_oaicompat(messages_json); // Prepare chat template inputs common_chat_templates_inputs inputs; inputs.messages = chat_messages; - inputs.grammar = data.value("grammar", ""); + // Grammars are fully supported - passed from request->grammar() or request->jsonschema() + // When json_schema is provided, it takes precedence over grammar + if (data.contains("json_schema")) { + // json_schema is already a JSON object in data, convert to string for inputs + inputs.json_schema = data["json_schema"].dump(); + inputs.grammar = ""; // Don't set grammar when json_schema is provided (llama.cpp requirement) + } else { + inputs.grammar = data.value("grammar", ""); + inputs.json_schema = ""; // Not provided, use empty string + } inputs.use_jinja = ctx_server.params_base.use_jinja; inputs.add_generation_prompt = true; inputs.chat_template_kwargs = ctx_server.params_base.default_template_kwargs; + // Tool calls are embedded in messages and will be parsed by common_chat_msgs_parse_oaicompat + // tools and tool_choice use defaults (empty tools vector, COMMON_CHAT_TOOL_CHOICE_AUTO) // Apply chat template auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); @@ -695,7 +764,7 @@ class BackendServiceImpl final : public backend::Backend::Service { } grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) { - json data = parse_options(true, request, ctx_server); + json data = parse_options(true, request, ctx_server, use_llama_grammar); data["stream"] = false; //Raise error if embeddings is set to true @@ -722,15 +791,28 @@ class BackendServiceImpl final : public backend::Backend::Service { } // Parse messages using llama.cpp's chat message parser + // This will automatically extract tool_calls from messages if they are embedded in the JSON + // (tool_calls are typically embedded in assistant messages in OpenAI format) auto chat_messages = common_chat_msgs_parse_oaicompat(messages_json); // Prepare chat template inputs common_chat_templates_inputs inputs; inputs.messages = chat_messages; - inputs.grammar = data.value("grammar", ""); + // Grammars are fully supported - passed from request->grammar() or request->jsonschema() + // When json_schema is provided, it takes precedence over grammar + if (data.contains("json_schema")) { + // json_schema is already a JSON object in data, convert to string for inputs + inputs.json_schema = data["json_schema"].dump(); + inputs.grammar = ""; // Don't set grammar when json_schema is provided (llama.cpp requirement) + } else { + inputs.grammar = data.value("grammar", ""); + inputs.json_schema = ""; // Not provided, use empty string + } inputs.use_jinja = ctx_server.params_base.use_jinja; inputs.add_generation_prompt = true; inputs.chat_template_kwargs = ctx_server.params_base.default_template_kwargs; + // Tool calls are embedded in messages and will be parsed by common_chat_msgs_parse_oaicompat + // tools and tool_choice use defaults (empty tools vector, COMMON_CHAT_TOOL_CHOICE_AUTO) // Apply chat template auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); @@ -861,7 +943,7 @@ class BackendServiceImpl final : public backend::Backend::Service { grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) { - json body = parse_options(false, request, ctx_server); + json body = parse_options(false, request, ctx_server, use_llama_grammar); body["stream"] = false; @@ -1042,7 +1124,7 @@ class BackendServiceImpl final : public backend::Backend::Service { } grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) { - json body = parse_options(false, request, ctx_server); + json body = parse_options(false, request, ctx_server, use_llama_grammar); body["stream"] = false; json tokens_response = json::array(); diff --git a/core/config/model_config.go b/core/config/model_config.go index a5bd65cdcd69..470646a182eb 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -46,6 +46,7 @@ type ModelConfig struct { functionCallString, functionCallNameString string `yaml:"-" json:"-"` ResponseFormat string `yaml:"-" json:"-"` ResponseFormatMap map[string]interface{} `yaml:"-" json:"-"` + JsonSchema string `yaml:"-" json:"-"` // JSON schema string (when use_llama_grammar is enabled) FunctionsConfig functions.FunctionsConfig `yaml:"function" json:"function"` @@ -499,6 +500,24 @@ func (c *ModelConfig) GetModelConfigFile() string { return c.modelConfigFile } +// HasOption checks if a specific option is enabled in the Options slice +// Option format can be "option_name" or "option_name:value" +func (c *ModelConfig) HasOption(optionName string) bool { + for _, opt := range c.Options { + // Split by colon to get option name and value + parts := strings.SplitN(opt, ":", 2) + if len(parts) > 0 && parts[0] == optionName { + // Check if value is truthy (if provided) + if len(parts) == 1 { + return true // No value means enabled + } + value := strings.ToLower(strings.TrimSpace(parts[1])) + return value == "true" || value == "1" || value == "yes" || value == "on" || value == "enabled" + } + } + return false +} + type ModelConfigUsecases int const ( diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 4e91a532983c..1b57346ce691 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -241,14 +241,36 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator if err != nil { return err } - fs := &functions.JSONFunctionStructure{ - AnyOf: []functions.Item{d.JsonSchema.Schema}, - } - g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - input.Grammar = g + // Check if use_llama_grammar option is enabled + if config.HasOption("use_llama_grammar") || config.HasOption("llama_grammar") { + // Pass json_schema directly to llama.cpp for grammar generation + schemaBytes, err := json.Marshal(d.JsonSchema.Schema) + if err == nil { + config.JsonSchema = string(schemaBytes) + } else { + log.Error().Err(err).Msg("Failed marshaling json_schema") + // Fallback to generating grammar + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{d.JsonSchema.Schema}, + } + g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + input.Grammar = g + } else { + log.Error().Err(err).Msg("Failed generating grammar") + } + } } else { - log.Error().Err(err).Msg("Failed generating grammar") + // Generate grammar using LocalAI's implementation + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{d.JsonSchema.Schema}, + } + g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + input.Grammar = g + } else { + log.Error().Err(err).Msg("Failed generating grammar") + } } } } @@ -283,20 +305,56 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator funcs = funcs.Select(config.FunctionToCall()) } - // Update input grammar + // Update input grammar or json_schema based on use_llama_grammar option jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey) - g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - config.Grammar = g + if config.HasOption("use_llama_grammar") || config.HasOption("llama_grammar") { + // Pass json_schema directly to llama.cpp for grammar generation + schemaBytes, err := json.Marshal(jsStruct) + if err == nil { + config.JsonSchema = string(schemaBytes) + } else { + log.Error().Err(err).Msg("Failed marshaling json_schema for functions") + // Fallback to generating grammar + g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + config.Grammar = g + } else { + log.Error().Err(err).Msg("Failed generating grammar") + } + } } else { - log.Error().Err(err).Msg("Failed generating grammar") + // Generate grammar using LocalAI's implementation + g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + config.Grammar = g + } else { + log.Error().Err(err).Msg("Failed generating grammar") + } } case input.JSONFunctionGrammarObject != nil: - g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - config.Grammar = g + if config.HasOption("use_llama_grammar") || config.HasOption("llama_grammar") { + // Pass json_schema directly to llama.cpp for grammar generation + schemaBytes, err := json.Marshal(input.JSONFunctionGrammarObject) + if err == nil { + config.JsonSchema = string(schemaBytes) + } else { + log.Error().Err(err).Msg("Failed marshaling json_schema from JSONFunctionGrammarObject") + // Fallback to generating grammar + g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + config.Grammar = g + } else { + log.Error().Err(err).Msg("Failed generating grammar") + } + } } else { - log.Error().Err(err).Msg("Failed generating grammar") + // Generate grammar using LocalAI's implementation + g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + config.Grammar = g + } else { + log.Error().Err(err).Msg("Failed generating grammar") + } } default: // Force picking one of the functions by the request From 1d8a9eec3c140e212f20d7f51e08c18feecc0ed8 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 5 Nov 2025 21:49:13 +0100 Subject: [PATCH 04/14] Detect template exists if use tokenizer template is enabled Signed-off-by: Ettore Di Giacinto --- core/backend/llm.go | 1 + core/config/model_config.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/core/backend/llm.go b/core/backend/llm.go index ffc71497522f..92250f509063 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -96,6 +96,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im opts.Prompt = s opts.Messages = protoMessages opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate + opts.JsonSchema = c.JsonSchema opts.Images = images opts.Videos = videos opts.Audios = audios diff --git a/core/config/model_config.go b/core/config/model_config.go index 470646a182eb..f967f5b38430 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -493,7 +493,7 @@ func (c *ModelConfig) Validate() bool { } func (c *ModelConfig) HasTemplate() bool { - return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != "" + return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != "" || c.TemplateConfig.UseTokenizerTemplate } func (c *ModelConfig) GetModelConfigFile() string { From 49679194e93b079a21812187be869761adedee2d Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 5 Nov 2025 22:03:36 +0100 Subject: [PATCH 05/14] Better recognization of chat Signed-off-by: Ettore Di Giacinto --- core/backend/llm.go | 4 ++-- core/config/model_config.go | 3 +-- core/http/endpoints/openai/chat.go | 8 ++++---- core/http/endpoints/openai/inference.go | 3 ++- core/schema/openai.go | 2 ++ 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/core/backend/llm.go b/core/backend/llm.go index 92250f509063..3364c70f0e82 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -35,7 +35,7 @@ type TokenUsage struct { TimingTokenGeneration float64 } -func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) { +func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, jsonSchema string) (func() (LLMResponse, error), error) { modelFile := c.Model // Check if the modelFile exists, if it doesn't try to load it from the gallery @@ -96,7 +96,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im opts.Prompt = s opts.Messages = protoMessages opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate - opts.JsonSchema = c.JsonSchema + opts.JsonSchema = jsonSchema opts.Images = images opts.Videos = videos opts.Audios = audios diff --git a/core/config/model_config.go b/core/config/model_config.go index f967f5b38430..3c75a9148497 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -46,7 +46,6 @@ type ModelConfig struct { functionCallString, functionCallNameString string `yaml:"-" json:"-"` ResponseFormat string `yaml:"-" json:"-"` ResponseFormatMap map[string]interface{} `yaml:"-" json:"-"` - JsonSchema string `yaml:"-" json:"-"` // JSON schema string (when use_llama_grammar is enabled) FunctionsConfig functions.FunctionsConfig `yaml:"function" json:"function"` @@ -592,7 +591,7 @@ func (c *ModelConfig) HasUsecases(u ModelConfigUsecases) bool { // This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently. func (c *ModelConfig) GuessUsecases(u ModelConfigUsecases) bool { if (u & FLAG_CHAT) == FLAG_CHAT { - if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" { + if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" && !c.TemplateConfig.UseTokenizerTemplate { return false } } diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 1b57346ce691..d7cd6d83554b 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -246,7 +246,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // Pass json_schema directly to llama.cpp for grammar generation schemaBytes, err := json.Marshal(d.JsonSchema.Schema) if err == nil { - config.JsonSchema = string(schemaBytes) + input.JSONSchema = string(schemaBytes) } else { log.Error().Err(err).Msg("Failed marshaling json_schema") // Fallback to generating grammar @@ -311,7 +311,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // Pass json_schema directly to llama.cpp for grammar generation schemaBytes, err := json.Marshal(jsStruct) if err == nil { - config.JsonSchema = string(schemaBytes) + input.JSONSchema = string(schemaBytes) } else { log.Error().Err(err).Msg("Failed marshaling json_schema for functions") // Fallback to generating grammar @@ -336,7 +336,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // Pass json_schema directly to llama.cpp for grammar generation schemaBytes, err := json.Marshal(input.JSONFunctionGrammarObject) if err == nil { - config.JsonSchema = string(schemaBytes) + input.JSONSchema = string(schemaBytes) } else { log.Error().Err(err).Msg("Failed marshaling json_schema from JSONFunctionGrammarObject") // Fallback to generating grammar @@ -655,7 +655,7 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in audios = append(audios, m.StringAudios...) } - predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil) + predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, input.JSONSchema) if err != nil { log.Error().Err(err).Msg("model inference failed") return "", err diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index b7b256bad0c4..1a168c5acb7a 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -38,7 +38,8 @@ func ComputeChoices( } // get the model function to call for the result - predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback) + predFunc, err := backend.ModelInference( + req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, req.JSONSchema) if err != nil { return result, backend.TokenUsage{}, err } diff --git a/core/schema/openai.go b/core/schema/openai.go index 5506231e560b..8c1441d343c2 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -178,6 +178,8 @@ type OpenAIRequest struct { // A grammar to constrain the LLM output Grammar string `json:"grammar" yaml:"grammar"` + JSONSchema string `json:"json_schema" yaml:"json_schema"` + JSONFunctionGrammarObject *functions.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"` Backend string `json:"backend" yaml:"backend"` From 6efc93eaddb9a910e0ab4f7fb96c85cfe0a77be3 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Nov 2025 11:04:19 +0100 Subject: [PATCH 06/14] Fixes to support tool calls while using templates from tokenizer Signed-off-by: Ettore Di Giacinto --- core/backend/llm.go | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/core/backend/llm.go b/core/backend/llm.go index 3364c70f0e82..f743190a648f 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -3,7 +3,6 @@ package backend import ( "context" "encoding/json" - "fmt" "regexp" "slices" "strings" @@ -66,26 +65,39 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im // if we are using the tokenizer template, we need to convert the messages to proto messages // unless the prompt has already been tokenized (non-chat endpoints + functions) if c.TemplateConfig.UseTokenizerTemplate && s == "" { - protoMessages = make([]*proto.Message, len(messages), len(messages)) + protoMessages = make([]*proto.Message, len(messages)) for i, message := range messages { protoMessages[i] = &proto.Message{ Role: message.Role, } - switch ct := message.Content.(type) { - case string: - protoMessages[i].Content = ct - case []interface{}: - // If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here - data, _ := json.Marshal(ct) - resultData := []struct { - Text string `json:"text"` - }{} - json.Unmarshal(data, &resultData) - for _, r := range resultData { - protoMessages[i].Content += r.Text + // Handle message content - can be nil for tool call messages + if message.Content == nil { + // Tool call messages might have nil content, use empty string + protoMessages[i].Content = "" + } else { + switch ct := message.Content.(type) { + case string: + protoMessages[i].Content = ct + case []interface{}: + // If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here + data, _ := json.Marshal(ct) + resultData := []struct { + Text string `json:"text"` + }{} + json.Unmarshal(data, &resultData) + for _, r := range resultData { + protoMessages[i].Content += r.Text + } + default: + // For other types, try to convert to string or use empty string + if str, ok := ct.(string); ok { + protoMessages[i].Content = str + } else { + // Log warning but don't fail - use empty string for unsupported types + log.Warn().Msgf("unsupported type for schema.Message.Content for inference: %T, using empty string", ct) + protoMessages[i].Content = "" + } } - default: - return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct) } } } From 3a488b79b3658ef0f7f65e663cde7b07385d2108 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Nov 2025 12:26:40 +0100 Subject: [PATCH 07/14] Fixups Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 95 ++++++++++++++------------- core/http/endpoints/openai/chat.go | 68 +++++++------------ 2 files changed, 72 insertions(+), 91 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index c08925a44e49..7ed736f49fc3 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -91,7 +91,7 @@ static void start_llama_server(server_context& ctx_server) { ctx_server.queue_tasks.start_loop(); } -json parse_options(bool streaming, const backend::PredictOptions* predict, const server_context& ctx_server, bool use_llama_grammar = false) +json parse_options(bool streaming, const backend::PredictOptions* predict, const server_context& ctx_server) { // Create now a json data from the prediction options instead @@ -116,38 +116,55 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const // Handle grammar/json_schema based on use_llama_grammar flag // Priority: JsonSchema field > grammar field (when use_llama_grammar is enabled) + // IMPORTANT: server.cpp requires: if json_schema exists, grammar must NOT exist + // See server.cpp line 420: if (data.contains("json_schema") && !data.contains("grammar")) std::string json_schema_str = predict->jsonschema(); std::string grammar_str = predict->grammar(); + // Debug logging + if (!json_schema_str.empty()) { + SRV_INF("Received JsonSchema field: %s\n", json_schema_str.c_str()); + } + if (!grammar_str.empty()) { + SRV_INF("Received Grammar field: %s\n", grammar_str.c_str()); + } + if (!json_schema_str.empty()) { // JsonSchema field is set - use it directly (highest priority) try { json json_schema_obj = json::parse(json_schema_str); - data["json_schema"] = json_schema_obj; - // Don't set grammar when json_schema is provided (llama.cpp requirement) + // Ensure json_schema is a JSON object (not a string) + // json_schema_to_grammar expects a JSON object representing the schema + if (json_schema_obj.is_object() || json_schema_obj.is_array()) { + data["json_schema"] = json_schema_obj; + SRV_INF("Set json_schema in data: %s\n", json_schema_obj.dump(2).c_str()); + // Explicitly ensure grammar is NOT set when json_schema is provided + // This matches server.cpp's requirement: !data.contains("grammar") + // Do NOT set data["grammar"] here + } else { + // If it's not an object/array, it's invalid - fall back to grammar + SRV_INF("%s", "JsonSchema is not a valid JSON object/array, falling back to grammar\n"); + if (!grammar_str.empty()) { + data["grammar"] = grammar_str; + } + } } catch (const json::parse_error& e) { // If json_schema is invalid JSON, fall back to grammar + SRV_INF("Failed to parse JsonSchema as JSON: %s, falling back to grammar\n", e.what()); if (!grammar_str.empty()) { data["grammar"] = grammar_str; } } - } else if (use_llama_grammar && !grammar_str.empty()) { - // use_llama_grammar is enabled and no JsonSchema field - try to parse grammar as JSON - // This is a fallback for backward compatibility - try { - json test_json = json::parse(grammar_str); - // If parsing succeeds, it's JSON - pass as json_schema - data["json_schema"] = test_json; - // Don't set grammar when json_schema is provided (llama.cpp requirement) - } catch (const json::parse_error&) { - // Not valid JSON, use as regular grammar + } else if (!grammar_str.empty()) { data["grammar"] = grammar_str; - } - } else { - // Normal behavior: use grammar as-is - if (!grammar_str.empty()) { - data["grammar"] = grammar_str; - } + SRV_INF("Using grammar as-is: %s\n", grammar_str.c_str()); + } + + // Final check: ensure we don't have both json_schema and grammar set + // This should never happen with the logic above, but double-check for safety + if (data.contains("json_schema") && data.contains("grammar")) { + SRV_WRN("%s", "Both json_schema and grammar are set - removing grammar to match server.cpp requirement\n"); + data.erase("grammar"); } // Only set prompt if UseTokenizerTemplate is false or if no Messages are provided @@ -267,7 +284,7 @@ static void add_rpc_devices(std::string servers) { } static void params_parse(server_context& ctx_server, const backend::ModelOptions* request, - common_params & params, bool& use_llama_grammar_out) { + common_params & params) { // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809 @@ -316,6 +333,12 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions } else if (!strcmp(optval, "false") || !strcmp(optval, "0") || !strcmp(optval, "no") || !strcmp(optval, "off") || !strcmp(optval, "disabled")) { params.ctx_shift = false; } + } else if (!strcmp(optname, "use_jinja") || !strcmp(optname, "jinja")) { + if (!strcmp(optval, "true") || !strcmp(optval, "1") || !strcmp(optval, "yes") || !strcmp(optval, "on") || !strcmp(optval, "enabled")) { + params.use_jinja = true; + } else if (!strcmp(optval, "false") || !strcmp(optval, "0") || !strcmp(optval, "no") || !strcmp(optval, "off") || !strcmp(optval, "disabled")) { + params.use_jinja = false; + } } else if (!strcmp(optname, "cache_ram")) { if (optval != NULL) { try { @@ -342,25 +365,6 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions } } - // Parse use_llama_grammar option separately since we need to store it in BackendServiceImpl - // We'll set it in LoadModel after parsing options - bool use_llama_grammar_option = false; - for (int i = 0; i < request->options_size(); i++) { - std::string opt = request->options(i); - char *optname = strtok(&opt[0], ":"); - char *optval = strtok(NULL, ":"); - if (optval == NULL) { - optval = "true"; - } - - if (!strcmp(optname, "use_llama_grammar") || !strcmp(optname, "llama_grammar")) { - if (!strcmp(optval, "true") || !strcmp(optval, "1") || !strcmp(optval, "yes") || !strcmp(optval, "on") || !strcmp(optval, "enabled")) { - use_llama_grammar_option = true; - } - } - } - use_llama_grammar_out = use_llama_grammar_option; - // Set params.n_parallel from environment variable if not set via options (fallback) if (params.n_parallel == 1) { const char *env_parallel = std::getenv("LLAMACPP_PARALLEL"); @@ -491,7 +495,6 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions class BackendServiceImpl final : public backend::Backend::Service { private: server_context& ctx_server; - bool use_llama_grammar = false; // Flag to enable llama.cpp grammar generation from json_schema public: BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {} @@ -505,9 +508,7 @@ class BackendServiceImpl final : public backend::Backend::Service { grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) { // Implement LoadModel RPC common_params params; - bool use_llama_grammar_flag = false; - params_parse(ctx_server, request, params, use_llama_grammar_flag); - use_llama_grammar = use_llama_grammar_flag; + params_parse(ctx_server, request, params); common_init(); @@ -569,7 +570,7 @@ class BackendServiceImpl final : public backend::Backend::Service { } grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter* writer) override { - json data = parse_options(true, request, ctx_server, use_llama_grammar); + json data = parse_options(true, request, ctx_server); //Raise error if embeddings is set to true @@ -764,7 +765,7 @@ class BackendServiceImpl final : public backend::Backend::Service { } grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) { - json data = parse_options(true, request, ctx_server, use_llama_grammar); + json data = parse_options(true, request, ctx_server); data["stream"] = false; //Raise error if embeddings is set to true @@ -943,7 +944,7 @@ class BackendServiceImpl final : public backend::Backend::Service { grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) { - json body = parse_options(false, request, ctx_server, use_llama_grammar); + json body = parse_options(false, request, ctx_server); body["stream"] = false; @@ -1124,7 +1125,7 @@ class BackendServiceImpl final : public backend::Backend::Service { } grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) { - json body = parse_options(false, request, ctx_server, use_llama_grammar); + json body = parse_options(false, request, ctx_server); body["stream"] = false; json tokens_response = json::array(); diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index d7cd6d83554b..cdb0807d828c 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -282,6 +282,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator } switch { + // Generates grammar with internal's LocalAI engine case (!config.FunctionsConfig.GrammarConfig.NoGrammar || strictMode) && shouldUseFn: noActionGrammar := functions.Function{ Name: noActionName, @@ -307,54 +308,33 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // Update input grammar or json_schema based on use_llama_grammar option jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey) - if config.HasOption("use_llama_grammar") || config.HasOption("llama_grammar") { - // Pass json_schema directly to llama.cpp for grammar generation - schemaBytes, err := json.Marshal(jsStruct) - if err == nil { - input.JSONSchema = string(schemaBytes) - } else { - log.Error().Err(err).Msg("Failed marshaling json_schema for functions") - // Fallback to generating grammar - g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - config.Grammar = g - } else { - log.Error().Err(err).Msg("Failed generating grammar") - } - } + g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + config.Grammar = g } else { - // Generate grammar using LocalAI's implementation - g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - config.Grammar = g - } else { - log.Error().Err(err).Msg("Failed generating grammar") - } + log.Error().Err(err).Msg("Failed generating grammar") } case input.JSONFunctionGrammarObject != nil: - if config.HasOption("use_llama_grammar") || config.HasOption("llama_grammar") { - // Pass json_schema directly to llama.cpp for grammar generation - schemaBytes, err := json.Marshal(input.JSONFunctionGrammarObject) - if err == nil { - input.JSONSchema = string(schemaBytes) - } else { - log.Error().Err(err).Msg("Failed marshaling json_schema from JSONFunctionGrammarObject") - // Fallback to generating grammar - g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - config.Grammar = g - } else { - log.Error().Err(err).Msg("Failed generating grammar") - } - } + g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + config.Grammar = g } else { - // Generate grammar using LocalAI's implementation - g, err := input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - config.Grammar = g - } else { - log.Error().Err(err).Msg("Failed generating grammar") - } + log.Error().Err(err).Msg("Failed generating grammar") + } + // Pass jsonschema to the backend for grammar generation + case config.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn && config.TemplateConfig.UseTokenizerTemplate: + if config.FunctionToCall() != "" { + funcs = funcs.Select(config.FunctionToCall()) + } + + // Update input grammar or json_schema based on use_llama_grammar option + jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey) + schemaBytes, err := json.Marshal(jsStruct) + if err == nil { + log.Debug().Msgf("JSONSchema: %s", string(schemaBytes)) + input.JSONSchema = string(schemaBytes) + } else { + log.Error().Err(err).Msg("Failed marshaling json_schema for functions") } default: // Force picking one of the functions by the request From ca0aec4267ab422c89e9632cfc1427358affa4a2 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Nov 2025 22:02:48 +0100 Subject: [PATCH 08/14] Drop template guessing, fix passing tools to tokenizer Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 5 + backend/cpp/llama-cpp/grpc-server.cpp | 76 ++++++++- core/backend/llm.go | 41 +---- core/config/gguf.go | 214 +------------------------- core/config/model_config.go | 20 --- core/http/endpoints/openai/chat.go | 39 ++--- core/schema/openai.go | 33 ---- core/templates/cache.go | 87 +---------- core/templates/evaluator.go | 69 --------- core/templates/evaluator_test.go | 32 ---- pkg/functions/functions.go | 6 + pkg/functions/parse.go | 2 +- 12 files changed, 102 insertions(+), 522 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index a1788f46d80e..c2cf02b74817 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -383,6 +383,11 @@ message StatusResponse { message Message { string role = 1; string content = 2; + // Optional fields for OpenAI-compatible message format + string name = 3; // Tool name (for tool messages) + string tool_call_id = 4; // Tool call ID (for tool messages) + string reasoning_content = 5; // Reasoning content (for thinking models) + string tool_calls = 6; // Tool calls as JSON string (for assistant messages with tool calls) } message DetectOptions { diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 7ed736f49fc3..5e75125164f7 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -27,8 +27,6 @@ using grpc::Status; // END LocalAI - - ///////////////////////////////// //////////////////////////////// //////// LOCALAI code starts below here @@ -37,6 +35,14 @@ using grpc::Status; bool loaded_model; // TODO: add a mutex for this, but happens only once loading the model +// Forward declarations +static void start_llama_server(server_context& ctx_server); +static json parse_options(bool streaming, const backend::PredictOptions* predict, const server_context& ctx_server); +static ggml_type kv_cache_type_from_str(const std::string & s); +static std::string get_all_kv_cache_types(); +static void add_rpc_devices(std::string servers); +static void params_parse(server_context& ctx_server, const backend::ModelOptions* request, common_params & params); + static void start_llama_server(server_context& ctx_server) { LOG_INF("%s: starting llama server\n", __func__); @@ -307,6 +313,9 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions params.cpuparams.n_threads = request->threads(); params.n_gpu_layers = request->ngpulayers(); params.n_batch = request->nbatch(); + params.verbosity = INT_MAX; + // Enable all debug logs by setting verbosity threshold to maximum + common_log_set_verbosity_thold(INT_MAX); params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size" // Initialize ctx_shift to false by default (can be overridden by options) @@ -511,6 +520,8 @@ class BackendServiceImpl final : public backend::Backend::Service { params_parse(ctx_server, request, params); common_init(); + // Ensure debug logs are enabled after common_init() sets up logging + common_log_set_verbosity_thold(params.verbosity); llama_backend_init(); llama_numa_init(params.numa); @@ -587,13 +598,38 @@ class BackendServiceImpl final : public backend::Backend::Service { std::string prompt_str; // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) { - // Convert proto Messages to JSON format + // Convert proto Messages to JSON format compatible with common_chat_msgs_parse_oaicompat json messages_json = json::array(); for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); json msg_json; msg_json["role"] = msg.role(); - msg_json["content"] = msg.content(); + + // Handle content - can be string, null, or array + if (!msg.content().empty()) { + msg_json["content"] = msg.content(); + } + + // Add optional fields for OpenAI-compatible message format + if (!msg.name().empty()) { + msg_json["name"] = msg.name(); + } + if (!msg.tool_call_id().empty()) { + msg_json["tool_call_id"] = msg.tool_call_id(); + } + if (!msg.reasoning_content().empty()) { + msg_json["reasoning_content"] = msg.reasoning_content(); + } + if (!msg.tool_calls().empty()) { + // Parse tool_calls JSON string and add to message + try { + json tool_calls = json::parse(msg.tool_calls()); + msg_json["tool_calls"] = tool_calls; + } catch (const json::parse_error& e) { + SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what()); + } + } + messages_json.push_back(msg_json); } @@ -781,13 +817,41 @@ class BackendServiceImpl final : public backend::Backend::Service { std::string prompt_str; // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) { - // Convert proto Messages to JSON format + // Convert proto Messages to JSON format compatible with common_chat_msgs_parse_oaicompat json messages_json = json::array(); for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); json msg_json; msg_json["role"] = msg.role(); - msg_json["content"] = msg.content(); + + // Handle content - can be string, null, or array + if (msg.content().empty() && !msg.tool_calls().empty()) { + // Tool call messages may have null content + msg_json["content"] = json(); + } else { + msg_json["content"] = msg.content(); + } + + // Add optional fields for OpenAI-compatible message format + if (!msg.name().empty()) { + msg_json["name"] = msg.name(); + } + if (!msg.tool_call_id().empty()) { + msg_json["tool_call_id"] = msg.tool_call_id(); + } + if (!msg.reasoning_content().empty()) { + msg_json["reasoning_content"] = msg.reasoning_content(); + } + if (!msg.tool_calls().empty()) { + // Parse tool_calls JSON string and add to message + try { + json tool_calls = json::parse(msg.tool_calls()); + msg_json["tool_calls"] = tool_calls; + } catch (const json::parse_error& e) { + SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what()); + } + } + messages_json.push_back(msg_json); } diff --git a/core/backend/llm.go b/core/backend/llm.go index f743190a648f..b2d06fbd34cc 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -2,7 +2,6 @@ package backend import ( "context" - "encoding/json" "regexp" "slices" "strings" @@ -34,7 +33,7 @@ type TokenUsage struct { TimingTokenGeneration float64 } -func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, jsonSchema string) (func() (LLMResponse, error), error) { +func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, jsonSchema string) (func() (LLMResponse, error), error) { modelFile := c.Model // Check if the modelFile exists, if it doesn't try to load it from the gallery @@ -64,42 +63,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im var protoMessages []*proto.Message // if we are using the tokenizer template, we need to convert the messages to proto messages // unless the prompt has already been tokenized (non-chat endpoints + functions) - if c.TemplateConfig.UseTokenizerTemplate && s == "" { - protoMessages = make([]*proto.Message, len(messages)) - for i, message := range messages { - protoMessages[i] = &proto.Message{ - Role: message.Role, - } - // Handle message content - can be nil for tool call messages - if message.Content == nil { - // Tool call messages might have nil content, use empty string - protoMessages[i].Content = "" - } else { - switch ct := message.Content.(type) { - case string: - protoMessages[i].Content = ct - case []interface{}: - // If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here - data, _ := json.Marshal(ct) - resultData := []struct { - Text string `json:"text"` - }{} - json.Unmarshal(data, &resultData) - for _, r := range resultData { - protoMessages[i].Content += r.Text - } - default: - // For other types, try to convert to string or use empty string - if str, ok := ct.(string); ok { - protoMessages[i].Content = str - } else { - // Log warning but don't fail - use empty string for unsupported types - log.Warn().Msgf("unsupported type for schema.Message.Content for inference: %T, using empty string", ct) - protoMessages[i].Content = "" - } - } - } - } + if c.TemplateConfig.UseTokenizerTemplate && len(messages) > 0 { + protoMessages = messages.ToProto() } // in GRPC, the backend is supposed to answer to 1 single token if stream is not supported diff --git a/core/config/gguf.go b/core/config/gguf.go index edc7d523083f..9b379763dd02 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -1,151 +1,17 @@ package config import ( - "strings" - "github.com/mudler/LocalAI/pkg/xsysinfo" "github.com/rs/zerolog/log" gguf "github.com/gpustack/gguf-parser-go" ) -type familyType uint8 - -const ( - Unknown familyType = iota - LLaMa3 - CommandR - Phi3 - ChatML - Mistral03 - Gemma - DeepSeek2 -) - const ( defaultContextSize = 1024 defaultNGPULayers = 99999999 ) -type settingsConfig struct { - StopWords []string - TemplateConfig TemplateConfig - RepeatPenalty float64 -} - -// default settings to adopt with a given model family -var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{ - Gemma: { - RepeatPenalty: 1.0, - StopWords: []string{"<|im_end|>", "", ""}, - TemplateConfig: TemplateConfig{ - Chat: "{{.Input }}\nmodel\n", - ChatMessage: "{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}", - Completion: "{{.Input}}", - }, - }, - DeepSeek2: { - StopWords: []string{"<|end▁of▁sentence|>"}, - TemplateConfig: TemplateConfig{ - ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }} -{{ end -}} -{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<|end▁of▁sentence|>{{end}} -{{if eq .RoleName "system" -}}{{.Content}} -{{end -}}`, - Chat: "{{.Input -}}\nAssistant: ", - }, - }, - LLaMa3: { - StopWords: []string{"<|eot_id|>"}, - TemplateConfig: TemplateConfig{ - Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>", - ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>", - }, - }, - CommandR: { - TemplateConfig: TemplateConfig{ - Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", - Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> -You are a function calling AI model, you can call the following functions: -## Available Tools -{{range .Functions}} -- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }} -{{end}} -When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}} -<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`, - ChatMessage: `{{if eq .RoleName "user" -}} -<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> -{{- else if eq .RoleName "system" -}} -<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> -{{- else if eq .RoleName "assistant" -}} -<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> -{{- else if eq .RoleName "tool" -}} -<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|> -{{- else if .FunctionCall -}} -<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|> -{{- end -}}`, - }, - StopWords: []string{"<|END_OF_TURN_TOKEN|>"}, - }, - Phi3: { - TemplateConfig: TemplateConfig{ - Chat: "{{.Input}}\n<|assistant|>", - ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>", - Completion: "{{.Input}}", - }, - StopWords: []string{"<|end|>", "<|endoftext|>"}, - }, - ChatML: { - TemplateConfig: TemplateConfig{ - Chat: "{{.Input -}}\n<|im_start|>assistant", - Functions: `<|im_start|>system -You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: -{{range .Functions}} -{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} -{{end}} -For each function call return a json object with function name and arguments -<|im_end|> -{{.Input -}} -<|im_start|>assistant`, - ChatMessage: `<|im_start|>{{ .RoleName }} -{{ if .FunctionCall -}} -Function call: -{{ else if eq .RoleName "tool" -}} -Function response: -{{ end -}} -{{ if .Content -}} -{{.Content }} -{{ end -}} -{{ if .FunctionCall -}} -{{toJson .FunctionCall}} -{{ end -}}<|im_end|>`, - }, - StopWords: []string{"<|im_end|>", "", ""}, - }, - Mistral03: { - TemplateConfig: TemplateConfig{ - Chat: "{{.Input -}}", - Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`, - ChatMessage: `{{if eq .RoleName "user" -}} -[INST] {{.Content }} [/INST] -{{- else if .FunctionCall -}} -[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS] -{{- else if eq .RoleName "tool" -}} -[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS] -{{- else -}} -{{ .Content -}} -{{ end -}}`, - }, - StopWords: []string{"<|im_end|>", "", "", "<|eot_id|>", "<|end_of_text|>", "", "[/TOOL_CALLS]", "[/ACTIONS]"}, - }, -} - -// this maps well known template used in HF to model families defined above -var knownTemplates = map[string]familyType{ - `{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML, - `{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03, -} - func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { if defaultCtx == 0 && cfg.ContextSize == nil { @@ -216,81 +82,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { cfg.Name = f.Metadata().Name } - family := identifyFamily(f) - - if family == Unknown { - log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified") - return - } - - // identify template - settings, ok := defaultsSettings[family] - if ok { - cfg.TemplateConfig = settings.TemplateConfig - log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig) - if len(cfg.StopWords) == 0 { - cfg.StopWords = settings.StopWords - } - if cfg.RepeatPenalty == 0.0 { - cfg.RepeatPenalty = settings.RepeatPenalty - } - } else { - log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family") - } - - if cfg.HasTemplate() { - return - } - - // identify from well known templates first, otherwise use the raw jinja template - chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template") - if found { - // try to use the jinja template - cfg.TemplateConfig.JinjaTemplate = true - cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString() - } - -} - -func identifyFamily(f *gguf.GGUFFile) familyType { - - // identify from well known templates first - chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template") - if found && chatTemplate.ValueString() != "" { - if family, ok := knownTemplates[chatTemplate.ValueString()]; ok { - return family - } - } - - // otherwise try to identify from the model properties - arch := f.Architecture().Architecture - eosTokenID := f.Tokenizer().EOSTokenID - bosTokenID := f.Tokenizer().BOSTokenID - - isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2 - // WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID - - llama3 := arch == "llama" && eosTokenID == 128009 - commandR := arch == "command-r" && eosTokenID == 255001 - qwen2 := arch == "qwen2" - phi3 := arch == "phi-3" - gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma") - deepseek2 := arch == "deepseek2" - - switch { - case deepseek2: - return DeepSeek2 - case gemma: - return Gemma - case llama3: - return LLaMa3 - case commandR: - return CommandR - case phi3: - return Phi3 - case qwen2, isYI: - return ChatML - default: - return Unknown - } + // Instruct to use template from llama.cpp + cfg.TemplateConfig.UseTokenizerTemplate = true + //cfg.FunctionsConfig.GrammarConfig.NoGrammar = true } diff --git a/core/config/model_config.go b/core/config/model_config.go index 3c75a9148497..921b808e7678 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -265,8 +265,6 @@ type TemplateConfig struct { Multimodal string `yaml:"multimodal" json:"multimodal"` - JinjaTemplate bool `yaml:"jinja_template" json:"jinja_template"` - ReplyPrefix string `yaml:"reply_prefix" json:"reply_prefix"` } @@ -499,24 +497,6 @@ func (c *ModelConfig) GetModelConfigFile() string { return c.modelConfigFile } -// HasOption checks if a specific option is enabled in the Options slice -// Option format can be "option_name" or "option_name:value" -func (c *ModelConfig) HasOption(optionName string) bool { - for _, opt := range c.Options { - // Split by colon to get option name and value - parts := strings.SplitN(opt, ":", 2) - if len(parts) > 0 && parts[0] == optionName { - // Check if value is truthy (if provided) - if len(parts) == 1 { - return true // No value means enabled - } - value := strings.ToLower(strings.TrimSpace(parts[1])) - return value == "true" || value == "1" || value == "yes" || value == "on" || value == "enabled" - } - } - return false -} - type ModelConfigUsecases int const ( diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index cdb0807d828c..16c8bb37bf8e 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -217,6 +217,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator noActionDescription = config.FunctionsConfig.NoActionDescriptionName } + // If we are using a response format, we need to generate a grammar for it if config.ResponseFormatMap != nil { d := schema.ChatCompletionResponseFormat{} dat, err := json.Marshal(config.ResponseFormatMap) @@ -241,36 +242,14 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator if err != nil { return err } - // Check if use_llama_grammar option is enabled - if config.HasOption("use_llama_grammar") || config.HasOption("llama_grammar") { - // Pass json_schema directly to llama.cpp for grammar generation - schemaBytes, err := json.Marshal(d.JsonSchema.Schema) - if err == nil { - input.JSONSchema = string(schemaBytes) - } else { - log.Error().Err(err).Msg("Failed marshaling json_schema") - // Fallback to generating grammar - fs := &functions.JSONFunctionStructure{ - AnyOf: []functions.Item{d.JsonSchema.Schema}, - } - g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - input.Grammar = g - } else { - log.Error().Err(err).Msg("Failed generating grammar") - } - } + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{d.JsonSchema.Schema}, + } + g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + input.Grammar = g } else { - // Generate grammar using LocalAI's implementation - fs := &functions.JSONFunctionStructure{ - AnyOf: []functions.Item{d.JsonSchema.Schema}, - } - g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) - if err == nil { - input.Grammar = g - } else { - log.Error().Err(err).Msg("Failed generating grammar") - } + log.Error().Err(err).Msg("Failed generating grammar") } } } @@ -354,7 +333,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator // If we are using the tokenizer template, we don't need to process the messages // unless we are processing functions - if !config.TemplateConfig.UseTokenizerTemplate || shouldUseFn { + if !config.TemplateConfig.UseTokenizerTemplate { predInput = evaluator.TemplateMessages(*input, input.Messages, config, funcs, shouldUseFn) log.Debug().Msgf("Prompt (after templating): %s", predInput) diff --git a/core/schema/openai.go b/core/schema/openai.go index 8c1441d343c2..8fb0b740c488 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -76,39 +76,6 @@ type InputAudio struct { Data string `json:"data" yaml:"data"` } -type Message struct { - // The message role - Role string `json:"role,omitempty" yaml:"role"` - - // The message name (used for tools calls) - Name string `json:"name,omitempty" yaml:"name"` - - // The message content - Content interface{} `json:"content" yaml:"content"` - - StringContent string `json:"string_content,omitempty" yaml:"string_content,omitempty"` - StringImages []string `json:"string_images,omitempty" yaml:"string_images,omitempty"` - StringVideos []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"` - StringAudios []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"` - - // A result of a function call - FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"` - - ToolCalls []ToolCall `json:"tool_calls,omitempty" yaml:"tool_call,omitempty"` -} - -type ToolCall struct { - Index int `json:"index"` - ID string `json:"id"` - Type string `json:"type"` - FunctionCall FunctionCall `json:"function"` -} - -type FunctionCall struct { - Name string `json:"name,omitempty"` - Arguments string `json:"arguments"` -} - type OpenAIModel struct { ID string `json:"id"` Object string `json:"object"` diff --git a/core/templates/cache.go b/core/templates/cache.go index 1efce6606e8f..a9780284a784 100644 --- a/core/templates/cache.go +++ b/core/templates/cache.go @@ -11,9 +11,6 @@ import ( "github.com/mudler/LocalAI/pkg/utils" "github.com/Masterminds/sprig/v3" - - "github.com/nikolalohinski/gonja/v2" - "github.com/nikolalohinski/gonja/v2/exec" ) // Keep this in sync with config.TemplateConfig. Is there a more idiomatic way to accomplish this in go? @@ -21,17 +18,15 @@ import ( type TemplateType int type templateCache struct { - mu sync.Mutex - templatesPath string - templates map[TemplateType]map[string]*template.Template - jinjaTemplates map[TemplateType]map[string]*exec.Template + mu sync.Mutex + templatesPath string + templates map[TemplateType]map[string]*template.Template } func newTemplateCache(templatesPath string) *templateCache { tc := &templateCache{ - templatesPath: templatesPath, - templates: make(map[TemplateType]map[string]*template.Template), - jinjaTemplates: make(map[TemplateType]map[string]*exec.Template), + templatesPath: templatesPath, + templates: make(map[TemplateType]map[string]*template.Template), } return tc } @@ -85,78 +80,6 @@ func (tc *templateCache) loadTemplateIfExists(templateType TemplateType, templat return nil } -func (tc *templateCache) initializeJinjaTemplateMapKey(tt TemplateType) { - if _, ok := tc.jinjaTemplates[tt]; !ok { - tc.jinjaTemplates[tt] = make(map[string]*exec.Template) - } -} - -func (tc *templateCache) loadJinjaTemplateIfExists(templateType TemplateType, templateName string) error { - // Check if the template was already loaded - if _, ok := tc.jinjaTemplates[templateType][templateName]; ok { - return nil - } - - // Check if the model path exists - // skip any error here - we run anyway if a template does not exist - modelTemplateFile := fmt.Sprintf("%s.tmpl", templateName) - - dat := "" - file := filepath.Join(tc.templatesPath, modelTemplateFile) - - // Security check - if err := utils.VerifyPath(modelTemplateFile, tc.templatesPath); err != nil { - return fmt.Errorf("template file outside path: %s", file) - } - - // can either be a file in the system or a string with the template - if utils.ExistsInPath(tc.templatesPath, modelTemplateFile) { - d, err := os.ReadFile(file) - if err != nil { - return err - } - dat = string(d) - } else { - dat = templateName - } - - tmpl, err := gonja.FromString(dat) - if err != nil { - return err - } - tc.jinjaTemplates[templateType][templateName] = tmpl - - return nil -} - -func (tc *templateCache) evaluateJinjaTemplate(templateType TemplateType, templateNameOrContent string, in map[string]interface{}) (string, error) { - tc.mu.Lock() - defer tc.mu.Unlock() - - tc.initializeJinjaTemplateMapKey(templateType) - m, ok := tc.jinjaTemplates[templateType][templateNameOrContent] - if !ok { - // return "", fmt.Errorf("template not loaded: %s", templateName) - loadErr := tc.loadJinjaTemplateIfExists(templateType, templateNameOrContent) - if loadErr != nil { - return "", loadErr - } - m = tc.jinjaTemplates[templateType][templateNameOrContent] // ok is not important since we check m on the next line, and wealready checked - } - if m == nil { - return "", fmt.Errorf("failed loading a template for %s", templateNameOrContent) - } - - var buf bytes.Buffer - - data := exec.NewContext(in) - - if err := m.Execute(&buf, data); err != nil { - return "", err - } - return buf.String(), nil -} - func (tc *templateCache) evaluateTemplate(templateType TemplateType, templateNameOrContent string, in interface{}) (string, error) { tc.mu.Lock() defer tc.mu.Unlock() diff --git a/core/templates/evaluator.go b/core/templates/evaluator.go index 12c2080555f1..a3b46a1aa0ff 100644 --- a/core/templates/evaluator.go +++ b/core/templates/evaluator.go @@ -86,10 +86,6 @@ func (e *Evaluator) EvaluateTemplateForPrompt(templateType TemplateType, config return in.Input, nil } - if config.TemplateConfig.JinjaTemplate { - return e.evaluateJinjaTemplateForPrompt(templateType, template, in) - } - return e.cache.evaluateTemplate(templateType, template, in) } @@ -97,72 +93,7 @@ func (e *Evaluator) evaluateTemplateForChatMessage(templateName string, messageD return e.cache.evaluateTemplate(ChatMessageTemplate, templateName, messageData) } -func (e *Evaluator) templateJinjaChat(templateName string, messageData []ChatMessageTemplateData, funcs []functions.Function) (string, error) { - - conversation := make(map[string]interface{}) - messages := make([]map[string]interface{}, len(messageData)) - - // convert from ChatMessageTemplateData to what the jinja template expects - - for _, message := range messageData { - // TODO: this seems to cover minimum text templates. Can be expanded to cover more complex interactions - var data []byte - data, _ = json.Marshal(message.FunctionCall) - messages = append(messages, map[string]interface{}{ - "role": message.RoleName, - "content": message.Content, - "tool_call": string(data), - }) - } - - conversation["messages"] = messages - - // if tools are detected, add these - if len(funcs) > 0 { - conversation["tools"] = funcs - } - - return e.cache.evaluateJinjaTemplate(ChatMessageTemplate, templateName, conversation) -} - -func (e *Evaluator) evaluateJinjaTemplateForPrompt(templateType TemplateType, templateName string, in PromptTemplateData) (string, error) { - - conversation := make(map[string]interface{}) - - conversation["system_prompt"] = in.SystemPrompt - conversation["content"] = in.Input - - return e.cache.evaluateJinjaTemplate(templateType, templateName, conversation) -} - func (e *Evaluator) TemplateMessages(input schema.OpenAIRequest, messages []schema.Message, config *config.ModelConfig, funcs []functions.Function, shouldUseFn bool) string { - - if config.TemplateConfig.JinjaTemplate { - var messageData []ChatMessageTemplateData - for messageIndex, i := range messages { - fcall := i.FunctionCall - if len(i.ToolCalls) > 0 { - fcall = i.ToolCalls - } - messageData = append(messageData, ChatMessageTemplateData{ - SystemPrompt: config.SystemPrompt, - Role: config.Roles[i.Role], - RoleName: i.Role, - Content: i.StringContent, - FunctionCall: fcall, - FunctionName: i.Name, - LastMessage: messageIndex == (len(messages) - 1), - Function: config.Grammar != "" && (messageIndex == (len(messages) - 1)), - MessageIndex: messageIndex, - }) - } - - templatedInput, err := e.templateJinjaChat(config.TemplateConfig.ChatMessage, messageData, funcs) - if err == nil { - return templatedInput - } - } - var predInput string suppressConfigSystemPrompt := false mess := []string{} diff --git a/core/templates/evaluator_test.go b/core/templates/evaluator_test.go index 6d29c876b519..91a750a3514e 100644 --- a/core/templates/evaluator_test.go +++ b/core/templates/evaluator_test.go @@ -191,25 +191,6 @@ var chatMLTestMatch map[string]map[string]interface{} = map[string]map[string]in }, } -var jinjaTest map[string]map[string]interface{} = map[string]map[string]interface{}{ - "user": { - "expected": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - "config": &config.ModelConfig{ - TemplateConfig: config.TemplateConfig{ - ChatMessage: toolCallJinja, - JinjaTemplate: true, - }, - }, - "functions": []functions.Function{}, - "shouldUseFn": false, - "messages": []schema.Message{ - { - Role: "user", - StringContent: "A long time ago in a galaxy far, far away...", - }, - }, - }, -} var _ = Describe("Templates", func() { Context("chat message ChatML", func() { var evaluator *Evaluator @@ -237,17 +218,4 @@ var _ = Describe("Templates", func() { }) } }) - Context("chat message jinja", func() { - var evaluator *Evaluator - BeforeEach(func() { - evaluator = NewEvaluator("") - }) - for key := range jinjaTest { - foo := jinjaTest[key] - It("renders correctly `"+key+"`", func() { - templated := evaluator.TemplateMessages(schema.OpenAIRequest{}, foo["messages"].([]schema.Message), foo["config"].(*config.ModelConfig), foo["functions"].([]functions.Function), foo["shouldUseFn"].(bool)) - Expect(templated).To(Equal(foo["expected"]), templated) - }) - } - }) }) diff --git a/pkg/functions/functions.go b/pkg/functions/functions.go index 477a43bb7260..aa509a82251d 100644 --- a/pkg/functions/functions.go +++ b/pkg/functions/functions.go @@ -79,6 +79,12 @@ func (f Functions) ToJSONStructure(name, args string) JSONFunctionStructure { Type: "object", Properties: property, }) + /* + js.AnyOf = append(js.OneOf, Item{ + Type: "object", + Properties: property, + }) + */ } return js } diff --git a/pkg/functions/parse.go b/pkg/functions/parse.go index 48efb819ac91..49c4970a7609 100644 --- a/pkg/functions/parse.go +++ b/pkg/functions/parse.go @@ -53,7 +53,7 @@ type GrammarConfig struct { type GrammarTrigger struct { // Trigger is the string that triggers the grammar - Word string `yaml:"word"` + Word string `yaml:"word"` } // FunctionsConfig is the configuration for the tool/function call. From c988a1601614f260793513cb490dd708f318edc6 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Nov 2025 22:51:05 +0100 Subject: [PATCH 09/14] Extract grammar and other options from chat template, add schema struct Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 118 ++++++++++++ core/config/gguf.go | 2 +- core/schema/message.go | 85 +++++++++ core/schema/message_test.go | 265 ++++++++++++++++++++++++++ core/schema/schema_suite_test.go | 13 ++ 5 files changed, 482 insertions(+), 1 deletion(-) create mode 100644 core/schema/message.go create mode 100644 core/schema/message_test.go create mode 100644 core/schema/schema_suite_test.go diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 5e75125164f7..76169173b2d3 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -660,6 +660,65 @@ class BackendServiceImpl final : public backend::Backend::Service { // Apply chat template auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); prompt_str = chat_params.prompt; + + // Update data with grammar-related fields from chat_params + // These may include additional grammar info from the template (e.g., for tool calls) + if (!chat_params.grammar.empty()) { + // Only set grammar if json_schema is not already set (llama.cpp requirement) + if (!data.contains("json_schema")) { + data["grammar"] = chat_params.grammar; + } + } + data["grammar_lazy"] = chat_params.grammar_lazy; + + // Merge grammar triggers from chat_params + if (!chat_params.grammar_triggers.empty()) { + json grammar_triggers = json::array(); + for (const auto& trigger : chat_params.grammar_triggers) { + json trigger_json; + trigger_json["value"] = trigger.value; + // Always serialize as WORD type since upstream converts WORD to TOKEN internally + trigger_json["type"] = static_cast(COMMON_GRAMMAR_TRIGGER_TYPE_WORD); + grammar_triggers.push_back(trigger_json); + } + // Merge with existing triggers if any + if (data.contains("grammar_triggers") && data["grammar_triggers"].is_array()) { + for (const auto& existing_trigger : data["grammar_triggers"]) { + grammar_triggers.push_back(existing_trigger); + } + } + data["grammar_triggers"] = grammar_triggers; + } + + // Merge preserved tokens from chat_params + if (!chat_params.preserved_tokens.empty()) { + json preserved_tokens = json::array(); + for (const auto& token_str : chat_params.preserved_tokens) { + preserved_tokens.push_back(token_str); + } + // Merge with existing preserved tokens if any + if (data.contains("preserved_tokens") && data["preserved_tokens"].is_array()) { + for (const auto& existing_token : data["preserved_tokens"]) { + preserved_tokens.push_back(existing_token); + } + } + data["preserved_tokens"] = preserved_tokens; + } + + // Add additional stops from chat_params + if (!chat_params.additional_stops.empty()) { + if (!data.contains("stop") || !data["stop"].is_array()) { + data["stop"] = json::array(); + } + for (const auto& stop : chat_params.additional_stops) { + data["stop"].push_back(stop); + } + } + + // Set thinking_forced_open if present + if (chat_params.thinking_forced_open) { + data["thinking_forced_open"] = chat_params.thinking_forced_open; + } } else { // Use prompt directly from data if (data.contains("prompt") && data["prompt"].is_string()) { @@ -882,6 +941,65 @@ class BackendServiceImpl final : public backend::Backend::Service { // Apply chat template auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); prompt_str = chat_params.prompt; + + // Update data with grammar-related fields from chat_params + // These may include additional grammar info from the template (e.g., for tool calls) + if (!chat_params.grammar.empty()) { + // Only set grammar if json_schema is not already set (llama.cpp requirement) + if (!data.contains("json_schema")) { + data["grammar"] = chat_params.grammar; + } + } + data["grammar_lazy"] = chat_params.grammar_lazy; + + // Merge grammar triggers from chat_params + if (!chat_params.grammar_triggers.empty()) { + json grammar_triggers = json::array(); + for (const auto& trigger : chat_params.grammar_triggers) { + json trigger_json; + trigger_json["value"] = trigger.value; + // Always serialize as WORD type since upstream converts WORD to TOKEN internally + trigger_json["type"] = static_cast(COMMON_GRAMMAR_TRIGGER_TYPE_WORD); + grammar_triggers.push_back(trigger_json); + } + // Merge with existing triggers if any + if (data.contains("grammar_triggers") && data["grammar_triggers"].is_array()) { + for (const auto& existing_trigger : data["grammar_triggers"]) { + grammar_triggers.push_back(existing_trigger); + } + } + data["grammar_triggers"] = grammar_triggers; + } + + // Merge preserved tokens from chat_params + if (!chat_params.preserved_tokens.empty()) { + json preserved_tokens = json::array(); + for (const auto& token_str : chat_params.preserved_tokens) { + preserved_tokens.push_back(token_str); + } + // Merge with existing preserved tokens if any + if (data.contains("preserved_tokens") && data["preserved_tokens"].is_array()) { + for (const auto& existing_token : data["preserved_tokens"]) { + preserved_tokens.push_back(existing_token); + } + } + data["preserved_tokens"] = preserved_tokens; + } + + // Add additional stops from chat_params + if (!chat_params.additional_stops.empty()) { + if (!data.contains("stop") || !data["stop"].is_array()) { + data["stop"] = json::array(); + } + for (const auto& stop : chat_params.additional_stops) { + data["stop"].push_back(stop); + } + } + + // Set thinking_forced_open if present + if (chat_params.thinking_forced_open) { + data["thinking_forced_open"] = chat_params.thinking_forced_open; + } } else { // Use prompt directly from data if (data.contains("prompt") && data["prompt"].is_string()) { diff --git a/core/config/gguf.go b/core/config/gguf.go index 9b379763dd02..b4d51936c1ea 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -84,5 +84,5 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { // Instruct to use template from llama.cpp cfg.TemplateConfig.UseTokenizerTemplate = true - //cfg.FunctionsConfig.GrammarConfig.NoGrammar = true + cfg.FunctionsConfig.GrammarConfig.NoGrammar = true } diff --git a/core/schema/message.go b/core/schema/message.go new file mode 100644 index 000000000000..793f5fca234b --- /dev/null +++ b/core/schema/message.go @@ -0,0 +1,85 @@ +package schema + +import ( + "encoding/json" + + "github.com/rs/zerolog/log" + + "github.com/mudler/LocalAI/pkg/grpc/proto" +) + +type Message struct { + // The message role + Role string `json:"role,omitempty" yaml:"role"` + + // The message name (used for tools calls) + Name string `json:"name,omitempty" yaml:"name"` + + // The message content + Content interface{} `json:"content" yaml:"content"` + + StringContent string `json:"string_content,omitempty" yaml:"string_content,omitempty"` + StringImages []string `json:"string_images,omitempty" yaml:"string_images,omitempty"` + StringVideos []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"` + StringAudios []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"` + + // A result of a function call + FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"` + + ToolCalls []ToolCall `json:"tool_calls,omitempty" yaml:"tool_call,omitempty"` +} + +type ToolCall struct { + Index int `json:"index"` + ID string `json:"id"` + Type string `json:"type"` + FunctionCall FunctionCall `json:"function"` +} + +type FunctionCall struct { + Name string `json:"name,omitempty"` + Arguments string `json:"arguments"` +} + +type Messages []Message + +// MessagesToProto converts schema.Message slice to proto.Message slice +// It handles content conversion, tool_calls serialization, and optional fields +func (messages Messages) ToProto() []*proto.Message { + protoMessages := make([]*proto.Message, len(messages)) + for i, message := range messages { + protoMessages[i] = &proto.Message{ + Role: message.Role, + Name: message.Name, // needed by function calls + } + + switch ct := message.Content.(type) { + case string: + protoMessages[i].Content = ct + case []interface{}: + // If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here + data, _ := json.Marshal(ct) + resultData := []struct { + Text string `json:"text"` + }{} + json.Unmarshal(data, &resultData) + for _, r := range resultData { + protoMessages[i].Content += r.Text + } + } + + // Serialize tool_calls to JSON string if present + if len(message.ToolCalls) > 0 { + toolCallsJSON, err := json.Marshal(message.ToolCalls) + if err != nil { + log.Warn().Err(err).Msg("failed to marshal tool_calls to JSON") + } else { + protoMessages[i].ToolCalls = string(toolCallsJSON) + } + } + + // Note: tool_call_id and reasoning_content are not in schema.Message yet + // They may need to be added to schema.Message if needed in the future + } + return protoMessages +} diff --git a/core/schema/message_test.go b/core/schema/message_test.go new file mode 100644 index 000000000000..1dd586f7685b --- /dev/null +++ b/core/schema/message_test.go @@ -0,0 +1,265 @@ +package schema_test + +import ( + "encoding/json" + + . "github.com/mudler/LocalAI/core/schema" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("LLM tests", func() { + + Context("ToProtoMessages conversion", func() { + It("should convert basic message with string content", func() { + messages := Messages{ + { + Role: "user", + Content: "Hello, world!", + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("user")) + Expect(protoMessages[0].Content).To(Equal("Hello, world!")) + Expect(protoMessages[0].Name).To(BeEmpty()) + Expect(protoMessages[0].ToolCalls).To(BeEmpty()) + }) + + It("should convert message with nil content to empty string", func() { + messages := Messages{ + { + Role: "assistant", + Content: nil, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("assistant")) + Expect(protoMessages[0].Content).To(Equal("")) + }) + + It("should convert message with array content (multimodal)", func() { + messages := Messages{ + { + Role: "user", + Content: []interface{}{ + map[string]interface{}{ + "type": "text", + "text": "Hello", + }, + map[string]interface{}{ + "type": "text", + "text": " World", + }, + }, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("user")) + Expect(protoMessages[0].Content).To(Equal("Hello World")) + }) + + It("should convert message with tool_calls", func() { + messages := Messages{ + { + Role: "assistant", + Content: "I'll call a function", + ToolCalls: []ToolCall{ + { + Index: 0, + ID: "call_123", + Type: "function", + FunctionCall: FunctionCall{ + Name: "get_weather", + Arguments: `{"location": "San Francisco"}`, + }, + }, + }, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("assistant")) + Expect(protoMessages[0].Content).To(Equal("I'll call a function")) + Expect(protoMessages[0].ToolCalls).NotTo(BeEmpty()) + + // Verify tool_calls JSON is valid + var toolCalls []ToolCall + err := json.Unmarshal([]byte(protoMessages[0].ToolCalls), &toolCalls) + Expect(err).NotTo(HaveOccurred()) + Expect(toolCalls).To(HaveLen(1)) + Expect(toolCalls[0].ID).To(Equal("call_123")) + Expect(toolCalls[0].FunctionCall.Name).To(Equal("get_weather")) + }) + + It("should convert message with name field", func() { + messages := Messages{ + { + Role: "tool", + Content: "Function result", + Name: "get_weather", + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("tool")) + Expect(protoMessages[0].Content).To(Equal("Function result")) + Expect(protoMessages[0].Name).To(Equal("get_weather")) + }) + + It("should convert message with tool_calls and nil content", func() { + messages := Messages{ + { + Role: "assistant", + Content: nil, + ToolCalls: []ToolCall{ + { + Index: 0, + ID: "call_456", + Type: "function", + FunctionCall: FunctionCall{ + Name: "search", + Arguments: `{"query": "test"}`, + }, + }, + }, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("assistant")) + Expect(protoMessages[0].Content).To(Equal("")) + Expect(protoMessages[0].ToolCalls).NotTo(BeEmpty()) + + var toolCalls []ToolCall + err := json.Unmarshal([]byte(protoMessages[0].ToolCalls), &toolCalls) + Expect(err).NotTo(HaveOccurred()) + Expect(toolCalls).To(HaveLen(1)) + Expect(toolCalls[0].FunctionCall.Name).To(Equal("search")) + }) + + It("should convert multiple messages", func() { + messages := Messages{ + { + Role: "user", + Content: "Hello", + }, + { + Role: "assistant", + Content: "Hi there!", + }, + { + Role: "user", + Content: "How are you?", + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(3)) + Expect(protoMessages[0].Role).To(Equal("user")) + Expect(protoMessages[0].Content).To(Equal("Hello")) + Expect(protoMessages[1].Role).To(Equal("assistant")) + Expect(protoMessages[1].Content).To(Equal("Hi there!")) + Expect(protoMessages[2].Role).To(Equal("user")) + Expect(protoMessages[2].Content).To(Equal("How are you?")) + }) + + It("should handle empty messages slice", func() { + messages := Messages{} + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(0)) + }) + + It("should handle message with all optional fields", func() { + messages := Messages{ + { + Role: "assistant", + Content: "I'll help you", + Name: "test_tool", + ToolCalls: []ToolCall{ + { + Index: 0, + ID: "call_789", + Type: "function", + FunctionCall: FunctionCall{ + Name: "test_function", + Arguments: `{"param": "value"}`, + }, + }, + }, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("assistant")) + Expect(protoMessages[0].Content).To(Equal("I'll help you")) + Expect(protoMessages[0].Name).To(Equal("test_tool")) + Expect(protoMessages[0].ToolCalls).NotTo(BeEmpty()) + + var toolCalls []ToolCall + err := json.Unmarshal([]byte(protoMessages[0].ToolCalls), &toolCalls) + Expect(err).NotTo(HaveOccurred()) + Expect(toolCalls).To(HaveLen(1)) + }) + + It("should handle message with empty string content", func() { + messages := Messages{ + { + Role: "user", + Content: "", + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("user")) + Expect(protoMessages[0].Content).To(Equal("")) + }) + + It("should handle message with array content containing non-text parts", func() { + messages := Messages{ + { + Role: "user", + Content: []interface{}{ + map[string]interface{}{ + "type": "text", + "text": "Hello", + }, + map[string]interface{}{ + "type": "image", + "url": "https://example.com/image.jpg", + }, + }, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Role).To(Equal("user")) + // Should only extract text parts + Expect(protoMessages[0].Content).To(Equal("Hello")) + }) + }) +}) diff --git a/core/schema/schema_suite_test.go b/core/schema/schema_suite_test.go new file mode 100644 index 000000000000..685a23309451 --- /dev/null +++ b/core/schema/schema_suite_test.go @@ -0,0 +1,13 @@ +package schema_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestSchema(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "LocalAI Schema test suite") +} From 7c5ccba5ae0e14c41c6e02ed55e4b05a0eda90f3 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Nov 2025 16:37:15 +0100 Subject: [PATCH 10/14] WIP Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 2 + backend/cpp/llama-cpp/grpc-server.cpp | 437 +++++++++++++----------- core/backend/llm.go | 4 +- core/http/endpoints/openai/chat.go | 18 +- core/http/endpoints/openai/inference.go | 20 +- 5 files changed, 283 insertions(+), 198 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index c2cf02b74817..08bf0ebda5bc 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -155,6 +155,8 @@ message PredictOptions { repeated string Audios = 46; string CorrelationId = 47; string JsonSchema = 48; // JSON schema for grammar generation (when use_llama_grammar is enabled) + string Tools = 49; // JSON array of available tools/functions for tool calling + string ToolChoice = 50; // JSON string or object specifying tool choice behavior } // The response message containing the result diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 76169173b2d3..d6f155378ed0 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -178,6 +178,37 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const if (!predict->usetokenizertemplate() || predict->messages_size() == 0) { data["prompt"] = predict->prompt(); } + + // Extract tools and tool_choice from proto and add to data JSON + if (!predict->tools().empty()) { + try { + // Parse tools JSON string and add to data + json tools_json = json::parse(predict->tools()); + data["tools"] = tools_json; + SRV_INF("Extracted tools from proto: %s\n", predict->tools().c_str()); + } catch (const json::parse_error& e) { + SRV_WRN("Failed to parse tools JSON from proto: %s\n", e.what()); + } + } + if (!predict->toolchoice().empty()) { + try { + // Parse tool_choice JSON string + json tool_choice_json = json::parse(predict->toolchoice()); + // tool_choice can be a string ("auto", "none", "required") or an object + // Store it as-is (string or object) so we can convert object to "required" later when adding to body_json + if (tool_choice_json.is_string()) { + data["tool_choice"] = tool_choice_json.get(); + } else { + // Store object as-is so we can detect it later and convert to "required" + data["tool_choice"] = tool_choice_json; + } + SRV_INF("Extracted tool_choice from proto: %s\n", predict->toolchoice().c_str()); + } catch (const json::parse_error& e) { + // If parsing fails, treat as string + data["tool_choice"] = predict->toolchoice(); + SRV_INF("Extracted tool_choice as string: %s\n", predict->toolchoice().c_str()); + } + } data["ignore_eos"] = predict->ignoreeos(); data["embeddings"] = predict->embeddings(); @@ -313,9 +344,9 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions params.cpuparams.n_threads = request->threads(); params.n_gpu_layers = request->ngpulayers(); params.n_batch = request->nbatch(); - params.verbosity = INT_MAX; + //params.verbosity = INT_MAX; // Enable all debug logs by setting verbosity threshold to maximum - common_log_set_verbosity_thold(INT_MAX); + //common_log_set_verbosity_thold(INT_MAX); params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size" // Initialize ctx_shift to false by default (can be overridden by options) @@ -596,9 +627,11 @@ class BackendServiceImpl final : public backend::Backend::Service { std::vector tasks; std::string prompt_str; + std::vector files; // Declare files early so it's accessible in both branches // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) { - // Convert proto Messages to JSON format compatible with common_chat_msgs_parse_oaicompat + // Convert proto Messages to JSON format compatible with oaicompat_chat_params_parse + json body_json; json messages_json = json::array(); for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); @@ -606,8 +639,34 @@ class BackendServiceImpl final : public backend::Backend::Service { msg_json["role"] = msg.role(); // Handle content - can be string, null, or array + // For multimodal content, we'll embed images/audio from separate fields if (!msg.content().empty()) { msg_json["content"] = msg.content(); + } else if (request->images_size() > 0 || request->audios_size() > 0) { + // If no content but has images/audio, create content array + json content_array = json::array(); + if (request->images_size() > 0) { + for (int j = 0; j < request->images_size(); j++) { + json image_chunk; + image_chunk["type"] = "image_url"; + json image_url; + image_url["url"] = "data:image/jpeg;base64," + request->images(j); + image_chunk["image_url"] = image_url; + content_array.push_back(image_chunk); + } + } + if (request->audios_size() > 0) { + for (int j = 0; j < request->audios_size(); j++) { + json audio_chunk; + audio_chunk["type"] = "input_audio"; + json input_audio; + input_audio["data"] = request->audios(j); + input_audio["format"] = "wav"; // default, could be made configurable + audio_chunk["input_audio"] = input_audio; + content_array.push_back(audio_chunk); + } + } + msg_json["content"] = content_array; } // Add optional fields for OpenAI-compatible message format @@ -633,91 +692,69 @@ class BackendServiceImpl final : public backend::Backend::Service { messages_json.push_back(msg_json); } - // Parse messages using llama.cpp's chat message parser - // This will automatically extract tool_calls from messages if they are embedded in the JSON - // (tool_calls are typically embedded in assistant messages in OpenAI format) - auto chat_messages = common_chat_msgs_parse_oaicompat(messages_json); - - // Prepare chat template inputs - common_chat_templates_inputs inputs; - inputs.messages = chat_messages; - // Grammars are fully supported - passed from request->grammar() or request->jsonschema() - // When json_schema is provided, it takes precedence over grammar - if (data.contains("json_schema")) { - // json_schema is already a JSON object in data, convert to string for inputs - inputs.json_schema = data["json_schema"].dump(); - inputs.grammar = ""; // Don't set grammar when json_schema is provided (llama.cpp requirement) + body_json["messages"] = messages_json; + body_json["stream"] = true; // PredictStream is always streaming + + // Copy other relevant fields from data that oaicompat_chat_params_parse expects + // Tools and tool_choice are required for tool call grammar generation + if (data.contains("tools")) { + body_json["tools"] = data["tools"]; + std::string tools_str = data["tools"].dump(); + SRV_INF("Using tools from data: %s\n", tools_str.c_str()); } else { - inputs.grammar = data.value("grammar", ""); - inputs.json_schema = ""; // Not provided, use empty string + SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n"); } - inputs.use_jinja = ctx_server.params_base.use_jinja; - inputs.add_generation_prompt = true; - inputs.chat_template_kwargs = ctx_server.params_base.default_template_kwargs; - // Tool calls are embedded in messages and will be parsed by common_chat_msgs_parse_oaicompat - // tools and tool_choice use defaults (empty tools vector, COMMON_CHAT_TOOL_CHOICE_AUTO) - - // Apply chat template - auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); - prompt_str = chat_params.prompt; - - // Update data with grammar-related fields from chat_params - // These may include additional grammar info from the template (e.g., for tool calls) - if (!chat_params.grammar.empty()) { - // Only set grammar if json_schema is not already set (llama.cpp requirement) - if (!data.contains("json_schema")) { - data["grammar"] = chat_params.grammar; + if (data.contains("tool_choice")) { + // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string + // Convert object tool_choice to "required" (since a specific function is requested) + if (data["tool_choice"].is_string()) { + body_json["tool_choice"] = data["tool_choice"].get(); + } else if (data["tool_choice"].is_object()) { + // Object tool_choice means a specific function is requested, use "required" + body_json["tool_choice"] = "required"; + std::string tool_choice_obj_str = data["tool_choice"].dump(); + SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str()); + } else { + // Fallback: convert to string + body_json["tool_choice"] = data["tool_choice"].dump(); } + std::string tool_choice_str = body_json["tool_choice"].get(); + SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str()); + } else { + // Default to "auto" if not specified + body_json["tool_choice"] = "auto"; } - data["grammar_lazy"] = chat_params.grammar_lazy; - - // Merge grammar triggers from chat_params - if (!chat_params.grammar_triggers.empty()) { - json grammar_triggers = json::array(); - for (const auto& trigger : chat_params.grammar_triggers) { - json trigger_json; - trigger_json["value"] = trigger.value; - // Always serialize as WORD type since upstream converts WORD to TOKEN internally - trigger_json["type"] = static_cast(COMMON_GRAMMAR_TRIGGER_TYPE_WORD); - grammar_triggers.push_back(trigger_json); - } - // Merge with existing triggers if any - if (data.contains("grammar_triggers") && data["grammar_triggers"].is_array()) { - for (const auto& existing_trigger : data["grammar_triggers"]) { - grammar_triggers.push_back(existing_trigger); - } - } - data["grammar_triggers"] = grammar_triggers; + if (data.contains("json_schema")) { + body_json["json_schema"] = data["json_schema"]; } - - // Merge preserved tokens from chat_params - if (!chat_params.preserved_tokens.empty()) { - json preserved_tokens = json::array(); - for (const auto& token_str : chat_params.preserved_tokens) { - preserved_tokens.push_back(token_str); - } - // Merge with existing preserved tokens if any - if (data.contains("preserved_tokens") && data["preserved_tokens"].is_array()) { - for (const auto& existing_token : data["preserved_tokens"]) { - preserved_tokens.push_back(existing_token); - } - } - data["preserved_tokens"] = preserved_tokens; + // Don't copy grammar when using chat templates - the template will generate grammar + // for tool calls if tools are present. oaicompat_chat_params_parse throws an error + // if both grammar and tools are provided (see utils.hpp line 700-701) + // Grammar from templates will be merged into data after parsing + if (data.contains("response_format")) { + body_json["response_format"] = data["response_format"]; } - - // Add additional stops from chat_params - if (!chat_params.additional_stops.empty()) { - if (!data.contains("stop") || !data["stop"].is_array()) { - data["stop"] = json::array(); - } - for (const auto& stop : chat_params.additional_stops) { - data["stop"].push_back(stop); - } + if (data.contains("chat_template_kwargs")) { + body_json["chat_template_kwargs"] = data["chat_template_kwargs"]; } + + // Use the same approach as server.cpp: call oaicompat_chat_params_parse + // This handles all template application, grammar merging, etc. automatically + // Files extracted from multimodal content in messages will be added to the files vector + // Create parser options with current chat_templates to ensure tmpls is not null + oaicompat_parser_options parser_opt = ctx_server.oai_parser_opt; + parser_opt.tmpls = ctx_server.chat_templates.get(); // Ensure tmpls is set to current chat_templates + json parsed_data = oaicompat_chat_params_parse(body_json, parser_opt, files); + + // Extract the prompt from parsed data + prompt_str = parsed_data.at("prompt").get(); - // Set thinking_forced_open if present - if (chat_params.thinking_forced_open) { - data["thinking_forced_open"] = chat_params.thinking_forced_open; + // Merge all fields from parsed_data into data (grammar, grammar_triggers, preserved_tokens, etc.) + // This ensures all template-generated fields are included + for (const auto& item : parsed_data.items()) { + if (item.key() != "prompt") { // Don't overwrite prompt_str, we already extracted it + data[item.key()] = item.value(); + } } } else { // Use prompt directly from data @@ -733,26 +770,29 @@ class BackendServiceImpl final : public backend::Backend::Service { // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); - std::vector files; - const auto &images_data = data.find("image_data"); - if (images_data != data.end() && images_data->is_array()) - { - for (const auto &img : *images_data) + // If not using chat templates, extract files from image_data/audio_data fields + // (If using chat templates, files were already extracted by oaicompat_chat_params_parse) + //if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) { + const auto &images_data = data.find("image_data"); + if (images_data != data.end() && images_data->is_array()) { - auto decoded_data = base64_decode(img["data"].get()); - files.push_back(decoded_data); + for (const auto &img : *images_data) + { + auto decoded_data = base64_decode(img["data"].get()); + files.push_back(decoded_data); + } } - } - const auto &audio_data = data.find("audio_data"); - if (audio_data != data.end() && audio_data->is_array()) - { - for (const auto &audio : *audio_data) + const auto &audio_data = data.find("audio_data"); + if (audio_data != data.end() && audio_data->is_array()) { - auto decoded_data = base64_decode(audio["data"].get()); - files.push_back(decoded_data); + for (const auto &audio : *audio_data) + { + auto decoded_data = base64_decode(audio["data"].get()); + files.push_back(decoded_data); + } } - } + // } const bool has_mtmd = ctx_server.mctx != nullptr; @@ -874,9 +914,11 @@ class BackendServiceImpl final : public backend::Backend::Service { std::vector tasks; std::string prompt_str; + std::vector files; // Declare files early so it's accessible in both branches // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) { - // Convert proto Messages to JSON format compatible with common_chat_msgs_parse_oaicompat + // Convert proto Messages to JSON format compatible with oaicompat_chat_params_parse + json body_json; json messages_json = json::array(); for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); @@ -884,11 +926,37 @@ class BackendServiceImpl final : public backend::Backend::Service { msg_json["role"] = msg.role(); // Handle content - can be string, null, or array - if (msg.content().empty() && !msg.tool_calls().empty()) { + // For multimodal content, we'll embed images/audio from separate fields + if (!msg.content().empty()) { + msg_json["content"] = msg.content(); + } else if (request->images_size() > 0 || request->audios_size() > 0) { + // If no content but has images/audio, create content array + json content_array = json::array(); + if (request->images_size() > 0) { + for (int j = 0; j < request->images_size(); j++) { + json image_chunk; + image_chunk["type"] = "image_url"; + json image_url; + image_url["url"] = "data:image/jpeg;base64," + request->images(j); + image_chunk["image_url"] = image_url; + content_array.push_back(image_chunk); + } + } + if (request->audios_size() > 0) { + for (int j = 0; j < request->audios_size(); j++) { + json audio_chunk; + audio_chunk["type"] = "input_audio"; + json input_audio; + input_audio["data"] = request->audios(j); + input_audio["format"] = "wav"; // default, could be made configurable + audio_chunk["input_audio"] = input_audio; + content_array.push_back(audio_chunk); + } + } + msg_json["content"] = content_array; + } else if (!msg.tool_calls().empty()) { // Tool call messages may have null content msg_json["content"] = json(); - } else { - msg_json["content"] = msg.content(); } // Add optional fields for OpenAI-compatible message format @@ -914,91 +982,69 @@ class BackendServiceImpl final : public backend::Backend::Service { messages_json.push_back(msg_json); } - // Parse messages using llama.cpp's chat message parser - // This will automatically extract tool_calls from messages if they are embedded in the JSON - // (tool_calls are typically embedded in assistant messages in OpenAI format) - auto chat_messages = common_chat_msgs_parse_oaicompat(messages_json); - - // Prepare chat template inputs - common_chat_templates_inputs inputs; - inputs.messages = chat_messages; - // Grammars are fully supported - passed from request->grammar() or request->jsonschema() - // When json_schema is provided, it takes precedence over grammar - if (data.contains("json_schema")) { - // json_schema is already a JSON object in data, convert to string for inputs - inputs.json_schema = data["json_schema"].dump(); - inputs.grammar = ""; // Don't set grammar when json_schema is provided (llama.cpp requirement) + body_json["messages"] = messages_json; + body_json["stream"] = false; + + // Copy other relevant fields from data that oaicompat_chat_params_parse expects + // Tools and tool_choice are required for tool call grammar generation + if (data.contains("tools")) { + body_json["tools"] = data["tools"]; + std::string tools_str = data["tools"].dump(); + SRV_INF("Using tools from data: %s\n", tools_str.c_str()); } else { - inputs.grammar = data.value("grammar", ""); - inputs.json_schema = ""; // Not provided, use empty string + SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n"); } - inputs.use_jinja = ctx_server.params_base.use_jinja; - inputs.add_generation_prompt = true; - inputs.chat_template_kwargs = ctx_server.params_base.default_template_kwargs; - // Tool calls are embedded in messages and will be parsed by common_chat_msgs_parse_oaicompat - // tools and tool_choice use defaults (empty tools vector, COMMON_CHAT_TOOL_CHOICE_AUTO) - - // Apply chat template - auto chat_params = common_chat_templates_apply(ctx_server.chat_templates.get(), inputs); - prompt_str = chat_params.prompt; - - // Update data with grammar-related fields from chat_params - // These may include additional grammar info from the template (e.g., for tool calls) - if (!chat_params.grammar.empty()) { - // Only set grammar if json_schema is not already set (llama.cpp requirement) - if (!data.contains("json_schema")) { - data["grammar"] = chat_params.grammar; + if (data.contains("tool_choice")) { + // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string + // Convert object tool_choice to "required" (since a specific function is requested) + if (data["tool_choice"].is_string()) { + body_json["tool_choice"] = data["tool_choice"].get(); + } else if (data["tool_choice"].is_object()) { + // Object tool_choice means a specific function is requested, use "required" + body_json["tool_choice"] = "required"; + std::string tool_choice_obj_str = data["tool_choice"].dump(); + SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str()); + } else { + // Fallback: convert to string + body_json["tool_choice"] = data["tool_choice"].dump(); } + std::string tool_choice_str = body_json["tool_choice"].get(); + SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str()); + } else { + // Default to "auto" if not specified + body_json["tool_choice"] = "auto"; } - data["grammar_lazy"] = chat_params.grammar_lazy; - - // Merge grammar triggers from chat_params - if (!chat_params.grammar_triggers.empty()) { - json grammar_triggers = json::array(); - for (const auto& trigger : chat_params.grammar_triggers) { - json trigger_json; - trigger_json["value"] = trigger.value; - // Always serialize as WORD type since upstream converts WORD to TOKEN internally - trigger_json["type"] = static_cast(COMMON_GRAMMAR_TRIGGER_TYPE_WORD); - grammar_triggers.push_back(trigger_json); - } - // Merge with existing triggers if any - if (data.contains("grammar_triggers") && data["grammar_triggers"].is_array()) { - for (const auto& existing_trigger : data["grammar_triggers"]) { - grammar_triggers.push_back(existing_trigger); - } - } - data["grammar_triggers"] = grammar_triggers; + if (data.contains("json_schema")) { + body_json["json_schema"] = data["json_schema"]; } - - // Merge preserved tokens from chat_params - if (!chat_params.preserved_tokens.empty()) { - json preserved_tokens = json::array(); - for (const auto& token_str : chat_params.preserved_tokens) { - preserved_tokens.push_back(token_str); - } - // Merge with existing preserved tokens if any - if (data.contains("preserved_tokens") && data["preserved_tokens"].is_array()) { - for (const auto& existing_token : data["preserved_tokens"]) { - preserved_tokens.push_back(existing_token); - } - } - data["preserved_tokens"] = preserved_tokens; + // Don't copy grammar when using chat templates - the template will generate grammar + // for tool calls if tools are present. oaicompat_chat_params_parse throws an error + // if both grammar and tools are provided (see utils.hpp line 700-701) + // Grammar from templates will be merged into data after parsing + if (data.contains("response_format")) { + body_json["response_format"] = data["response_format"]; } - - // Add additional stops from chat_params - if (!chat_params.additional_stops.empty()) { - if (!data.contains("stop") || !data["stop"].is_array()) { - data["stop"] = json::array(); - } - for (const auto& stop : chat_params.additional_stops) { - data["stop"].push_back(stop); - } + if (data.contains("chat_template_kwargs")) { + body_json["chat_template_kwargs"] = data["chat_template_kwargs"]; } + + // Use the same approach as server.cpp: call oaicompat_chat_params_parse + // This handles all template application, grammar merging, etc. automatically + // Files extracted from multimodal content in messages will be added to the files vector + // Create parser options with current chat_templates to ensure tmpls is not null + oaicompat_parser_options parser_opt = ctx_server.oai_parser_opt; + parser_opt.tmpls = ctx_server.chat_templates.get(); // Ensure tmpls is set to current chat_templates + json parsed_data = oaicompat_chat_params_parse(body_json, parser_opt, files); - // Set thinking_forced_open if present - if (chat_params.thinking_forced_open) { - data["thinking_forced_open"] = chat_params.thinking_forced_open; + // Extract the prompt from parsed data + prompt_str = parsed_data.at("prompt").get(); + + // Merge all fields from parsed_data into data (grammar, grammar_triggers, preserved_tokens, etc.) + // This ensures all template-generated fields are included + for (const auto& item : parsed_data.items()) { + if (item.key() != "prompt") { // Don't overwrite prompt_str, we already extracted it + data[item.key()] = item.value(); + } } } else { // Use prompt directly from data @@ -1014,30 +1060,31 @@ class BackendServiceImpl final : public backend::Backend::Service { // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); - std::vector files; - const auto &images_data = data.find("image_data"); - // std::cout << "[PREDICT] Images data: " << images_data->dump(2) << std::endl; - - if (images_data != data.end() && images_data->is_array()) - { - std::cout << "[PREDICT] Processing " << images_data->size() << " images" << std::endl; - for (const auto &img : *images_data) + // If not using chat templates, extract files from image_data/audio_data fields + // (If using chat templates, files were already extracted by oaicompat_chat_params_parse) + // if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) { + const auto &images_data = data.find("image_data"); + if (images_data != data.end() && images_data->is_array()) { - std::cout << "[PREDICT] Processing image" << std::endl; - auto decoded_data = base64_decode(img["data"].get()); - files.push_back(decoded_data); + std::cout << "[PREDICT] Processing " << images_data->size() << " images" << std::endl; + for (const auto &img : *images_data) + { + std::cout << "[PREDICT] Processing image" << std::endl; + auto decoded_data = base64_decode(img["data"].get()); + files.push_back(decoded_data); + } } - } - const auto &audio_data = data.find("audio_data"); - if (audio_data != data.end() && audio_data->is_array()) - { - for (const auto &audio : *audio_data) + const auto &audio_data = data.find("audio_data"); + if (audio_data != data.end() && audio_data->is_array()) { - auto decoded_data = base64_decode(audio["data"].get()); - files.push_back(decoded_data); + for (const auto &audio : *audio_data) + { + auto decoded_data = base64_decode(audio["data"].get()); + files.push_back(decoded_data); + } } - } + // } // process files const bool has_mtmd = ctx_server.mctx != nullptr; diff --git a/core/backend/llm.go b/core/backend/llm.go index b2d06fbd34cc..9b461e616ff3 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -33,7 +33,7 @@ type TokenUsage struct { TimingTokenGeneration float64 } -func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, jsonSchema string) (func() (LLMResponse, error), error) { +func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, jsonSchema string, tools string, toolChoice string) (func() (LLMResponse, error), error) { modelFile := c.Model // Check if the modelFile exists, if it doesn't try to load it from the gallery @@ -77,6 +77,8 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima opts.Images = images opts.Videos = videos opts.Audios = audios + opts.Tools = tools + opts.ToolChoice = toolChoice tokenUsage := TokenUsage{} diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 16c8bb37bf8e..d84518bdca6b 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -614,7 +614,23 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in audios = append(audios, m.StringAudios...) } - predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, input.JSONSchema) + // Serialize tools and tool_choice to JSON strings + toolsJSON := "" + if len(input.Tools) > 0 { + toolsBytes, err := json.Marshal(input.Tools) + if err == nil { + toolsJSON = string(toolsBytes) + } + } + toolChoiceJSON := "" + if input.ToolsChoice != nil { + toolChoiceBytes, err := json.Marshal(input.ToolsChoice) + if err == nil { + toolChoiceJSON = string(toolChoiceBytes) + } + } + + predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, input.JSONSchema, toolsJSON, toolChoiceJSON) if err != nil { log.Error().Err(err).Msg("model inference failed") return "", err diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index 1a168c5acb7a..65a9f6962a4f 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -1,6 +1,8 @@ package openai import ( + "encoding/json" + "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" @@ -37,9 +39,25 @@ func ComputeChoices( audios = append(audios, m.StringAudios...) } + // Serialize tools and tool_choice to JSON strings + toolsJSON := "" + if len(req.Tools) > 0 { + toolsBytes, err := json.Marshal(req.Tools) + if err == nil { + toolsJSON = string(toolsBytes) + } + } + toolChoiceJSON := "" + if req.ToolsChoice != nil { + toolChoiceBytes, err := json.Marshal(req.ToolsChoice) + if err == nil { + toolChoiceJSON = string(toolChoiceBytes) + } + } + // get the model function to call for the result predFunc, err := backend.ModelInference( - req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, req.JSONSchema) + req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, req.JSONSchema, toolsJSON, toolChoiceJSON) if err != nil { return result, backend.TokenUsage{}, err } From 270f70adc48fde4de735c089c92ea772d4973784 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Nov 2025 16:56:15 +0100 Subject: [PATCH 11/14] WIP Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 174 +++++++++++++++++--------- core/http/endpoints/openai/chat.go | 14 --- 2 files changed, 116 insertions(+), 72 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index d6f155378ed0..0723866162de 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -695,42 +695,59 @@ class BackendServiceImpl final : public backend::Backend::Service { body_json["messages"] = messages_json; body_json["stream"] = true; // PredictStream is always streaming + // Check if grammar is provided from Go layer (NoGrammar=false) + // If grammar is provided, we must use it and NOT let template generate grammar from tools + // oaicompat_chat_params_parse throws an error if both grammar and tools are provided + bool has_grammar_from_go = data.contains("grammar") && + data["grammar"].is_string() && + !data["grammar"].get().empty(); + // Copy other relevant fields from data that oaicompat_chat_params_parse expects - // Tools and tool_choice are required for tool call grammar generation - if (data.contains("tools")) { - body_json["tools"] = data["tools"]; - std::string tools_str = data["tools"].dump(); - SRV_INF("Using tools from data: %s\n", tools_str.c_str()); - } else { - SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n"); - } - if (data.contains("tool_choice")) { - // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string - // Convert object tool_choice to "required" (since a specific function is requested) - if (data["tool_choice"].is_string()) { - body_json["tool_choice"] = data["tool_choice"].get(); - } else if (data["tool_choice"].is_object()) { - // Object tool_choice means a specific function is requested, use "required" - body_json["tool_choice"] = "required"; - std::string tool_choice_obj_str = data["tool_choice"].dump(); - SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str()); + // Tools and tool_choice are only passed when NoGrammar is true (grammar not provided) + // When grammar is provided from Go layer, we use it instead of template-generated grammar + if (!has_grammar_from_go) { + // NoGrammar=true: pass tools and let template generate grammar + if (data.contains("tools")) { + body_json["tools"] = data["tools"]; + std::string tools_str = data["tools"].dump(); + SRV_INF("Using tools from data (NoGrammar=true): %s\n", tools_str.c_str()); + } else { + SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n"); + } + if (data.contains("tool_choice")) { + // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string + // Convert object tool_choice to "required" (since a specific function is requested) + if (data["tool_choice"].is_string()) { + body_json["tool_choice"] = data["tool_choice"].get(); + } else if (data["tool_choice"].is_object()) { + // Object tool_choice means a specific function is requested, use "required" + body_json["tool_choice"] = "required"; + std::string tool_choice_obj_str = data["tool_choice"].dump(); + SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str()); + } else { + // Fallback: convert to string + body_json["tool_choice"] = data["tool_choice"].dump(); + } + std::string tool_choice_str = body_json["tool_choice"].get(); + SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str()); } else { - // Fallback: convert to string - body_json["tool_choice"] = data["tool_choice"].dump(); + // Default to "auto" if not specified + body_json["tool_choice"] = "auto"; } - std::string tool_choice_str = body_json["tool_choice"].get(); - SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str()); } else { - // Default to "auto" if not specified - body_json["tool_choice"] = "auto"; + // Grammar is provided from Go layer (NoGrammar=false) - use it, don't pass tools + SRV_INF("%s", "Grammar provided from Go layer - using it instead of template-generated grammar\n"); + // Grammar will be copied from data after parsing (it's already in data) } + if (data.contains("json_schema")) { body_json["json_schema"] = data["json_schema"]; } - // Don't copy grammar when using chat templates - the template will generate grammar - // for tool calls if tools are present. oaicompat_chat_params_parse throws an error - // if both grammar and tools are provided (see utils.hpp line 700-701) - // Grammar from templates will be merged into data after parsing + // If grammar is provided from Go layer, copy it to body_json so it's preserved + // (though oaicompat_chat_params_parse may not use it if tools are present) + if (has_grammar_from_go) { + body_json["grammar"] = data["grammar"]; + } if (data.contains("response_format")) { body_json["response_format"] = data["response_format"]; } @@ -749,11 +766,23 @@ class BackendServiceImpl final : public backend::Backend::Service { // Extract the prompt from parsed data prompt_str = parsed_data.at("prompt").get(); + // Preserve grammar from Go layer if it was provided (NoGrammar=false) + // Otherwise, use grammar from parsed_data (template-generated when NoGrammar=true) + json preserved_grammar; + if (has_grammar_from_go && data.contains("grammar")) { + preserved_grammar = data["grammar"]; + } + // Merge all fields from parsed_data into data (grammar, grammar_triggers, preserved_tokens, etc.) // This ensures all template-generated fields are included for (const auto& item : parsed_data.items()) { if (item.key() != "prompt") { // Don't overwrite prompt_str, we already extracted it - data[item.key()] = item.value(); + // If grammar was provided from Go layer, preserve it instead of template-generated grammar + if (item.key() == "grammar" && has_grammar_from_go && !preserved_grammar.is_null()) { + data["grammar"] = preserved_grammar; + } else { + data[item.key()] = item.value(); + } } } } else { @@ -985,42 +1014,59 @@ class BackendServiceImpl final : public backend::Backend::Service { body_json["messages"] = messages_json; body_json["stream"] = false; + // Check if grammar is provided from Go layer (NoGrammar=false) + // If grammar is provided, we must use it and NOT let template generate grammar from tools + // oaicompat_chat_params_parse throws an error if both grammar and tools are provided + bool has_grammar_from_go = data.contains("grammar") && + data["grammar"].is_string() && + !data["grammar"].get().empty(); + // Copy other relevant fields from data that oaicompat_chat_params_parse expects - // Tools and tool_choice are required for tool call grammar generation - if (data.contains("tools")) { - body_json["tools"] = data["tools"]; - std::string tools_str = data["tools"].dump(); - SRV_INF("Using tools from data: %s\n", tools_str.c_str()); - } else { - SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n"); - } - if (data.contains("tool_choice")) { - // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string - // Convert object tool_choice to "required" (since a specific function is requested) - if (data["tool_choice"].is_string()) { - body_json["tool_choice"] = data["tool_choice"].get(); - } else if (data["tool_choice"].is_object()) { - // Object tool_choice means a specific function is requested, use "required" - body_json["tool_choice"] = "required"; - std::string tool_choice_obj_str = data["tool_choice"].dump(); - SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str()); + // Tools and tool_choice are only passed when NoGrammar is true (grammar not provided) + // When grammar is provided from Go layer, we use it instead of template-generated grammar + if (!has_grammar_from_go) { + // NoGrammar=true: pass tools and let template generate grammar + if (data.contains("tools")) { + body_json["tools"] = data["tools"]; + std::string tools_str = data["tools"].dump(); + SRV_INF("Using tools from data (NoGrammar=true): %s\n", tools_str.c_str()); + } else { + SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n"); + } + if (data.contains("tool_choice")) { + // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string + // Convert object tool_choice to "required" (since a specific function is requested) + if (data["tool_choice"].is_string()) { + body_json["tool_choice"] = data["tool_choice"].get(); + } else if (data["tool_choice"].is_object()) { + // Object tool_choice means a specific function is requested, use "required" + body_json["tool_choice"] = "required"; + std::string tool_choice_obj_str = data["tool_choice"].dump(); + SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str()); + } else { + // Fallback: convert to string + body_json["tool_choice"] = data["tool_choice"].dump(); + } + std::string tool_choice_str = body_json["tool_choice"].get(); + SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str()); } else { - // Fallback: convert to string - body_json["tool_choice"] = data["tool_choice"].dump(); + // Default to "auto" if not specified + body_json["tool_choice"] = "auto"; } - std::string tool_choice_str = body_json["tool_choice"].get(); - SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str()); } else { - // Default to "auto" if not specified - body_json["tool_choice"] = "auto"; + // Grammar is provided from Go layer (NoGrammar=false) - use it, don't pass tools + SRV_INF("%s", "Grammar provided from Go layer - using it instead of template-generated grammar\n"); + // Grammar will be copied from data after parsing (it's already in data) } + if (data.contains("json_schema")) { body_json["json_schema"] = data["json_schema"]; } - // Don't copy grammar when using chat templates - the template will generate grammar - // for tool calls if tools are present. oaicompat_chat_params_parse throws an error - // if both grammar and tools are provided (see utils.hpp line 700-701) - // Grammar from templates will be merged into data after parsing + // If grammar is provided from Go layer, copy it to body_json so it's preserved + // (though oaicompat_chat_params_parse may not use it if tools are present) + if (has_grammar_from_go) { + body_json["grammar"] = data["grammar"]; + } if (data.contains("response_format")) { body_json["response_format"] = data["response_format"]; } @@ -1039,11 +1085,23 @@ class BackendServiceImpl final : public backend::Backend::Service { // Extract the prompt from parsed data prompt_str = parsed_data.at("prompt").get(); + // Preserve grammar from Go layer if it was provided (NoGrammar=false) + // Otherwise, use grammar from parsed_data (template-generated when NoGrammar=true) + json preserved_grammar; + if (has_grammar_from_go && data.contains("grammar")) { + preserved_grammar = data["grammar"]; + } + // Merge all fields from parsed_data into data (grammar, grammar_triggers, preserved_tokens, etc.) // This ensures all template-generated fields are included for (const auto& item : parsed_data.items()) { if (item.key() != "prompt") { // Don't overwrite prompt_str, we already extracted it - data[item.key()] = item.value(); + // If grammar was provided from Go layer, preserve it instead of template-generated grammar + if (item.key() == "grammar" && has_grammar_from_go && !preserved_grammar.is_null()) { + data["grammar"] = preserved_grammar; + } else { + data[item.key()] = item.value(); + } } } } else { diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index d84518bdca6b..d7b80bd8cb55 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -300,21 +300,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator } else { log.Error().Err(err).Msg("Failed generating grammar") } - // Pass jsonschema to the backend for grammar generation - case config.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn && config.TemplateConfig.UseTokenizerTemplate: - if config.FunctionToCall() != "" { - funcs = funcs.Select(config.FunctionToCall()) - } - // Update input grammar or json_schema based on use_llama_grammar option - jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey) - schemaBytes, err := json.Marshal(jsStruct) - if err == nil { - log.Debug().Msgf("JSONSchema: %s", string(schemaBytes)) - input.JSONSchema = string(schemaBytes) - } else { - log.Error().Err(err).Msg("Failed marshaling json_schema for functions") - } default: // Force picking one of the functions by the request if config.FunctionToCall() != "" { From 0238dcfaf5d2e204c7d44b12650108f0c50bfb73 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Nov 2025 18:31:48 +0100 Subject: [PATCH 12/14] Automatically set use_jinja Signed-off-by: Ettore Di Giacinto --- core/config/gguf.go | 1 + 1 file changed, 1 insertion(+) diff --git a/core/config/gguf.go b/core/config/gguf.go index b4d51936c1ea..ca1a651a3a51 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -85,4 +85,5 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { // Instruct to use template from llama.cpp cfg.TemplateConfig.UseTokenizerTemplate = true cfg.FunctionsConfig.GrammarConfig.NoGrammar = true + cfg.Options = append(cfg.Options, "use_jinja:true") } From c7379aa8553fc1660fb91292ad578c2c45c89ff5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Nov 2025 18:44:06 +0100 Subject: [PATCH 13/14] Cleanups, identify by default gguf models for chat Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 5 ++-- backend/cpp/llama-cpp/grpc-server.cpp | 39 +------------------------ core/backend/llm.go | 3 +- core/config/gguf.go | 1 + core/config/model_config.go | 21 ++++++++----- core/http/endpoints/openai/chat.go | 2 +- core/http/endpoints/openai/inference.go | 2 +- core/schema/openai.go | 2 -- 8 files changed, 20 insertions(+), 55 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index 08bf0ebda5bc..a367523de5c6 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -154,9 +154,8 @@ message PredictOptions { repeated string Videos = 45; repeated string Audios = 46; string CorrelationId = 47; - string JsonSchema = 48; // JSON schema for grammar generation (when use_llama_grammar is enabled) - string Tools = 49; // JSON array of available tools/functions for tool calling - string ToolChoice = 50; // JSON string or object specifying tool choice behavior + string Tools = 48; // JSON array of available tools/functions for tool calling + string ToolChoice = 49; // JSON string or object specifying tool choice behavior } // The response message containing the result diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 0723866162de..aa6275e7f3b6 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -124,55 +124,18 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const // Priority: JsonSchema field > grammar field (when use_llama_grammar is enabled) // IMPORTANT: server.cpp requires: if json_schema exists, grammar must NOT exist // See server.cpp line 420: if (data.contains("json_schema") && !data.contains("grammar")) - std::string json_schema_str = predict->jsonschema(); std::string grammar_str = predict->grammar(); // Debug logging - if (!json_schema_str.empty()) { - SRV_INF("Received JsonSchema field: %s\n", json_schema_str.c_str()); - } if (!grammar_str.empty()) { SRV_INF("Received Grammar field: %s\n", grammar_str.c_str()); } - if (!json_schema_str.empty()) { - // JsonSchema field is set - use it directly (highest priority) - try { - json json_schema_obj = json::parse(json_schema_str); - // Ensure json_schema is a JSON object (not a string) - // json_schema_to_grammar expects a JSON object representing the schema - if (json_schema_obj.is_object() || json_schema_obj.is_array()) { - data["json_schema"] = json_schema_obj; - SRV_INF("Set json_schema in data: %s\n", json_schema_obj.dump(2).c_str()); - // Explicitly ensure grammar is NOT set when json_schema is provided - // This matches server.cpp's requirement: !data.contains("grammar") - // Do NOT set data["grammar"] here - } else { - // If it's not an object/array, it's invalid - fall back to grammar - SRV_INF("%s", "JsonSchema is not a valid JSON object/array, falling back to grammar\n"); - if (!grammar_str.empty()) { - data["grammar"] = grammar_str; - } - } - } catch (const json::parse_error& e) { - // If json_schema is invalid JSON, fall back to grammar - SRV_INF("Failed to parse JsonSchema as JSON: %s, falling back to grammar\n", e.what()); - if (!grammar_str.empty()) { - data["grammar"] = grammar_str; - } - } - } else if (!grammar_str.empty()) { + if (!grammar_str.empty()) { data["grammar"] = grammar_str; SRV_INF("Using grammar as-is: %s\n", grammar_str.c_str()); } - // Final check: ensure we don't have both json_schema and grammar set - // This should never happen with the logic above, but double-check for safety - if (data.contains("json_schema") && data.contains("grammar")) { - SRV_WRN("%s", "Both json_schema and grammar are set - removing grammar to match server.cpp requirement\n"); - data.erase("grammar"); - } - // Only set prompt if UseTokenizerTemplate is false or if no Messages are provided // When UseTokenizerTemplate is true and Messages are provided, prompt will be set via chat templates in Predict/PredictStream if (!predict->usetokenizertemplate() || predict->messages_size() == 0) { diff --git a/core/backend/llm.go b/core/backend/llm.go index 9b461e616ff3..d6c7bc736e93 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -33,7 +33,7 @@ type TokenUsage struct { TimingTokenGeneration float64 } -func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, jsonSchema string, tools string, toolChoice string) (func() (LLMResponse, error), error) { +func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string) (func() (LLMResponse, error), error) { modelFile := c.Model // Check if the modelFile exists, if it doesn't try to load it from the gallery @@ -73,7 +73,6 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima opts.Prompt = s opts.Messages = protoMessages opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate - opts.JsonSchema = jsonSchema opts.Images = images opts.Videos = videos opts.Audios = audios diff --git a/core/config/gguf.go b/core/config/gguf.go index ca1a651a3a51..6d67d798bd9b 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -86,4 +86,5 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { cfg.TemplateConfig.UseTokenizerTemplate = true cfg.FunctionsConfig.GrammarConfig.NoGrammar = true cfg.Options = append(cfg.Options, "use_jinja:true") + cfg.KnownUsecaseStrings = append(cfg.KnownUsecaseStrings, "FLAG_CHAT") } diff --git a/core/config/model_config.go b/core/config/model_config.go index 921b808e7678..87fa05fce8ca 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -268,14 +268,7 @@ type TemplateConfig struct { ReplyPrefix string `yaml:"reply_prefix" json:"reply_prefix"` } -func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error { - type BCAlias ModelConfig - var aux BCAlias - if err := value.Decode(&aux); err != nil { - return err - } - *c = ModelConfig(aux) - +func (c *ModelConfig) syncKnownUsecasesFromString() { c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings) // Make sure the usecases are valid, we rewrite with what we identified c.KnownUsecaseStrings = []string{} @@ -284,6 +277,17 @@ func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error { c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k) } } +} + +func (c *ModelConfig) UnmarshalYAML(value *yaml.Node) error { + type BCAlias ModelConfig + var aux BCAlias + if err := value.Decode(&aux); err != nil { + return err + } + *c = ModelConfig(aux) + + c.syncKnownUsecasesFromString() return nil } @@ -460,6 +464,7 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { } guessDefaultsFromFile(cfg, lo.modelPath, ctx) + cfg.syncKnownUsecasesFromString() } func (c *ModelConfig) Validate() bool { diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index d7b80bd8cb55..d1ce156215c4 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -616,7 +616,7 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in } } - predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, input.JSONSchema, toolsJSON, toolChoiceJSON) + predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, toolsJSON, toolChoiceJSON) if err != nil { log.Error().Err(err).Msg("model inference failed") return "", err diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index 65a9f6962a4f..95d3ee24671d 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -57,7 +57,7 @@ func ComputeChoices( // get the model function to call for the result predFunc, err := backend.ModelInference( - req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, req.JSONSchema, toolsJSON, toolChoiceJSON) + req.Context, predInput, req.Messages, images, videos, audios, loader, config, bcl, o, tokenCallback, toolsJSON, toolChoiceJSON) if err != nil { return result, backend.TokenUsage{}, err } diff --git a/core/schema/openai.go b/core/schema/openai.go index 8fb0b740c488..49e18642f541 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -145,8 +145,6 @@ type OpenAIRequest struct { // A grammar to constrain the LLM output Grammar string `json:"grammar" yaml:"grammar"` - JSONSchema string `json:"json_schema" yaml:"json_schema"` - JSONFunctionGrammarObject *functions.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"` Backend string `json:"backend" yaml:"backend"` From 234c2aef88ae130e2a83ee259ced8872a77833f8 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 7 Nov 2025 18:52:05 +0100 Subject: [PATCH 14/14] Update docs Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 14 +++----- docs/content/docs/features/text-generation.md | 32 +++++++++++++++++-- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index aa6275e7f3b6..a33dc5c20da3 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -120,20 +120,14 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const data["n_keep"] = predict->nkeep(); data["seed"] = predict->seed(); - // Handle grammar/json_schema based on use_llama_grammar flag - // Priority: JsonSchema field > grammar field (when use_llama_grammar is enabled) - // IMPORTANT: server.cpp requires: if json_schema exists, grammar must NOT exist - // See server.cpp line 420: if (data.contains("json_schema") && !data.contains("grammar")) + std::string grammar_str = predict->grammar(); - // Debug logging - if (!grammar_str.empty()) { - SRV_INF("Received Grammar field: %s\n", grammar_str.c_str()); - } + if (!grammar_str.empty()) { - data["grammar"] = grammar_str; - SRV_INF("Using grammar as-is: %s\n", grammar_str.c_str()); + data["grammar"] = grammar_str; + SRV_INF("Using grammar: %s\n", grammar_str.c_str()); } // Only set prompt if UseTokenizerTemplate is false or if no Messages are provided diff --git a/docs/content/docs/features/text-generation.md b/docs/content/docs/features/text-generation.md index c4e637f7040c..70c2c7524b2e 100644 --- a/docs/content/docs/features/text-generation.md +++ b/docs/content/docs/features/text-generation.md @@ -128,16 +128,44 @@ Models can be also preloaded or downloaded on demand. To learn about model galle #### YAML configuration -To use the `llama.cpp` backend, specify `llama` as the backend in the YAML file: +To use the `llama.cpp` backend, specify `llama-cpp` as the backend in the YAML file: ```yaml name: llama -backend: llama +backend: llama-cpp parameters: # Relative to the models path model: file.gguf ``` +#### Backend Options + +The `llama.cpp` backend supports additional configuration options that can be specified in the `options` field of your model YAML configuration. These options allow fine-tuning of the backend behavior: + +| Option | Type | Description | Example | +|--------|------|-------------|---------| +| `use_jinja` or `jinja` | boolean | Enable Jinja2 template processing for chat templates. When enabled, the backend uses Jinja2-based chat templates from the model for formatting messages. | `use_jinja:true` | +| `context_shift` | boolean | Enable context shifting, which allows the model to dynamically adjust context window usage. | `context_shift:true` | +| `cache_ram` | integer | Set the maximum RAM cache size in MiB for KV cache. Use `-1` for unlimited (default). | `cache_ram:2048` | +| `parallel` or `n_parallel` | integer | Enable parallel request processing. When set to a value greater than 1, enables continuous batching for handling multiple requests concurrently. | `parallel:4` | +| `grpc_servers` or `rpc_servers` | string | Comma-separated list of gRPC server addresses for distributed inference. Allows distributing workload across multiple llama.cpp workers. | `grpc_servers:localhost:50051,localhost:50052` | + +**Example configuration with options:** + +```yaml +name: llama-model +backend: llama +parameters: + model: model.gguf +options: + - use_jinja:true + - context_shift:true + - cache_ram:4096 + - parallel:2 +``` + +**Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables. + #### Reference - [llama](https://github.com/ggerganov/llama.cpp)