From 97167eb633955bc8bd9daf36f67b76c1968dab3f Mon Sep 17 00:00:00 2001 From: corebonts Date: Fri, 14 Mar 2025 06:14:50 +0100 Subject: [PATCH 1/3] Add stream_options parameter for OpenAI compatible endpoints --- llamafile/server/doc/v1_chat_completions.md | 13 +++++++++ llamafile/server/v1_chat_completions.cpp | 29 +++++++++++++++++++++ llamafile/server/v1_completions.cpp | 29 +++++++++++++++++++++ 3 files changed, 71 insertions(+) diff --git a/llamafile/server/doc/v1_chat_completions.md b/llamafile/server/doc/v1_chat_completions.md index 4eda4aa77a..14d8df93d1 100644 --- a/llamafile/server/doc/v1_chat_completions.md +++ b/llamafile/server/doc/v1_chat_completions.md @@ -78,6 +78,19 @@ This endpoint supports the following features: will be named delta instead. It's assumed the client will reconstruct the full conversation. +- `stream_options`: `object|null` + + Options for streaming the API response. This parameter is only + applicable when `stream: true` is also specified. Default is `null`. + + - `include_usage`: `boolean|null` + + Whether to include usage statistics in the streaming response. Default is `false`. + + If set to `true`, a `usage` field with the usage information will be + included in an additional empty chunk. Note that all other chunks will + also contain this field, but with `null` value. + - `max_tokens`: `integer|null` Specifies an upper bound for the number of tokens that can be diff --git a/llamafile/server/v1_chat_completions.cpp b/llamafile/server/v1_chat_completions.cpp index 659815ee62..b362341fea 100644 --- a/llamafile/server/v1_chat_completions.cpp +++ b/llamafile/server/v1_chat_completions.cpp @@ -46,6 +46,7 @@ namespace server { struct V1ChatCompletionParams { bool stream = false; + bool stream_include_usage = false; long max_tokens = -1; long seed = _rand64(); double top_p = 1; @@ -276,6 +277,26 @@ Client::get_v1_chat_completions_params(V1ChatCompletionParams* params) if (!stream.isBool()) return send_error(400, "stream field must be boolean"); params->stream = stream.getBool(); + + // stream_options: object|null + // + // Options for the streaming response. + Json& stream_options = json["stream_options"]; + if (!stream_options.isNull()) { + if (!stream_options.isObject()) + return send_error(400, "stream_options field must be object"); + + // include_usage: bool|null + // + // Include usage also for streaming responses. The actual usage will be reported before + // the [DONE] message, but all chunks contain an empty usage field. + Json& include_usage = stream_options["include_usage"]; + if (!include_usage.isNull()) { + if (!include_usage.isBool()) + return send_error(400, "include_usage field must be boolean"); + params->stream_include_usage = include_usage.getBool(); + } + } } // max_tokens: integer|null @@ -570,6 +591,8 @@ Client::v1_chat_completions() return false; choice["delta"]["role"] = "assistant"; choice["delta"]["content"] = ""; + if (params->stream_include_usage) + response->json["usage"] = nullptr; } // prefill time @@ -661,6 +684,12 @@ Client::v1_chat_completions() if (params->stream) { choice["delta"]["content"] = ""; response->json["created"] = timespec_real().tv_sec; + if (params->stream_include_usage) { + Json& usage = response->json["usage"]; + usage["prompt_tokens"] = prompt_tokens; + usage["completion_tokens"] = completion_tokens; + usage["total_tokens"] = completion_tokens + prompt_tokens; + } response->content = make_event(response->json); choice.getObject().erase("delta"); if (!send_response_chunk(response->content)) diff --git a/llamafile/server/v1_completions.cpp b/llamafile/server/v1_completions.cpp index 5e11042000..b07fb8c8f9 100644 --- a/llamafile/server/v1_completions.cpp +++ b/llamafile/server/v1_completions.cpp @@ -46,6 +46,7 @@ struct V1CompletionParams { bool echo = false; bool stream = false; + bool stream_include_usage = false; long max_tokens = -1; long seed = _rand64(); double top_p = 1; @@ -248,6 +249,26 @@ Client::get_v1_completions_params(V1CompletionParams* params) if (!stream.isBool()) return send_error(400, "stream field must be boolean"); params->stream = stream.getBool(); + + // stream_options: object|null + // + // Options for the streaming response. + Json& stream_options = json["stream_options"]; + if (!stream_options.isNull()) { + if (!stream_options.isObject()) + return send_error(400, "stream_options field must be object"); + + // include_usage: bool|null + // + // Include usage also for streaming responses. The actual usage will be reported before + // the [DONE] message, but all chunks contain an empty usage field. + Json& include_usage = stream_options["include_usage"]; + if (!include_usage.isNull()) { + if (!include_usage.isBool()) + return send_error(400, "include_usage field must be boolean"); + params->stream_include_usage = include_usage.getBool(); + } + } } // max_tokens: integer|null @@ -441,6 +462,8 @@ Client::v1_completions() choice["delta"]["role"] = "assistant"; choice["delta"]["content"] = ""; response->json["created"] = timespec_real().tv_sec; + if (params->stream_include_usage) + response->json["usage"] = nullptr; response->content = make_event(response->json); choice.getObject().erase("delta"); if (!send_response_chunk(response->content)) @@ -494,6 +517,12 @@ Client::v1_completions() if (params->stream) { choice["text"] = ""; response->json["created"] = timespec_real().tv_sec; + if (params->stream_include_usage) { + Json& usage = response->json["usage"]; + usage["prompt_tokens"] = prompt_tokens; + usage["completion_tokens"] = completion_tokens; + usage["total_tokens"] = completion_tokens + prompt_tokens; + } response->content = make_event(response->json); if (!send_response_chunk(response->content)) return false; From 82b9831919515a9072dc7cc0f3f4ee46fa40caf8 Mon Sep 17 00:00:00 2001 From: corebonts Date: Fri, 14 Mar 2025 06:15:35 +0100 Subject: [PATCH 2/3] Add /v1/models endpoint This is usually used by OpenAI clients, like OpenWebUI for discovery and health check. --- llamafile/server/client.cpp | 2 ++ llamafile/server/client.h | 2 ++ llamafile/server/v1_models.cpp | 49 ++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 llamafile/server/v1_models.cpp diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index 2335997e3d..e142a5a219 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -699,6 +699,8 @@ Client::dispatcher() return v1_completions(); if (p1 == "v1/chat/completions") return v1_chat_completions(); + if (p1 == "v1/models") + return v1_models(); if (p1 == "slotz") return slotz(); if (p1 == "flagz") diff --git a/llamafile/server/client.h b/llamafile/server/client.h index 3b5e69cf2c..b9e00da41b 100644 --- a/llamafile/server/client.h +++ b/llamafile/server/client.h @@ -117,6 +117,8 @@ struct Client bool v1_chat_completions() __wur; bool get_v1_chat_completions_params(V1ChatCompletionParams*) __wur; + bool v1_models() __wur; + bool slotz() __wur; bool flagz() __wur; bool db_chat(int64_t) __wur; diff --git a/llamafile/server/v1_models.cpp b/llamafile/server/v1_models.cpp new file mode 100644 index 0000000000..14972f0d2e --- /dev/null +++ b/llamafile/server/v1_models.cpp @@ -0,0 +1,49 @@ +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2024 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "client.h" +#include "llama.cpp/llama.h" +#include "llamafile/json.h" +#include "llamafile/llamafile.h" +#include "llamafile/string.h" +#include + +using jt::Json; + +namespace lf { +namespace server { + +// Use it as reported model creation time +static const time_t model_creation_time = time(0); + +bool +Client::v1_models() +{ + jt::Json json; + json["object"] = "list"; + Json& model = json["data"][0]; + model["id"] = stripext(basename(FLAG_model)); + model["object"] = "model"; + model["created"] = model_creation_time; + model["owned_by"] = "llamafile"; + char* p = append_http_response_message(obuf_.p, 200); + p = stpcpy(p, "Content-Type: application/json\r\n"); + return send_response(obuf_.p, p, json.toString()); +} + +} // namespace server +} // namespace lf \ No newline at end of file From 01cc848a23154a603431b884a8f67d433d574eb4 Mon Sep 17 00:00:00 2001 From: corebonts Date: Fri, 14 Mar 2025 06:16:39 +0100 Subject: [PATCH 3/3] Add missing documentation for OpenAI compatible endpoints --- llamafile/server/doc/endpoints.md | 10 +- llamafile/server/doc/v1_completions.md | 122 +++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 llamafile/server/doc/v1_completions.md diff --git a/llamafile/server/doc/endpoints.md b/llamafile/server/doc/endpoints.md index e01cd1df79..06ec513f93 100644 --- a/llamafile/server/doc/endpoints.md +++ b/llamafile/server/doc/endpoints.md @@ -1,5 +1,9 @@ # LLaMAfiler Endpoints Reference -- [`/tokenize`](tokenize.md) -- [`/embedding`](embedding.md) -- [`/v1/chat/completions`](v1_chat_completions.md) +- [`/v1/tokenize`](tokenize.md) endpoint provides a robust interface for +converting text prompts into tokens. +- [`/v1/embedding`](embedding.md) endpoint provides a way to +transform textual prompts into numerical representations. +- [`/v1/chat/completions`](v1_chat_completions.md) endpoint lets you build a chatbot. +- [`/v1/completions`](v1_completions.md) returns a predicted completion for a given prompt. +- `/v1/models` returns a basic model info which is usually used by OpenAI clients for discovery and health check. diff --git a/llamafile/server/doc/v1_completions.md b/llamafile/server/doc/v1_completions.md new file mode 100644 index 0000000000..d62ba32b85 --- /dev/null +++ b/llamafile/server/doc/v1_completions.md @@ -0,0 +1,122 @@ +# LLaMAfiler Completions Endpoint + +The `/v1/completions` endpoint generates text completions based on a +given prompt. It provides a flexible interface for text generation, +allowing customization of parameters such as temperature, top-p +sampling, and maximum tokens. + +This endpoint supports the following features: + +1. Deterministic outputs using a fixed seed +2. Streaming responses for real-time token generation +3. Configurable stopping criteria for token generation + +## Request URIs + +- `/v1/completions` (OpenAI API compatible) + +## Request Methods + +- `POST` + +## Request Content Types + +- `application/json` must be used. + +## Request Parameters + +- `model`: `string` + + Specifies name of model to run. + + Only a single model is currently supported, so this field is simply + copied along to the response. In the future, this will matter. + + This field is required in the request. + +- `prompt`: `string` + + The input text that the model will generate a completion for. + + This field is required. + +- `stream`: `boolean|null` + + If this field is optionally set to true, then this endpoint will + return a text/event-stream using HTTP chunked transfer encoding. This + allows your chatbot to rapidly show text as it's being genearted. The + standard JSON response is slightly modified so that its message field + will be named delta instead. It's assumed the client will reconstruct + the full conversation. + +- `stream_options`: `object|null` + + Options for streaming the API response. This parameter is only + applicable when `stream: true` is also specified. Default is `null`. + + - `include_usage`: `boolean|null` + + Whether to include usage statistics in the streaming response. Default is `false`. + + If set to `true`, a `usage` field with the usage information will be + included in an additional empty chunk. Note that all other chunks will + also contain this field, but with `null` value. + +- `max_tokens`: `integer|null` + + Specifies an upper bound for the number of tokens that can be + generated for this completion. This can be used to control compute + and/or latency costs. + +- `top_p`: `number|null` + + May optionally be used to set the `top_p` sampling parameter. This + should be a floating point number. Setting this to 1.0 (the default) + will disable this feature. Setting this to, for example, 0.1, would + mean that only the top 10% probability tokens are considered. + + We generally recommend altering this or temperature but not both. + +- `temperature`: `number|null` + + Configures the randomness level of generated text. + + This field may be set to a value between 0.0 and 2.0 inclusive. It + defaults to 1.0. Lower numbers are more deterministic. Higher numbers + mean more randomness. + + We generally recommend altering this or top_p but not both. + +- `seed`: `integer|null` + + If specified, llamafiler will make its best effort to sample + deterministically, even when temperature is non-zero. This means that + repeated requests with the same seed and parameters should return the + same result. + +- `presence_penalty`: `number|null` + + Number between -2.0 and 2.0. Positive values penalize new tokens based + on whether they appear in the text so far, increasing the model's + likelihood to talk about new topics. + +- `frequency_penalty`: `number|null` + + Number between -2.0 and 2.0. Positive values penalize new tokens based + on their existing frequency in the text so far, decreasing the model's + likelihood to repeat the same line verbatim. + +- `user`: `string|null` + + A unique identifier representing your end-user, which can help + llamafiler to monitor and detect abuse. + +- `stop`: `string|array|null` + + Specifies up to 4 stop sequences where the API will cease text generation. + +## See Also + +- [LLaMAfiler Documentation Index](index.md) +- [LLaMAfiler Endpoints Reference](endpoints.md) +- [LLaMAfiler Technical Details](technical_details.md)