From 97167eb633955bc8bd9daf36f67b76c1968dab3f Mon Sep 17 00:00:00 2001
From: corebonts <corebonts@gmail.com>
Date: Fri, 14 Mar 2025 06:14:50 +0100
Subject: [PATCH 1/3] Add stream_options parameter for OpenAI compatible
 endpoints

---
 llamafile/server/doc/v1_chat_completions.md | 13 +++++++++
 llamafile/server/v1_chat_completions.cpp    | 29 +++++++++++++++++++++
 llamafile/server/v1_completions.cpp         | 29 +++++++++++++++++++++
 3 files changed, 71 insertions(+)

diff --git a/llamafile/server/doc/v1_chat_completions.md b/llamafile/server/doc/v1_chat_completions.md
index 4eda4aa77a..14d8df93d1 100644
--- a/llamafile/server/doc/v1_chat_completions.md
+++ b/llamafile/server/doc/v1_chat_completions.md
@@ -78,6 +78,19 @@ This endpoint supports the following features:
   will be named delta instead. It's assumed the client will reconstruct
   the full conversation.
 
+- `stream_options`: `object|null`
+
+  Options for streaming the API response. This parameter is only
+  applicable when `stream: true` is also specified. Default is `null`.
+
+  - `include_usage`: `boolean|null`
+
+    Whether to include usage statistics in the streaming response. Default is `false`.
+
+    If set to `true`, a `usage` field with the usage information will be
+    included in an additional empty chunk. Note that all other chunks will
+    also contain this field, but with `null` value.
+
 - `max_tokens`: `integer|null`
 
   Specifies an upper bound for the number of tokens that can be
diff --git a/llamafile/server/v1_chat_completions.cpp b/llamafile/server/v1_chat_completions.cpp
index 659815ee62..b362341fea 100644
--- a/llamafile/server/v1_chat_completions.cpp
+++ b/llamafile/server/v1_chat_completions.cpp
@@ -46,6 +46,7 @@ namespace server {
 struct V1ChatCompletionParams
 {
     bool stream = false;
+    bool stream_include_usage = false;
     long max_tokens = -1;
     long seed = _rand64();
     double top_p = 1;
@@ -276,6 +277,26 @@ Client::get_v1_chat_completions_params(V1ChatCompletionParams* params)
         if (!stream.isBool())
             return send_error(400, "stream field must be boolean");
         params->stream = stream.getBool();
+
+        // stream_options: object|null
+        //
+        // Options for the streaming response.
+        Json& stream_options = json["stream_options"];
+        if (!stream_options.isNull()) {
+            if (!stream_options.isObject())
+                return send_error(400, "stream_options field must be object");
+
+            // include_usage: bool|null
+            //
+            // Include usage also for streaming responses. The actual usage will be reported before
+            // the [DONE] message, but all chunks contain an empty usage field.
+            Json& include_usage = stream_options["include_usage"];
+            if (!include_usage.isNull()) {
+                if (!include_usage.isBool())
+                    return send_error(400, "include_usage field must be boolean");
+                params->stream_include_usage = include_usage.getBool();
+            }
+        }
     }
 
     // max_tokens: integer|null
@@ -570,6 +591,8 @@ Client::v1_chat_completions()
             return false;
         choice["delta"]["role"] = "assistant";
         choice["delta"]["content"] = "";
+        if (params->stream_include_usage)
+            response->json["usage"] = nullptr;
     }
 
     // prefill time
@@ -661,6 +684,12 @@ Client::v1_chat_completions()
     if (params->stream) {
         choice["delta"]["content"] = "";
         response->json["created"] = timespec_real().tv_sec;
+        if (params->stream_include_usage) {
+            Json& usage = response->json["usage"];
+            usage["prompt_tokens"] = prompt_tokens;
+            usage["completion_tokens"] = completion_tokens;
+            usage["total_tokens"] = completion_tokens + prompt_tokens;
+        }
         response->content = make_event(response->json);
         choice.getObject().erase("delta");
         if (!send_response_chunk(response->content))
diff --git a/llamafile/server/v1_completions.cpp b/llamafile/server/v1_completions.cpp
index 5e11042000..b07fb8c8f9 100644
--- a/llamafile/server/v1_completions.cpp
+++ b/llamafile/server/v1_completions.cpp
@@ -46,6 +46,7 @@ struct V1CompletionParams
 {
     bool echo = false;
     bool stream = false;
+    bool stream_include_usage = false;
     long max_tokens = -1;
     long seed = _rand64();
     double top_p = 1;
@@ -248,6 +249,26 @@ Client::get_v1_completions_params(V1CompletionParams* params)
         if (!stream.isBool())
             return send_error(400, "stream field must be boolean");
         params->stream = stream.getBool();
+
+        // stream_options: object|null
+        //
+        // Options for the streaming response.
+        Json& stream_options = json["stream_options"];
+        if (!stream_options.isNull()) {
+            if (!stream_options.isObject())
+                return send_error(400, "stream_options field must be object");
+
+            // include_usage: bool|null
+            //
+            // Include usage also for streaming responses. The actual usage will be reported before
+            // the [DONE] message, but all chunks contain an empty usage field.
+            Json& include_usage = stream_options["include_usage"];
+            if (!include_usage.isNull()) {
+                if (!include_usage.isBool())
+                    return send_error(400, "include_usage field must be boolean");
+                params->stream_include_usage = include_usage.getBool();
+            }
+        }
     }
 
     // max_tokens: integer|null
@@ -441,6 +462,8 @@ Client::v1_completions()
         choice["delta"]["role"] = "assistant";
         choice["delta"]["content"] = "";
         response->json["created"] = timespec_real().tv_sec;
+        if (params->stream_include_usage)
+            response->json["usage"] = nullptr;
         response->content = make_event(response->json);
         choice.getObject().erase("delta");
         if (!send_response_chunk(response->content))
@@ -494,6 +517,12 @@ Client::v1_completions()
     if (params->stream) {
         choice["text"] = "";
         response->json["created"] = timespec_real().tv_sec;
+        if (params->stream_include_usage) {
+            Json& usage = response->json["usage"];
+            usage["prompt_tokens"] = prompt_tokens;
+            usage["completion_tokens"] = completion_tokens;
+            usage["total_tokens"] = completion_tokens + prompt_tokens;
+        }
         response->content = make_event(response->json);
         if (!send_response_chunk(response->content))
             return false;

From 82b9831919515a9072dc7cc0f3f4ee46fa40caf8 Mon Sep 17 00:00:00 2001
From: corebonts <corebonts@gmail.com>
Date: Fri, 14 Mar 2025 06:15:35 +0100
Subject: [PATCH 2/3] Add /v1/models endpoint

This is usually used by OpenAI clients, like OpenWebUI for discovery and
health check.
---
 llamafile/server/client.cpp    |  2 ++
 llamafile/server/client.h      |  2 ++
 llamafile/server/v1_models.cpp | 49 ++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)
 create mode 100644 llamafile/server/v1_models.cpp

diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
index 2335997e3d..e142a5a219 100644
--- a/llamafile/server/client.cpp
+++ b/llamafile/server/client.cpp
@@ -699,6 +699,8 @@ Client::dispatcher()
         return v1_completions();
     if (p1 == "v1/chat/completions")
         return v1_chat_completions();
+    if (p1 == "v1/models")
+        return v1_models();
     if (p1 == "slotz")
         return slotz();
     if (p1 == "flagz")
diff --git a/llamafile/server/client.h b/llamafile/server/client.h
index 3b5e69cf2c..b9e00da41b 100644
--- a/llamafile/server/client.h
+++ b/llamafile/server/client.h
@@ -117,6 +117,8 @@ struct Client
     bool v1_chat_completions() __wur;
     bool get_v1_chat_completions_params(V1ChatCompletionParams*) __wur;
 
+    bool v1_models() __wur;
+
     bool slotz() __wur;
     bool flagz() __wur;
     bool db_chat(int64_t) __wur;
diff --git a/llamafile/server/v1_models.cpp b/llamafile/server/v1_models.cpp
new file mode 100644
index 0000000000..14972f0d2e
--- /dev/null
+++ b/llamafile/server/v1_models.cpp
@@ -0,0 +1,49 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "client.h"
+#include "llama.cpp/llama.h"
+#include "llamafile/json.h"
+#include "llamafile/llamafile.h"
+#include "llamafile/string.h"
+#include <ctime>
+
+using jt::Json;
+
+namespace lf {
+namespace server {
+
+// Use it as reported model creation time
+static const time_t model_creation_time = time(0);
+
+bool
+Client::v1_models()
+{
+    jt::Json json;
+    json["object"] = "list";
+    Json& model = json["data"][0];
+    model["id"] = stripext(basename(FLAG_model));
+    model["object"] = "model";
+    model["created"] = model_creation_time;
+    model["owned_by"] = "llamafile";
+    char* p = append_http_response_message(obuf_.p, 200);
+    p = stpcpy(p, "Content-Type: application/json\r\n");
+    return send_response(obuf_.p, p, json.toString());
+}
+    
+} // namespace server
+} // namespace lf
\ No newline at end of file

From 01cc848a23154a603431b884a8f67d433d574eb4 Mon Sep 17 00:00:00 2001
From: corebonts <corebonts@gmail.com>
Date: Fri, 14 Mar 2025 06:16:39 +0100
Subject: [PATCH 3/3] Add missing documentation for OpenAI compatible endpoints

---
 llamafile/server/doc/endpoints.md      |  10 +-
 llamafile/server/doc/v1_completions.md | 122 +++++++++++++++++++++++++
 2 files changed, 129 insertions(+), 3 deletions(-)
 create mode 100644 llamafile/server/doc/v1_completions.md

diff --git a/llamafile/server/doc/endpoints.md b/llamafile/server/doc/endpoints.md
index e01cd1df79..06ec513f93 100644
--- a/llamafile/server/doc/endpoints.md
+++ b/llamafile/server/doc/endpoints.md
@@ -1,5 +1,9 @@
 # LLaMAfiler Endpoints Reference
 
-- [`/tokenize`](tokenize.md)
-- [`/embedding`](embedding.md)
-- [`/v1/chat/completions`](v1_chat_completions.md)
+- [`/v1/tokenize`](tokenize.md) endpoint provides a robust interface for
+converting text prompts into tokens.
+- [`/v1/embedding`](embedding.md) endpoint provides a way to
+transform textual prompts into numerical representations.
+- [`/v1/chat/completions`](v1_chat_completions.md) endpoint lets you build a chatbot.
+- [`/v1/completions`](v1_completions.md) returns a predicted completion for a given prompt.
+- `/v1/models` returns a basic model info which is usually used by OpenAI clients for discovery and health check.
diff --git a/llamafile/server/doc/v1_completions.md b/llamafile/server/doc/v1_completions.md
new file mode 100644
index 0000000000..d62ba32b85
--- /dev/null
+++ b/llamafile/server/doc/v1_completions.md
@@ -0,0 +1,122 @@
+# LLaMAfiler Completions Endpoint
+
+The `/v1/completions` endpoint generates text completions based on a
+given prompt. It provides a flexible interface for text generation,
+allowing customization of parameters such as temperature, top-p
+sampling, and maximum tokens.
+
+This endpoint supports the following features:
+
+1. Deterministic outputs using a fixed seed
+2. Streaming responses for real-time token generation
+3. Configurable stopping criteria for token generation
+
+## Request URIs
+
+- `/v1/completions` (OpenAI API compatible)
+
+## Request Methods
+
+- `POST`
+
+## Request Content Types
+
+- `application/json` must be used.
+
+## Request Parameters
+
+- `model`: `string`
+  
+  Specifies name of model to run.
+  
+  Only a single model is currently supported, so this field is simply
+  copied along to the response. In the future, this will matter.
+  
+  This field is required in the request.
+
+- `prompt`: `string`
+  
+  The input text that the model will generate a completion for.
+  
+  This field is required.
+
+- `stream`: `boolean|null`
+  
+  If this field is optionally set to true, then this endpoint will
+  return a text/event-stream using HTTP chunked transfer encoding. This
+  allows your chatbot to rapidly show text as it's being genearted. The
+  standard JSON response is slightly modified so that its message field
+  will be named delta instead. It's assumed the client will reconstruct
+  the full conversation.
+
+- `stream_options`: `object|null`
+
+  Options for streaming the API response. This parameter is only
+  applicable when `stream: true` is also specified. Default is `null`.
+
+  - `include_usage`: `boolean|null`
+
+    Whether to include usage statistics in the streaming response. Default is `false`.
+
+    If set to `true`, a `usage` field with the usage information will be
+    included in an additional empty chunk. Note that all other chunks will
+    also contain this field, but with `null` value.
+
+- `max_tokens`: `integer|null`
+
+  Specifies an upper bound for the number of tokens that can be
+  generated for this completion. This can be used to control compute
+  and/or latency costs.
+
+- `top_p`: `number|null`
+  
+  May optionally be used to set the `top_p` sampling parameter. This
+  should be a floating point number. Setting this to 1.0 (the default)
+  will disable this feature. Setting this to, for example, 0.1, would
+  mean that only the top 10% probability tokens are considered.
+  
+  We generally recommend altering this or temperature but not both.
+
+- `temperature`: `number|null`
+  
+  Configures the randomness level of generated text.
+  
+  This field may be set to a value between 0.0 and 2.0 inclusive. It
+  defaults to 1.0. Lower numbers are more deterministic. Higher numbers
+  mean more randomness.
+  
+  We generally recommend altering this or top_p but not both.
+
+- `seed`: `integer|null`
+  
+  If specified, llamafiler will make its best effort to sample
+  deterministically, even when temperature is non-zero. This means that
+  repeated requests with the same seed and parameters should return the
+  same result.
+
+- `presence_penalty`: `number|null`
+  
+  Number between -2.0 and 2.0. Positive values penalize new tokens based
+  on whether they appear in the text so far, increasing the model's
+  likelihood to talk about new topics.
+
+- `frequency_penalty`: `number|null`
+  
+  Number between -2.0 and 2.0. Positive values penalize new tokens based
+  on their existing frequency in the text so far, decreasing the model's
+  likelihood to repeat the same line verbatim.
+
+- `user`: `string|null`
+  
+  A unique identifier representing your end-user, which can help
+  llamafiler to monitor and detect abuse.
+
+- `stop`: `string|array<string>|null`
+  
+  Specifies up to 4 stop sequences where the API will cease text generation.
+
+## See Also
+
+- [LLaMAfiler Documentation Index](index.md)
+- [LLaMAfiler Endpoints Reference](endpoints.md)
+- [LLaMAfiler Technical Details](technical_details.md)