diff --git a/.air.toml b/.air.toml index 123d7272c2a4..1e8442249dc8 100644 --- a/.air.toml +++ b/.air.toml @@ -2,6 +2,7 @@ [build] cmd = "make build" bin = "./local-ai" +args_bin = [ "--debug" ] include_ext = ["go", "html", "yaml", "toml", "json", "txt", "md"] exclude_dir = ["pkg/grpc/proto"] delay = 1000 diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 4ec9613711c2..06647ea57891 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -15,6 +15,7 @@ import ( "github.com/mudler/LocalAI/pkg/functions" "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/LocalAI/pkg/utils" + "github.com/valyala/fasthttp" "github.com/gofiber/fiber/v2" "github.com/rs/zerolog/log" @@ -167,11 +168,12 @@ func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error { c1, cancel := context.WithCancel(re.applicationConfig.Context) // Monitor the Fiber context and cancel our context when it's canceled // This ensures we respect request cancellation without causing panics - go func() { - <-ctx.Context().Done() - // Fiber context was canceled (request completed or client disconnected) - cancel() - }() + go func(fiberCtx *fasthttp.RequestCtx) { + if fiberCtx != nil { + <-fiberCtx.Done() + cancel() + } + }(ctx.Context()) // Add the correlation ID to the new context ctxWithCorrelationID := context.WithValue(c1, CorrelationIDKey, correlationID) diff --git a/core/http/routes/ui.go b/core/http/routes/ui.go index c781bd88b021..8d8b2cebaf7f 100644 --- a/core/http/routes/ui.go +++ b/core/http/routes/ui.go @@ -91,11 +91,15 @@ func RegisterUIRoutes(app *fiber.App, } title := "LocalAI - Chat" + var modelContextSize *int for _, b := range modelConfigs { if b.HasUsecases(config.FLAG_CHAT) { modelThatCanBeUsed = b.Name title = "LocalAI - Chat with " + modelThatCanBeUsed + if b.LLMConfig.ContextSize != nil { + modelContextSize = b.LLMConfig.ContextSize + } break } } @@ -107,6 +111,7 @@ func RegisterUIRoutes(app *fiber.App, "GalleryConfig": galleryConfigs, "ModelsConfig": modelConfigs, "Model": modelThatCanBeUsed, + "ContextSize": modelContextSize, "Version": internal.PrintableVersion(), } @@ -120,6 +125,8 @@ func RegisterUIRoutes(app *fiber.App, modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY) galleryConfigs := map[string]*gallery.ModelConfig{} + modelName := c.Params("model") + var modelContextSize *int for _, m := range modelConfigs { cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name) @@ -127,15 +134,19 @@ func RegisterUIRoutes(app *fiber.App, continue } galleryConfigs[m.Name] = cfg + if m.Name == modelName && m.LLMConfig.ContextSize != nil { + modelContextSize = m.LLMConfig.ContextSize + } } summary := fiber.Map{ - "Title": "LocalAI - Chat with " + c.Params("model"), + "Title": "LocalAI - Chat with " + modelName, "BaseURL": utils.BaseURL(c), "ModelsConfig": modelConfigs, "GalleryConfig": galleryConfigs, "ModelsWithoutConfig": modelsWithoutConfig, - "Model": c.Params("model"), + "Model": modelName, + "ContextSize": modelContextSize, "Version": internal.PrintableVersion(), } diff --git a/core/http/static/chat.js b/core/http/static/chat.js index 4517b0baf5bf..993c956ac91a 100644 --- a/core/http/static/chat.js +++ b/core/http/static/chat.js @@ -30,21 +30,63 @@ SOFTWARE. // Global variable to store the current AbortController let currentAbortController = null; let currentReader = null; +let requestStartTime = null; +let tokensReceived = 0; +let tokensPerSecondInterval = null; +let lastTokensPerSecond = null; // Store the last calculated rate function toggleLoader(show) { const sendButton = document.getElementById('send-button'); const stopButton = document.getElementById('stop-button'); + const headerLoadingIndicator = document.getElementById('header-loading-indicator'); + const tokensPerSecondDisplay = document.getElementById('tokens-per-second'); if (show) { sendButton.style.display = 'none'; stopButton.style.display = 'block'; - document.getElementById("input").disabled = true; + if (headerLoadingIndicator) headerLoadingIndicator.style.display = 'block'; + // Reset token tracking + requestStartTime = Date.now(); + tokensReceived = 0; + + // Start updating tokens/second display + if (tokensPerSecondDisplay) { + tokensPerSecondDisplay.textContent = '-'; + updateTokensPerSecond(); + tokensPerSecondInterval = setInterval(updateTokensPerSecond, 500); // Update every 500ms + } } else { - document.getElementById("input").disabled = false; sendButton.style.display = 'block'; stopButton.style.display = 'none'; + if (headerLoadingIndicator) headerLoadingIndicator.style.display = 'none'; + // Stop updating but keep the last value visible + if (tokensPerSecondInterval) { + clearInterval(tokensPerSecondInterval); + tokensPerSecondInterval = null; + } + // Keep the last calculated rate visible + if (tokensPerSecondDisplay && lastTokensPerSecond !== null) { + tokensPerSecondDisplay.textContent = lastTokensPerSecond; + } currentAbortController = null; currentReader = null; + requestStartTime = null; + tokensReceived = 0; + } +} + +function updateTokensPerSecond() { + const tokensPerSecondDisplay = document.getElementById('tokens-per-second'); + if (!tokensPerSecondDisplay || !requestStartTime) return; + + const elapsedSeconds = (Date.now() - requestStartTime) / 1000; + if (elapsedSeconds > 0 && tokensReceived > 0) { + const rate = tokensReceived / elapsedSeconds; + const formattedRate = `${rate.toFixed(1)} tokens/s`; + tokensPerSecondDisplay.textContent = formattedRate; + lastTokensPerSecond = formattedRate; // Store the last calculated rate + } else if (elapsedSeconds > 0) { + tokensPerSecondDisplay.textContent = '-'; } } @@ -171,9 +213,30 @@ function readInputFile() { function submitPrompt(event) { event.preventDefault(); + + const input = document.getElementById("input"); + if (!input) return; + + const inputValue = input.value; + if (!inputValue.trim()) return; // Don't send empty messages + + // If already processing, abort the current request and send the new one + if (currentAbortController || currentReader) { + // Abort current request + stopRequest(); + // Small delay to ensure cleanup completes + setTimeout(() => { + // Continue with new request + processAndSendMessage(inputValue); + }, 100); + return; + } + + processAndSendMessage(inputValue); +} - const input = document.getElementById("input").value; - let fullInput = input; +function processAndSendMessage(inputValue) { + let fullInput = inputValue; // If there are file contents, append them to the input for the LLM if (fileContents.length > 0) { @@ -184,7 +247,7 @@ function submitPrompt(event) { } // Show file icons in chat if there are files - let displayContent = input; + let displayContent = inputValue; if (currentFileNames.length > 0) { displayContent += "\n\n"; currentFileNames.forEach(fileName => { @@ -201,9 +264,15 @@ function submitPrompt(event) { history[history.length - 1].content = fullInput; } - document.getElementById("input").value = ""; + const input = document.getElementById("input"); + if (input) input.value = ""; const systemPrompt = localStorage.getItem("system_prompt"); Alpine.nextTick(() => { document.getElementById('messages').scrollIntoView(false); }); + + // Reset token tracking before starting new request + requestStartTime = Date.now(); + tokensReceived = 0; + promptGPT(systemPrompt, fullInput); // Reset file contents and names after sending @@ -242,6 +311,12 @@ function readInputAudio() { async function promptGPT(systemPrompt, input) { const model = document.getElementById("chat-model").value; const mcpMode = Alpine.store("chat").mcpMode; + + // Reset current request usage tracking for new request + if (Alpine.store("chat")) { + Alpine.store("chat").tokenUsage.currentRequest = null; + } + toggleLoader(true); messages = Alpine.store("chat").messages(); @@ -373,10 +448,35 @@ async function promptGPT(systemPrompt, input) { // Handle MCP non-streaming response try { const data = await response.json(); - // MCP endpoint returns content in choices[0].text, not choices[0].message.content - const content = data.choices[0]?.text || ""; + + // Update token usage if present + if (data.usage) { + Alpine.store("chat").updateTokenUsage(data.usage); + } + + // MCP endpoint returns content in choices[0].message.content (chat completion format) + // Fallback to choices[0].text for backward compatibility (completion format) + const content = data.choices[0]?.message?.content || data.choices[0]?.text || ""; + + if (!content && (!data.choices || data.choices.length === 0)) { + Alpine.store("chat").add( + "assistant", + `Error: Empty response from MCP endpoint`, + ); + toggleLoader(false); + return; + } if (content) { + // Count tokens for rate calculation (MCP mode - full content at once) + // Prefer actual token count from API if available + if (data.usage && data.usage.completion_tokens) { + tokensReceived = data.usage.completion_tokens; + } else { + tokensReceived += Math.ceil(content.length / 4); + } + updateTokensPerSecond(); + // Process thinking tags using shared function const { regularContent, thinkingContent } = processThinkingTags(content); @@ -426,6 +526,9 @@ async function promptGPT(systemPrompt, input) { const addToChat = (token) => { const chatStore = Alpine.store("chat"); chatStore.add("assistant", token); + // Count tokens for rate calculation (rough estimate: count characters/4) + tokensReceived += Math.ceil(token.length / 4); + updateTokensPerSecond(); // Efficiently scroll into view without triggering multiple reflows // const messages = document.getElementById('messages'); // messages.scrollTop = messages.scrollHeight; @@ -456,6 +559,12 @@ async function promptGPT(systemPrompt, input) { if (line.startsWith("data: ")) { try { const jsonData = JSON.parse(line.substring(6)); + + // Update token usage if present + if (jsonData.usage) { + Alpine.store("chat").updateTokenUsage(jsonData.usage); + } + const token = jsonData.choices[0].delta.content; if (token) { @@ -480,6 +589,9 @@ async function promptGPT(systemPrompt, input) { // Handle content based on thinking state if (isThinking) { thinkingContent += token; + // Count tokens for rate calculation + tokensReceived += Math.ceil(token.length / 4); + updateTokensPerSecond(); // Update the last thinking message or create a new one if (lastThinkingMessageIndex === -1) { // Create new thinking message @@ -568,14 +680,71 @@ marked.setOptions({ }, }); +// Alpine store is now initialized in chat.html inline script to ensure it's available before Alpine processes the DOM +// Only initialize if not already initialized (to avoid duplicate initialization) document.addEventListener("alpine:init", () => { - Alpine.store("chat", { + // Check if store already exists (initialized in chat.html) + if (!Alpine.store("chat")) { + // Fallback initialization (should not be needed if chat.html loads correctly) + Alpine.store("chat", { history: [], languages: [undefined], systemPrompt: "", mcpMode: false, + contextSize: null, + tokenUsage: { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + currentRequest: null + }, clear() { this.history.length = 0; + this.tokenUsage = { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0, + currentRequest: null + }; + }, + updateTokenUsage(usage) { + // Usage values in streaming responses are cumulative totals for the current request + // We track session totals separately and only update when we see new (higher) values + if (usage) { + const currentRequest = this.tokenUsage.currentRequest || { + promptTokens: 0, + completionTokens: 0, + totalTokens: 0 + }; + + // Check if this is a new/updated usage (values increased) + const isNewUsage = + (usage.prompt_tokens !== undefined && usage.prompt_tokens > currentRequest.promptTokens) || + (usage.completion_tokens !== undefined && usage.completion_tokens > currentRequest.completionTokens) || + (usage.total_tokens !== undefined && usage.total_tokens > currentRequest.totalTokens); + + if (isNewUsage) { + // Update session totals: subtract old request usage, add new + this.tokenUsage.promptTokens = this.tokenUsage.promptTokens - currentRequest.promptTokens + (usage.prompt_tokens || 0); + this.tokenUsage.completionTokens = this.tokenUsage.completionTokens - currentRequest.completionTokens + (usage.completion_tokens || 0); + this.tokenUsage.totalTokens = this.tokenUsage.totalTokens - currentRequest.totalTokens + (usage.total_tokens || 0); + + // Store current request usage + this.tokenUsage.currentRequest = { + promptTokens: usage.prompt_tokens || 0, + completionTokens: usage.completion_tokens || 0, + totalTokens: usage.total_tokens || 0 + }; + } + } + }, + getRemainingTokens() { + if (!this.contextSize) return null; + return Math.max(0, this.contextSize - this.tokenUsage.totalTokens); + }, + getContextUsagePercent() { + if (!this.contextSize) return null; + return Math.min(100, (this.tokenUsage.totalTokens / this.contextSize) * 100); }, add(role, content, image, audio) { const N = this.history.length - 1; @@ -640,5 +809,6 @@ document.addEventListener("alpine:init", () => { audio: message.audio, })); }, - }); + }); + } }); diff --git a/core/http/views/backends.html b/core/http/views/backends.html index 754108bdf7cd..8e807b2151ff 100644 --- a/core/http/views/backends.html +++ b/core/http/views/backends.html @@ -305,7 +305,7 @@
Tags
@@ -439,6 +439,42 @@Non-streaming Mode Active
-Responses will be processed in full before display. This may take significantly longer (up to 5 minutes), especially on CPU-only systems.
+Non-streaming Mode Active
+Responses will be processed in full before display. This may take significantly longer (up to 5 minutes), especially on CPU-only systems.
+
Start chatting with the AI by typing a prompt in the input field below and pressing Enter.
-