mudler · mudler · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
diff --git a/.air.toml b/.air.toml
@@ -2,6 +2,7 @@
 [build]
 cmd = "make build"
 bin = "./local-ai"
+args_bin = [ "--debug" ]
 include_ext = ["go", "html", "yaml", "toml", "json", "txt", "md"]
 exclude_dir = ["pkg/grpc/proto"]
 delay = 1000
diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go
@@ -15,6 +15,7 @@ import (
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
+	"github.com/valyala/fasthttp"
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -167,11 +168,12 @@ func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error {
 	c1, cancel := context.WithCancel(re.applicationConfig.Context)
 	// Monitor the Fiber context and cancel our context when it's canceled
 	// This ensures we respect request cancellation without causing panics
-	go func() {
-		<-ctx.Context().Done()
-		// Fiber context was canceled (request completed or client disconnected)
-		cancel()
-	}()
+	go func(fiberCtx *fasthttp.RequestCtx) {
+		if fiberCtx != nil {
+			<-fiberCtx.Done()
+			cancel()
+		}
+	}(ctx.Context())
 	// Add the correlation ID to the new context
 	ctxWithCorrelationID := context.WithValue(c1, CorrelationIDKey, correlationID)
 

diff --git a/core/http/routes/ui.go b/core/http/routes/ui.go
@@ -91,11 +91,15 @@ func RegisterUIRoutes(app *fiber.App,
 		}
 
 		title := "LocalAI - Chat"
+		var modelContextSize *int
 
 		for _, b := range modelConfigs {
 			if b.HasUsecases(config.FLAG_CHAT) {
 				modelThatCanBeUsed = b.Name
 				title = "LocalAI - Chat with " + modelThatCanBeUsed
+				if b.LLMConfig.ContextSize != nil {
+					modelContextSize = b.LLMConfig.ContextSize
+				}
 				break
 			}
 		}
@@ -107,6 +111,7 @@ func RegisterUIRoutes(app *fiber.App,
 			"GalleryConfig":       galleryConfigs,
 			"ModelsConfig":        modelConfigs,
 			"Model":               modelThatCanBeUsed,
+			"ContextSize":         modelContextSize,
 			"Version":             internal.PrintableVersion(),
 		}
 
@@ -120,22 +125,28 @@ func RegisterUIRoutes(app *fiber.App,
 		modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
 
 		galleryConfigs := map[string]*gallery.ModelConfig{}
+		modelName := c.Params("model")
+		var modelContextSize *int
 
 		for _, m := range modelConfigs {
 			cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
 			if err != nil {
 				continue
 			}
 			galleryConfigs[m.Name] = cfg
+			if m.Name == modelName && m.LLMConfig.ContextSize != nil {
+				modelContextSize = m.LLMConfig.ContextSize
+			}
 		}
 
 		summary := fiber.Map{
-			"Title":               "LocalAI - Chat with " + c.Params("model"),
+			"Title":               "LocalAI - Chat with " + modelName,
 			"BaseURL":             utils.BaseURL(c),
 			"ModelsConfig":        modelConfigs,
 			"GalleryConfig":       galleryConfigs,
 			"ModelsWithoutConfig": modelsWithoutConfig,
-			"Model":               c.Params("model"),
+			"Model":               modelName,
+			"ContextSize":         modelContextSize,
 			"Version":             internal.PrintableVersion(),
 		}
 

diff --git a/core/http/static/chat.js b/core/http/static/chat.js
@@ -30,21 +30,63 @@ SOFTWARE.
 // Global variable to store the current AbortController
 let currentAbortController = null;
 let currentReader = null;
+let requestStartTime = null;
+let tokensReceived = 0;
+let tokensPerSecondInterval = null;
+let lastTokensPerSecond = null; // Store the last calculated rate
 
 function toggleLoader(show) {
   const sendButton = document.getElementById('send-button');
   const stopButton = document.getElementById('stop-button');
+  const headerLoadingIndicator = document.getElementById('header-loading-indicator');
+  const tokensPerSecondDisplay = document.getElementById('tokens-per-second');
 
   if (show) {
     sendButton.style.display = 'none';
     stopButton.style.display = 'block';
-    document.getElementById("input").disabled = true;
+    if (headerLoadingIndicator) headerLoadingIndicator.style.display = 'block';
+    // Reset token tracking
+    requestStartTime = Date.now();
+    tokensReceived = 0;
+
+    // Start updating tokens/second display
+    if (tokensPerSecondDisplay) {
+      tokensPerSecondDisplay.textContent = '-';
+      updateTokensPerSecond();
+      tokensPerSecondInterval = setInterval(updateTokensPerSecond, 500); // Update every 500ms
+    }
   } else {
-    document.getElementById("input").disabled = false;
     sendButton.style.display = 'block';
     stopButton.style.display = 'none';
+    if (headerLoadingIndicator) headerLoadingIndicator.style.display = 'none';
+    // Stop updating but keep the last value visible
+    if (tokensPerSecondInterval) {
+      clearInterval(tokensPerSecondInterval);
+      tokensPerSecondInterval = null;
+    }
+    // Keep the last calculated rate visible
+    if (tokensPerSecondDisplay && lastTokensPerSecond !== null) {
+      tokensPerSecondDisplay.textContent = lastTokensPerSecond;
+    }
     currentAbortController = null;
     currentReader = null;
+    requestStartTime = null;
+    tokensReceived = 0;
+  }
+}
+
+function updateTokensPerSecond() {
+  const tokensPerSecondDisplay = document.getElementById('tokens-per-second');
+  if (!tokensPerSecondDisplay || !requestStartTime) return;
+
+  const elapsedSeconds = (Date.now() - requestStartTime) / 1000;
+  if (elapsedSeconds > 0 && tokensReceived > 0) {
+    const rate = tokensReceived / elapsedSeconds;
+    const formattedRate = `${rate.toFixed(1)} tokens/s`;
+    tokensPerSecondDisplay.textContent = formattedRate;
+    lastTokensPerSecond = formattedRate; // Store the last calculated rate
+  } else if (elapsedSeconds > 0) {
+    tokensPerSecondDisplay.textContent = '-';
   }
 }
 
@@ -171,9 +213,30 @@ function readInputFile() {
 
 function submitPrompt(event) {
   event.preventDefault();
+
+  const input = document.getElementById("input");
+  if (!input) return;
+
+  const inputValue = input.value;
+  if (!inputValue.trim()) return; // Don't send empty messages
+
+  // If already processing, abort the current request and send the new one
+  if (currentAbortController || currentReader) {
+    // Abort current request
+    stopRequest();
+    // Small delay to ensure cleanup completes
+    setTimeout(() => {
+      // Continue with new request
+      processAndSendMessage(inputValue);
+    }, 100);
+    return;
+  }
+
+  processAndSendMessage(inputValue);
+}
 
-  const input = document.getElementById("input").value;
-  let fullInput = input;
+function processAndSendMessage(inputValue) {
+  let fullInput = inputValue;
 
   // If there are file contents, append them to the input for the LLM
   if (fileContents.length > 0) {
@@ -184,7 +247,7 @@ function submitPrompt(event) {
   }
 
   // Show file icons in chat if there are files
-  let displayContent = input;
+  let displayContent = inputValue;
   if (currentFileNames.length > 0) {
     displayContent += "\n\n";
     currentFileNames.forEach(fileName => {
@@ -201,9 +264,15 @@ function submitPrompt(event) {
     history[history.length - 1].content = fullInput;
   }
 
-  document.getElementById("input").value = "";
+  const input = document.getElementById("input");
+  if (input) input.value = "";
   const systemPrompt = localStorage.getItem("system_prompt");
   Alpine.nextTick(() => { document.getElementById('messages').scrollIntoView(false); });
+
+  // Reset token tracking before starting new request
+  requestStartTime = Date.now();
+  tokensReceived = 0;
+
   promptGPT(systemPrompt, fullInput);
 
   // Reset file contents and names after sending
@@ -242,6 +311,12 @@ function readInputAudio() {
 async function promptGPT(systemPrompt, input) {
   const model = document.getElementById("chat-model").value;
   const mcpMode = Alpine.store("chat").mcpMode;
+
+  // Reset current request usage tracking for new request
+  if (Alpine.store("chat")) {
+    Alpine.store("chat").tokenUsage.currentRequest = null;
+  }
+
   toggleLoader(true);
 
   messages = Alpine.store("chat").messages();
@@ -373,10 +448,35 @@ async function promptGPT(systemPrompt, input) {
     // Handle MCP non-streaming response
     try {
       const data = await response.json();
-      // MCP endpoint returns content in choices[0].text, not choices[0].message.content
-      const content = data.choices[0]?.text || "";
+
+      // Update token usage if present
+      if (data.usage) {
+        Alpine.store("chat").updateTokenUsage(data.usage);
+      }
+
+      // MCP endpoint returns content in choices[0].message.content (chat completion format)
+      // Fallback to choices[0].text for backward compatibility (completion format)
+      const content = data.choices[0]?.message?.content || data.choices[0]?.text || "";
+
+      if (!content && (!data.choices || data.choices.length === 0)) {
+        Alpine.store("chat").add(
+          "assistant",
+          `<span class='error'>Error: Empty response from MCP endpoint</span>`,
+        );
+        toggleLoader(false);
+        return;
+      }
 
       if (content) {
+        // Count tokens for rate calculation (MCP mode - full content at once)
+        // Prefer actual token count from API if available
+        if (data.usage && data.usage.completion_tokens) {
+          tokensReceived = data.usage.completion_tokens;
+        } else {
+          tokensReceived += Math.ceil(content.length / 4);
+        }
+        updateTokensPerSecond();
+
         // Process thinking tags using shared function
         const { regularContent, thinkingContent } = processThinkingTags(content);
 
@@ -426,6 +526,9 @@ async function promptGPT(systemPrompt, input) {
     const addToChat = (token) => {
       const chatStore = Alpine.store("chat");
       chatStore.add("assistant", token);
+      // Count tokens for rate calculation (rough estimate: count characters/4)
+      tokensReceived += Math.ceil(token.length / 4);
+      updateTokensPerSecond();
       // Efficiently scroll into view without triggering multiple reflows
       // const messages = document.getElementById('messages');
       // messages.scrollTop = messages.scrollHeight;
@@ -456,6 +559,12 @@ async function promptGPT(systemPrompt, input) {
           if (line.startsWith("data: ")) {
             try {
               const jsonData = JSON.parse(line.substring(6));
+
+              // Update token usage if present
+              if (jsonData.usage) {
+                Alpine.store("chat").updateTokenUsage(jsonData.usage);
+              }
+
               const token = jsonData.choices[0].delta.content;
 
               if (token) {
@@ -480,6 +589,9 @@ async function promptGPT(systemPrompt, input) {
                 // Handle content based on thinking state
                 if (isThinking) {
                   thinkingContent += token;
+                  // Count tokens for rate calculation
+                  tokensReceived += Math.ceil(token.length / 4);
+                  updateTokensPerSecond();
                   // Update the last thinking message or create a new one
                   if (lastThinkingMessageIndex === -1) {
                     // Create new thinking message
@@ -568,14 +680,71 @@ marked.setOptions({
   },
 });
 
+// Alpine store is now initialized in chat.html inline script to ensure it's available before Alpine processes the DOM
+// Only initialize if not already initialized (to avoid duplicate initialization)
 document.addEventListener("alpine:init", () => {
-  Alpine.store("chat", {
+  // Check if store already exists (initialized in chat.html)
+  if (!Alpine.store("chat")) {
+    // Fallback initialization (should not be needed if chat.html loads correctly)
+    Alpine.store("chat", {
     history: [],
     languages: [undefined],
     systemPrompt: "",
     mcpMode: false,
+    contextSize: null,
+    tokenUsage: {
+      promptTokens: 0,
+      completionTokens: 0,
+      totalTokens: 0,
+      currentRequest: null
+    },
     clear() {
       this.history.length = 0;
+      this.tokenUsage = {
+        promptTokens: 0,
+        completionTokens: 0,
+        totalTokens: 0,
+        currentRequest: null
+      };
+    },
+    updateTokenUsage(usage) {
+      // Usage values in streaming responses are cumulative totals for the current request
+      // We track session totals separately and only update when we see new (higher) values
+      if (usage) {
+        const currentRequest = this.tokenUsage.currentRequest || {
+          promptTokens: 0,
+          completionTokens: 0,
+          totalTokens: 0
+        };
+
+        // Check if this is a new/updated usage (values increased)
+        const isNewUsage = 
+          (usage.prompt_tokens !== undefined && usage.prompt_tokens > currentRequest.promptTokens) ||
+          (usage.completion_tokens !== undefined && usage.completion_tokens > currentRequest.completionTokens) ||
+          (usage.total_tokens !== undefined && usage.total_tokens > currentRequest.totalTokens);
+
+        if (isNewUsage) {
+          // Update session totals: subtract old request usage, add new
+          this.tokenUsage.promptTokens = this.tokenUsage.promptTokens - currentRequest.promptTokens + (usage.prompt_tokens || 0);
+          this.tokenUsage.completionTokens = this.tokenUsage.completionTokens - currentRequest.completionTokens + (usage.completion_tokens || 0);
+          this.tokenUsage.totalTokens = this.tokenUsage.totalTokens - currentRequest.totalTokens + (usage.total_tokens || 0);
+
+          // Store current request usage
+          this.tokenUsage.currentRequest = {
+            promptTokens: usage.prompt_tokens || 0,
+            completionTokens: usage.completion_tokens || 0,
+            totalTokens: usage.total_tokens || 0
+          };
+        }
+      }
+    },
+    getRemainingTokens() {
+      if (!this.contextSize) return null;
+      return Math.max(0, this.contextSize - this.tokenUsage.totalTokens);
+    },
+    getContextUsagePercent() {
+      if (!this.contextSize) return null;
+      return Math.min(100, (this.tokenUsage.totalTokens / this.contextSize) * 100);
     },
     add(role, content, image, audio) {
       const N = this.history.length - 1;
@@ -640,5 +809,6 @@ document.addEventListener("alpine:init", () => {
         audio: message.audio,
       }));
     },
-  });
+    });
+  }
 });