From ac130ffe69275e6b188b239da28e5c4cb843c2ec Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 15 Nov 2025 14:48:45 +0100
Subject: [PATCH 1/6] feat(mcp): add LocalAI endpoint to stream live results of
 the agent

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/model_config.go        |  66 +++++++
 core/http/app.go                   |   2 +-
 core/http/endpoints/localai/mcp.go | 268 +++++++++++++++++++++++++++++
 core/http/endpoints/openai/mcp.go  |   7 +
 core/http/routes/localai.go        |  23 ++-
 core/http/static/chat.js           | 181 ++++++++++++++-----
 core/http/views/chat.html          |  45 ++++-
 7 files changed, 540 insertions(+), 52 deletions(-)
 create mode 100644 core/http/endpoints/localai/mcp.go

diff --git a/core/config/model_config.go b/core/config/model_config.go
index 41664f2a3dd4..1fcc13fa9754 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1,14 +1,17 @@
 package config
 
 import (
+	"context"
 	"os"
 	"regexp"
 	"slices"
 	"strings"
 
+	"github.com/modelcontextprotocol/go-sdk/mcp"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/cogito"
 	"gopkg.in/yaml.v3"
 )
 
@@ -668,3 +671,66 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecases) bool {
 
 	return true
 }
+
+// BuildCogitoOptions generates cogito options from the model configuration
+// It accepts a context, MCP sessions, and optional callback functions for status, reasoning, tool calls, and tool results
+func (c *ModelConfig) BuildCogitoOptions(
+	ctx context.Context,
+	sessions []*mcp.ClientSession,
+	statusCallback func(string),
+	reasoningCallback func(string),
+	toolCallCallback func(*cogito.ToolChoice) bool,
+	toolCallResultCallback func(cogito.ToolStatus),
+) []cogito.Option {
+	cogitoOpts := []cogito.Option{
+		cogito.WithContext(ctx),
+		cogito.WithMCPs(sessions...),
+		cogito.WithIterations(3),  // default to 3 iterations
+		cogito.WithMaxAttempts(3), // default to 3 attempts
+		cogito.WithForceReasoning(),
+	}
+
+	// Add optional callbacks if provided
+	if statusCallback != nil {
+		cogitoOpts = append(cogitoOpts, cogito.WithStatusCallback(statusCallback))
+	}
+
+	if reasoningCallback != nil {
+		cogitoOpts = append(cogitoOpts, cogito.WithReasoningCallback(reasoningCallback))
+	}
+
+	if toolCallCallback != nil {
+		cogitoOpts = append(cogitoOpts, cogito.WithToolCallBack(toolCallCallback))
+	}
+
+	if toolCallResultCallback != nil {
+		cogitoOpts = append(cogitoOpts, cogito.WithToolCallResultCallback(toolCallResultCallback))
+	}
+
+	// Apply agent configuration options
+	if c.Agent.EnableReasoning {
+		cogitoOpts = append(cogitoOpts, cogito.EnableToolReasoner)
+	}
+
+	if c.Agent.EnablePlanning {
+		cogitoOpts = append(cogitoOpts, cogito.EnableAutoPlan)
+	}
+
+	if c.Agent.EnableMCPPrompts {
+		cogitoOpts = append(cogitoOpts, cogito.EnableMCPPrompts)
+	}
+
+	if c.Agent.EnablePlanReEvaluator {
+		cogitoOpts = append(cogitoOpts, cogito.EnableAutoPlanReEvaluator)
+	}
+
+	if c.Agent.MaxIterations != 0 {
+		cogitoOpts = append(cogitoOpts, cogito.WithIterations(c.Agent.MaxIterations))
+	}
+
+	if c.Agent.MaxAttempts != 0 {
+		cogitoOpts = append(cogitoOpts, cogito.WithMaxAttempts(c.Agent.MaxAttempts))
+	}
+
+	return cogitoOpts
+}
diff --git a/core/http/app.go b/core/http/app.go
index 731e69df565c..7497a5d611fa 100644
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -205,7 +205,7 @@ func API(application *application.Application) (*echo.Echo, error) {
 		opcache = services.NewOpCache(application.GalleryService())
 	}
 
-	routes.RegisterLocalAIRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService(), opcache)
+	routes.RegisterLocalAIRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService(), opcache, application.TemplatesEvaluator())
 	routes.RegisterOpenAIRoutes(e, requestExtractor, application)
 	if !application.ApplicationConfig().DisableWebUI {
 		routes.RegisterUIAPIRoutes(e, application.ModelConfigLoader(), application.ApplicationConfig(), application.GalleryService(), opcache)
diff --git a/core/http/endpoints/localai/mcp.go b/core/http/endpoints/localai/mcp.go
new file mode 100644
index 000000000000..b4d86af74918
--- /dev/null
+++ b/core/http/endpoints/localai/mcp.go
@@ -0,0 +1,268 @@
+package localai
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/templates"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/cogito"
+	"github.com/rs/zerolog/log"
+)
+
+// MCP SSE Event Types
+type MCPReasoningEvent struct {
+	Type    string `json:"type"`
+	Content string `json:"content"`
+}
+
+type MCPToolCallEvent struct {
+	Type      string                 `json:"type"`
+	Name      string                 `json:"name"`
+	Arguments map[string]interface{} `json:"arguments"`
+	Reasoning string                 `json:"reasoning"`
+}
+
+type MCPToolResultEvent struct {
+	Type   string `json:"type"`
+	Name   string `json:"name"`
+	Result string `json:"result"`
+}
+
+type MCPStatusEvent struct {
+	Type    string `json:"type"`
+	Message string `json:"message"`
+}
+
+type MCPAssistantEvent struct {
+	Type    string `json:"type"`
+	Content string `json:"content"`
+}
+
+type MCPErrorEvent struct {
+	Type    string `json:"type"`
+	Message string `json:"message"`
+}
+
+// MCPStreamEndpoint is the SSE streaming endpoint for MCP chat completions
+// @Summary Stream MCP chat completions with reasoning, tool calls, and results
+// @Param request body schema.OpenAIRequest true "query params"
+// @Success 200 {object} schema.OpenAIResponse "Response"
+// @Router /v1/mcp/chat/completions [post]
+func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		ctx := c.Request().Context()
+
+		// Handle Correlation
+		id := c.Request().Header.Get("X-Correlation-ID")
+		if id == "" {
+			id = fmt.Sprintf("mcp-%d", time.Now().UnixNano())
+		}
+
+		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		if !ok || input.Model == "" {
+			return echo.ErrBadRequest
+		}
+
+		config, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
+		if !ok || config == nil {
+			return echo.ErrBadRequest
+		}
+
+		if config.MCP.Servers == "" && config.MCP.Stdio == "" {
+			return fmt.Errorf("no MCP servers configured")
+		}
+
+		// Get MCP config from model config
+		remote, stdio, err := config.MCP.MCPConfigFromYAML()
+		if err != nil {
+			return fmt.Errorf("failed to get MCP config: %w", err)
+		}
+
+		// Check if we have tools in cache, or we have to have an initial connection
+		sessions, err := mcpTools.SessionsFromMCPConfig(config.Name, remote, stdio)
+		if err != nil {
+			return fmt.Errorf("failed to get MCP sessions: %w", err)
+		}
+
+		if len(sessions) == 0 {
+			return fmt.Errorf("no working MCP servers found")
+		}
+
+		// Set up SSE headers
+		c.Response().Header().Set("Content-Type", "text/event-stream")
+		c.Response().Header().Set("Cache-Control", "no-cache")
+		c.Response().Header().Set("Connection", "keep-alive")
+		c.Response().Header().Set("X-Correlation-ID", id)
+
+		// Create channel for streaming events
+		events := make(chan interface{})
+		ended := make(chan error, 1)
+
+		ctxWithCancellation, cancel := context.WithCancel(ctx)
+		defer cancel()
+
+		// Build fragment from messages
+		fragment := cogito.NewEmptyFragment()
+		for _, message := range input.Messages {
+			fragment = fragment.AddMessage(message.Role, message.StringContent)
+		}
+
+		port := appConfig.APIAddress[strings.LastIndex(appConfig.APIAddress, ":")+1:]
+		apiKey := ""
+		if appConfig.ApiKeys != nil && len(appConfig.ApiKeys) > 0 {
+			apiKey = appConfig.ApiKeys[0]
+		}
+
+		// TODO: instead of connecting to the API, we should just wire this internally
+		// and act like completion.go.
+		// We can do this as cogito expects an interface and we can create one that
+		// we satisfy to just call internally ComputeChoices
+		defaultLLM := cogito.NewOpenAILLM(config.Name, apiKey, "http://127.0.0.1:"+port)
+
+		// Set up callbacks for streaming
+		statusCallback := func(s string) {
+			events <- MCPStatusEvent{
+				Type:    "status",
+				Message: s,
+			}
+		}
+
+		reasoningCallback := func(s string) {
+			events <- MCPReasoningEvent{
+				Type:    "reasoning",
+				Content: s,
+			}
+		}
+
+		toolCallCallback := func(t *cogito.ToolChoice) bool {
+			events <- MCPToolCallEvent{
+				Type:      "tool_call",
+				Name:      t.Name,
+				Arguments: t.Arguments,
+				Reasoning: t.Reasoning,
+			}
+			return true
+		}
+
+		toolCallResultCallback := func(t cogito.ToolStatus) {
+			events <- MCPToolResultEvent{
+				Type:   "tool_result",
+				Name:   t.Name,
+				Result: t.Result,
+			}
+		}
+
+		// Build cogito options using the consolidated method
+		cogitoOpts := config.BuildCogitoOptions(
+			ctxWithCancellation,
+			sessions,
+			statusCallback,
+			reasoningCallback,
+			toolCallCallback,
+			toolCallResultCallback,
+		)
+
+		// Execute tools in a goroutine
+		go func() {
+			defer close(events)
+
+			f, err := cogito.ExecuteTools(
+				defaultLLM, fragment,
+				cogitoOpts...,
+			)
+			if err != nil && !errors.Is(err, cogito.ErrNoToolSelected) {
+				events <- MCPErrorEvent{
+					Type:    "error",
+					Message: fmt.Sprintf("Failed to execute tools: %v", err),
+				}
+				ended <- err
+				return
+			}
+
+			// Get final response
+			f, err = defaultLLM.Ask(ctx, f)
+			if err != nil {
+				events <- MCPErrorEvent{
+					Type:    "error",
+					Message: fmt.Sprintf("Failed to get response: %v", err),
+				}
+				ended <- err
+				return
+			}
+
+			// Stream final assistant response
+			content := f.LastMessage().Content
+			events <- MCPAssistantEvent{
+				Type:    "assistant",
+				Content: content,
+			}
+
+			ended <- nil
+		}()
+
+		// Stream events to client
+	LOOP:
+		for {
+			select {
+			case <-ctx.Done():
+				// Context was cancelled (client disconnected or request cancelled)
+				log.Debug().Msgf("Request context cancelled, stopping stream")
+				cancel()
+				break LOOP
+			case event := <-events:
+				if event == nil {
+					// Channel closed
+					break LOOP
+				}
+				eventData, err := json.Marshal(event)
+				if err != nil {
+					log.Debug().Msgf("Failed to marshal event: %v", err)
+					continue
+				}
+				log.Debug().Msgf("Sending event: %s", string(eventData))
+				_, err = fmt.Fprintf(c.Response().Writer, "data: %s\n\n", string(eventData))
+				if err != nil {
+					log.Debug().Msgf("Sending event failed: %v", err)
+					cancel()
+					return err
+				}
+				c.Response().Flush()
+			case err := <-ended:
+				if err == nil {
+					// Send done signal
+					fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
+					c.Response().Flush()
+					break LOOP
+				}
+				log.Error().Msgf("Stream ended with error: %v", err)
+				errorEvent := MCPErrorEvent{
+					Type:    "error",
+					Message: err.Error(),
+				}
+				errorData, marshalErr := json.Marshal(errorEvent)
+				if marshalErr != nil {
+					fmt.Fprintf(c.Response().Writer, "data: {\"type\":\"error\",\"message\":\"Internal error\"}\n\n")
+				} else {
+					fmt.Fprintf(c.Response().Writer, "data: %s\n\n", string(errorData))
+				}
+				fmt.Fprintf(c.Response().Writer, "data: [DONE]\n\n")
+				c.Response().Flush()
+				return nil
+			}
+		}
+
+		log.Debug().Msgf("Stream ended")
+		return nil
+	}
+}
+
diff --git a/core/http/endpoints/openai/mcp.go b/core/http/endpoints/openai/mcp.go
index a91706f51d10..6bff942c486a 100644
--- a/core/http/endpoints/openai/mcp.go
+++ b/core/http/endpoints/openai/mcp.go
@@ -95,6 +95,13 @@ func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader,
 				log.Debug().Msgf("[model agent] [model: %s] Status: %s", config.Name, s)
 			}),
 			cogito.WithContext(ctxWithCancellation),
+			cogito.WithToolCallBack(func(t *cogito.ToolChoice) bool {
+				log.Debug().Msgf("[model agent] [model: %s] Tool call: %s, reasoning: %s, arguments: %+v", t.Name, t.Reasoning, t.Arguments)
+				return true
+			}),
+			cogito.WithToolCallResultCallback(func(t cogito.ToolStatus) {
+				log.Debug().Msgf("[model agent] [model: %s] Tool call result: %s, tool arguments: %+v", t.Name, t.Result, t.ToolArguments)
+			}),
 			cogito.WithMCPs(sessions...),
 			cogito.WithIterations(3),  // default to 3 iterations
 			cogito.WithMaxAttempts(3), // default to 3 attempts
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index 7b1c003ca021..bf8a7bfb8f16 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -7,6 +7,7 @@ import (
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
+	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/model"
 	echoswagger "github.com/swaggo/echo-swagger"
@@ -18,7 +19,8 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	galleryService *services.GalleryService,
-	opcache *services.OpCache) {
+	opcache *services.OpCache,
+	evaluator *templates.Evaluator) {
 
 	router.GET("/swagger/*", echoswagger.WrapHandler) // default
 
@@ -133,4 +135,23 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TOKENIZE)),
 		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TokenizeRequest) }))
 
+	// MCP Stream endpoint
+	if evaluator != nil {
+		mcpStreamHandler := localai.MCPStreamEndpoint(cl, ml, evaluator, appConfig)
+		mcpStreamMiddleware := []echo.MiddlewareFunc{
+			requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
+			requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			func(next echo.HandlerFunc) echo.HandlerFunc {
+				return func(c echo.Context) error {
+					if err := requestExtractor.SetOpenAIRequest(c); err != nil {
+						return err
+					}
+					return next(c)
+				}
+			},
+		}
+		router.POST("/v1/mcp/chat/completions", mcpStreamHandler, mcpStreamMiddleware...)
+		router.POST("/mcp/v1/chat/completions", mcpStreamHandler, mcpStreamMiddleware...)
+	}
+
 }
diff --git a/core/http/static/chat.js b/core/http/static/chat.js
index 993c956ac91a..634255227380 100644
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -379,16 +379,14 @@ async function promptGPT(systemPrompt, input) {
   document.getElementById("fileName").innerHTML = "";
 
   // Choose endpoint based on MCP mode
-  const endpoint = mcpMode ? "mcp/v1/chat/completions" : "v1/chat/completions";
+  const endpoint = mcpMode ? "v1/mcp/chat/completions" : "v1/chat/completions";
   const requestBody = {
     model: model,
     messages: messages,
   };
   
-  // Only add stream parameter for regular chat (MCP doesn't support streaming)
-  if (!mcpMode) {
-    requestBody.stream = true;
-  }
+  // Add stream parameter for both regular chat and MCP (MCP now supports SSE streaming)
+  requestBody.stream = true;
   
   let response;
   try {
@@ -444,64 +442,153 @@ async function promptGPT(systemPrompt, input) {
     return;
   }
 
+  // Handle streaming response (both regular and MCP mode now use SSE)
   if (mcpMode) {
-    // Handle MCP non-streaming response
+    // Handle MCP SSE streaming with new event types
+    const reader = response.body
+      ?.pipeThrough(new TextDecoderStream())
+      .getReader();
+
+    if (!reader) {
+      Alpine.store("chat").add(
+        "assistant",
+        `<span class='error'>Error: Failed to decode MCP API response</span>`,
+      );
+      toggleLoader(false);
+      return;
+    }
+
+    // Store reader globally so stop button can cancel it
+    currentReader = reader;
+
+    let buffer = "";
+    let assistantContent = "";
+    let lastAssistantMessageIndex = -1;
+
     try {
-      const data = await response.json();
-      
-      // Update token usage if present
-      if (data.usage) {
-        Alpine.store("chat").updateTokenUsage(data.usage);
-      }
-      
-      // MCP endpoint returns content in choices[0].message.content (chat completion format)
-      // Fallback to choices[0].text for backward compatibility (completion format)
-      const content = data.choices[0]?.message?.content || data.choices[0]?.text || "";
-      
-      if (!content && (!data.choices || data.choices.length === 0)) {
-        Alpine.store("chat").add(
-          "assistant",
-          `<span class='error'>Error: Empty response from MCP endpoint</span>`,
-        );
-        toggleLoader(false);
-        return;
+      while (true) {
+        const { value, done } = await reader.read();
+        if (done) break;
+
+        buffer += value;
+
+        let lines = buffer.split("\n");
+        buffer = lines.pop(); // Retain any incomplete line in the buffer
+
+        lines.forEach((line) => {
+          if (line.length === 0 || line.startsWith(":")) return;
+          if (line === "data: [DONE]") {
+            return;
+          }
+
+          if (line.startsWith("data: ")) {
+            try {
+              const eventData = JSON.parse(line.substring(6));
+              
+              // Handle different event types
+              switch (eventData.type) {
+                case "reasoning":
+                  if (eventData.content) {
+                    Alpine.store("chat").add("reasoning", eventData.content);
+                  }
+                  break;
+                
+                case "tool_call":
+                  if (eventData.name) {
+                    const toolCallContent = `**Tool:** ${eventData.name}\n\n` +
+                      (eventData.reasoning ? `**Reasoning:** ${eventData.reasoning}\n\n` : '') +
+                      `**Arguments:**\n\`\`\`json\n${JSON.stringify(eventData.arguments, null, 2)}\n\`\`\``;
+                    Alpine.store("chat").add("tool_call", toolCallContent);
+                  }
+                  break;
+                
+                case "tool_result":
+                  if (eventData.name) {
+                    const toolResultContent = `**Tool:** ${eventData.name}\n\n` +
+                      `**Result:**\n\`\`\`\n${eventData.result}\n\`\`\``;
+                    Alpine.store("chat").add("tool_result", toolResultContent);
+                  }
+                  break;
+                
+                case "status":
+                  // Status messages can be logged but not necessarily displayed
+                  console.log("[MCP Status]", eventData.message);
+                  break;
+                
+                case "assistant":
+                  if (eventData.content) {
+                    assistantContent += eventData.content;
+                    // Count tokens for rate calculation
+                    tokensReceived += Math.ceil(eventData.content.length / 4);
+                    updateTokensPerSecond();
+                    
+                    // Process thinking tags in assistant content
+                    const { regularContent, thinkingContent } = processThinkingTags(assistantContent);
+                    
+                    // Update or create assistant message
+                    if (lastAssistantMessageIndex === -1) {
+                      Alpine.store("chat").add("assistant", regularContent || assistantContent);
+                      lastAssistantMessageIndex = Alpine.store("chat").history.length - 1;
+                    } else {
+                      const chatStore = Alpine.store("chat");
+                      const lastMessage = chatStore.history[lastAssistantMessageIndex];
+                      if (lastMessage && lastMessage.role === "assistant") {
+                        lastMessage.content = regularContent || assistantContent;
+                        lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
+                      }
+                    }
+                    
+                    // Add thinking content if present
+                    if (thinkingContent) {
+                      Alpine.store("chat").add("thinking", thinkingContent);
+                    }
+                  }
+                  break;
+                
+                case "error":
+                  Alpine.store("chat").add(
+                    "assistant",
+                    `<span class='error'>MCP Error: ${eventData.message}</span>`,
+                  );
+                  break;
+              }
+            } catch (error) {
+              console.error("Failed to parse MCP event:", line, error);
+            }
+          }
+        });
       }
-      
-      if (content) {
-        // Count tokens for rate calculation (MCP mode - full content at once)
-        // Prefer actual token count from API if available
-        if (data.usage && data.usage.completion_tokens) {
-          tokensReceived = data.usage.completion_tokens;
-        } else {
-          tokensReceived += Math.ceil(content.length / 4);
+
+      // Final assistant content flush if any data remains
+      if (assistantContent.trim() && lastAssistantMessageIndex !== -1) {
+        const { regularContent, thinkingContent } = processThinkingTags(assistantContent);
+        const chatStore = Alpine.store("chat");
+        const lastMessage = chatStore.history[lastAssistantMessageIndex];
+        if (lastMessage && lastMessage.role === "assistant") {
+          lastMessage.content = regularContent || assistantContent;
+          lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
         }
-        updateTokensPerSecond();
-        
-        // Process thinking tags using shared function
-        const { regularContent, thinkingContent } = processThinkingTags(content);
-        
-        // Add thinking content if present
         if (thinkingContent) {
           Alpine.store("chat").add("thinking", thinkingContent);
         }
-        
-        // Add regular content if present
-        if (regularContent) {
-          Alpine.store("chat").add("assistant", regularContent);
-        }
       }
-      
-      // Highlight all code blocks
+
+      // Highlight all code blocks once at the end
       hljs.highlightAll();
     } catch (error) {
       // Don't show error if request was aborted by user
-      if (error.name !== 'AbortError' || currentAbortController) {
+      if (error.name !== 'AbortError' || !currentAbortController) {
         Alpine.store("chat").add(
           "assistant",
-          `<span class='error'>Error: Failed to parse MCP response</span>`,
+          `<span class='error'>Error: Failed to process MCP stream</span>`,
         );
       }
     } finally {
+      // Perform any cleanup if necessary
+      if (reader) {
+        reader.releaseLock();
+      }
+      currentReader = null;
       currentAbortController = null;
     }
   } else {
diff --git a/core/http/views/chat.html b/core/http/views/chat.html
index 5aa45e3e30c5..acb98f1bca35 100644
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -111,8 +111,8 @@
         },
         add(role, content, image, audio) {
           const N = this.history.length - 1;
-          // For thinking messages, always create a new message
-          if (role === "thinking") {
+          // For thinking, reasoning, tool_call, and tool_result messages, always create a new message
+          if (role === "thinking" || role === "reasoning" || role === "tool_call" || role === "tool_result") {
             let c = "";
             const lines = content.split("\n");
             lines.forEach((line) => {
@@ -527,7 +527,46 @@ <h1 class="text-lg font-semibold text-[#E5E7EB]">
                     </div>
                   </div>
                 </template>
-                <template x-if="message.role != 'user' && message.role != 'thinking'">
+                <template x-if="message.role === 'reasoning'">
+                  <div class="flex items-center space-x-2 w-full">
+                    <div class="flex flex-col flex-1">
+                      <div class="p-3 flex-1 rounded-lg bg-[#38BDF8]/10 text-[#94A3B8] border border-[#38BDF8]/30">
+                        <div class="flex items-center space-x-2 mb-2">
+                          <i class="fa-solid fa-lightbulb text-[#38BDF8]"></i>
+                          <span class="text-xs font-semibold text-[#38BDF8]">Reasoning</span>
+                        </div>
+                        <div class="mt-1 text-[#E5E7EB]" x-html="message.html"></div>
+                      </div>
+                    </div>
+                  </div>
+                </template>
+                <template x-if="message.role === 'tool_call'">
+                  <div class="flex items-center space-x-2 w-full">
+                    <div class="flex flex-col flex-1">
+                      <div class="p-3 flex-1 rounded-lg bg-[#8B5CF6]/10 text-[#94A3B8] border border-[#8B5CF6]/30">
+                        <div class="flex items-center space-x-2 mb-2">
+                          <i class="fa-solid fa-wrench text-[#8B5CF6]"></i>
+                          <span class="text-xs font-semibold text-[#8B5CF6]">Tool Call</span>
+                        </div>
+                        <div class="mt-1 text-[#E5E7EB]" x-html="message.html"></div>
+                      </div>
+                    </div>
+                  </div>
+                </template>
+                <template x-if="message.role === 'tool_result'">
+                  <div class="flex items-center space-x-2 w-full">
+                    <div class="flex flex-col flex-1">
+                      <div class="p-3 flex-1 rounded-lg bg-[#10B981]/10 text-[#94A3B8] border border-[#10B981]/30">
+                        <div class="flex items-center space-x-2 mb-2">
+                          <i class="fa-solid fa-check-circle text-[#10B981]"></i>
+                          <span class="text-xs font-semibold text-[#10B981]">Tool Result</span>
+                        </div>
+                        <div class="mt-1 text-[#E5E7EB]" x-html="message.html"></div>
+                      </div>
+                    </div>
+                  </div>
+                </template>
+                <template x-if="message.role != 'user' && message.role != 'thinking' && message.role != 'reasoning' && message.role != 'tool_call' && message.role != 'tool_result'">
                   <div class="flex items-center space-x-2">
                     {{ if $galleryConfig }}
                     {{ if $galleryConfig.Icon }}<img src="{{$galleryConfig.Icon}}" class="rounded-lg mt-2 max-w-8 max-h-8 border border-[#38BDF8]/20">{{end}}

From b462a5165b834fe0ab8efe4cd67c559cb75254aa Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 15 Nov 2025 14:50:26 +0100
Subject: [PATCH 2/6] wip

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/static/chat.js | 167 +++++++++++++++++++++++++++++++++------
 1 file changed, 144 insertions(+), 23 deletions(-)

diff --git a/core/http/static/chat.js b/core/http/static/chat.js
index 634255227380..ad668085b14f 100644
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -463,7 +463,11 @@ async function promptGPT(systemPrompt, input) {
 
     let buffer = "";
     let assistantContent = "";
+    let assistantContentBuffer = [];
+    let thinkingContent = "";
+    let isThinking = false;
     let lastAssistantMessageIndex = -1;
+    let lastThinkingMessageIndex = -1;
 
     try {
       while (true) {
@@ -518,29 +522,63 @@ async function promptGPT(systemPrompt, input) {
                 case "assistant":
                   if (eventData.content) {
                     assistantContent += eventData.content;
+                    const contentChunk = eventData.content;
+                    
                     // Count tokens for rate calculation
-                    tokensReceived += Math.ceil(eventData.content.length / 4);
+                    tokensReceived += Math.ceil(contentChunk.length / 4);
                     updateTokensPerSecond();
                     
-                    // Process thinking tags in assistant content
-                    const { regularContent, thinkingContent } = processThinkingTags(assistantContent);
+                    // Check for thinking tags in the chunk (incremental detection)
+                    if (contentChunk.includes("<thinking>") || contentChunk.includes("<think>")) {
+                      isThinking = true;
+                      thinkingContent = "";
+                      lastThinkingMessageIndex = -1;
+                    }
                     
-                    // Update or create assistant message
-                    if (lastAssistantMessageIndex === -1) {
-                      Alpine.store("chat").add("assistant", regularContent || assistantContent);
-                      lastAssistantMessageIndex = Alpine.store("chat").history.length - 1;
-                    } else {
-                      const chatStore = Alpine.store("chat");
-                      const lastMessage = chatStore.history[lastAssistantMessageIndex];
-                      if (lastMessage && lastMessage.role === "assistant") {
-                        lastMessage.content = regularContent || assistantContent;
-                        lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
+                    if (contentChunk.includes("</thinking>") || contentChunk.includes("</think>")) {
+                      isThinking = false;
+                      // When closing tag is detected, process the accumulated thinking content
+                      if (thinkingContent.trim()) {
+                        // Extract just the thinking part from the accumulated content
+                        const thinkingMatch = thinkingContent.match(/<(?:thinking|redacted_reasoning)>(.*?)<\/(?:thinking|redacted_reasoning)>/s);
+                        if (thinkingMatch && thinkingMatch[1]) {
+                          const extractedThinking = thinkingMatch[1];
+                          if (lastThinkingMessageIndex === -1) {
+                            Alpine.store("chat").add("thinking", extractedThinking);
+                            lastThinkingMessageIndex = Alpine.store("chat").history.length - 1;
+                          } else {
+                            const chatStore = Alpine.store("chat");
+                            const lastMessage = chatStore.history[lastThinkingMessageIndex];
+                            if (lastMessage && lastMessage.role === "thinking") {
+                              lastMessage.content = extractedThinking;
+                              lastMessage.html = DOMPurify.sanitize(marked.parse(extractedThinking));
+                            }
+                          }
+                        }
+                        thinkingContent = "";
                       }
                     }
                     
-                    // Add thinking content if present
-                    if (thinkingContent) {
-                      Alpine.store("chat").add("thinking", thinkingContent);
+                    // Handle content based on thinking state
+                    if (isThinking) {
+                      thinkingContent += contentChunk;
+                      // Update the last thinking message or create a new one (incremental)
+                      if (lastThinkingMessageIndex === -1) {
+                        // Create new thinking message
+                        Alpine.store("chat").add("thinking", thinkingContent);
+                        lastThinkingMessageIndex = Alpine.store("chat").history.length - 1;
+                      } else {
+                        // Update existing thinking message
+                        const chatStore = Alpine.store("chat");
+                        const lastMessage = chatStore.history[lastThinkingMessageIndex];
+                        if (lastMessage && lastMessage.role === "thinking") {
+                          lastMessage.content = thinkingContent;
+                          lastMessage.html = DOMPurify.sanitize(marked.parse(thinkingContent));
+                        }
+                      }
+                    } else {
+                      // Regular assistant content - buffer it for batch processing
+                      assistantContentBuffer.push(contentChunk);
                     }
                   }
                   break;
@@ -557,21 +595,103 @@ async function promptGPT(systemPrompt, input) {
             }
           }
         });
+        
+        // Efficiently update assistant message in batch
+        if (assistantContentBuffer.length > 0) {
+          const regularContent = assistantContentBuffer.join("");
+          
+          // Process any thinking tags that might be in the accumulated content
+          // This handles cases where tags are split across chunks
+          const { regularContent: processedRegular, thinkingContent: processedThinking } = processThinkingTags(regularContent);
+          
+          // Update or create assistant message with processed regular content
+          if (lastAssistantMessageIndex === -1) {
+            if (processedRegular && processedRegular.trim()) {
+              Alpine.store("chat").add("assistant", processedRegular);
+              lastAssistantMessageIndex = Alpine.store("chat").history.length - 1;
+            }
+          } else {
+            const chatStore = Alpine.store("chat");
+            const lastMessage = chatStore.history[lastAssistantMessageIndex];
+            if (lastMessage && lastMessage.role === "assistant") {
+              lastMessage.content = (lastMessage.content || "") + (processedRegular || "");
+              lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
+            }
+          }
+          
+          // Add any extracted thinking content from the processed buffer
+          if (processedThinking && processedThinking.trim()) {
+            Alpine.store("chat").add("thinking", processedThinking);
+          }
+          
+          assistantContentBuffer = [];
+        }
       }
 
       // Final assistant content flush if any data remains
-      if (assistantContent.trim() && lastAssistantMessageIndex !== -1) {
-        const { regularContent, thinkingContent } = processThinkingTags(assistantContent);
+      if (assistantContentBuffer.length > 0) {
+        const regularContent = assistantContentBuffer.join("");
+        // Process any remaining thinking tags that might be in the buffer
+        const { regularContent: processedRegular, thinkingContent: processedThinking } = processThinkingTags(regularContent);
+        
         const chatStore = Alpine.store("chat");
-        const lastMessage = chatStore.history[lastAssistantMessageIndex];
-        if (lastMessage && lastMessage.role === "assistant") {
-          lastMessage.content = regularContent || assistantContent;
-          lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
+        if (lastAssistantMessageIndex !== -1) {
+          const lastMessage = chatStore.history[lastAssistantMessageIndex];
+          if (lastMessage && lastMessage.role === "assistant") {
+            lastMessage.content = (lastMessage.content || "") + (processedRegular || "");
+            lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
+          }
+        } else if (processedRegular && processedRegular.trim()) {
+          Alpine.store("chat").add("assistant", processedRegular);
+          lastAssistantMessageIndex = Alpine.store("chat").history.length - 1;
         }
-        if (thinkingContent) {
+        
+        // Add any extracted thinking content from the buffer
+        if (processedThinking && processedThinking.trim()) {
+          Alpine.store("chat").add("thinking", processedThinking);
+        }
+      }
+      
+      // Final thinking content flush if any data remains (from incremental detection)
+      if (thinkingContent.trim() && lastThinkingMessageIndex === -1) {
+        // Extract thinking content if tags are present
+        const thinkingMatch = thinkingContent.match(/<(?:thinking|redacted_reasoning)>(.*?)<\/(?:thinking|redacted_reasoning)>/s);
+        if (thinkingMatch && thinkingMatch[1]) {
+          Alpine.store("chat").add("thinking", thinkingMatch[1]);
+        } else {
           Alpine.store("chat").add("thinking", thinkingContent);
         }
       }
+      
+      // Final pass: process the entire assistantContent to catch any missed thinking tags
+      // This ensures we don't miss tags that were split across chunks
+      if (assistantContent.trim()) {
+        const { regularContent: finalRegular, thinkingContent: finalThinking } = processThinkingTags(assistantContent);
+        
+        // Update assistant message with final processed content (without thinking tags)
+        if (finalRegular && finalRegular.trim()) {
+          if (lastAssistantMessageIndex !== -1) {
+            const chatStore = Alpine.store("chat");
+            const lastMessage = chatStore.history[lastAssistantMessageIndex];
+            if (lastMessage && lastMessage.role === "assistant") {
+              lastMessage.content = finalRegular;
+              lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
+            }
+          } else {
+            Alpine.store("chat").add("assistant", finalRegular);
+          }
+        }
+        
+        // Add any extracted thinking content (only if not already added)
+        if (finalThinking && finalThinking.trim()) {
+          const hasThinking = Alpine.store("chat").history.some(msg => 
+            msg.role === "thinking" && msg.content.trim() === finalThinking.trim()
+          );
+          if (!hasThinking) {
+            Alpine.store("chat").add("thinking", finalThinking);
+          }
+        }
+      }
 
       // Highlight all code blocks once at the end
       hljs.highlightAll();
@@ -899,3 +1019,4 @@ document.addEventListener("alpine:init", () => {
     });
   }
 });
+

From 158ccbf37d0a2636c7c6889a0c929d8abcf5d779 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 15 Nov 2025 16:42:53 +0100
Subject: [PATCH 3/6] Refactoring

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/model_config.go        | 30 +--------------------
 core/http/endpoints/localai/mcp.go | 18 ++++++-------
 core/http/endpoints/openai/mcp.go  | 42 ++++++++----------------------
 3 files changed, 21 insertions(+), 69 deletions(-)

diff --git a/core/config/model_config.go b/core/config/model_config.go
index 1fcc13fa9754..e7b4567c10af 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1,13 +1,11 @@
 package config
 
 import (
-	"context"
 	"os"
 	"regexp"
 	"slices"
 	"strings"
 
-	"github.com/modelcontextprotocol/go-sdk/mcp"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/functions"
@@ -674,39 +672,13 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecases) bool {
 
 // BuildCogitoOptions generates cogito options from the model configuration
 // It accepts a context, MCP sessions, and optional callback functions for status, reasoning, tool calls, and tool results
-func (c *ModelConfig) BuildCogitoOptions(
-	ctx context.Context,
-	sessions []*mcp.ClientSession,
-	statusCallback func(string),
-	reasoningCallback func(string),
-	toolCallCallback func(*cogito.ToolChoice) bool,
-	toolCallResultCallback func(cogito.ToolStatus),
-) []cogito.Option {
+func (c *ModelConfig) BuildCogitoOptions() []cogito.Option {
 	cogitoOpts := []cogito.Option{
-		cogito.WithContext(ctx),
-		cogito.WithMCPs(sessions...),
 		cogito.WithIterations(3),  // default to 3 iterations
 		cogito.WithMaxAttempts(3), // default to 3 attempts
 		cogito.WithForceReasoning(),
 	}
 
-	// Add optional callbacks if provided
-	if statusCallback != nil {
-		cogitoOpts = append(cogitoOpts, cogito.WithStatusCallback(statusCallback))
-	}
-
-	if reasoningCallback != nil {
-		cogitoOpts = append(cogitoOpts, cogito.WithReasoningCallback(reasoningCallback))
-	}
-
-	if toolCallCallback != nil {
-		cogitoOpts = append(cogitoOpts, cogito.WithToolCallBack(toolCallCallback))
-	}
-
-	if toolCallResultCallback != nil {
-		cogitoOpts = append(cogitoOpts, cogito.WithToolCallResultCallback(toolCallResultCallback))
-	}
-
 	// Apply agent configuration options
 	if c.Agent.EnableReasoning {
 		cogitoOpts = append(cogitoOpts, cogito.EnableToolReasoner)
diff --git a/core/http/endpoints/localai/mcp.go b/core/http/endpoints/localai/mcp.go
index b4d86af74918..ff58d7df20c3 100644
--- a/core/http/endpoints/localai/mcp.go
+++ b/core/http/endpoints/localai/mcp.go
@@ -163,14 +163,15 @@ func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 		}
 
 		// Build cogito options using the consolidated method
-		cogitoOpts := config.BuildCogitoOptions(
-			ctxWithCancellation,
-			sessions,
-			statusCallback,
-			reasoningCallback,
-			toolCallCallback,
-			toolCallResultCallback,
-		)
+		cogitoOpts := config.BuildCogitoOptions()
+
+		cogitoOpts = append(cogitoOpts,
+			cogito.WithStatusCallback(statusCallback),
+			cogito.WithReasoningCallback(reasoningCallback),
+			cogito.WithToolCallBack(toolCallCallback),
+			cogito.WithToolCallResultCallback(toolCallResultCallback),
+			cogito.WithContext(ctxWithCancellation),
+			cogito.WithMCPs(sessions...))
 
 		// Execute tools in a goroutine
 		go func() {
@@ -265,4 +266,3 @@ func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 		return nil
 	}
 }
-
diff --git a/core/http/endpoints/openai/mcp.go b/core/http/endpoints/openai/mcp.go
index 6bff942c486a..264403c31d87 100644
--- a/core/http/endpoints/openai/mcp.go
+++ b/core/http/endpoints/openai/mcp.go
@@ -90,11 +90,19 @@ func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader,
 		// we satisfy to just call internally ComputeChoices
 		defaultLLM := cogito.NewOpenAILLM(config.Name, apiKey, "http://127.0.0.1:"+port)
 
-		cogitoOpts := []cogito.Option{
+		// Build cogito options using the consolidated method
+		cogitoOpts := config.BuildCogitoOptions()
+
+		cogitoOpts = append(
+			cogitoOpts,
+			cogito.WithContext(ctxWithCancellation),
+			cogito.WithMCPs(sessions...),
 			cogito.WithStatusCallback(func(s string) {
 				log.Debug().Msgf("[model agent] [model: %s] Status: %s", config.Name, s)
 			}),
-			cogito.WithContext(ctxWithCancellation),
+			cogito.WithReasoningCallback(func(s string) {
+				log.Debug().Msgf("[model agent] [model: %s] Reasoning: %s", config.Name, s)
+			}),
 			cogito.WithToolCallBack(func(t *cogito.ToolChoice) bool {
 				log.Debug().Msgf("[model agent] [model: %s] Tool call: %s, reasoning: %s, arguments: %+v", t.Name, t.Reasoning, t.Arguments)
 				return true
@@ -102,35 +110,7 @@ func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader,
 			cogito.WithToolCallResultCallback(func(t cogito.ToolStatus) {
 				log.Debug().Msgf("[model agent] [model: %s] Tool call result: %s, tool arguments: %+v", t.Name, t.Result, t.ToolArguments)
 			}),
-			cogito.WithMCPs(sessions...),
-			cogito.WithIterations(3),  // default to 3 iterations
-			cogito.WithMaxAttempts(3), // default to 3 attempts
-			cogito.WithForceReasoning(),
-		}
-
-		if config.Agent.EnableReasoning {
-			cogitoOpts = append(cogitoOpts, cogito.EnableToolReasoner)
-		}
-
-		if config.Agent.EnablePlanning {
-			cogitoOpts = append(cogitoOpts, cogito.EnableAutoPlan)
-		}
-
-		if config.Agent.EnableMCPPrompts {
-			cogitoOpts = append(cogitoOpts, cogito.EnableMCPPrompts)
-		}
-
-		if config.Agent.EnablePlanReEvaluator {
-			cogitoOpts = append(cogitoOpts, cogito.EnableAutoPlanReEvaluator)
-		}
-
-		if config.Agent.MaxIterations != 0 {
-			cogitoOpts = append(cogitoOpts, cogito.WithIterations(config.Agent.MaxIterations))
-		}
-
-		if config.Agent.MaxAttempts != 0 {
-			cogitoOpts = append(cogitoOpts, cogito.WithMaxAttempts(config.Agent.MaxAttempts))
-		}
+		)
 
 		f, err := cogito.ExecuteTools(
 			defaultLLM, fragment,

From 8ed077a48ca3ec8881585f417e64e9ff9e6b5ce1 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 15 Nov 2025 16:49:17 +0100
Subject: [PATCH 4/6] MCP UX integration

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/static/chat.js  | 158 ++++++++++++++++++----
 core/http/views/chat.html | 267 +++++++++++++++++++++++++++++---------
 2 files changed, 337 insertions(+), 88 deletions(-)

diff --git a/core/http/static/chat.js b/core/http/static/chat.js
index ad668085b14f..a742eeb8a753 100644
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -493,24 +493,45 @@ async function promptGPT(systemPrompt, input) {
               switch (eventData.type) {
                 case "reasoning":
                   if (eventData.content) {
-                    Alpine.store("chat").add("reasoning", eventData.content);
+                    const chatStore = Alpine.store("chat");
+                    // Insert reasoning before assistant message if it exists
+                    if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
+                      chatStore.history.splice(lastAssistantMessageIndex, 0, {
+                        role: "reasoning",
+                        content: eventData.content,
+                        html: DOMPurify.sanitize(marked.parse(eventData.content)),
+                        image: [],
+                        audio: [],
+                        expanded: false
+                      });
+                      lastAssistantMessageIndex++; // Adjust index since we inserted
+                    } else {
+                      // No assistant message yet, just add normally
+                      chatStore.add("reasoning", eventData.content);
+                    }
                   }
                   break;
                 
                 case "tool_call":
                   if (eventData.name) {
-                    const toolCallContent = `**Tool:** ${eventData.name}\n\n` +
-                      (eventData.reasoning ? `**Reasoning:** ${eventData.reasoning}\n\n` : '') +
-                      `**Arguments:**\n\`\`\`json\n${JSON.stringify(eventData.arguments, null, 2)}\n\`\`\``;
-                    Alpine.store("chat").add("tool_call", toolCallContent);
+                    // Store as JSON for better formatting
+                    const toolCallData = {
+                      name: eventData.name,
+                      arguments: eventData.arguments || {},
+                      reasoning: eventData.reasoning || ""
+                    };
+                    Alpine.store("chat").add("tool_call", JSON.stringify(toolCallData, null, 2));
                   }
                   break;
                 
                 case "tool_result":
                   if (eventData.name) {
-                    const toolResultContent = `**Tool:** ${eventData.name}\n\n` +
-                      `**Result:**\n\`\`\`\n${eventData.result}\n\`\`\``;
-                    Alpine.store("chat").add("tool_result", toolResultContent);
+                    // Store as JSON for better formatting
+                    const toolResultData = {
+                      name: eventData.name,
+                      result: eventData.result || ""
+                    };
+                    Alpine.store("chat").add("tool_result", JSON.stringify(toolResultData, null, 2));
                   }
                   break;
                 
@@ -543,11 +564,28 @@ async function promptGPT(systemPrompt, input) {
                         const thinkingMatch = thinkingContent.match(/<(?:thinking|redacted_reasoning)>(.*?)<\/(?:thinking|redacted_reasoning)>/s);
                         if (thinkingMatch && thinkingMatch[1]) {
                           const extractedThinking = thinkingMatch[1];
+                          const chatStore = Alpine.store("chat");
                           if (lastThinkingMessageIndex === -1) {
-                            Alpine.store("chat").add("thinking", extractedThinking);
-                            lastThinkingMessageIndex = Alpine.store("chat").history.length - 1;
+                            // Insert thinking before the last assistant message if it exists
+                            if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
+                              // Insert before assistant message
+                              chatStore.history.splice(lastAssistantMessageIndex, 0, {
+                                role: "thinking",
+                                content: extractedThinking,
+                                html: DOMPurify.sanitize(marked.parse(extractedThinking)),
+                                image: [],
+                                audio: [],
+                                expanded: false
+                              });
+                              lastThinkingMessageIndex = lastAssistantMessageIndex;
+                              lastAssistantMessageIndex++; // Adjust index since we inserted
+                            } else {
+                              // No assistant message yet, just add normally
+                              chatStore.add("thinking", extractedThinking);
+                              lastThinkingMessageIndex = chatStore.history.length - 1;
+                            }
                           } else {
-                            const chatStore = Alpine.store("chat");
+                            // Update existing thinking message
                             const lastMessage = chatStore.history[lastThinkingMessageIndex];
                             if (lastMessage && lastMessage.role === "thinking") {
                               lastMessage.content = extractedThinking;
@@ -562,14 +600,29 @@ async function promptGPT(systemPrompt, input) {
                     // Handle content based on thinking state
                     if (isThinking) {
                       thinkingContent += contentChunk;
+                      const chatStore = Alpine.store("chat");
                       // Update the last thinking message or create a new one (incremental)
                       if (lastThinkingMessageIndex === -1) {
-                        // Create new thinking message
-                        Alpine.store("chat").add("thinking", thinkingContent);
-                        lastThinkingMessageIndex = Alpine.store("chat").history.length - 1;
+                        // Insert thinking before the last assistant message if it exists
+                        if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
+                          // Insert before assistant message
+                          chatStore.history.splice(lastAssistantMessageIndex, 0, {
+                            role: "thinking",
+                            content: thinkingContent,
+                            html: DOMPurify.sanitize(marked.parse(thinkingContent)),
+                            image: [],
+                            audio: [],
+                            expanded: false
+                          });
+                          lastThinkingMessageIndex = lastAssistantMessageIndex;
+                          lastAssistantMessageIndex++; // Adjust index since we inserted
+                        } else {
+                          // No assistant message yet, just add normally
+                          chatStore.add("thinking", thinkingContent);
+                          lastThinkingMessageIndex = chatStore.history.length - 1;
+                        }
                       } else {
                         // Update existing thinking message
-                        const chatStore = Alpine.store("chat");
                         const lastMessage = chatStore.history[lastThinkingMessageIndex];
                         if (lastMessage && lastMessage.role === "thinking") {
                           lastMessage.content = thinkingContent;
@@ -619,9 +672,24 @@ async function promptGPT(systemPrompt, input) {
             }
           }
           
-          // Add any extracted thinking content from the processed buffer
+          // Add any extracted thinking content from the processed buffer BEFORE assistant message
           if (processedThinking && processedThinking.trim()) {
-            Alpine.store("chat").add("thinking", processedThinking);
+            const chatStore = Alpine.store("chat");
+            // Insert thinking before assistant message if it exists
+            if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
+              chatStore.history.splice(lastAssistantMessageIndex, 0, {
+                role: "thinking",
+                content: processedThinking,
+                html: DOMPurify.sanitize(marked.parse(processedThinking)),
+                image: [],
+                audio: [],
+                expanded: false
+              });
+              lastAssistantMessageIndex++; // Adjust index since we inserted
+            } else {
+              // No assistant message yet, just add normally
+              chatStore.add("thinking", processedThinking);
+            }
           }
           
           assistantContentBuffer = [];
@@ -635,6 +703,27 @@ async function promptGPT(systemPrompt, input) {
         const { regularContent: processedRegular, thinkingContent: processedThinking } = processThinkingTags(regularContent);
         
         const chatStore = Alpine.store("chat");
+        
+        // First, add any extracted thinking content BEFORE assistant message
+        if (processedThinking && processedThinking.trim()) {
+          // Insert thinking before assistant message if it exists
+          if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
+            chatStore.history.splice(lastAssistantMessageIndex, 0, {
+              role: "thinking",
+              content: processedThinking,
+              html: DOMPurify.sanitize(marked.parse(processedThinking)),
+              image: [],
+              audio: [],
+              expanded: false
+            });
+            lastAssistantMessageIndex++; // Adjust index since we inserted
+          } else {
+            // No assistant message yet, just add normally
+            chatStore.add("thinking", processedThinking);
+          }
+        }
+        
+        // Then update or create assistant message
         if (lastAssistantMessageIndex !== -1) {
           const lastMessage = chatStore.history[lastAssistantMessageIndex];
           if (lastMessage && lastMessage.role === "assistant") {
@@ -642,13 +731,8 @@ async function promptGPT(systemPrompt, input) {
             lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
           }
         } else if (processedRegular && processedRegular.trim()) {
-          Alpine.store("chat").add("assistant", processedRegular);
-          lastAssistantMessageIndex = Alpine.store("chat").history.length - 1;
-        }
-        
-        // Add any extracted thinking content from the buffer
-        if (processedThinking && processedThinking.trim()) {
-          Alpine.store("chat").add("thinking", processedThinking);
+          chatStore.add("assistant", processedRegular);
+          lastAssistantMessageIndex = chatStore.history.length - 1;
         }
       }
       
@@ -657,7 +741,21 @@ async function promptGPT(systemPrompt, input) {
         // Extract thinking content if tags are present
         const thinkingMatch = thinkingContent.match(/<(?:thinking|redacted_reasoning)>(.*?)<\/(?:thinking|redacted_reasoning)>/s);
         if (thinkingMatch && thinkingMatch[1]) {
-          Alpine.store("chat").add("thinking", thinkingMatch[1]);
+          const chatStore = Alpine.store("chat");
+          // Insert thinking before assistant message if it exists
+          if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
+            chatStore.history.splice(lastAssistantMessageIndex, 0, {
+              role: "thinking",
+              content: thinkingMatch[1],
+              html: DOMPurify.sanitize(marked.parse(thinkingMatch[1])),
+              image: [],
+              audio: [],
+              expanded: false
+            });
+          } else {
+            // No assistant message yet, just add normally
+            chatStore.add("thinking", thinkingMatch[1]);
+          }
         } else {
           Alpine.store("chat").add("thinking", thinkingContent);
         }
@@ -861,8 +959,14 @@ async function promptGPT(systemPrompt, input) {
   // Remove class "loader" from the element with "loader" id
   toggleLoader(false);
 
-  // scroll to the bottom of the chat
-  document.getElementById('messages').scrollIntoView(false)
+  // scroll to the bottom of the chat consistently
+  setTimeout(() => {
+    const messagesContainer = document.getElementById('messages');
+    if (messagesContainer) {
+      messagesContainer.scrollTop = messagesContainer.scrollHeight;
+    }
+  }, 100);
+  
   // set focus to the input
   document.getElementById("input").focus();
 }
diff --git a/core/http/views/chat.html b/core/http/views/chat.html
index acb98f1bca35..ac1b28d379cf 100644
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -114,11 +114,29 @@
           // For thinking, reasoning, tool_call, and tool_result messages, always create a new message
           if (role === "thinking" || role === "reasoning" || role === "tool_call" || role === "tool_result") {
             let c = "";
-            const lines = content.split("\n");
-            lines.forEach((line) => {
-              c += DOMPurify.sanitize(marked.parse(line));
-            });
-            this.history.push({ role, content, html: c, image, audio });
+            if (role === "tool_call" || role === "tool_result") {
+              // For tool calls and results, try to parse as JSON and format nicely
+              try {
+                const parsed = typeof content === 'string' ? JSON.parse(content) : content;
+                // Format JSON with proper indentation
+                const formatted = JSON.stringify(parsed, null, 2);
+                c = DOMPurify.sanitize('<pre><code class="language-json">' + formatted + '</code></pre>');
+              } catch (e) {
+                // If not JSON, treat as markdown
+                const lines = content.split("\n");
+                lines.forEach((line) => {
+                  c += DOMPurify.sanitize(marked.parse(line));
+                });
+              }
+            } else {
+              // For thinking and reasoning, format as markdown
+              const lines = content.split("\n");
+              lines.forEach((line) => {
+                c += DOMPurify.sanitize(marked.parse(line));
+              });
+            }
+            // All these message types are collapsible by default
+            this.history.push({ role, content, html: c, image, audio, expanded: false });
           }
           // For other messages, merge if same role
           else if (this.history.length && this.history[N].role === role) {
@@ -147,7 +165,13 @@
               audio: audio || [] 
             });
           }
-          document.getElementById('messages').scrollIntoView(false);
+          // Scroll to bottom consistently for all messages
+          setTimeout(() => {
+            const messagesContainer = document.getElementById('messages');
+            if (messagesContainer) {
+              messagesContainer.scrollTop = messagesContainer.scrollHeight;
+            }
+          }, 0);
           const parser = new DOMParser();
           const html = parser.parseFromString(
             this.history[this.history.length - 1].html,
@@ -484,9 +508,104 @@ <h1 class="text-lg font-semibold text-[#E5E7EB]">
               <li>To send a text, markdown or PDF file, click the <i class="fa-solid fa-file text-[#38BDF8]"></i> icon.</li>
             </ul>
           </p>
-          <div id="messages" class="max-w-3xl mx-auto">
-            <template x-for="message in history">
-              <div :class="message.role === 'user' ? 'flex items-start space-x-2 my-2 justify-end' : 'flex items-start space-x-2 my-2'">
+          <div id="messages" class="max-w-3xl mx-auto space-y-2">
+            <template x-for="(message, index) in history" :key="index">
+              <div>
+                <!-- Reasoning/Thinking messages appear first (before assistant) - collapsible in MCP mode -->
+                <template x-if="message.role === 'reasoning' || message.role === 'thinking'">
+                  <div class="flex items-start space-x-2 mb-1">
+                    <div class="flex flex-col flex-1">
+                      <div class="p-2 flex-1 rounded-lg bg-[#38BDF8]/10 text-[#94A3B8] border border-[#38BDF8]/30">
+                        <button 
+                          @click="message.expanded = !message.expanded"
+                          class="w-full flex items-center justify-between text-left hover:bg-[#38BDF8]/20 rounded p-2 transition-colors"
+                        >
+                          <div class="flex items-center space-x-2">
+                            <i :class="message.role === 'thinking' ? 'fa-solid fa-brain' : 'fa-solid fa-lightbulb'" class="text-[#38BDF8]"></i>
+                            <span class="text-xs font-semibold text-[#38BDF8]" x-text="message.role === 'thinking' ? 'Thinking' : 'Reasoning'"></span>
+                            <span class="text-xs text-[#94A3B8]" x-show="message.content && message.content.length > 0" x-text="'(' + Math.ceil(message.content.length / 100) + ' lines)'"></span>
+                          </div>
+                          <i 
+                            class="fa-solid text-[#38BDF8] transition-transform text-xs"
+                            :class="message.expanded ? 'fa-chevron-up' : 'fa-chevron-down'"
+                          ></i>
+                        </button>
+                        <div 
+                          x-show="message.expanded"
+                          x-transition
+                          class="mt-2 pt-2 border-t border-[#38BDF8]/20"
+                        >
+                          <div class="text-[#E5E7EB] text-sm max-h-96 overflow-auto" x-html="message.html"></div>
+                        </div>
+                      </div>
+                    </div>
+                  </div>
+                </template>
+                
+                <!-- Tool calls (collapsible) -->
+                <template x-if="message.role === 'tool_call'">
+                  <div class="flex items-start space-x-2 mb-1">
+                    <div class="flex flex-col flex-1">
+                      <div class="p-2 flex-1 rounded-lg bg-[#8B5CF6]/10 text-[#94A3B8] border border-[#8B5CF6]/30">
+                        <button 
+                          @click="message.expanded = !message.expanded"
+                          class="w-full flex items-center justify-between text-left hover:bg-[#8B5CF6]/20 rounded p-2 transition-colors"
+                        >
+                          <div class="flex items-center space-x-2">
+                            <i class="fa-solid fa-wrench text-[#8B5CF6]"></i>
+                            <span class="text-xs font-semibold text-[#8B5CF6]">Tool Call</span>
+                            <span class="text-xs text-[#94A3B8]" x-text="getToolName(message.content)"></span>
+                          </div>
+                          <i 
+                            class="fa-solid text-[#8B5CF6] transition-transform text-xs"
+                            :class="message.expanded ? 'fa-chevron-up' : 'fa-chevron-down'"
+                          ></i>
+                        </button>
+                        <div 
+                          x-show="message.expanded"
+                          x-transition
+                          class="mt-2 pt-2 border-t border-[#8B5CF6]/20"
+                        >
+                          <div class="text-[#E5E7EB] text-xs max-h-96 overflow-auto" x-html="message.html"></div>
+                        </div>
+                      </div>
+                    </div>
+                  </div>
+                </template>
+                
+                <!-- Tool results (collapsible) -->
+                <template x-if="message.role === 'tool_result'">
+                  <div class="flex items-start space-x-2 mb-1">
+                    <div class="flex flex-col flex-1">
+                      <div class="p-2 flex-1 rounded-lg bg-[#10B981]/10 text-[#94A3B8] border border-[#10B981]/30">
+                        <button 
+                          @click="message.expanded = !message.expanded"
+                          class="w-full flex items-center justify-between text-left hover:bg-[#10B981]/20 rounded p-2 transition-colors"
+                        >
+                          <div class="flex items-center space-x-2">
+                            <i class="fa-solid fa-check-circle text-[#10B981]"></i>
+                            <span class="text-xs font-semibold text-[#10B981]">Tool Result</span>
+                            <span class="text-xs text-[#94A3B8]" x-text="getToolName(message.content) || 'Success'"></span>
+                          </div>
+                          <i 
+                            class="fa-solid text-[#10B981] transition-transform text-xs"
+                            :class="message.expanded ? 'fa-chevron-up' : 'fa-chevron-down'"
+                          ></i>
+                        </button>
+                        <div 
+                          x-show="message.expanded"
+                          x-transition
+                          class="mt-2 pt-2 border-t border-[#10B981]/20"
+                        >
+                          <div class="text-[#E5E7EB] text-xs max-h-96 overflow-auto" x-html="formatToolResult(message.content)"></div>
+                        </div>
+                      </div>
+                    </div>
+                  </div>
+                </template>
+                
+                <!-- User and Assistant messages -->
+                <div :class="message.role === 'user' ? 'flex items-start space-x-2 justify-end' : 'flex items-start space-x-2'">
                 {{ if .Model }}
                 {{ $galleryConfig:= index $allGalleryConfigs .Model}}
                 <template x-if="message.role === 'user'">
@@ -514,58 +633,6 @@ <h1 class="text-lg font-semibold text-[#E5E7EB]">
                     </div>
                   </div>
                 </template>
-                <template x-if="message.role === 'thinking'">
-                  <div class="flex items-center space-x-2 w-full">
-                    <div class="flex flex-col flex-1">
-                      <div class="p-3 flex-1 rounded-lg bg-[#38BDF8]/10 text-[#94A3B8] border border-[#38BDF8]/30">
-                        <div class="flex items-center space-x-2 mb-2">
-                          <i class="fa-solid fa-brain text-[#38BDF8]"></i>
-                          <span class="text-xs font-semibold text-[#38BDF8]">Thinking</span>
-                        </div>
-                        <div class="mt-1 text-[#E5E7EB]" x-html="message.html"></div>
-                      </div>
-                    </div>
-                  </div>
-                </template>
-                <template x-if="message.role === 'reasoning'">
-                  <div class="flex items-center space-x-2 w-full">
-                    <div class="flex flex-col flex-1">
-                      <div class="p-3 flex-1 rounded-lg bg-[#38BDF8]/10 text-[#94A3B8] border border-[#38BDF8]/30">
-                        <div class="flex items-center space-x-2 mb-2">
-                          <i class="fa-solid fa-lightbulb text-[#38BDF8]"></i>
-                          <span class="text-xs font-semibold text-[#38BDF8]">Reasoning</span>
-                        </div>
-                        <div class="mt-1 text-[#E5E7EB]" x-html="message.html"></div>
-                      </div>
-                    </div>
-                  </div>
-                </template>
-                <template x-if="message.role === 'tool_call'">
-                  <div class="flex items-center space-x-2 w-full">
-                    <div class="flex flex-col flex-1">
-                      <div class="p-3 flex-1 rounded-lg bg-[#8B5CF6]/10 text-[#94A3B8] border border-[#8B5CF6]/30">
-                        <div class="flex items-center space-x-2 mb-2">
-                          <i class="fa-solid fa-wrench text-[#8B5CF6]"></i>
-                          <span class="text-xs font-semibold text-[#8B5CF6]">Tool Call</span>
-                        </div>
-                        <div class="mt-1 text-[#E5E7EB]" x-html="message.html"></div>
-                      </div>
-                    </div>
-                  </div>
-                </template>
-                <template x-if="message.role === 'tool_result'">
-                  <div class="flex items-center space-x-2 w-full">
-                    <div class="flex flex-col flex-1">
-                      <div class="p-3 flex-1 rounded-lg bg-[#10B981]/10 text-[#94A3B8] border border-[#10B981]/30">
-                        <div class="flex items-center space-x-2 mb-2">
-                          <i class="fa-solid fa-check-circle text-[#10B981]"></i>
-                          <span class="text-xs font-semibold text-[#10B981]">Tool Result</span>
-                        </div>
-                        <div class="mt-1 text-[#E5E7EB]" x-html="message.html"></div>
-                      </div>
-                    </div>
-                  </div>
-                </template>
                 <template x-if="message.role != 'user' && message.role != 'thinking' && message.role != 'reasoning' && message.role != 'tool_call' && message.role != 'tool_result'">
                   <div class="flex items-center space-x-2">
                     {{ if $galleryConfig }}
@@ -605,6 +672,7 @@ <h1 class="text-lg font-semibold text-[#E5E7EB]">
                   :class="message.role === 'user' ? 'fa-user text-[#38BDF8]' : 'fa-robot text-[#8B5CF6]'"
                 ></i>
                 {{ end }}
+                </div>
               </div>
             </template>
           </div>
@@ -814,6 +882,83 @@ <h3 class="text-xl font-semibold text-gray-900 dark:text-white">{{ $model }}</h3
             console.error('Failed to copy: ', err);
           });
         };
+
+        // Format tool result for better display
+        window.formatToolResult = (content) => {
+          if (!content) return '';
+          try {
+            // Try to parse as JSON
+            const parsed = JSON.parse(content);
+            
+            // If it has a 'result' field, try to parse that too
+            if (parsed.result && typeof parsed.result === 'string') {
+              try {
+                const resultParsed = JSON.parse(parsed.result);
+                parsed.result = resultParsed;
+              } catch (e) {
+                // Keep as string if not JSON
+              }
+            }
+            
+            // Format the JSON nicely
+            const formatted = JSON.stringify(parsed, null, 2);
+            return DOMPurify.sanitize('<pre class="bg-[#101827] p-3 rounded border border-[#10B981]/20 overflow-x-auto"><code class="language-json">' + formatted + '</code></pre>');
+          } catch (e) {
+            // If not JSON, try to format as markdown or plain text
+            try {
+              // Check if it's a markdown code block
+              if (content.includes('```')) {
+                return DOMPurify.sanitize(marked.parse(content));
+              }
+              // Otherwise, try to parse as JSON one more time with error handling
+              const lines = content.split('\n');
+              let jsonStart = -1;
+              let jsonEnd = -1;
+              for (let i = 0; i < lines.length; i++) {
+                if (lines[i].trim().startsWith('{') || lines[i].trim().startsWith('[')) {
+                  jsonStart = i;
+                  break;
+                }
+              }
+              if (jsonStart >= 0) {
+                for (let i = lines.length - 1; i >= jsonStart; i--) {
+                  if (lines[i].trim().endsWith('}') || lines[i].trim().endsWith(']')) {
+                    jsonEnd = i;
+                    break;
+                  }
+                }
+                if (jsonEnd >= jsonStart) {
+                  const jsonStr = lines.slice(jsonStart, jsonEnd + 1).join('\n');
+                  try {
+                    const parsed = JSON.parse(jsonStr);
+                    const formatted = JSON.stringify(parsed, null, 2);
+                    return DOMPurify.sanitize('<pre class="bg-[#101827] p-3 rounded border border-[#10B981]/20 overflow-x-auto"><code class="language-json">' + formatted + '</code></pre>');
+                  } catch (e2) {
+                    // Fall through to markdown
+                  }
+                }
+              }
+              // Fall back to markdown
+              return DOMPurify.sanitize(marked.parse(content));
+            } catch (e2) {
+              // Last resort: plain text
+              return DOMPurify.sanitize('<pre class="bg-[#101827] p-3 rounded border border-[#10B981]/20 overflow-x-auto text-xs">' + content.replace(/</g, '&lt;').replace(/>/g, '&gt;') + '</pre>');
+            }
+          }
+        };
+
+        // Get tool name from content
+        window.getToolName = (content) => {
+          if (!content || typeof content !== 'string') return '';
+          try {
+            const parsed = JSON.parse(content);
+            return parsed.name || '';
+          } catch (e) {
+            // Try to extract name from string
+            const nameMatch = content.match(/"name"\s*:\s*"([^"]+)"/);
+            return nameMatch ? nameMatch[1] : '';
+          }
+        };
       });
 
       // Context size is now initialized in the Alpine store initialization above

From 6668e7defb4bb6af924119f2564195637b1fc2ce Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 15 Nov 2025 17:02:07 +0100
Subject: [PATCH 5/6] Enhance UX

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/static/chat.js  | 135 ++++++++++++++++++++++++++++++++++----
 core/http/views/chat.html | 123 +++++++++++++++++++++++++++++++---
 2 files changed, 238 insertions(+), 20 deletions(-)

diff --git a/core/http/static/chat.js b/core/http/static/chat.js
index a742eeb8a753..1307b1b5486c 100644
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -267,7 +267,15 @@ function processAndSendMessage(inputValue) {
   const input = document.getElementById("input");
   if (input) input.value = "";
   const systemPrompt = localStorage.getItem("system_prompt");
-  Alpine.nextTick(() => { document.getElementById('messages').scrollIntoView(false); });
+  Alpine.nextTick(() => {
+    const chatContainer = document.getElementById('chat');
+    if (chatContainer) {
+      chatContainer.scrollTo({
+        top: chatContainer.scrollHeight,
+        behavior: 'smooth'
+      });
+    }
+  });
   
   // Reset token tracking before starting new request
   requestStartTime = Date.now();
@@ -468,6 +476,8 @@ async function promptGPT(systemPrompt, input) {
     let isThinking = false;
     let lastAssistantMessageIndex = -1;
     let lastThinkingMessageIndex = -1;
+    let lastThinkingScrollTime = 0;
+    const THINKING_SCROLL_THROTTLE = 200; // Throttle scrolling to every 200ms
 
     try {
       while (true) {
@@ -502,9 +512,19 @@ async function promptGPT(systemPrompt, input) {
                         html: DOMPurify.sanitize(marked.parse(eventData.content)),
                         image: [],
                         audio: [],
-                        expanded: false
+                        expanded: false // Reasoning is always collapsed
                       });
                       lastAssistantMessageIndex++; // Adjust index since we inserted
+                      // Scroll smoothly after adding reasoning
+                      setTimeout(() => {
+                        const chatContainer = document.getElementById('chat');
+                        if (chatContainer) {
+                          chatContainer.scrollTo({
+                            top: chatContainer.scrollHeight,
+                            behavior: 'smooth'
+                          });
+                        }
+                      }, 100);
                     } else {
                       // No assistant message yet, just add normally
                       chatStore.add("reasoning", eventData.content);
@@ -521,6 +541,16 @@ async function promptGPT(systemPrompt, input) {
                       reasoning: eventData.reasoning || ""
                     };
                     Alpine.store("chat").add("tool_call", JSON.stringify(toolCallData, null, 2));
+                    // Scroll smoothly after adding tool call
+                    setTimeout(() => {
+                      const chatContainer = document.getElementById('chat');
+                      if (chatContainer) {
+                        chatContainer.scrollTo({
+                          top: chatContainer.scrollHeight,
+                          behavior: 'smooth'
+                        });
+                      }
+                    }, 100);
                   }
                   break;
                 
@@ -532,6 +562,16 @@ async function promptGPT(systemPrompt, input) {
                       result: eventData.result || ""
                     };
                     Alpine.store("chat").add("tool_result", JSON.stringify(toolResultData, null, 2));
+                    // Scroll smoothly after adding tool result
+                    setTimeout(() => {
+                      const chatContainer = document.getElementById('chat');
+                      if (chatContainer) {
+                        chatContainer.scrollTo({
+                          top: chatContainer.scrollHeight,
+                          behavior: 'smooth'
+                        });
+                      }
+                    }, 100);
                   }
                   break;
                 
@@ -565,6 +605,8 @@ async function promptGPT(systemPrompt, input) {
                         if (thinkingMatch && thinkingMatch[1]) {
                           const extractedThinking = thinkingMatch[1];
                           const chatStore = Alpine.store("chat");
+                          const isMCPMode = chatStore.mcpMode || false;
+                          const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
                           if (lastThinkingMessageIndex === -1) {
                             // Insert thinking before the last assistant message if it exists
                             if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
@@ -575,7 +617,7 @@ async function promptGPT(systemPrompt, input) {
                                 html: DOMPurify.sanitize(marked.parse(extractedThinking)),
                                 image: [],
                                 audio: [],
-                                expanded: false
+                                expanded: shouldExpand
                               });
                               lastThinkingMessageIndex = lastAssistantMessageIndex;
                               lastAssistantMessageIndex++; // Adjust index since we inserted
@@ -592,6 +634,18 @@ async function promptGPT(systemPrompt, input) {
                               lastMessage.html = DOMPurify.sanitize(marked.parse(extractedThinking));
                             }
                           }
+                          // Scroll when thinking is finalized in non-MCP mode
+                          if (!isMCPMode) {
+                            setTimeout(() => {
+                              const chatContainer = document.getElementById('chat');
+                              if (chatContainer) {
+                                chatContainer.scrollTo({
+                                  top: chatContainer.scrollHeight,
+                                  behavior: 'smooth'
+                                });
+                              }
+                            }, 50);
+                          }
                         }
                         thinkingContent = "";
                       }
@@ -601,6 +655,8 @@ async function promptGPT(systemPrompt, input) {
                     if (isThinking) {
                       thinkingContent += contentChunk;
                       const chatStore = Alpine.store("chat");
+                      const isMCPMode = chatStore.mcpMode || false;
+                      const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
                       // Update the last thinking message or create a new one (incremental)
                       if (lastThinkingMessageIndex === -1) {
                         // Insert thinking before the last assistant message if it exists
@@ -612,7 +668,7 @@ async function promptGPT(systemPrompt, input) {
                             html: DOMPurify.sanitize(marked.parse(thinkingContent)),
                             image: [],
                             audio: [],
-                            expanded: false
+                            expanded: shouldExpand
                           });
                           lastThinkingMessageIndex = lastAssistantMessageIndex;
                           lastAssistantMessageIndex++; // Adjust index since we inserted
@@ -629,6 +685,22 @@ async function promptGPT(systemPrompt, input) {
                           lastMessage.html = DOMPurify.sanitize(marked.parse(thinkingContent));
                         }
                       }
+                      // Scroll when thinking is updated in non-MCP mode (throttled)
+                      if (!isMCPMode) {
+                        const now = Date.now();
+                        if (now - lastThinkingScrollTime > THINKING_SCROLL_THROTTLE) {
+                          lastThinkingScrollTime = now;
+                          setTimeout(() => {
+                            const chatContainer = document.getElementById('chat');
+                            if (chatContainer) {
+                              chatContainer.scrollTo({
+                                top: chatContainer.scrollHeight,
+                                behavior: 'smooth'
+                              });
+                            }
+                          }, 100);
+                        }
+                      }
                     } else {
                       // Regular assistant content - buffer it for batch processing
                       assistantContentBuffer.push(contentChunk);
@@ -675,6 +747,8 @@ async function promptGPT(systemPrompt, input) {
           // Add any extracted thinking content from the processed buffer BEFORE assistant message
           if (processedThinking && processedThinking.trim()) {
             const chatStore = Alpine.store("chat");
+            const isMCPMode = chatStore.mcpMode || false;
+            const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
             // Insert thinking before assistant message if it exists
             if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
               chatStore.history.splice(lastAssistantMessageIndex, 0, {
@@ -683,7 +757,7 @@ async function promptGPT(systemPrompt, input) {
                 html: DOMPurify.sanitize(marked.parse(processedThinking)),
                 image: [],
                 audio: [],
-                expanded: false
+                expanded: shouldExpand
               });
               lastAssistantMessageIndex++; // Adjust index since we inserted
             } else {
@@ -706,6 +780,8 @@ async function promptGPT(systemPrompt, input) {
         
         // First, add any extracted thinking content BEFORE assistant message
         if (processedThinking && processedThinking.trim()) {
+          const isMCPMode = chatStore.mcpMode || false;
+          const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
           // Insert thinking before assistant message if it exists
           if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
             chatStore.history.splice(lastAssistantMessageIndex, 0, {
@@ -714,7 +790,7 @@ async function promptGPT(systemPrompt, input) {
               html: DOMPurify.sanitize(marked.parse(processedThinking)),
               image: [],
               audio: [],
-              expanded: false
+              expanded: shouldExpand
             });
             lastAssistantMessageIndex++; // Adjust index since we inserted
           } else {
@@ -742,6 +818,8 @@ async function promptGPT(systemPrompt, input) {
         const thinkingMatch = thinkingContent.match(/<(?:thinking|redacted_reasoning)>(.*?)<\/(?:thinking|redacted_reasoning)>/s);
         if (thinkingMatch && thinkingMatch[1]) {
           const chatStore = Alpine.store("chat");
+          const isMCPMode = chatStore.mcpMode || false;
+          const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
           // Insert thinking before assistant message if it exists
           if (lastAssistantMessageIndex >= 0 && chatStore.history[lastAssistantMessageIndex]?.role === "assistant") {
             chatStore.history.splice(lastAssistantMessageIndex, 0, {
@@ -750,7 +828,7 @@ async function promptGPT(systemPrompt, input) {
               html: DOMPurify.sanitize(marked.parse(thinkingMatch[1])),
               image: [],
               audio: [],
-              expanded: false
+              expanded: shouldExpand
             });
           } else {
             // No assistant message yet, just add normally
@@ -844,6 +922,8 @@ async function promptGPT(systemPrompt, input) {
     let thinkingContent = "";
     let isThinking = false;
     let lastThinkingMessageIndex = -1;
+    let lastThinkingScrollTime = 0;
+    const THINKING_SCROLL_THROTTLE = 200; // Throttle scrolling to every 200ms
 
     try {
       while (true) {
@@ -911,6 +991,20 @@ async function promptGPT(systemPrompt, input) {
                       lastMessage.html = DOMPurify.sanitize(marked.parse(thinkingContent));
                     }
                   }
+                  // Scroll when thinking is updated (throttled)
+                  const now = Date.now();
+                  if (now - lastThinkingScrollTime > THINKING_SCROLL_THROTTLE) {
+                    lastThinkingScrollTime = now;
+                    setTimeout(() => {
+                      const chatContainer = document.getElementById('chat');
+                      if (chatContainer) {
+                        chatContainer.scrollTo({
+                          top: chatContainer.scrollHeight,
+                          behavior: 'smooth'
+                        });
+                      }
+                    }, 100);
+                  }
                 } else {
                   contentBuffer.push(token);
                 }
@@ -925,6 +1019,16 @@ async function promptGPT(systemPrompt, input) {
         if (contentBuffer.length > 0) {
           addToChat(contentBuffer.join(""));
           contentBuffer = [];
+          // Scroll when assistant content is updated (this will also show thinking messages above)
+          setTimeout(() => {
+            const chatContainer = document.getElementById('chat');
+            if (chatContainer) {
+              chatContainer.scrollTo({
+                top: chatContainer.scrollHeight,
+                behavior: 'smooth'
+              });
+            }
+          }, 50);
         }
       }
 
@@ -961,9 +1065,12 @@ async function promptGPT(systemPrompt, input) {
 
   // scroll to the bottom of the chat consistently
   setTimeout(() => {
-    const messagesContainer = document.getElementById('messages');
-    if (messagesContainer) {
-      messagesContainer.scrollTop = messagesContainer.scrollHeight;
+    const chatContainer = document.getElementById('chat');
+    if (chatContainer) {
+      chatContainer.scrollTo({
+        top: chatContainer.scrollHeight,
+        behavior: 'smooth'
+      });
     }
   }, 100);
   
@@ -1095,7 +1202,13 @@ document.addEventListener("alpine:init", () => {
           audio: audio || [] 
         });
       }
-      document.getElementById('messages').scrollIntoView(false);
+      const chatContainer = document.getElementById('chat');
+      if (chatContainer) {
+        chatContainer.scrollTo({
+          top: chatContainer.scrollHeight,
+          behavior: 'smooth'
+        });
+      }
       const parser = new DOMParser();
       const html = parser.parseFromString(
         this.history[this.history.length - 1].html,
diff --git a/core/http/views/chat.html b/core/http/views/chat.html
index ac1b28d379cf..7e042edd6776 100644
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -135,8 +135,12 @@
                 c += DOMPurify.sanitize(marked.parse(line));
               });
             }
-            // All these message types are collapsible by default
-            this.history.push({ role, content, html: c, image, audio, expanded: false });
+            // Set expanded state: thinking is expanded by default in non-MCP mode, collapsed in MCP mode
+            // Reasoning, tool_call, and tool_result are always collapsed by default
+            const isMCPMode = this.mcpMode || false;
+            const shouldExpand = (role === "thinking" && !isMCPMode) || false;
+            this.history.push({ role, content, html: c, image, audio, expanded: shouldExpand });
+            
           }
           // For other messages, merge if same role
           else if (this.history.length && this.history[N].role === role) {
@@ -165,13 +169,16 @@
               audio: audio || [] 
             });
           }
-          // Scroll to bottom consistently for all messages
+          // Scroll to bottom consistently for all messages (use #chat as it's the scrollable container)
           setTimeout(() => {
-            const messagesContainer = document.getElementById('messages');
-            if (messagesContainer) {
-              messagesContainer.scrollTop = messagesContainer.scrollHeight;
+            const chatContainer = document.getElementById('chat');
+            if (chatContainer) {
+              chatContainer.scrollTo({
+                top: chatContainer.scrollHeight,
+                behavior: 'smooth'
+              });
             }
-          }, 0);
+          }, 100);
           const parser = new DOMParser();
           const html = parser.parseFromString(
             this.history[this.history.length - 1].html,
@@ -184,9 +191,33 @@
             if (this.languages.includes(language)) return;
             const script = document.createElement("script");
             script.src = `https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/languages/${language}.min.js`;
+            script.onload = () => {
+              // Re-highlight after language script loads
+              if (window.hljs) {
+                const container = document.getElementById('messages');
+                if (container) {
+                  container.querySelectorAll('pre code.language-json').forEach(block => {
+                    window.hljs.highlightElement(block);
+                  });
+                }
+              }
+            };
             document.head.appendChild(script);
             this.languages.push(language);
           });
+          // Highlight code blocks immediately if hljs is available
+          if (window.hljs) {
+            setTimeout(() => {
+              const container = document.getElementById('messages');
+              if (container) {
+                container.querySelectorAll('pre code.language-json').forEach(block => {
+                  if (!block.classList.contains('hljs')) {
+                    window.hljs.highlightElement(block);
+                  }
+                });
+              }
+            }, 100);
+          }
         },
         messages() {
           return this.history.map((message) => ({
@@ -566,7 +597,9 @@ <h1 class="text-lg font-semibold text-[#E5E7EB]">
                           x-transition
                           class="mt-2 pt-2 border-t border-[#8B5CF6]/20"
                         >
-                          <div class="text-[#E5E7EB] text-xs max-h-96 overflow-auto" x-html="message.html"></div>
+                          <div class="text-[#E5E7EB] text-xs max-h-96 overflow-auto overflow-x-auto tool-call-content" 
+                               x-html="message.html"
+                               x-effect="if (message.expanded && window.hljs) { setTimeout(() => { $el.querySelectorAll('pre code.language-json').forEach(block => { if (!block.classList.contains('hljs')) window.hljs.highlightElement(block); }); }, 50); }"></div>
                         </div>
                       </div>
                     </div>
@@ -597,7 +630,9 @@ <h1 class="text-lg font-semibold text-[#E5E7EB]">
                           x-transition
                           class="mt-2 pt-2 border-t border-[#10B981]/20"
                         >
-                          <div class="text-[#E5E7EB] text-xs max-h-96 overflow-auto" x-html="formatToolResult(message.content)"></div>
+                          <div class="text-[#E5E7EB] text-xs max-h-96 overflow-auto overflow-x-auto tool-result-content" 
+                               x-html="formatToolResult(message.content)"
+                               x-effect="if (message.expanded && window.hljs) { setTimeout(() => { $el.querySelectorAll('pre code.language-json').forEach(block => { if (!block.classList.contains('hljs')) window.hljs.highlightElement(block); }); }, 50); }"></div>
                         </div>
                       </div>
                     </div>
@@ -1088,6 +1123,76 @@ <h3 class="text-xl font-semibold text-gray-900 dark:text-white">{{ $model }}</h3
         max-width: 100%;
         height: auto;
     }
+
+    /* Prevent JSON overflow in tool calls and results */
+    .tool-call-content pre,
+    .tool-result-content pre {
+        overflow-x: auto;
+        overflow-y: auto;
+        max-width: 100%;
+        word-wrap: break-word;
+        white-space: pre-wrap;
+        background: #101827 !important;
+        border: 1px solid #1E293B;
+        border-radius: 6px;
+        padding: 12px;
+        margin: 0;
+    }
+
+    .tool-call-content code,
+    .tool-result-content code {
+        word-wrap: break-word;
+        white-space: pre-wrap;
+        overflow-wrap: break-word;
+        background: transparent !important;
+        color: #E5E7EB;
+        font-family: 'ui-monospace', 'Monaco', 'Consolas', monospace;
+        font-size: 0.875rem;
+        line-height: 1.5;
+    }
+
+    /* Dark theme syntax highlighting for JSON */
+    .tool-call-content .hljs,
+    .tool-result-content .hljs {
+        background: #101827 !important;
+        color: #E5E7EB !important;
+    }
+
+    .tool-call-content .hljs-keyword,
+    .tool-result-content .hljs-keyword {
+        color: #8B5CF6 !important;
+        font-weight: 600;
+    }
+
+    .tool-call-content .hljs-string,
+    .tool-result-content .hljs-string {
+        color: #10B981 !important;
+    }
+
+    .tool-call-content .hljs-number,
+    .tool-result-content .hljs-number {
+        color: #38BDF8 !important;
+    }
+
+    .tool-call-content .hljs-literal,
+    .tool-result-content .hljs-literal {
+        color: #F59E0B !important;
+    }
+
+    .tool-call-content .hljs-punctuation,
+    .tool-result-content .hljs-punctuation {
+        color: #94A3B8 !important;
+    }
+
+    .tool-call-content .hljs-property,
+    .tool-result-content .hljs-property {
+        color: #38BDF8 !important;
+    }
+
+    .tool-call-content .hljs-attr,
+    .tool-result-content .hljs-attr {
+        color: #8B5CF6 !important;
+    }
     </style>
   </body>
 </html>

From 2c1e0e0a028a938ed031130f99b93f769abf2eee Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 15 Nov 2025 17:16:25 +0100
Subject: [PATCH 6/6] Support also non-SSE

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/localai/mcp.go | 95 +++++++++++++++++++++++-------
 1 file changed, 75 insertions(+), 20 deletions(-)

diff --git a/core/http/endpoints/localai/mcp.go b/core/http/endpoints/localai/mcp.go
index ff58d7df20c3..c13a388064f1 100644
--- a/core/http/endpoints/localai/mcp.go
+++ b/core/http/endpoints/localai/mcp.go
@@ -61,6 +61,7 @@ type MCPErrorEvent struct {
 func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		ctx := c.Request().Context()
+		created := int(time.Now().Unix())
 
 		// Handle Correlation
 		id := c.Request().Header.Get("X-Correlation-ID")
@@ -98,19 +99,6 @@ func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 			return fmt.Errorf("no working MCP servers found")
 		}
 
-		// Set up SSE headers
-		c.Response().Header().Set("Content-Type", "text/event-stream")
-		c.Response().Header().Set("Cache-Control", "no-cache")
-		c.Response().Header().Set("Connection", "keep-alive")
-		c.Response().Header().Set("X-Correlation-ID", id)
-
-		// Create channel for streaming events
-		events := make(chan interface{})
-		ended := make(chan error, 1)
-
-		ctxWithCancellation, cancel := context.WithCancel(ctx)
-		defer cancel()
-
 		// Build fragment from messages
 		fragment := cogito.NewEmptyFragment()
 		for _, message := range input.Messages {
@@ -119,16 +107,87 @@ func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 
 		port := appConfig.APIAddress[strings.LastIndex(appConfig.APIAddress, ":")+1:]
 		apiKey := ""
-		if appConfig.ApiKeys != nil && len(appConfig.ApiKeys) > 0 {
+		if len(appConfig.ApiKeys) > 0 {
 			apiKey = appConfig.ApiKeys[0]
 		}
 
+		ctxWithCancellation, cancel := context.WithCancel(ctx)
+		defer cancel()
+
 		// TODO: instead of connecting to the API, we should just wire this internally
 		// and act like completion.go.
 		// We can do this as cogito expects an interface and we can create one that
 		// we satisfy to just call internally ComputeChoices
 		defaultLLM := cogito.NewOpenAILLM(config.Name, apiKey, "http://127.0.0.1:"+port)
 
+		// Build cogito options using the consolidated method
+		cogitoOpts := config.BuildCogitoOptions()
+		cogitoOpts = append(
+			cogitoOpts,
+			cogito.WithContext(ctxWithCancellation),
+			cogito.WithMCPs(sessions...),
+		)
+		// Check if streaming is requested
+		toStream := input.Stream
+
+		if !toStream {
+			// Non-streaming mode: execute synchronously and return JSON response
+			cogitoOpts = append(
+				cogitoOpts,
+				cogito.WithStatusCallback(func(s string) {
+					log.Debug().Msgf("[model agent] [model: %s] Status: %s", config.Name, s)
+				}),
+				cogito.WithReasoningCallback(func(s string) {
+					log.Debug().Msgf("[model agent] [model: %s] Reasoning: %s", config.Name, s)
+				}),
+				cogito.WithToolCallBack(func(t *cogito.ToolChoice) bool {
+					log.Debug().Str("model", config.Name).Str("tool", t.Name).Str("reasoning", t.Reasoning).Interface("arguments", t.Arguments).Msg("[model agent] Tool call")
+					return true
+				}),
+				cogito.WithToolCallResultCallback(func(t cogito.ToolStatus) {
+					log.Debug().Str("model", config.Name).Str("tool", t.Name).Str("result", t.Result).Interface("tool_arguments", t.ToolArguments).Msg("[model agent] Tool call result")
+				}),
+			)
+
+			f, err := cogito.ExecuteTools(
+				defaultLLM, fragment,
+				cogitoOpts...,
+			)
+			if err != nil && !errors.Is(err, cogito.ErrNoToolSelected) {
+				return err
+			}
+
+			f, err = defaultLLM.Ask(ctxWithCancellation, f)
+			if err != nil {
+				return err
+			}
+
+			resp := &schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Message: &schema.Message{Role: "assistant", Content: &f.LastMessage().Content}}},
+				Object:  "chat.completion",
+			}
+
+			jsonResult, _ := json.Marshal(resp)
+			log.Debug().Msgf("Response: %s", jsonResult)
+
+			// Return the prediction in the response body
+			return c.JSON(200, resp)
+		}
+
+		// Streaming mode: use SSE
+		// Set up SSE headers
+		c.Response().Header().Set("Content-Type", "text/event-stream")
+		c.Response().Header().Set("Cache-Control", "no-cache")
+		c.Response().Header().Set("Connection", "keep-alive")
+		c.Response().Header().Set("X-Correlation-ID", id)
+
+		// Create channel for streaming events
+		events := make(chan interface{})
+		ended := make(chan error, 1)
+
 		// Set up callbacks for streaming
 		statusCallback := func(s string) {
 			events <- MCPStatusEvent{
@@ -162,16 +221,12 @@ func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 			}
 		}
 
-		// Build cogito options using the consolidated method
-		cogitoOpts := config.BuildCogitoOptions()
-
 		cogitoOpts = append(cogitoOpts,
 			cogito.WithStatusCallback(statusCallback),
 			cogito.WithReasoningCallback(reasoningCallback),
 			cogito.WithToolCallBack(toolCallCallback),
 			cogito.WithToolCallResultCallback(toolCallResultCallback),
-			cogito.WithContext(ctxWithCancellation),
-			cogito.WithMCPs(sessions...))
+		)
 
 		// Execute tools in a goroutine
 		go func() {
@@ -191,7 +246,7 @@ func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 			}
 
 			// Get final response
-			f, err = defaultLLM.Ask(ctx, f)
+			f, err = defaultLLM.Ask(ctxWithCancellation, f)
 			if err != nil {
 				events <- MCPErrorEvent{
 					Type:    "error",