Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .air.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[build]
cmd = "make build"
bin = "./local-ai"
args_bin = [ "--debug" ]
include_ext = ["go", "html", "yaml", "toml", "json", "txt", "md"]
exclude_dir = ["pkg/grpc/proto"]
delay = 1000
12 changes: 7 additions & 5 deletions core/http/middleware/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/valyala/fasthttp"

"github.com/gofiber/fiber/v2"
"github.com/rs/zerolog/log"
Expand Down Expand Up @@ -167,11 +168,12 @@ func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error {
c1, cancel := context.WithCancel(re.applicationConfig.Context)
// Monitor the Fiber context and cancel our context when it's canceled
// This ensures we respect request cancellation without causing panics
go func() {
<-ctx.Context().Done()
// Fiber context was canceled (request completed or client disconnected)
cancel()
}()
go func(fiberCtx *fasthttp.RequestCtx) {
if fiberCtx != nil {
<-fiberCtx.Done()
cancel()
}
}(ctx.Context())
// Add the correlation ID to the new context
ctxWithCorrelationID := context.WithValue(c1, CorrelationIDKey, correlationID)

Expand Down
15 changes: 13 additions & 2 deletions core/http/routes/ui.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,15 @@ func RegisterUIRoutes(app *fiber.App,
}

title := "LocalAI - Chat"
var modelContextSize *int

for _, b := range modelConfigs {
if b.HasUsecases(config.FLAG_CHAT) {
modelThatCanBeUsed = b.Name
title = "LocalAI - Chat with " + modelThatCanBeUsed
if b.LLMConfig.ContextSize != nil {
modelContextSize = b.LLMConfig.ContextSize
}
break
}
}
Expand All @@ -107,6 +111,7 @@ func RegisterUIRoutes(app *fiber.App,
"GalleryConfig": galleryConfigs,
"ModelsConfig": modelConfigs,
"Model": modelThatCanBeUsed,
"ContextSize": modelContextSize,
"Version": internal.PrintableVersion(),
}

Expand All @@ -120,22 +125,28 @@ func RegisterUIRoutes(app *fiber.App,
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)

galleryConfigs := map[string]*gallery.ModelConfig{}
modelName := c.Params("model")
var modelContextSize *int

for _, m := range modelConfigs {
cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
if err != nil {
continue
}
galleryConfigs[m.Name] = cfg
if m.Name == modelName && m.LLMConfig.ContextSize != nil {
modelContextSize = m.LLMConfig.ContextSize
}
}

summary := fiber.Map{
"Title": "LocalAI - Chat with " + c.Params("model"),
"Title": "LocalAI - Chat with " + modelName,
"BaseURL": utils.BaseURL(c),
"ModelsConfig": modelConfigs,
"GalleryConfig": galleryConfigs,
"ModelsWithoutConfig": modelsWithoutConfig,
"Model": c.Params("model"),
"Model": modelName,
"ContextSize": modelContextSize,
"Version": internal.PrintableVersion(),
}

Expand Down
190 changes: 180 additions & 10 deletions core/http/static/chat.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,63 @@ SOFTWARE.
// Global variable to store the current AbortController
let currentAbortController = null;
let currentReader = null;
let requestStartTime = null;
let tokensReceived = 0;
let tokensPerSecondInterval = null;
let lastTokensPerSecond = null; // Store the last calculated rate

function toggleLoader(show) {
const sendButton = document.getElementById('send-button');
const stopButton = document.getElementById('stop-button');
const headerLoadingIndicator = document.getElementById('header-loading-indicator');
const tokensPerSecondDisplay = document.getElementById('tokens-per-second');

if (show) {
sendButton.style.display = 'none';
stopButton.style.display = 'block';
document.getElementById("input").disabled = true;
if (headerLoadingIndicator) headerLoadingIndicator.style.display = 'block';
// Reset token tracking
requestStartTime = Date.now();
tokensReceived = 0;

// Start updating tokens/second display
if (tokensPerSecondDisplay) {
tokensPerSecondDisplay.textContent = '-';
updateTokensPerSecond();
tokensPerSecondInterval = setInterval(updateTokensPerSecond, 500); // Update every 500ms
}
} else {
document.getElementById("input").disabled = false;
sendButton.style.display = 'block';
stopButton.style.display = 'none';
if (headerLoadingIndicator) headerLoadingIndicator.style.display = 'none';
// Stop updating but keep the last value visible
if (tokensPerSecondInterval) {
clearInterval(tokensPerSecondInterval);
tokensPerSecondInterval = null;
}
// Keep the last calculated rate visible
if (tokensPerSecondDisplay && lastTokensPerSecond !== null) {
tokensPerSecondDisplay.textContent = lastTokensPerSecond;
}
currentAbortController = null;
currentReader = null;
requestStartTime = null;
tokensReceived = 0;
}
}

function updateTokensPerSecond() {
const tokensPerSecondDisplay = document.getElementById('tokens-per-second');
if (!tokensPerSecondDisplay || !requestStartTime) return;

const elapsedSeconds = (Date.now() - requestStartTime) / 1000;
if (elapsedSeconds > 0 && tokensReceived > 0) {
const rate = tokensReceived / elapsedSeconds;
const formattedRate = `${rate.toFixed(1)} tokens/s`;
tokensPerSecondDisplay.textContent = formattedRate;
lastTokensPerSecond = formattedRate; // Store the last calculated rate
} else if (elapsedSeconds > 0) {
tokensPerSecondDisplay.textContent = '-';
}
}

Expand Down Expand Up @@ -171,9 +213,30 @@ function readInputFile() {

function submitPrompt(event) {
event.preventDefault();

const input = document.getElementById("input");
if (!input) return;

const inputValue = input.value;
if (!inputValue.trim()) return; // Don't send empty messages

// If already processing, abort the current request and send the new one
if (currentAbortController || currentReader) {
// Abort current request
stopRequest();
// Small delay to ensure cleanup completes
setTimeout(() => {
// Continue with new request
processAndSendMessage(inputValue);
}, 100);
return;
}

processAndSendMessage(inputValue);
}

const input = document.getElementById("input").value;
let fullInput = input;
function processAndSendMessage(inputValue) {
let fullInput = inputValue;

// If there are file contents, append them to the input for the LLM
if (fileContents.length > 0) {
Expand All @@ -184,7 +247,7 @@ function submitPrompt(event) {
}

// Show file icons in chat if there are files
let displayContent = input;
let displayContent = inputValue;
if (currentFileNames.length > 0) {
displayContent += "\n\n";
currentFileNames.forEach(fileName => {
Expand All @@ -201,9 +264,15 @@ function submitPrompt(event) {
history[history.length - 1].content = fullInput;
}

document.getElementById("input").value = "";
const input = document.getElementById("input");
if (input) input.value = "";
const systemPrompt = localStorage.getItem("system_prompt");
Alpine.nextTick(() => { document.getElementById('messages').scrollIntoView(false); });

// Reset token tracking before starting new request
requestStartTime = Date.now();
tokensReceived = 0;

promptGPT(systemPrompt, fullInput);

// Reset file contents and names after sending
Expand Down Expand Up @@ -242,6 +311,12 @@ function readInputAudio() {
async function promptGPT(systemPrompt, input) {
const model = document.getElementById("chat-model").value;
const mcpMode = Alpine.store("chat").mcpMode;

// Reset current request usage tracking for new request
if (Alpine.store("chat")) {
Alpine.store("chat").tokenUsage.currentRequest = null;
}

toggleLoader(true);

messages = Alpine.store("chat").messages();
Expand Down Expand Up @@ -373,10 +448,35 @@ async function promptGPT(systemPrompt, input) {
// Handle MCP non-streaming response
try {
const data = await response.json();
// MCP endpoint returns content in choices[0].text, not choices[0].message.content
const content = data.choices[0]?.text || "";

// Update token usage if present
if (data.usage) {
Alpine.store("chat").updateTokenUsage(data.usage);
}

// MCP endpoint returns content in choices[0].message.content (chat completion format)
// Fallback to choices[0].text for backward compatibility (completion format)
const content = data.choices[0]?.message?.content || data.choices[0]?.text || "";

if (!content && (!data.choices || data.choices.length === 0)) {
Alpine.store("chat").add(
"assistant",
`<span class='error'>Error: Empty response from MCP endpoint</span>`,
);
toggleLoader(false);
return;
}

if (content) {
// Count tokens for rate calculation (MCP mode - full content at once)
// Prefer actual token count from API if available
if (data.usage && data.usage.completion_tokens) {
tokensReceived = data.usage.completion_tokens;
} else {
tokensReceived += Math.ceil(content.length / 4);
}
updateTokensPerSecond();

// Process thinking tags using shared function
const { regularContent, thinkingContent } = processThinkingTags(content);

Expand Down Expand Up @@ -426,6 +526,9 @@ async function promptGPT(systemPrompt, input) {
const addToChat = (token) => {
const chatStore = Alpine.store("chat");
chatStore.add("assistant", token);
// Count tokens for rate calculation (rough estimate: count characters/4)
tokensReceived += Math.ceil(token.length / 4);
updateTokensPerSecond();
// Efficiently scroll into view without triggering multiple reflows
// const messages = document.getElementById('messages');
// messages.scrollTop = messages.scrollHeight;
Expand Down Expand Up @@ -456,6 +559,12 @@ async function promptGPT(systemPrompt, input) {
if (line.startsWith("data: ")) {
try {
const jsonData = JSON.parse(line.substring(6));

// Update token usage if present
if (jsonData.usage) {
Alpine.store("chat").updateTokenUsage(jsonData.usage);
}

const token = jsonData.choices[0].delta.content;

if (token) {
Expand All @@ -480,6 +589,9 @@ async function promptGPT(systemPrompt, input) {
// Handle content based on thinking state
if (isThinking) {
thinkingContent += token;
// Count tokens for rate calculation
tokensReceived += Math.ceil(token.length / 4);
updateTokensPerSecond();
// Update the last thinking message or create a new one
if (lastThinkingMessageIndex === -1) {
// Create new thinking message
Expand Down Expand Up @@ -568,14 +680,71 @@ marked.setOptions({
},
});

// Alpine store is now initialized in chat.html inline script to ensure it's available before Alpine processes the DOM
// Only initialize if not already initialized (to avoid duplicate initialization)
document.addEventListener("alpine:init", () => {
Alpine.store("chat", {
// Check if store already exists (initialized in chat.html)
if (!Alpine.store("chat")) {
// Fallback initialization (should not be needed if chat.html loads correctly)
Alpine.store("chat", {
history: [],
languages: [undefined],
systemPrompt: "",
mcpMode: false,
contextSize: null,
tokenUsage: {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
currentRequest: null
},
clear() {
this.history.length = 0;
this.tokenUsage = {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
currentRequest: null
};
},
updateTokenUsage(usage) {
// Usage values in streaming responses are cumulative totals for the current request
// We track session totals separately and only update when we see new (higher) values
if (usage) {
const currentRequest = this.tokenUsage.currentRequest || {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0
};

// Check if this is a new/updated usage (values increased)
const isNewUsage =
(usage.prompt_tokens !== undefined && usage.prompt_tokens > currentRequest.promptTokens) ||
(usage.completion_tokens !== undefined && usage.completion_tokens > currentRequest.completionTokens) ||
(usage.total_tokens !== undefined && usage.total_tokens > currentRequest.totalTokens);

if (isNewUsage) {
// Update session totals: subtract old request usage, add new
this.tokenUsage.promptTokens = this.tokenUsage.promptTokens - currentRequest.promptTokens + (usage.prompt_tokens || 0);
this.tokenUsage.completionTokens = this.tokenUsage.completionTokens - currentRequest.completionTokens + (usage.completion_tokens || 0);
this.tokenUsage.totalTokens = this.tokenUsage.totalTokens - currentRequest.totalTokens + (usage.total_tokens || 0);

// Store current request usage
this.tokenUsage.currentRequest = {
promptTokens: usage.prompt_tokens || 0,
completionTokens: usage.completion_tokens || 0,
totalTokens: usage.total_tokens || 0
};
}
}
},
getRemainingTokens() {
if (!this.contextSize) return null;
return Math.max(0, this.contextSize - this.tokenUsage.totalTokens);
},
getContextUsagePercent() {
if (!this.contextSize) return null;
return Math.min(100, (this.tokenUsage.totalTokens / this.contextSize) * 100);
},
add(role, content, image, audio) {
const N = this.history.length - 1;
Expand Down Expand Up @@ -640,5 +809,6 @@ document.addEventListener("alpine:init", () => {
audio: message.audio,
}));
},
});
});
}
});
Loading
Loading