From b529293a3d0221f761f468cb4c663adb6d46daa3 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 15 May 2026 10:31:27 +0000 Subject: [PATCH 1/3] realtime: honor output_modalities to skip TTS in text-only mode The emulated realtime pipeline previously ignored the OpenAI Realtime spec field output_modalities and always synthesized TTS. Add resolveOutputModalities + modalitiesContainAudio helpers and gate the TTS / ResponseOutputAudio* emission so a client requesting ["text"] gets only ResponseOutputText* events. This lets thin clients (e.g. thing5-poc) cache TTS on the client side while still using the realtime WS for VAD + STT + LLM + tool-call parsing. Assisted-by: Claude:claude-opus-4-7 --- core/http/endpoints/openai/realtime.go | 204 +++++++++++------- .../openai/realtime_modality_test.go | 39 ++++ 2 files changed, 167 insertions(+), 76 deletions(-) create mode 100644 core/http/endpoints/openai/realtime_modality_test.go diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index b7a0be6acb5d..b83d52e02c6b 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -54,6 +54,30 @@ const ( "Avoid parenthetical asides, URLs, and anything that cannot be clearly vocalized." ) +// resolveOutputModalities returns the effective output modalities for a +// response: response-level overrides session-level, and the OpenAI Realtime +// spec default is ["audio"] when neither is set. +func resolveOutputModalities(session, response []types.Modality) []types.Modality { + if len(response) > 0 { + return response + } + if len(session) > 0 { + return session + } + return []types.Modality{types.ModalityAudio} +} + +// modalitiesContainAudio reports whether the resolved modalities include audio +// output. +func modalitiesContainAudio(m []types.Modality) bool { + for _, x := range m { + if x == types.ModalityAudio { + return true + } + } + return false +} + // A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result // If the model support instead audio-to-audio, we will use the specific gRPC calls instead @@ -82,6 +106,10 @@ type Session struct { InputSampleRate int OutputSampleRate int MaxOutputTokens types.IntOrInf + // OutputModalities mirrors the OpenAI Realtime spec field of the same + // name. Empty means "use the spec default" (audio). ["text"] suppresses + // TTS so the client receives only response.output_text.* events. + OutputModalities []types.Modality // MaxHistoryItems caps the number of MessageItems passed to the LLM each // turn (0 = unlimited). Small models — especially the LFM2.5-Audio 1.5B // served via the liquid-audio backend — degrade quickly past a handful @@ -1015,6 +1043,10 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode session.MaxOutputTokens = rt.MaxOutputTokens } + if len(rt.OutputModalities) > 0 { + session.OutputModalities = rt.OutputModalities + } + return nil } @@ -1654,106 +1686,126 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa }) } - // Check for cancellation before TTS - if ctx.Err() != nil { - xlog.Debug("Response cancelled before TTS (barge-in)") - sendCancelledResponse() - return - } - - audioFilePath, res, err := session.ModelInterface.TTS(ctx, finalSpeech, session.Voice, session.InputAudioTranscription.Language) - if err != nil { + var audioString string + _, isWebRTC := t.(*WebRTCTransport) + modalities := resolveOutputModalities(session.OutputModalities, nil) + if modalitiesContainAudio(modalities) { + // Check for cancellation before TTS if ctx.Err() != nil { - xlog.Debug("TTS cancelled (barge-in)") + xlog.Debug("Response cancelled before TTS (barge-in)") sendCancelledResponse() return } - xlog.Error("TTS failed", "error", err) - sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID) - return - } - if !res.Success { - xlog.Error("TTS failed", "message", res.Message) - sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID) - return - } - defer os.Remove(audioFilePath) - audioBytes, err := os.ReadFile(audioFilePath) - if err != nil { - xlog.Error("failed to read TTS file", "error", err) - sendError(t, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID) - return - } - - // Parse WAV header to get raw PCM and the actual sample rate from the TTS backend. - pcmData, ttsSampleRate := laudio.ParseWAV(audioBytes) - if ttsSampleRate == 0 { - ttsSampleRate = localSampleRate - } - xlog.Debug("TTS audio parsed", "raw_bytes", len(audioBytes), "pcm_bytes", len(pcmData), "sample_rate", ttsSampleRate) + audioFilePath, res, err := session.ModelInterface.TTS(ctx, finalSpeech, session.Voice, session.InputAudioTranscription.Language) + if err != nil { + if ctx.Err() != nil { + xlog.Debug("TTS cancelled (barge-in)") + sendCancelledResponse() + return + } + xlog.Error("TTS failed", "error", err) + sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID) + return + } + if !res.Success { + xlog.Error("TTS failed", "message", res.Message) + sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID) + return + } + defer os.Remove(audioFilePath) - // SendAudio (WebRTC) passes PCM at the TTS sample rate directly to the - // Opus encoder, which resamples to 48kHz internally. This avoids a - // lossy intermediate resample through 16kHz. - // XXX: This is a noop in websocket mode; it's included in the JSON instead - if err := t.SendAudio(ctx, pcmData, ttsSampleRate); err != nil { - if ctx.Err() != nil { - xlog.Debug("Audio playback cancelled (barge-in)") - sendCancelledResponse() + audioBytes, err := os.ReadFile(audioFilePath) + if err != nil { + xlog.Error("failed to read TTS file", "error", err) + sendError(t, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID) return } - xlog.Error("failed to send audio via transport", "error", err) - } - _, isWebRTC := t.(*WebRTCTransport) + // Parse WAV header to get raw PCM and the actual sample rate from the TTS backend. + pcmData, ttsSampleRate := laudio.ParseWAV(audioBytes) + if ttsSampleRate == 0 { + ttsSampleRate = localSampleRate + } + xlog.Debug("TTS audio parsed", "raw_bytes", len(audioBytes), "pcm_bytes", len(pcmData), "sample_rate", ttsSampleRate) + + // SendAudio (WebRTC) passes PCM at the TTS sample rate directly to the + // Opus encoder, which resamples to 48kHz internally. This avoids a + // lossy intermediate resample through 16kHz. + // XXX: This is a noop in websocket mode; it's included in the JSON instead + if err := t.SendAudio(ctx, pcmData, ttsSampleRate); err != nil { + if ctx.Err() != nil { + xlog.Debug("Audio playback cancelled (barge-in)") + sendCancelledResponse() + return + } + xlog.Error("failed to send audio via transport", "error", err) + } - // For WebSocket clients, resample to the session's output rate and - // deliver audio as base64 in JSON events. WebRTC clients already - // received audio over the RTP track, so skip the base64 payload. - var audioString string - if !isWebRTC { - wsPCM := pcmData - if ttsSampleRate != session.OutputSampleRate { - samples := sound.BytesToInt16sLE(pcmData) - resampled := sound.ResampleInt16(samples, ttsSampleRate, session.OutputSampleRate) - wsPCM = sound.Int16toBytesLE(resampled) + // For WebSocket clients, resample to the session's output rate and + // deliver audio as base64 in JSON events. WebRTC clients already + // received audio over the RTP track, so skip the base64 payload. + if !isWebRTC { + wsPCM := pcmData + if ttsSampleRate != session.OutputSampleRate { + samples := sound.BytesToInt16sLE(pcmData) + resampled := sound.ResampleInt16(samples, ttsSampleRate, session.OutputSampleRate) + wsPCM = sound.Int16toBytesLE(resampled) + } + audioString = base64.StdEncoding.EncodeToString(wsPCM) } - audioString = base64.StdEncoding.EncodeToString(wsPCM) - } - sendEvent(t, types.ResponseOutputAudioTranscriptDeltaEvent{ - ServerEventBase: types.ServerEventBase{}, - ResponseID: responseID, - ItemID: item.Assistant.ID, - OutputIndex: 0, - ContentIndex: 0, - Delta: finalSpeech, - }) - sendEvent(t, types.ResponseOutputAudioTranscriptDoneEvent{ - ServerEventBase: types.ServerEventBase{}, - ResponseID: responseID, - ItemID: item.Assistant.ID, - OutputIndex: 0, - ContentIndex: 0, - Transcript: finalSpeech, - }) + sendEvent(t, types.ResponseOutputAudioTranscriptDeltaEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Delta: finalSpeech, + }) + sendEvent(t, types.ResponseOutputAudioTranscriptDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Transcript: finalSpeech, + }) - if !isWebRTC { - sendEvent(t, types.ResponseOutputAudioDeltaEvent{ + if !isWebRTC { + sendEvent(t, types.ResponseOutputAudioDeltaEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + Delta: audioString, + }) + sendEvent(t, types.ResponseOutputAudioDoneEvent{ + ServerEventBase: types.ServerEventBase{}, + ResponseID: responseID, + ItemID: item.Assistant.ID, + OutputIndex: 0, + ContentIndex: 0, + }) + } + } else { + // Text-only mode: skip TTS, emit only the text events. + sendEvent(t, types.ResponseOutputTextDeltaEvent{ ServerEventBase: types.ServerEventBase{}, ResponseID: responseID, ItemID: item.Assistant.ID, OutputIndex: 0, ContentIndex: 0, - Delta: audioString, + Delta: finalSpeech, }) - sendEvent(t, types.ResponseOutputAudioDoneEvent{ + sendEvent(t, types.ResponseOutputTextDoneEvent{ ServerEventBase: types.ServerEventBase{}, ResponseID: responseID, ItemID: item.Assistant.ID, OutputIndex: 0, ContentIndex: 0, + Text: finalSpeech, }) } diff --git a/core/http/endpoints/openai/realtime_modality_test.go b/core/http/endpoints/openai/realtime_modality_test.go new file mode 100644 index 000000000000..0a17d7a03b11 --- /dev/null +++ b/core/http/endpoints/openai/realtime_modality_test.go @@ -0,0 +1,39 @@ +package openai + +import ( + "github.com/mudler/LocalAI/core/http/endpoints/openai/types" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("resolveOutputModalities", func() { + It("defaults to audio when neither session nor response specify", func() { + got := resolveOutputModalities(nil, nil) + Expect(got).To(ConsistOf(types.ModalityAudio)) + }) + + It("uses session modalities when response omits them", func() { + sess := []types.Modality{types.ModalityText} + got := resolveOutputModalities(sess, nil) + Expect(got).To(ConsistOf(types.ModalityText)) + }) + + It("response modalities override session", func() { + sess := []types.Modality{types.ModalityAudio} + resp := []types.Modality{types.ModalityText} + got := resolveOutputModalities(sess, resp) + Expect(got).To(ConsistOf(types.ModalityText)) + }) + + It("returns false from modalitiesContainAudio for text-only", func() { + Expect(modalitiesContainAudio([]types.Modality{types.ModalityText})).To(BeFalse()) + }) + + It("returns true from modalitiesContainAudio for audio (default)", func() { + Expect(modalitiesContainAudio([]types.Modality{types.ModalityAudio})).To(BeTrue()) + }) + + It("returns true when both audio and text are present", func() { + Expect(modalitiesContainAudio([]types.Modality{types.ModalityText, types.ModalityAudio})).To(BeTrue()) + }) +}) From 7ed6c4b44e46803030deb191c3736f83c261fcfb Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 15 May 2026 10:31:35 +0000 Subject: [PATCH 2/3] realtime: plumb response-level output_modalities and echo on session Follow-up to the previous commit: - Resolve response.create's output_modalities at the gate so a per-response override of an audio session is honored (the test asserted this contract but the production call site was passing nil). - Mirror OutputModalities in the RealtimeSession echo so session.update round-trips the client-supplied value, matching MaxOutputTokens's pattern. Assisted-by: Claude:claude-opus-4-7 --- core/http/endpoints/openai/realtime.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index b83d52e02c6b..4f4456b316bb 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -190,13 +190,14 @@ func (s *Session) ToServer() types.SessionUnion { } else { return types.SessionUnion{ Realtime: &types.RealtimeSession{ - ID: s.ID, - Object: "realtime.session", - Model: s.Model, - Instructions: s.Instructions, - Tools: s.Tools, - ToolChoice: s.ToolChoice, - MaxOutputTokens: s.MaxOutputTokens, + ID: s.ID, + Object: "realtime.session", + Model: s.Model, + Instructions: s.Instructions, + Tools: s.Tools, + ToolChoice: s.ToolChoice, + MaxOutputTokens: s.MaxOutputTokens, + OutputModalities: s.OutputModalities, Audio: &types.RealtimeSessionAudio{ Input: &types.SessionAudioInput{ TurnDetection: s.TurnDetection, @@ -1688,7 +1689,11 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa var audioString string _, isWebRTC := t.(*WebRTCTransport) - modalities := resolveOutputModalities(session.OutputModalities, nil) + var respMods []types.Modality + if overrides != nil { + respMods = overrides.OutputModalities + } + modalities := resolveOutputModalities(session.OutputModalities, respMods) if modalitiesContainAudio(modalities) { // Check for cancellation before TTS if ctx.Err() != nil { From 49027ee34431d15715b8af10d2e471f33dbae504 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 15 May 2026 10:31:45 +0000 Subject: [PATCH 3/3] realtime: silence errcheck on deferred os.Remove of TTS file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI's errcheck flagged the pre-existing `defer os.Remove(audioFilePath)` inside the audio-emission block (now wrapped by the modality gate). Wrap the call in a closure that explicitly discards the error — the canonical Go pattern for "I want to defer a cleanup whose error I genuinely don't care about." Assisted-by: Claude:claude-opus-4-7 golangci-lint --- core/http/endpoints/openai/realtime.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 4f4456b316bb..9a416719d96b 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -1718,7 +1718,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID) return } - defer os.Remove(audioFilePath) + defer func() { _ = os.Remove(audioFilePath) }() audioBytes, err := os.ReadFile(audioFilePath) if err != nil {