From b529293a3d0221f761f468cb4c663adb6d46daa3 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 15 May 2026 10:31:27 +0000
Subject: [PATCH 1/3] realtime: honor output_modalities to skip TTS in
 text-only mode

The emulated realtime pipeline previously ignored the OpenAI Realtime spec
field output_modalities and always synthesized TTS. Add resolveOutputModalities
+ modalitiesContainAudio helpers and gate the TTS / ResponseOutputAudio*
emission so a client requesting ["text"] gets only ResponseOutputText* events.

This lets thin clients (e.g. thing5-poc) cache TTS on the client side while
still using the realtime WS for VAD + STT + LLM + tool-call parsing.

Assisted-by: Claude:claude-opus-4-7
---
 core/http/endpoints/openai/realtime.go        | 204 +++++++++++-------
 .../openai/realtime_modality_test.go          |  39 ++++
 2 files changed, 167 insertions(+), 76 deletions(-)
 create mode 100644 core/http/endpoints/openai/realtime_modality_test.go

diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index b7a0be6acb5d..b83d52e02c6b 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -54,6 +54,30 @@ const (
 		"Avoid parenthetical asides, URLs, and anything that cannot be clearly vocalized."
 )
 
+// resolveOutputModalities returns the effective output modalities for a
+// response: response-level overrides session-level, and the OpenAI Realtime
+// spec default is ["audio"] when neither is set.
+func resolveOutputModalities(session, response []types.Modality) []types.Modality {
+	if len(response) > 0 {
+		return response
+	}
+	if len(session) > 0 {
+		return session
+	}
+	return []types.Modality{types.ModalityAudio}
+}
+
+// modalitiesContainAudio reports whether the resolved modalities include audio
+// output.
+func modalitiesContainAudio(m []types.Modality) bool {
+	for _, x := range m {
+		if x == types.ModalityAudio {
+			return true
+		}
+	}
+	return false
+}
+
 // A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result
 // If the model support instead audio-to-audio, we will use the specific gRPC calls instead
 
@@ -82,6 +106,10 @@ type Session struct {
 	InputSampleRate  int
 	OutputSampleRate int
 	MaxOutputTokens  types.IntOrInf
+	// OutputModalities mirrors the OpenAI Realtime spec field of the same
+	// name. Empty means "use the spec default" (audio). ["text"] suppresses
+	// TTS so the client receives only response.output_text.* events.
+	OutputModalities []types.Modality
 	// MaxHistoryItems caps the number of MessageItems passed to the LLM each
 	// turn (0 = unlimited). Small models — especially the LFM2.5-Audio 1.5B
 	// served via the liquid-audio backend — degrade quickly past a handful
@@ -1015,6 +1043,10 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 		session.MaxOutputTokens = rt.MaxOutputTokens
 	}
 
+	if len(rt.OutputModalities) > 0 {
+		session.OutputModalities = rt.OutputModalities
+	}
+
 	return nil
 }
 
@@ -1654,106 +1686,126 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			})
 		}
 
-		// Check for cancellation before TTS
-		if ctx.Err() != nil {
-			xlog.Debug("Response cancelled before TTS (barge-in)")
-			sendCancelledResponse()
-			return
-		}
-
-		audioFilePath, res, err := session.ModelInterface.TTS(ctx, finalSpeech, session.Voice, session.InputAudioTranscription.Language)
-		if err != nil {
+		var audioString string
+		_, isWebRTC := t.(*WebRTCTransport)
+		modalities := resolveOutputModalities(session.OutputModalities, nil)
+		if modalitiesContainAudio(modalities) {
+			// Check for cancellation before TTS
 			if ctx.Err() != nil {
-				xlog.Debug("TTS cancelled (barge-in)")
+				xlog.Debug("Response cancelled before TTS (barge-in)")
 				sendCancelledResponse()
 				return
 			}
-			xlog.Error("TTS failed", "error", err)
-			sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
-			return
-		}
-		if !res.Success {
-			xlog.Error("TTS failed", "message", res.Message)
-			sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID)
-			return
-		}
-		defer os.Remove(audioFilePath)
 
-		audioBytes, err := os.ReadFile(audioFilePath)
-		if err != nil {
-			xlog.Error("failed to read TTS file", "error", err)
-			sendError(t, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID)
-			return
-		}
-
-		// Parse WAV header to get raw PCM and the actual sample rate from the TTS backend.
-		pcmData, ttsSampleRate := laudio.ParseWAV(audioBytes)
-		if ttsSampleRate == 0 {
-			ttsSampleRate = localSampleRate
-		}
-		xlog.Debug("TTS audio parsed", "raw_bytes", len(audioBytes), "pcm_bytes", len(pcmData), "sample_rate", ttsSampleRate)
+			audioFilePath, res, err := session.ModelInterface.TTS(ctx, finalSpeech, session.Voice, session.InputAudioTranscription.Language)
+			if err != nil {
+				if ctx.Err() != nil {
+					xlog.Debug("TTS cancelled (barge-in)")
+					sendCancelledResponse()
+					return
+				}
+				xlog.Error("TTS failed", "error", err)
+				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %v", err), "", item.Assistant.ID)
+				return
+			}
+			if !res.Success {
+				xlog.Error("TTS failed", "message", res.Message)
+				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID)
+				return
+			}
+			defer os.Remove(audioFilePath)
 
-		// SendAudio (WebRTC) passes PCM at the TTS sample rate directly to the
-		// Opus encoder, which resamples to 48kHz internally. This avoids a
-		// lossy intermediate resample through 16kHz.
-		// XXX: This is a noop in websocket mode; it's included in the JSON instead
-		if err := t.SendAudio(ctx, pcmData, ttsSampleRate); err != nil {
-			if ctx.Err() != nil {
-				xlog.Debug("Audio playback cancelled (barge-in)")
-				sendCancelledResponse()
+			audioBytes, err := os.ReadFile(audioFilePath)
+			if err != nil {
+				xlog.Error("failed to read TTS file", "error", err)
+				sendError(t, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID)
 				return
 			}
-			xlog.Error("failed to send audio via transport", "error", err)
-		}
 
-		_, isWebRTC := t.(*WebRTCTransport)
+			// Parse WAV header to get raw PCM and the actual sample rate from the TTS backend.
+			pcmData, ttsSampleRate := laudio.ParseWAV(audioBytes)
+			if ttsSampleRate == 0 {
+				ttsSampleRate = localSampleRate
+			}
+			xlog.Debug("TTS audio parsed", "raw_bytes", len(audioBytes), "pcm_bytes", len(pcmData), "sample_rate", ttsSampleRate)
+
+			// SendAudio (WebRTC) passes PCM at the TTS sample rate directly to the
+			// Opus encoder, which resamples to 48kHz internally. This avoids a
+			// lossy intermediate resample through 16kHz.
+			// XXX: This is a noop in websocket mode; it's included in the JSON instead
+			if err := t.SendAudio(ctx, pcmData, ttsSampleRate); err != nil {
+				if ctx.Err() != nil {
+					xlog.Debug("Audio playback cancelled (barge-in)")
+					sendCancelledResponse()
+					return
+				}
+				xlog.Error("failed to send audio via transport", "error", err)
+			}
 
-		// For WebSocket clients, resample to the session's output rate and
-		// deliver audio as base64 in JSON events. WebRTC clients already
-		// received audio over the RTP track, so skip the base64 payload.
-		var audioString string
-		if !isWebRTC {
-			wsPCM := pcmData
-			if ttsSampleRate != session.OutputSampleRate {
-				samples := sound.BytesToInt16sLE(pcmData)
-				resampled := sound.ResampleInt16(samples, ttsSampleRate, session.OutputSampleRate)
-				wsPCM = sound.Int16toBytesLE(resampled)
+			// For WebSocket clients, resample to the session's output rate and
+			// deliver audio as base64 in JSON events. WebRTC clients already
+			// received audio over the RTP track, so skip the base64 payload.
+			if !isWebRTC {
+				wsPCM := pcmData
+				if ttsSampleRate != session.OutputSampleRate {
+					samples := sound.BytesToInt16sLE(pcmData)
+					resampled := sound.ResampleInt16(samples, ttsSampleRate, session.OutputSampleRate)
+					wsPCM = sound.Int16toBytesLE(resampled)
+				}
+				audioString = base64.StdEncoding.EncodeToString(wsPCM)
 			}
-			audioString = base64.StdEncoding.EncodeToString(wsPCM)
-		}
 
-		sendEvent(t, types.ResponseOutputAudioTranscriptDeltaEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			ItemID:          item.Assistant.ID,
-			OutputIndex:     0,
-			ContentIndex:    0,
-			Delta:           finalSpeech,
-		})
-		sendEvent(t, types.ResponseOutputAudioTranscriptDoneEvent{
-			ServerEventBase: types.ServerEventBase{},
-			ResponseID:      responseID,
-			ItemID:          item.Assistant.ID,
-			OutputIndex:     0,
-			ContentIndex:    0,
-			Transcript:      finalSpeech,
-		})
+			sendEvent(t, types.ResponseOutputAudioTranscriptDeltaEvent{
+				ServerEventBase: types.ServerEventBase{},
+				ResponseID:      responseID,
+				ItemID:          item.Assistant.ID,
+				OutputIndex:     0,
+				ContentIndex:    0,
+				Delta:           finalSpeech,
+			})
+			sendEvent(t, types.ResponseOutputAudioTranscriptDoneEvent{
+				ServerEventBase: types.ServerEventBase{},
+				ResponseID:      responseID,
+				ItemID:          item.Assistant.ID,
+				OutputIndex:     0,
+				ContentIndex:    0,
+				Transcript:      finalSpeech,
+			})
 
-		if !isWebRTC {
-			sendEvent(t, types.ResponseOutputAudioDeltaEvent{
+			if !isWebRTC {
+				sendEvent(t, types.ResponseOutputAudioDeltaEvent{
+					ServerEventBase: types.ServerEventBase{},
+					ResponseID:      responseID,
+					ItemID:          item.Assistant.ID,
+					OutputIndex:     0,
+					ContentIndex:    0,
+					Delta:           audioString,
+				})
+				sendEvent(t, types.ResponseOutputAudioDoneEvent{
+					ServerEventBase: types.ServerEventBase{},
+					ResponseID:      responseID,
+					ItemID:          item.Assistant.ID,
+					OutputIndex:     0,
+					ContentIndex:    0,
+				})
+			}
+		} else {
+			// Text-only mode: skip TTS, emit only the text events.
+			sendEvent(t, types.ResponseOutputTextDeltaEvent{
 				ServerEventBase: types.ServerEventBase{},
 				ResponseID:      responseID,
 				ItemID:          item.Assistant.ID,
 				OutputIndex:     0,
 				ContentIndex:    0,
-				Delta:           audioString,
+				Delta:           finalSpeech,
 			})
-			sendEvent(t, types.ResponseOutputAudioDoneEvent{
+			sendEvent(t, types.ResponseOutputTextDoneEvent{
 				ServerEventBase: types.ServerEventBase{},
 				ResponseID:      responseID,
 				ItemID:          item.Assistant.ID,
 				OutputIndex:     0,
 				ContentIndex:    0,
+				Text:            finalSpeech,
 			})
 		}
 
diff --git a/core/http/endpoints/openai/realtime_modality_test.go b/core/http/endpoints/openai/realtime_modality_test.go
new file mode 100644
index 000000000000..0a17d7a03b11
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_modality_test.go
@@ -0,0 +1,39 @@
+package openai
+
+import (
+	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("resolveOutputModalities", func() {
+	It("defaults to audio when neither session nor response specify", func() {
+		got := resolveOutputModalities(nil, nil)
+		Expect(got).To(ConsistOf(types.ModalityAudio))
+	})
+
+	It("uses session modalities when response omits them", func() {
+		sess := []types.Modality{types.ModalityText}
+		got := resolveOutputModalities(sess, nil)
+		Expect(got).To(ConsistOf(types.ModalityText))
+	})
+
+	It("response modalities override session", func() {
+		sess := []types.Modality{types.ModalityAudio}
+		resp := []types.Modality{types.ModalityText}
+		got := resolveOutputModalities(sess, resp)
+		Expect(got).To(ConsistOf(types.ModalityText))
+	})
+
+	It("returns false from modalitiesContainAudio for text-only", func() {
+		Expect(modalitiesContainAudio([]types.Modality{types.ModalityText})).To(BeFalse())
+	})
+
+	It("returns true from modalitiesContainAudio for audio (default)", func() {
+		Expect(modalitiesContainAudio([]types.Modality{types.ModalityAudio})).To(BeTrue())
+	})
+
+	It("returns true when both audio and text are present", func() {
+		Expect(modalitiesContainAudio([]types.Modality{types.ModalityText, types.ModalityAudio})).To(BeTrue())
+	})
+})

From 7ed6c4b44e46803030deb191c3736f83c261fcfb Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 15 May 2026 10:31:35 +0000
Subject: [PATCH 2/3] realtime: plumb response-level output_modalities and echo
 on session

Follow-up to the previous commit:
- Resolve response.create's output_modalities at the gate so a per-response
  override of an audio session is honored (the test asserted this contract
  but the production call site was passing nil).
- Mirror OutputModalities in the RealtimeSession echo so session.update
  round-trips the client-supplied value, matching MaxOutputTokens's pattern.

Assisted-by: Claude:claude-opus-4-7
---
 core/http/endpoints/openai/realtime.go | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index b83d52e02c6b..4f4456b316bb 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -190,13 +190,14 @@ func (s *Session) ToServer() types.SessionUnion {
 	} else {
 		return types.SessionUnion{
 			Realtime: &types.RealtimeSession{
-				ID:              s.ID,
-				Object:          "realtime.session",
-				Model:           s.Model,
-				Instructions:    s.Instructions,
-				Tools:           s.Tools,
-				ToolChoice:      s.ToolChoice,
-				MaxOutputTokens: s.MaxOutputTokens,
+				ID:               s.ID,
+				Object:           "realtime.session",
+				Model:            s.Model,
+				Instructions:     s.Instructions,
+				Tools:            s.Tools,
+				ToolChoice:       s.ToolChoice,
+				MaxOutputTokens:  s.MaxOutputTokens,
+				OutputModalities: s.OutputModalities,
 				Audio: &types.RealtimeSessionAudio{
 					Input: &types.SessionAudioInput{
 						TurnDetection: s.TurnDetection,
@@ -1688,7 +1689,11 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 
 		var audioString string
 		_, isWebRTC := t.(*WebRTCTransport)
-		modalities := resolveOutputModalities(session.OutputModalities, nil)
+		var respMods []types.Modality
+		if overrides != nil {
+			respMods = overrides.OutputModalities
+		}
+		modalities := resolveOutputModalities(session.OutputModalities, respMods)
 		if modalitiesContainAudio(modalities) {
 			// Check for cancellation before TTS
 			if ctx.Err() != nil {

From 49027ee34431d15715b8af10d2e471f33dbae504 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 15 May 2026 10:31:45 +0000
Subject: [PATCH 3/3] realtime: silence errcheck on deferred os.Remove of TTS
 file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI's errcheck flagged the pre-existing `defer os.Remove(audioFilePath)`
inside the audio-emission block (now wrapped by the modality gate). Wrap
the call in a closure that explicitly discards the error — the canonical
Go pattern for "I want to defer a cleanup whose error I genuinely don't
care about."

Assisted-by: Claude:claude-opus-4-7 golangci-lint
---
 core/http/endpoints/openai/realtime.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 4f4456b316bb..9a416719d96b 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1718,7 +1718,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 				sendError(t, "tts_error", fmt.Sprintf("TTS generation failed: %s", res.Message), "", item.Assistant.ID)
 				return
 			}
-			defer os.Remove(audioFilePath)
+			defer func() { _ = os.Remove(audioFilePath) }()
 
 			audioBytes, err := os.ReadFile(audioFilePath)
 			if err != nil {