fix(stage-ui): text to speech wasn't working

nekomeowww · nekomeowww · commit aed77fc3c929 · 2025-09-28T17:10:36.000+08:00
diff --git a/packages/stage-ui/src/components/scenarios/providers/SpeechStreamingPlayground.vue b/packages/stage-ui/src/components/scenarios/providers/SpeechStreamingPlayground.vue
@@ -2,9 +2,10 @@
 import type { TTSInputChunk } from '../../../utils/tts'
 
 import { animate } from 'animejs'
+import { storeToRefs } from 'pinia'
 import { ref } from 'vue'
 
-import { useMessageContentQueue } from '../../../composables/queues'
+import { usePipelineWorkflowTextSegmentationStore } from '../../../composables/queues'
 import { useAudioContext } from '../../../stores/audio'
 import { createQueue } from '../../../utils/queue'
 import { chunkTTSInput } from '../../../utils/tts'
@@ -16,6 +17,8 @@ const props = defineProps<{
   voice: string
 }>()
 
+const { onTextSegmented } = usePipelineWorkflowTextSegmentationStore()
+const { textSegmentationQueue } = storeToRefs(usePipelineWorkflowTextSegmentationStore())
 const { audioContext } = useAudioContext()
 const nowSpeaking = ref(false)
 const ttsInputChunks = ref<TTSInputChunk[]>([])
@@ -63,10 +66,12 @@ async function handleSpeechGeneration(ctx: { data: string }) {
 
 const ttsQueue = createQueue<string>({ handlers: [handleSpeechGeneration] })
 
-const messageContentQueue = useMessageContentQueue(ttsQueue)
+onTextSegmented((chunk) => {
+  ttsQueue.enqueue(chunk)
+})
 
 async function testStreaming() {
-  messageContentQueue.enqueue(props.text)
+  textSegmentationQueue.value.enqueue(props.text)
 }
 
 async function testChunking() {
diff --git a/packages/stage-ui/src/components/scenes/Stage.vue b/packages/stage-ui/src/components/scenes/Stage.vue
@@ -17,7 +17,7 @@ import { onMounted, onUnmounted, ref } from 'vue'
 
 import Live2DScene from './Live2D.vue'
 
-import { useDelayMessageQueue, useEmotionsMessageQueue, useMessageContentQueue } from '../../composables/queues'
+import { useDelayMessageQueue, useEmotionsMessageQueue, usePipelineCharacterSpeechPlaybackQueueStore, usePipelineWorkflowTextSegmentationStore } from '../../composables/queues'
 import { llmInferenceEndToken } from '../../constants'
 import { EMOTION_EmotionMotionName_value, EMOTION_VRMExpressionName_value, EmotionThinkMotionName } from '../../constants/emotions'
 import { useAudioContext, useSpeakingStore } from '../../stores/audio'
@@ -44,13 +44,22 @@ const db = ref<DuckDBWasmDrizzleDatabase>()
 const vrmViewerRef = ref<InstanceType<typeof ThreeScene>>()
 const live2dSceneRef = ref<InstanceType<typeof Live2DScene>>()
 
+const textSegmentationStore = usePipelineWorkflowTextSegmentationStore()
+const { onTextSegmented } = textSegmentationStore
+const { textSegmentationQueue } = storeToRefs(textSegmentationStore)
+
+const characterSpeechPlaybackQueue = usePipelineCharacterSpeechPlaybackQueueStore()
+const { connectAudioContext, connectAudioAnalyser, clearAll } = characterSpeechPlaybackQueue
+const { playbackQueue } = storeToRefs(characterSpeechPlaybackQueue)
+
 const settingsStore = useSettings()
 const { stageModelRenderer, stageViewControlsEnabled, live2dDisableFocus, stageModelSelectedUrl } = storeToRefs(settingsStore)
 const { mouthOpenSize } = storeToRefs(useSpeakingStore())
 const { audioContext, calculateVolume } = useAudioContext()
+connectAudioContext(audioContext)
+
 const { onBeforeMessageComposed, onBeforeSend, onTokenLiteral, onTokenSpecial, onStreamEnd, onAssistantResponseEnd } = useChatStore()
 const providersStore = useProvidersStore()
-
 const live2dStore = useLive2d()
 
 const showStage = ref(true)
@@ -79,45 +88,6 @@ live2dStore.onShouldUpdateView(async () => {
 const audioAnalyser = ref<AnalyserNode>()
 const nowSpeaking = ref(false)
 const lipSyncStarted = ref(false)
-let currentAudioSource: AudioBufferSourceNode | null = null
-
-const audioQueue = createQueue<{ audioBuffer: AudioBuffer, text: string }>({
-  handlers: [
-    (ctx) => {
-      return new Promise((resolve) => {
-        // Stop any currently playing audio
-        if (currentAudioSource) {
-          try {
-            currentAudioSource.stop()
-            currentAudioSource.disconnect()
-          }
-          catch {}
-          currentAudioSource = null
-        }
-        // Create an AudioBufferSourceNode
-        const source = audioContext.createBufferSource()
-        source.buffer = ctx.data.audioBuffer
-
-        // Connect the source to the AudioContext's destination (the speakers)
-        source.connect(audioContext.destination)
-        // Connect the source to the analyzer
-        source.connect(audioAnalyser.value!)
-
-        // Start playing the audio
-        nowSpeaking.value = true
-        currentAudioSource = source
-        source.start(0)
-        source.onended = () => {
-          nowSpeaking.value = false
-          if (currentAudioSource === source) {
-            currentAudioSource = null
-          }
-          resolve()
-        }
-      })
-    },
-  ],
-})
 
 const speechStore = useSpeechStore()
 const { ssmlEnabled, activeSpeechProvider, activeSpeechModel, activeSpeechVoice, pitch } = storeToRefs(speechStore)
@@ -155,7 +125,7 @@ async function handleSpeechGeneration(ctx: { data: string }) {
 
     // Decode the ArrayBuffer into an AudioBuffer
     const audioBuffer = await audioContext.decodeAudioData(res)
-    audioQueue.enqueue({ audioBuffer, text: ctx.data })
+    playbackQueue.value.enqueue({ audioBuffer, text: ctx.data })
   }
   catch (error) {
     console.error('Speech generation failed:', error)
@@ -168,7 +138,9 @@ const ttsQueue = createQueue<string>({
   ],
 })
 
-const messageContentQueue = useMessageContentQueue(ttsQueue)
+onTextSegmented((chunk) => {
+  ttsQueue.enqueue(chunk)
+})
 
 const { currentMotion } = storeToRefs(useLive2d())
 
@@ -218,21 +190,14 @@ function setupLipSync() {
 }
 
 function setupAnalyser() {
-  if (!audioAnalyser.value)
+  if (!audioAnalyser.value) {
     audioAnalyser.value = audioContext.createAnalyser()
+    connectAudioAnalyser(audioAnalyser.value)
+  }
 }
 
 onBeforeMessageComposed(async () => {
-  // Stop any currently playing audio and clear the audio queue
-  if (currentAudioSource) {
-    try {
-      currentAudioSource.stop()
-      currentAudioSource.disconnect()
-    }
-    catch {}
-    currentAudioSource = null
-  }
-  audioQueue.clear()
+  clearAll()
   setupAnalyser()
   setupLipSync()
 })
@@ -242,7 +207,7 @@ onBeforeSend(async () => {
 })
 
 onTokenLiteral(async (literal) => {
-  messageContentQueue.enqueue(literal)
+  textSegmentationQueue.value.enqueue(literal)
 })
 
 onTokenSpecial(async (special) => {
diff --git a/packages/stage-ui/src/composables/queues.ts b/packages/stage-ui/src/composables/queues.ts
@@ -2,11 +2,14 @@ import type { Emotion } from '../constants/emotions'
 import type { UseQueueReturn } from '../utils/queue'
 
 import { sleep } from '@moeru/std'
+import { invoke } from '@vueuse/core'
+import { defineStore } from 'pinia'
+import { ref } from 'vue'
 
 import { EMOTION_VALUES } from '../constants/emotions'
 import { createQueue } from '../utils/queue'
 import { createControllableStream } from '../utils/stream'
-import { chunkToTTSQueue } from '../utils/tts'
+import { chunkEmitter } from '../utils/tts'
 
 export function useEmotionsMessageQueue(emotionsQueue: UseQueueReturn<Emotion>) {
   function splitEmotion(content: string) {
@@ -100,17 +103,147 @@ export function useDelayMessageQueue() {
   })
 }
 
-export function useMessageContentQueue(ttsQueue: UseQueueReturn<string>) {
-  const encoder = new TextEncoder()
-  const { stream, controller } = createControllableStream<Uint8Array>()
+export const usePipelineCharacterSpeechPlaybackQueueStore = defineStore('pipelines:character:speech', () => {
+  // Hooks
+  const onPlaybackStartedHooks = ref<Array<() => Promise<void> | void>>([])
+  const onPlaybackFinishedHooks = ref<Array<() => Promise<void> | void>>([])
 
-  chunkToTTSQueue(stream.getReader(), ttsQueue)
+  // Hooks registers
+  function onPlaybackStarted(hook: () => Promise<void> | void) {
+    onPlaybackStartedHooks.value.push(hook)
+  }
+  function onPlaybackFinished(hook: () => Promise<void> | void) {
+    onPlaybackFinishedHooks.value.push(hook)
+  }
 
-  return createQueue<string>({
-    handlers: [
-      async (ctx) => {
-        controller.enqueue(encoder.encode(ctx.data))
-      },
-    ],
-  })
-}
+  let currentAudioSource: AudioBufferSourceNode | null = null
+
+  const audioContext = ref<AudioContext>()
+  const audioAnalyser = ref<AnalyserNode>()
+
+  function connectAudioContext(context: AudioContext) {
+    audioContext.value = context
+  }
+
+  function connectAudioAnalyser(analyser: AnalyserNode) {
+    audioAnalyser.value = analyser
+  }
+
+  function clearPlaying() {
+    if (currentAudioSource) {
+      try {
+        currentAudioSource.stop()
+        currentAudioSource.disconnect()
+      }
+      catch {}
+      currentAudioSource = null
+    }
+  }
+
+  const playbackQueue = ref(invoke(() => {
+    return createQueue<{ audioBuffer: AudioBuffer, text: string }>({
+      handlers: [
+        (ctx) => {
+          return new Promise((resolve) => {
+            clearPlaying()
+
+            if (!audioContext.value) {
+              resolve()
+              return
+            }
+
+            // Create an AudioBufferSourceNode
+            const source = audioContext.value.createBufferSource()
+            source.buffer = ctx.data.audioBuffer
+
+            // Connect the source to the AudioContext's destination (the speakers)
+            source.connect(audioContext.value.destination)
+            // Connect the source to the analyzer
+            source.connect(audioAnalyser.value!)
+
+            // Start playing the audio
+            for (const hook of onPlaybackStartedHooks.value) {
+              hook()
+            }
+
+            currentAudioSource = source
+            source.start(0)
+            source.onended = () => {
+              for (const hook of onPlaybackFinishedHooks.value) {
+                hook()
+              }
+              if (currentAudioSource === source) {
+                currentAudioSource = null
+              }
+
+              resolve()
+            }
+          })
+        },
+      ],
+    })
+  }))
+
+  function clearQueue() {
+    playbackQueue.value.clear()
+  }
+
+  function clearAll() {
+    clearPlaying()
+    clearQueue()
+  }
+
+  return {
+    onPlaybackStarted,
+    onPlaybackFinished,
+
+    connectAudioContext,
+    connectAudioAnalyser,
+    clearPlaying,
+    clearQueue,
+    clearAll,
+
+    playbackQueue,
+  }
+})
+
+export const usePipelineWorkflowTextSegmentationStore = defineStore('pipelines:workflows:text-segmentation', () => {
+  // Hooks
+  const onTextSegmentedHooks = ref<Array<(segment: string) => Promise<void> | void>>([])
+
+  // Hooks registers
+  function onTextSegmented(hook: (segment: string) => Promise<void> | void) {
+    onTextSegmentedHooks.value.push(hook)
+  }
+
+  const textSegmentationQueue = ref(invoke(() => {
+    const textSegmentationStream = ref()
+    const textSegmentationStreamController = ref<ReadableStreamDefaultController<Uint8Array>>()
+
+    const encoder = new TextEncoder()
+
+    const { stream, controller } = createControllableStream<Uint8Array>()
+    textSegmentationStream.value = stream
+    textSegmentationStreamController.value = controller
+
+    chunkEmitter(stream.getReader(), async (chunk) => {
+      for (const hook of onTextSegmentedHooks.value) {
+        await hook(chunk)
+      }
+    })
+
+    return createQueue<string>({
+      handlers: [
+        async (ctx) => {
+          controller.enqueue(encoder.encode(ctx.data))
+        },
+      ],
+    })
+  }))
+
+  return {
+    onTextSegmented,
+
+    textSegmentationQueue,
+  }
+})
diff --git a/packages/stage-ui/src/stores/audio.ts b/packages/stage-ui/src/stores/audio.ts
@@ -71,7 +71,7 @@ export const useAudioContext = defineStore('audio-context', () => {
   onUnmounted(async () => {
     // Close audio context
     if (audioContext) {
-      await audioContext.value.close()
+      await audioContext.value.suspend()
     }
   })
 
diff --git a/packages/stage-ui/src/utils/tts.ts b/packages/stage-ui/src/utils/tts.ts
@@ -1,7 +1,5 @@
 import type { ReaderLike } from 'clustr'
 
-import type { UseQueueReturn } from './queue'
-
 import { readGraphemeClusters } from 'clustr'
 
 // A special character to instruct the TTS pipeline to flush
@@ -201,13 +199,13 @@ export async function* chunkTTSInput(input: string | ReaderLike, options?: TTSIn
   }
 }
 
-export async function chunkToTTSQueue(reader: ReaderLike, queue: UseQueueReturn<string>) {
+export async function chunkEmitter(reader: ReaderLike, handler: (chunk: string) => Promise<void> | void) {
   try {
     for await (const chunk of chunkTTSInput(reader)) {
       // TODO: remove later
       // eslint-disable-next-line no-console
       console.debug('chunk to be pushed: ', chunk)
-      queue.enqueue(chunk.text)
+      await handler(chunk.text)
     }
   }
   catch (e) {

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ export const useAudioContext = defineStore('audio-context', () => {`
`71`	`71`	`onUnmounted(async () => {`
`72`	`72`	`// Close audio context`
`73`	`73`	`if (audioContext) {`
`74`		`- await audioContext.value.close()`
	`74`	`+ await audioContext.value.suspend()`
`75`	`75`	`}`
`76`	`76`	`})`
`77`	`77`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,5 @@`
`1`	`1`	`import type { ReaderLike } from 'clustr'`
`2`	`2`
`3`		`-import type { UseQueueReturn } from './queue'`
`4`		`-`
`5`	`3`	`import { readGraphemeClusters } from 'clustr'`
`6`	`4`
`7`	`5`	`// A special character to instruct the TTS pipeline to flush`
`@@ -201,13 +199,13 @@ export async function* chunkTTSInput(input: string \| ReaderLike, options?: TTSIn`
`201`	`199`	`}`
`202`	`200`	`}`
`203`	`201`
`204`		`-export async function chunkToTTSQueue(reader: ReaderLike, queue: UseQueueReturn<string>) {`
	`202`	`+export async function chunkEmitter(reader: ReaderLike, handler: (chunk: string) => Promise<void> \| void) {`
`205`	`203`	`try {`
`206`	`204`	`for await (const chunk of chunkTTSInput(reader)) {`
`207`	`205`	`// TODO: remove later`
`208`	`206`	`// eslint-disable-next-line no-console`
`209`	`207`	`console.debug('chunk to be pushed: ', chunk)`
`210`		`- queue.enqueue(chunk.text)`
	`208`	`+ await handler(chunk.text)`
`211`	`209`	`}`
`212`	`210`	`}`
`213`	`211`	`catch (e) {`