Skip to content

Commit aed77fc

Browse files
committed
fix(stage-ui): text to speech wasn't working
1 parent 29c0fed commit aed77fc

File tree

5 files changed

+177
-76
lines changed

5 files changed

+177
-76
lines changed

packages/stage-ui/src/components/scenarios/providers/SpeechStreamingPlayground.vue

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
import type { TTSInputChunk } from '../../../utils/tts'
33
44
import { animate } from 'animejs'
5+
import { storeToRefs } from 'pinia'
56
import { ref } from 'vue'
67
7-
import { useMessageContentQueue } from '../../../composables/queues'
8+
import { usePipelineWorkflowTextSegmentationStore } from '../../../composables/queues'
89
import { useAudioContext } from '../../../stores/audio'
910
import { createQueue } from '../../../utils/queue'
1011
import { chunkTTSInput } from '../../../utils/tts'
@@ -16,6 +17,8 @@ const props = defineProps<{
1617
voice: string
1718
}>()
1819
20+
const { onTextSegmented } = usePipelineWorkflowTextSegmentationStore()
21+
const { textSegmentationQueue } = storeToRefs(usePipelineWorkflowTextSegmentationStore())
1922
const { audioContext } = useAudioContext()
2023
const nowSpeaking = ref(false)
2124
const ttsInputChunks = ref<TTSInputChunk[]>([])
@@ -63,10 +66,12 @@ async function handleSpeechGeneration(ctx: { data: string }) {
6366
6467
const ttsQueue = createQueue<string>({ handlers: [handleSpeechGeneration] })
6568
66-
const messageContentQueue = useMessageContentQueue(ttsQueue)
69+
onTextSegmented((chunk) => {
70+
ttsQueue.enqueue(chunk)
71+
})
6772
6873
async function testStreaming() {
69-
messageContentQueue.enqueue(props.text)
74+
textSegmentationQueue.value.enqueue(props.text)
7075
}
7176
7277
async function testChunking() {

packages/stage-ui/src/components/scenes/Stage.vue

Lines changed: 20 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import { onMounted, onUnmounted, ref } from 'vue'
1717
1818
import Live2DScene from './Live2D.vue'
1919
20-
import { useDelayMessageQueue, useEmotionsMessageQueue, useMessageContentQueue } from '../../composables/queues'
20+
import { useDelayMessageQueue, useEmotionsMessageQueue, usePipelineCharacterSpeechPlaybackQueueStore, usePipelineWorkflowTextSegmentationStore } from '../../composables/queues'
2121
import { llmInferenceEndToken } from '../../constants'
2222
import { EMOTION_EmotionMotionName_value, EMOTION_VRMExpressionName_value, EmotionThinkMotionName } from '../../constants/emotions'
2323
import { useAudioContext, useSpeakingStore } from '../../stores/audio'
@@ -44,13 +44,22 @@ const db = ref<DuckDBWasmDrizzleDatabase>()
4444
const vrmViewerRef = ref<InstanceType<typeof ThreeScene>>()
4545
const live2dSceneRef = ref<InstanceType<typeof Live2DScene>>()
4646
47+
const textSegmentationStore = usePipelineWorkflowTextSegmentationStore()
48+
const { onTextSegmented } = textSegmentationStore
49+
const { textSegmentationQueue } = storeToRefs(textSegmentationStore)
50+
51+
const characterSpeechPlaybackQueue = usePipelineCharacterSpeechPlaybackQueueStore()
52+
const { connectAudioContext, connectAudioAnalyser, clearAll } = characterSpeechPlaybackQueue
53+
const { playbackQueue } = storeToRefs(characterSpeechPlaybackQueue)
54+
4755
const settingsStore = useSettings()
4856
const { stageModelRenderer, stageViewControlsEnabled, live2dDisableFocus, stageModelSelectedUrl } = storeToRefs(settingsStore)
4957
const { mouthOpenSize } = storeToRefs(useSpeakingStore())
5058
const { audioContext, calculateVolume } = useAudioContext()
59+
connectAudioContext(audioContext)
60+
5161
const { onBeforeMessageComposed, onBeforeSend, onTokenLiteral, onTokenSpecial, onStreamEnd, onAssistantResponseEnd } = useChatStore()
5262
const providersStore = useProvidersStore()
53-
5463
const live2dStore = useLive2d()
5564
5665
const showStage = ref(true)
@@ -79,45 +88,6 @@ live2dStore.onShouldUpdateView(async () => {
7988
const audioAnalyser = ref<AnalyserNode>()
8089
const nowSpeaking = ref(false)
8190
const lipSyncStarted = ref(false)
82-
let currentAudioSource: AudioBufferSourceNode | null = null
83-
84-
const audioQueue = createQueue<{ audioBuffer: AudioBuffer, text: string }>({
85-
handlers: [
86-
(ctx) => {
87-
return new Promise((resolve) => {
88-
// Stop any currently playing audio
89-
if (currentAudioSource) {
90-
try {
91-
currentAudioSource.stop()
92-
currentAudioSource.disconnect()
93-
}
94-
catch {}
95-
currentAudioSource = null
96-
}
97-
// Create an AudioBufferSourceNode
98-
const source = audioContext.createBufferSource()
99-
source.buffer = ctx.data.audioBuffer
100-
101-
// Connect the source to the AudioContext's destination (the speakers)
102-
source.connect(audioContext.destination)
103-
// Connect the source to the analyzer
104-
source.connect(audioAnalyser.value!)
105-
106-
// Start playing the audio
107-
nowSpeaking.value = true
108-
currentAudioSource = source
109-
source.start(0)
110-
source.onended = () => {
111-
nowSpeaking.value = false
112-
if (currentAudioSource === source) {
113-
currentAudioSource = null
114-
}
115-
resolve()
116-
}
117-
})
118-
},
119-
],
120-
})
12191
12292
const speechStore = useSpeechStore()
12393
const { ssmlEnabled, activeSpeechProvider, activeSpeechModel, activeSpeechVoice, pitch } = storeToRefs(speechStore)
@@ -155,7 +125,7 @@ async function handleSpeechGeneration(ctx: { data: string }) {
155125
156126
// Decode the ArrayBuffer into an AudioBuffer
157127
const audioBuffer = await audioContext.decodeAudioData(res)
158-
audioQueue.enqueue({ audioBuffer, text: ctx.data })
128+
playbackQueue.value.enqueue({ audioBuffer, text: ctx.data })
159129
}
160130
catch (error) {
161131
console.error('Speech generation failed:', error)
@@ -168,7 +138,9 @@ const ttsQueue = createQueue<string>({
168138
],
169139
})
170140
171-
const messageContentQueue = useMessageContentQueue(ttsQueue)
141+
onTextSegmented((chunk) => {
142+
ttsQueue.enqueue(chunk)
143+
})
172144
173145
const { currentMotion } = storeToRefs(useLive2d())
174146
@@ -218,21 +190,14 @@ function setupLipSync() {
218190
}
219191
220192
function setupAnalyser() {
221-
if (!audioAnalyser.value)
193+
if (!audioAnalyser.value) {
222194
audioAnalyser.value = audioContext.createAnalyser()
195+
connectAudioAnalyser(audioAnalyser.value)
196+
}
223197
}
224198
225199
onBeforeMessageComposed(async () => {
226-
// Stop any currently playing audio and clear the audio queue
227-
if (currentAudioSource) {
228-
try {
229-
currentAudioSource.stop()
230-
currentAudioSource.disconnect()
231-
}
232-
catch {}
233-
currentAudioSource = null
234-
}
235-
audioQueue.clear()
200+
clearAll()
236201
setupAnalyser()
237202
setupLipSync()
238203
})
@@ -242,7 +207,7 @@ onBeforeSend(async () => {
242207
})
243208
244209
onTokenLiteral(async (literal) => {
245-
messageContentQueue.enqueue(literal)
210+
textSegmentationQueue.value.enqueue(literal)
246211
})
247212
248213
onTokenSpecial(async (special) => {

packages/stage-ui/src/composables/queues.ts

Lines changed: 146 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@ import type { Emotion } from '../constants/emotions'
22
import type { UseQueueReturn } from '../utils/queue'
33

44
import { sleep } from '@moeru/std'
5+
import { invoke } from '@vueuse/core'
6+
import { defineStore } from 'pinia'
7+
import { ref } from 'vue'
58

69
import { EMOTION_VALUES } from '../constants/emotions'
710
import { createQueue } from '../utils/queue'
811
import { createControllableStream } from '../utils/stream'
9-
import { chunkToTTSQueue } from '../utils/tts'
12+
import { chunkEmitter } from '../utils/tts'
1013

1114
export function useEmotionsMessageQueue(emotionsQueue: UseQueueReturn<Emotion>) {
1215
function splitEmotion(content: string) {
@@ -100,17 +103,147 @@ export function useDelayMessageQueue() {
100103
})
101104
}
102105

103-
export function useMessageContentQueue(ttsQueue: UseQueueReturn<string>) {
104-
const encoder = new TextEncoder()
105-
const { stream, controller } = createControllableStream<Uint8Array>()
106+
export const usePipelineCharacterSpeechPlaybackQueueStore = defineStore('pipelines:character:speech', () => {
107+
// Hooks
108+
const onPlaybackStartedHooks = ref<Array<() => Promise<void> | void>>([])
109+
const onPlaybackFinishedHooks = ref<Array<() => Promise<void> | void>>([])
106110

107-
chunkToTTSQueue(stream.getReader(), ttsQueue)
111+
// Hooks registers
112+
function onPlaybackStarted(hook: () => Promise<void> | void) {
113+
onPlaybackStartedHooks.value.push(hook)
114+
}
115+
function onPlaybackFinished(hook: () => Promise<void> | void) {
116+
onPlaybackFinishedHooks.value.push(hook)
117+
}
108118

109-
return createQueue<string>({
110-
handlers: [
111-
async (ctx) => {
112-
controller.enqueue(encoder.encode(ctx.data))
113-
},
114-
],
115-
})
116-
}
119+
let currentAudioSource: AudioBufferSourceNode | null = null
120+
121+
const audioContext = ref<AudioContext>()
122+
const audioAnalyser = ref<AnalyserNode>()
123+
124+
function connectAudioContext(context: AudioContext) {
125+
audioContext.value = context
126+
}
127+
128+
function connectAudioAnalyser(analyser: AnalyserNode) {
129+
audioAnalyser.value = analyser
130+
}
131+
132+
function clearPlaying() {
133+
if (currentAudioSource) {
134+
try {
135+
currentAudioSource.stop()
136+
currentAudioSource.disconnect()
137+
}
138+
catch {}
139+
currentAudioSource = null
140+
}
141+
}
142+
143+
const playbackQueue = ref(invoke(() => {
144+
return createQueue<{ audioBuffer: AudioBuffer, text: string }>({
145+
handlers: [
146+
(ctx) => {
147+
return new Promise((resolve) => {
148+
clearPlaying()
149+
150+
if (!audioContext.value) {
151+
resolve()
152+
return
153+
}
154+
155+
// Create an AudioBufferSourceNode
156+
const source = audioContext.value.createBufferSource()
157+
source.buffer = ctx.data.audioBuffer
158+
159+
// Connect the source to the AudioContext's destination (the speakers)
160+
source.connect(audioContext.value.destination)
161+
// Connect the source to the analyzer
162+
source.connect(audioAnalyser.value!)
163+
164+
// Start playing the audio
165+
for (const hook of onPlaybackStartedHooks.value) {
166+
hook()
167+
}
168+
169+
currentAudioSource = source
170+
source.start(0)
171+
source.onended = () => {
172+
for (const hook of onPlaybackFinishedHooks.value) {
173+
hook()
174+
}
175+
if (currentAudioSource === source) {
176+
currentAudioSource = null
177+
}
178+
179+
resolve()
180+
}
181+
})
182+
},
183+
],
184+
})
185+
}))
186+
187+
function clearQueue() {
188+
playbackQueue.value.clear()
189+
}
190+
191+
function clearAll() {
192+
clearPlaying()
193+
clearQueue()
194+
}
195+
196+
return {
197+
onPlaybackStarted,
198+
onPlaybackFinished,
199+
200+
connectAudioContext,
201+
connectAudioAnalyser,
202+
clearPlaying,
203+
clearQueue,
204+
clearAll,
205+
206+
playbackQueue,
207+
}
208+
})
209+
210+
export const usePipelineWorkflowTextSegmentationStore = defineStore('pipelines:workflows:text-segmentation', () => {
211+
// Hooks
212+
const onTextSegmentedHooks = ref<Array<(segment: string) => Promise<void> | void>>([])
213+
214+
// Hooks registers
215+
function onTextSegmented(hook: (segment: string) => Promise<void> | void) {
216+
onTextSegmentedHooks.value.push(hook)
217+
}
218+
219+
const textSegmentationQueue = ref(invoke(() => {
220+
const textSegmentationStream = ref()
221+
const textSegmentationStreamController = ref<ReadableStreamDefaultController<Uint8Array>>()
222+
223+
const encoder = new TextEncoder()
224+
225+
const { stream, controller } = createControllableStream<Uint8Array>()
226+
textSegmentationStream.value = stream
227+
textSegmentationStreamController.value = controller
228+
229+
chunkEmitter(stream.getReader(), async (chunk) => {
230+
for (const hook of onTextSegmentedHooks.value) {
231+
await hook(chunk)
232+
}
233+
})
234+
235+
return createQueue<string>({
236+
handlers: [
237+
async (ctx) => {
238+
controller.enqueue(encoder.encode(ctx.data))
239+
},
240+
],
241+
})
242+
}))
243+
244+
return {
245+
onTextSegmented,
246+
247+
textSegmentationQueue,
248+
}
249+
})

packages/stage-ui/src/stores/audio.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ export const useAudioContext = defineStore('audio-context', () => {
7171
onUnmounted(async () => {
7272
// Close audio context
7373
if (audioContext) {
74-
await audioContext.value.close()
74+
await audioContext.value.suspend()
7575
}
7676
})
7777

packages/stage-ui/src/utils/tts.ts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import type { ReaderLike } from 'clustr'
22

3-
import type { UseQueueReturn } from './queue'
4-
53
import { readGraphemeClusters } from 'clustr'
64

75
// A special character to instruct the TTS pipeline to flush
@@ -201,13 +199,13 @@ export async function* chunkTTSInput(input: string | ReaderLike, options?: TTSIn
201199
}
202200
}
203201

204-
export async function chunkToTTSQueue(reader: ReaderLike, queue: UseQueueReturn<string>) {
202+
export async function chunkEmitter(reader: ReaderLike, handler: (chunk: string) => Promise<void> | void) {
205203
try {
206204
for await (const chunk of chunkTTSInput(reader)) {
207205
// TODO: remove later
208206
// eslint-disable-next-line no-console
209207
console.debug('chunk to be pushed: ', chunk)
210-
queue.enqueue(chunk.text)
208+
await handler(chunk.text)
211209
}
212210
}
213211
catch (e) {

0 commit comments

Comments
 (0)