Skip to content

Commit c13ada5

Browse files
Lilia-Chenautofix-ci[bot]gemini-code-assist[bot]
authored
feat(stage-ui): Synchronised Emotion Expression and Delay Animation to the TTS AIRI Speaking (#741)
* feat(stage-ui): Synchronised emotion expression and delay animation with the tts speaking * [autofix.ci] apply automated fixes * Update packages/stage-ui/src/utils/tts.ts Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * feat(stage-ui): Synchronised emotion expression and delay animation with the tts speaking - code review * feat(stage-ui): Synchronised emotion expression and delay animation with the tts speaking - code review * feat(stage-ui): Synchronised emotion expression and delay animation with the tts speaking - code review * feat(stage-ui): Synchronised emotion expression and delay animation with the tts speaking - code review --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 3240276 commit c13ada5

File tree

5 files changed

+420
-320
lines changed

5 files changed

+420
-320
lines changed

packages/stage-ui/src/components/scenarios/providers/SpeechStreamingPlayground.vue

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,11 @@ async function handleSpeechGeneration(ctx: { data: string }) {
6767
const ttsQueue = createQueue<string>({ handlers: [handleSpeechGeneration] })
6868
6969
onTextSegmented((chunk) => {
70-
ttsQueue.enqueue(chunk)
70+
ttsQueue.enqueue(chunk.chunk)
7171
})
7272
7373
async function testStreaming() {
74-
textSegmentationQueue.value.enqueue(props.text)
74+
textSegmentationQueue.value.enqueue({ type: 'literal', value: props.text })
7575
}
7676
7777
async function testChunking() {

packages/stage-ui/src/components/scenes/Stage.vue

Lines changed: 66 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@ import type { DuckDBWasmDrizzleDatabase } from '@proj-airi/drizzle-duckdb-wasm'
33
import type { SpeechProviderWithExtraOptions } from '@xsai-ext/shared-providers'
44
import type { UnElevenLabsOptions } from 'unspeech'
55
6+
import type { TextSegmentationItem } from '../../composables/queues'
67
import type { Emotion } from '../../constants/emotions'
8+
import type { TTSChunkItem } from '../../utils/tts'
79
810
import { drizzle } from '@proj-airi/drizzle-duckdb-wasm'
911
import { getImportUrlBundles } from '@proj-airi/drizzle-duckdb-wasm/bundles/import-url-browser'
@@ -65,7 +67,7 @@ const { textSegmentationQueue } = storeToRefs(textSegmentationStore)
6567
clearTextSegmentationHooks()
6668
6769
const characterSpeechPlaybackQueue = usePipelineCharacterSpeechPlaybackQueueStore()
68-
const { connectAudioContext, connectAudioAnalyser, clearAll, onPlaybackStarted } = characterSpeechPlaybackQueue
70+
const { connectAudioContext, connectAudioAnalyser, clearAll, onPlaybackStarted, onPlaybackFinished } = characterSpeechPlaybackQueue
6971
const { currentAudioSource, playbackQueue } = storeToRefs(characterSpeechPlaybackQueue)
7072
7173
const settingsStore = useSettings()
@@ -123,7 +125,48 @@ const lipSyncStarted = ref(false)
123125
const speechStore = useSpeechStore()
124126
const { ssmlEnabled, activeSpeechProvider, activeSpeechModel, activeSpeechVoice, pitch } = storeToRefs(speechStore)
125127
126-
async function handleSpeechGeneration(ctx: { data: string }) {
128+
const { currentMotion } = storeToRefs(useLive2d())
129+
130+
const emotionsQueue = createQueue<Emotion>({
131+
handlers: [
132+
async (ctx) => {
133+
if (stageModelRenderer.value === 'vrm') {
134+
// console.debug("VRM emotion anime: ", ctx.data)
135+
const value = EMOTION_VRMExpressionName_value[ctx.data]
136+
if (!value)
137+
return
138+
139+
await vrmViewerRef.value!.setExpression(value)
140+
}
141+
else if (stageModelRenderer.value === 'live2d') {
142+
currentMotion.value = { group: EMOTION_EmotionMotionName_value[ctx.data] }
143+
}
144+
},
145+
],
146+
})
147+
148+
const emotionMessageContentQueue = useEmotionsMessageQueue(emotionsQueue)
149+
emotionMessageContentQueue.onHandlerEvent('emotion', (emotion) => {
150+
// eslint-disable-next-line no-console
151+
console.debug('emotion detected', emotion)
152+
})
153+
154+
const delaysQueue = useDelayMessageQueue()
155+
delaysQueue.onHandlerEvent('delay', (delay) => {
156+
// eslint-disable-next-line no-console
157+
console.debug('delay detected', delay)
158+
})
159+
160+
// Play special token: delay or emotion
161+
function playSpecialToken(special: string) {
162+
delaysQueue.enqueue(special)
163+
emotionMessageContentQueue.enqueue(special)
164+
}
165+
onPlaybackFinished(({ special }) => {
166+
playSpecialToken(special)
167+
})
168+
169+
async function handleSpeechGeneration(ctx: { data: TTSChunkItem }) {
127170
try {
128171
if (!activeSpeechProvider.value) {
129172
console.warn('No active speech provider configured')
@@ -141,11 +184,21 @@ async function handleSpeechGeneration(ctx: { data: string }) {
141184
return
142185
}
143186
187+
// console.debug("ctx.data.chunk is empty? ", ctx.data.chunk === "")
188+
// console.debug("ctx.data.special: ", ctx.data.special)
189+
if (ctx.data.chunk === '' && !ctx.data.special)
190+
return
191+
// If special token only and chunk = ""
192+
if (ctx.data.chunk === '' && ctx.data.special) {
193+
playSpecialToken(ctx.data.special)
194+
return
195+
}
196+
144197
const providerConfig = providersStore.getProviderConfig(activeSpeechProvider.value)
145198
146199
const input = ssmlEnabled.value
147-
? speechStore.generateSSML(ctx.data, activeSpeechVoice.value, { ...providerConfig, pitch: pitch.value })
148-
: ctx.data
200+
? speechStore.generateSSML(ctx.data.chunk, activeSpeechVoice.value, { ...providerConfig, pitch: pitch.value })
201+
: ctx.data.chunk
149202
150203
const res = await generateSpeech({
151204
...provider.speech(activeSpeechModel.value, providerConfig),
@@ -154,52 +207,21 @@ async function handleSpeechGeneration(ctx: { data: string }) {
154207
})
155208
156209
const audioBuffer = await audioContext.decodeAudioData(res)
157-
playbackQueue.value.enqueue({ audioBuffer, text: ctx.data })
210+
playbackQueue.value.enqueue({ audioBuffer, text: ctx.data.chunk, special: ctx.data.special })
158211
}
159212
catch (error) {
160213
console.error('Speech generation failed:', error)
161214
}
162215
}
163216
164-
const ttsQueue = createQueue<string>({
217+
const ttsQueue = createQueue<TTSChunkItem>({
165218
handlers: [
166219
handleSpeechGeneration,
167220
],
168221
})
169222
170-
onTextSegmented((chunk) => {
171-
ttsQueue.enqueue(chunk)
172-
})
173-
174-
const { currentMotion } = storeToRefs(useLive2d())
175-
176-
const emotionsQueue = createQueue<Emotion>({
177-
handlers: [
178-
async (ctx) => {
179-
if (stageModelRenderer.value === 'vrm') {
180-
const value = EMOTION_VRMExpressionName_value[ctx.data]
181-
if (!value)
182-
return
183-
184-
await vrmViewerRef.value!.setExpression(value)
185-
}
186-
else if (stageModelRenderer.value === 'live2d') {
187-
currentMotion.value = { group: EMOTION_EmotionMotionName_value[ctx.data] }
188-
}
189-
},
190-
],
191-
})
192-
193-
const emotionMessageContentQueue = useEmotionsMessageQueue(emotionsQueue)
194-
emotionMessageContentQueue.onHandlerEvent('emotion', (emotion) => {
195-
// eslint-disable-next-line no-console
196-
console.debug('emotion detected', emotion)
197-
})
198-
199-
const delaysQueue = useDelayMessageQueue()
200-
delaysQueue.onHandlerEvent('delay', (delay) => {
201-
// eslint-disable-next-line no-console
202-
console.debug('delay detected', delay)
223+
onTextSegmented((chunkItem) => {
224+
ttsQueue.enqueue(chunkItem)
203225
})
204226
205227
function getVolumeWithMinMaxNormalizeWithFrameUpdates() {
@@ -241,12 +263,14 @@ onBeforeSend(async () => {
241263
242264
onTokenLiteral(async (literal) => {
243265
// Only push to segmentation; visual presentation happens on playback start
244-
textSegmentationQueue.value.enqueue(literal)
266+
textSegmentationQueue.value.enqueue({ type: 'literal', value: literal } as TextSegmentationItem)
245267
})
246268
247269
onTokenSpecial(async (special) => {
248-
delaysQueue.enqueue(special)
249-
emotionMessageContentQueue.enqueue(special)
270+
// delaysQueue.enqueue(special)
271+
// emotionMessageContentQueue.enqueue(special)
272+
// Also push special token to the queue for emotion animation/delay and TTS playback synchronisation
273+
textSegmentationQueue.value.enqueue({ type: 'special', value: special } as TextSegmentationItem)
250274
})
251275
252276
onStreamEnd(async () => {

packages/stage-ui/src/composables/queues.ts

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import type { Emotion } from '../constants/emotions'
22
import type { UseQueueReturn } from '../utils/queue'
3+
import type { TTSChunkItem } from '../utils/tts'
34

45
import { sleep } from '@moeru/std'
56
import { invoke } from '@vueuse/core'
@@ -9,7 +10,12 @@ import { ref, shallowRef } from 'vue'
910
import { EMOTION_VALUES } from '../constants/emotions'
1011
import { createQueue } from '../utils/queue'
1112
import { createControllableStream } from '../utils/stream'
12-
import { chunkEmitter } from '../utils/tts'
13+
import { chunkEmitter, TTS_SPECIAL_TOKEN } from '../utils/tts'
14+
15+
export interface TextSegmentationItem {
16+
type: 'literal' | 'special'
17+
value: string
18+
}
1319

1420
export function useEmotionsMessageQueue(emotionsQueue: UseQueueReturn<Emotion>) {
1521
function splitEmotion(content: string) {
@@ -106,13 +112,13 @@ export function useDelayMessageQueue() {
106112
export const usePipelineCharacterSpeechPlaybackQueueStore = defineStore('pipelines:character:speech', () => {
107113
// Hooks
108114
const onPlaybackStartedHooks = ref<Array<(payload: { text: string }) => Promise<void> | void>>([])
109-
const onPlaybackFinishedHooks = ref<Array<(payload: { text: string }) => Promise<void> | void>>([])
115+
const onPlaybackFinishedHooks = ref<Array<(payload: { special: string }) => Promise<void> | void>>([])
110116

111117
// Hooks registers
112118
function onPlaybackStarted(hook: (payload: { text: string }) => Promise<void> | void) {
113119
onPlaybackStartedHooks.value.push(hook)
114120
}
115-
function onPlaybackFinished(hook: (payload: { text: string }) => Promise<void> | void) {
121+
function onPlaybackFinished(hook: (payload: { special: string }) => Promise<void> | void) {
116122
onPlaybackFinishedHooks.value.push(hook)
117123
}
118124

@@ -141,7 +147,7 @@ export const usePipelineCharacterSpeechPlaybackQueueStore = defineStore('pipelin
141147
}
142148

143149
const playbackQueue = ref(invoke(() => {
144-
return createQueue<{ audioBuffer: AudioBuffer, text: string }>({
150+
return createQueue<{ audioBuffer: AudioBuffer, text: string, special: string | null }>({
145151
handlers: [
146152
(ctx) => {
147153
return new Promise((resolve) => {
@@ -167,11 +173,15 @@ export const usePipelineCharacterSpeechPlaybackQueueStore = defineStore('pipelin
167173
currentAudioSource.value = source
168174
source.start(0)
169175
source.onended = () => {
170-
for (const hook of onPlaybackFinishedHooks.value) hook({ text: ctx.data.text })
176+
// Play special token: delay or emotion
177+
if (ctx.data.special) {
178+
for (const hook of onPlaybackFinishedHooks.value)
179+
hook({ special: ctx.data.special })
180+
}
181+
171182
if (currentAudioSource.value === source) {
172183
currentAudioSource.value = undefined
173184
}
174-
175185
resolve()
176186
}
177187
})
@@ -206,10 +216,10 @@ export const usePipelineCharacterSpeechPlaybackQueueStore = defineStore('pipelin
206216

207217
export const usePipelineWorkflowTextSegmentationStore = defineStore('pipelines:workflows:text-segmentation', () => {
208218
// Hooks
209-
const onTextSegmentedHooks = ref<Array<(segment: string) => Promise<void> | void>>([])
219+
const onTextSegmentedHooks = ref<Array<(segment: TTSChunkItem) => Promise<void> | void>>([])
210220

211221
// Hooks registers
212-
function onTextSegmented(hook: (segment: string) => Promise<void> | void) {
222+
function onTextSegmented(hook: (segment: TTSChunkItem) => Promise<void> | void) {
213223
onTextSegmentedHooks.value.push(hook)
214224
}
215225

@@ -226,17 +236,27 @@ export const usePipelineWorkflowTextSegmentationStore = defineStore('pipelines:w
226236
const { stream, controller } = createControllableStream<Uint8Array>()
227237
textSegmentationStream.value = stream
228238
textSegmentationStreamController.value = controller
239+
// This is the queue for pending special tokens
240+
const pendingSpecials: string[] = []
229241

230-
chunkEmitter(stream.getReader(), async (chunk) => {
242+
chunkEmitter(stream.getReader(), pendingSpecials, async (chunk) => {
231243
for (const hook of onTextSegmentedHooks.value) {
232244
await hook(chunk)
233245
}
234246
})
235247

236-
return createQueue<string>({
248+
return createQueue<TextSegmentationItem>({
237249
handlers: [
238250
async (ctx) => {
239-
controller.enqueue(encoder.encode(ctx.data))
251+
if (ctx.data.type === 'literal') {
252+
controller.enqueue(encoder.encode(ctx.data.value))
253+
}
254+
else {
255+
// Special literal, need to be flushed in tts rechunking
256+
// console.debug("TextSegmentationQueue: Special enqueue", encoder.encode(TTS_SPECIAL_TOKEN))
257+
pendingSpecials.push(ctx.data.value)
258+
controller.enqueue(encoder.encode(TTS_SPECIAL_TOKEN))
259+
}
240260
},
241261
],
242262
})
Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,56 @@
1-
export const EMOTION_HAPPY = '<|EMOTE_HAPPY|>'
2-
export const EMOTION_SAD = '<|EMOTE_SAD|>'
3-
export const EMOTION_ANGRY = '<|EMOTE_ANGRY|>'
4-
export const EMOTION_THINK = '<|EMOTE_THINK|>'
5-
export const EMOTION_SURPRISE = '<|EMOTE_SURPRISE|>'
6-
export const EMOTION_AWKWARD = '<|EMOTE_AWKWARD|>'
7-
export const EMOTION_QUESTION = '<|EMOTE_QUESTION|>'
8-
9-
export enum Emotion {
10-
Idle = '<|EMOTE_NEUTRAL|>',
11-
Happy = '<|EMOTE_HAPPY|>',
12-
Sad = '<|EMOTE_SAD|>',
13-
Angry = '<|EMOTE_ANGRY|>',
14-
Think = '<|EMOTE_THINK|>',
15-
Surprise = '<|EMOTE_SURPRISE|>',
16-
Awkward = '<|EMOTE_AWKWARD|>',
17-
Question = '<|EMOTE_QUESTION|>',
18-
}
19-
20-
export const EMOTION_VALUES = Object.values(Emotion)
21-
22-
export const EmotionHappyMotionName = 'Happy'
23-
export const EmotionSadMotionName = 'Sad'
24-
export const EmotionAngryMotionName = 'Angry'
25-
export const EmotionAwkwardMotionName = 'Awkward'
26-
export const EmotionThinkMotionName = 'Think'
27-
export const EmotionSurpriseMotionName = 'Surprise'
28-
export const EmotionQuestionMotionName = 'Question'
29-
export const EmotionNeutralMotionName = 'Idle'
30-
31-
export const EMOTION_EmotionMotionName_value = {
32-
[Emotion.Happy]: EmotionHappyMotionName,
33-
[Emotion.Sad]: EmotionSadMotionName,
34-
[Emotion.Angry]: EmotionAngryMotionName,
35-
[Emotion.Think]: EmotionThinkMotionName,
36-
[Emotion.Surprise]: EmotionSurpriseMotionName,
37-
[Emotion.Awkward]: EmotionAwkwardMotionName,
38-
[Emotion.Question]: EmotionQuestionMotionName,
39-
[Emotion.Idle]: EmotionNeutralMotionName,
40-
}
41-
42-
export const EMOTION_VRMExpressionName_value = {
43-
[Emotion.Happy]: 'happy',
44-
[Emotion.Sad]: 'sad',
45-
[Emotion.Angry]: 'angry',
46-
[Emotion.Think]: undefined,
47-
[Emotion.Surprise]: 'surprised',
48-
[Emotion.Awkward]: undefined,
49-
[Emotion.Question]: undefined,
50-
[Emotion.Idle]: undefined,
51-
} satisfies Record<Emotion, string | undefined>
1+
export const EMOTION_HAPPY = '<|EMOTE_HAPPY|>'
2+
export const EMOTION_SAD = '<|EMOTE_SAD|>'
3+
export const EMOTION_ANGRY = '<|EMOTE_ANGRY|>'
4+
export const EMOTION_THINK = '<|EMOTE_THINK|>'
5+
export const EMOTION_SURPRISE = '<|EMOTE_SURPRISED|>'
6+
export const EMOTION_AWKWARD = '<|EMOTE_AWKWARD|>'
7+
export const EMOTION_QUESTION = '<|EMOTE_QUESTION|>'
8+
export const EMOTION_CURIOUS = '<|EMOTE_CURIOUS|>'
9+
10+
export enum Emotion {
11+
Idle = '<|EMOTE_NEUTRAL|>',
12+
Happy = '<|EMOTE_HAPPY|>',
13+
Sad = '<|EMOTE_SAD|>',
14+
Angry = '<|EMOTE_ANGRY|>',
15+
Think = '<|EMOTE_THINK|>',
16+
Surprise = '<|EMOTE_SURPRISED|>',
17+
Awkward = '<|EMOTE_AWKWARD|>',
18+
Question = '<|EMOTE_QUESTION|>',
19+
Curious = '<|EMOTE_CURIOUS|>',
20+
}
21+
22+
export const EMOTION_VALUES = Object.values(Emotion)
23+
24+
export const EmotionHappyMotionName = 'Happy'
25+
export const EmotionSadMotionName = 'Sad'
26+
export const EmotionAngryMotionName = 'Angry'
27+
export const EmotionAwkwardMotionName = 'Awkward'
28+
export const EmotionThinkMotionName = 'Think'
29+
export const EmotionSurpriseMotionName = 'Surprise'
30+
export const EmotionQuestionMotionName = 'Question'
31+
export const EmotionNeutralMotionName = 'Idle'
32+
export const EmotionCuriousMotionName = 'Curious'
33+
34+
export const EMOTION_EmotionMotionName_value = {
35+
[Emotion.Happy]: EmotionHappyMotionName,
36+
[Emotion.Sad]: EmotionSadMotionName,
37+
[Emotion.Angry]: EmotionAngryMotionName,
38+
[Emotion.Think]: EmotionThinkMotionName,
39+
[Emotion.Surprise]: EmotionSurpriseMotionName,
40+
[Emotion.Awkward]: EmotionAwkwardMotionName,
41+
[Emotion.Question]: EmotionQuestionMotionName,
42+
[Emotion.Idle]: EmotionNeutralMotionName,
43+
[Emotion.Curious]: EmotionCuriousMotionName,
44+
}
45+
46+
export const EMOTION_VRMExpressionName_value = {
47+
[Emotion.Happy]: 'happy',
48+
[Emotion.Sad]: 'sad',
49+
[Emotion.Angry]: 'angry',
50+
[Emotion.Think]: undefined,
51+
[Emotion.Surprise]: 'surprised',
52+
[Emotion.Awkward]: undefined,
53+
[Emotion.Question]: undefined,
54+
[Emotion.Idle]: undefined,
55+
[Emotion.Curious]: 'surprised',
56+
} satisfies Record<Emotion, string | undefined>

0 commit comments

Comments
 (0)