diff --git a/.changeset/all-mangos-send.md b/.changeset/all-mangos-send.md new file mode 100644 index 00000000..bec58241 --- /dev/null +++ b/.changeset/all-mangos-send.md @@ -0,0 +1,5 @@ +--- +'@openai/agents-realtime': patch +--- + +fix: #495 Realtime session config falls back to legacy format when voice is set diff --git a/packages/agents-realtime/src/clientMessages.ts b/packages/agents-realtime/src/clientMessages.ts index 3b68a36f..14bc0d55 100644 --- a/packages/agents-realtime/src/clientMessages.ts +++ b/packages/agents-realtime/src/clientMessages.ts @@ -133,6 +133,12 @@ export type RealtimeSessionConfigCommon = { export type RealtimeSessionConfigDefinition = RealtimeSessionConfigCommon & { outputModalities?: ('text' | 'audio')[]; audio?: RealtimeAudioConfig; + /** + * TODO: We'll eventually migrate to audio.output.voice instead of this property. + * Until we fully migrate to audio.output.voice for all session implementations, + * using this top-level voice property helps with backwards compatibility. + */ + voice?: string; }; // Deprecated config (legacy) — cannot be mixed with new fields @@ -175,7 +181,6 @@ function isDeprecatedConfig( ): config is Partial { return ( isDefined('modalities', config) || - isDefined('voice', config) || isDefined('inputAudioFormat', config) || isDefined('outputAudioFormat', config) || isDefined('inputAudioTranscription', config) || @@ -193,6 +198,25 @@ export function toNewSessionConfig( config: Partial, ): Partial { if (!isDeprecatedConfig(config)) { + const inputConfig = config.audio?.input + ? { + format: normalizeAudioFormat(config.audio.input.format), + noiseReduction: config.audio.input.noiseReduction ?? null, + transcription: config.audio.input.transcription, + turnDetection: config.audio.input.turnDetection, + } + : undefined; + + const requestedOutputVoice = config.audio?.output?.voice ?? config.voice; + const outputConfig = + config.audio?.output || typeof requestedOutputVoice !== 'undefined' + ? { + format: normalizeAudioFormat(config.audio?.output?.format), + voice: requestedOutputVoice, + speed: config.audio?.output?.speed, + } + : undefined; + return { model: config.model, instructions: config.instructions, @@ -202,25 +226,13 @@ export function toNewSessionConfig( providerData: config.providerData, prompt: config.prompt, outputModalities: config.outputModalities, - audio: config.audio - ? { - input: config.audio.input - ? { - format: normalizeAudioFormat(config.audio.input.format), - noiseReduction: config.audio.input.noiseReduction ?? null, - transcription: config.audio.input.transcription, - turnDetection: config.audio.input.turnDetection, - } - : undefined, - output: config.audio.output - ? { - format: normalizeAudioFormat(config.audio.output.format), - voice: config.audio.output.voice, - speed: config.audio.output.speed, - } - : undefined, - } - : undefined, + audio: + inputConfig || outputConfig + ? { + input: inputConfig, + output: outputConfig, + } + : undefined, }; } diff --git a/packages/agents-realtime/test/realtimeVoiceConfigRegression.test.ts b/packages/agents-realtime/test/realtimeVoiceConfigRegression.test.ts new file mode 100644 index 00000000..b8432afb --- /dev/null +++ b/packages/agents-realtime/test/realtimeVoiceConfigRegression.test.ts @@ -0,0 +1,82 @@ +import { describe, it, expect } from 'vitest'; +import { toNewSessionConfig } from '../src/clientMessages'; +import { RealtimeAgent } from '../src/realtimeAgent'; +import { RealtimeSession } from '../src/realtimeSession'; +import { OpenAIRealtimeBase } from '../src/openaiRealtimeBase'; +import type { RealtimeClientMessage } from '../src/clientMessages'; + +const TELEPHONY_AUDIO_FORMAT = { type: 'audio/pcmu' as const }; + +class CapturingTransport extends OpenAIRealtimeBase { + status: 'connected' | 'disconnected' | 'connecting' | 'disconnecting' = + 'disconnected'; + mergedConfig: any = null; + events: RealtimeClientMessage[] = []; + + async connect(options: { initialSessionConfig?: any }) { + this.mergedConfig = (this as any)._getMergedSessionConfig( + options.initialSessionConfig ?? {}, + ); + } + + sendEvent(event: RealtimeClientMessage) { + this.events.push(event); + } + + mute() {} + close() {} + interrupt() {} + + get muted() { + return false; + } +} + +describe('Realtime session voice config regression', () => { + it('drops GA audio formats when top-level voice is present', () => { + const converted = toNewSessionConfig({ + voice: 'alloy', + audio: { + input: { format: TELEPHONY_AUDIO_FORMAT }, + output: { format: TELEPHONY_AUDIO_FORMAT }, + }, + }); + + expect(converted.audio?.input?.format).toEqual(TELEPHONY_AUDIO_FORMAT); + expect(converted.audio?.output?.format).toEqual(TELEPHONY_AUDIO_FORMAT); + expect(converted.audio?.output?.voice).toBe('alloy'); + }); + + it('resets audio formats when connecting a session for an agent with voice configured', async () => { + const transport = new CapturingTransport(); + const agent = new RealtimeAgent({ + name: 'voice-agent', + instructions: 'Respond cheerfully.', + voice: 'alloy', + }); + + const session = new RealtimeSession(agent, { + transport, + model: 'gpt-realtime', + config: { + audio: { + input: { format: TELEPHONY_AUDIO_FORMAT }, + output: { + format: TELEPHONY_AUDIO_FORMAT, + voice: 'marin', + }, + }, + }, + }); + + await session.connect({ apiKey: 'dummy-key' }); + + expect(transport.mergedConfig?.audio?.input?.format).toEqual( + TELEPHONY_AUDIO_FORMAT, + ); + expect(transport.mergedConfig?.audio?.output?.format).toEqual( + TELEPHONY_AUDIO_FORMAT, + ); + expect(transport.mergedConfig?.audio?.output?.voice).toBe('marin'); + }); +});