Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/all-mangos-send.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@openai/agents-realtime': patch
---

fix: #495 Realtime session config falls back to legacy format when voice is set
52 changes: 32 additions & 20 deletions packages/agents-realtime/src/clientMessages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ export type RealtimeSessionConfigCommon = {
export type RealtimeSessionConfigDefinition = RealtimeSessionConfigCommon & {
outputModalities?: ('text' | 'audio')[];
audio?: RealtimeAudioConfig;
/**
* TODO: We'll eventually migrate to audio.output.voice instead of this property.
* Until we fully migrate to audio.output.voice for all session implementations,
* using this top-level voice property helps with backwards compatibility.
*/
voice?: string;
};

// Deprecated config (legacy) — cannot be mixed with new fields
Expand Down Expand Up @@ -175,7 +181,6 @@ function isDeprecatedConfig(
): config is Partial<RealtimeSessionConfigDeprecated> {
return (
isDefined('modalities', config) ||
isDefined('voice', config) ||
isDefined('inputAudioFormat', config) ||
isDefined('outputAudioFormat', config) ||
isDefined('inputAudioTranscription', config) ||
Expand All @@ -193,6 +198,25 @@ export function toNewSessionConfig(
config: Partial<RealtimeSessionConfig>,
): Partial<RealtimeSessionConfigDefinition> {
if (!isDeprecatedConfig(config)) {
const inputConfig = config.audio?.input
? {
format: normalizeAudioFormat(config.audio.input.format),
noiseReduction: config.audio.input.noiseReduction ?? null,
transcription: config.audio.input.transcription,
turnDetection: config.audio.input.turnDetection,
}
: undefined;

const requestedOutputVoice = config.audio?.output?.voice ?? config.voice;
const outputConfig =
config.audio?.output || typeof requestedOutputVoice !== 'undefined'
? {
format: normalizeAudioFormat(config.audio?.output?.format),
voice: requestedOutputVoice,
speed: config.audio?.output?.speed,
}
: undefined;

return {
model: config.model,
instructions: config.instructions,
Expand All @@ -202,25 +226,13 @@ export function toNewSessionConfig(
providerData: config.providerData,
prompt: config.prompt,
outputModalities: config.outputModalities,
audio: config.audio
? {
input: config.audio.input
? {
format: normalizeAudioFormat(config.audio.input.format),
noiseReduction: config.audio.input.noiseReduction ?? null,
transcription: config.audio.input.transcription,
turnDetection: config.audio.input.turnDetection,
}
: undefined,
output: config.audio.output
? {
format: normalizeAudioFormat(config.audio.output.format),
voice: config.audio.output.voice,
speed: config.audio.output.speed,
}
: undefined,
}
: undefined,
audio:
inputConfig || outputConfig
? {
input: inputConfig,
output: outputConfig,
}
: undefined,
};
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import { describe, it, expect } from 'vitest';
import { toNewSessionConfig } from '../src/clientMessages';
import { RealtimeAgent } from '../src/realtimeAgent';
import { RealtimeSession } from '../src/realtimeSession';
import { OpenAIRealtimeBase } from '../src/openaiRealtimeBase';
import type { RealtimeClientMessage } from '../src/clientMessages';

const TELEPHONY_AUDIO_FORMAT = { type: 'audio/pcmu' as const };

class CapturingTransport extends OpenAIRealtimeBase {
status: 'connected' | 'disconnected' | 'connecting' | 'disconnecting' =
'disconnected';
mergedConfig: any = null;
events: RealtimeClientMessage[] = [];

async connect(options: { initialSessionConfig?: any }) {
this.mergedConfig = (this as any)._getMergedSessionConfig(
options.initialSessionConfig ?? {},
);
}

sendEvent(event: RealtimeClientMessage) {
this.events.push(event);
}

mute() {}
close() {}
interrupt() {}

get muted() {
return false;
}
}

describe('Realtime session voice config regression', () => {
it('drops GA audio formats when top-level voice is present', () => {
const converted = toNewSessionConfig({
voice: 'alloy',
audio: {
input: { format: TELEPHONY_AUDIO_FORMAT },
output: { format: TELEPHONY_AUDIO_FORMAT },
},
});

expect(converted.audio?.input?.format).toEqual(TELEPHONY_AUDIO_FORMAT);
expect(converted.audio?.output?.format).toEqual(TELEPHONY_AUDIO_FORMAT);
expect(converted.audio?.output?.voice).toBe('alloy');
});

it('resets audio formats when connecting a session for an agent with voice configured', async () => {
const transport = new CapturingTransport();
const agent = new RealtimeAgent({
name: 'voice-agent',
instructions: 'Respond cheerfully.',
voice: 'alloy',
});

const session = new RealtimeSession(agent, {
transport,
model: 'gpt-realtime',
config: {
audio: {
input: { format: TELEPHONY_AUDIO_FORMAT },
output: {
format: TELEPHONY_AUDIO_FORMAT,
voice: 'marin',
},
},
},
});

await session.connect({ apiKey: 'dummy-key' });

expect(transport.mergedConfig?.audio?.input?.format).toEqual(
TELEPHONY_AUDIO_FORMAT,
);
expect(transport.mergedConfig?.audio?.output?.format).toEqual(
TELEPHONY_AUDIO_FORMAT,
);
expect(transport.mergedConfig?.audio?.output?.voice).toBe('marin');
});
});