diff --git a/.changeset/wet-snakes-stare.md b/.changeset/wet-snakes-stare.md new file mode 100644 index 00000000..5471efbe --- /dev/null +++ b/.changeset/wet-snakes-stare.md @@ -0,0 +1,5 @@ +--- +'@openai/agents-realtime': patch +--- + +fix: #494 Voice input transcription failing in realtime-demo diff --git a/packages/agents-realtime/src/realtimeSession.ts b/packages/agents-realtime/src/realtimeSession.ts index 1e4fd899..e4ca173f 100644 --- a/packages/agents-realtime/src/realtimeSession.ts +++ b/packages/agents-realtime/src/realtimeSession.ts @@ -29,7 +29,10 @@ import { RealtimeOutputGuardrailSettings, } from './guardrail'; import { RealtimeItem } from './items'; -import { OpenAIRealtimeModels } from './openaiRealtimeBase'; +import { + DEFAULT_OPENAI_REALTIME_SESSION_CONFIG, + OpenAIRealtimeModels, +} from './openaiRealtimeBase'; import { OpenAIRealtimeWebRTC } from './openaiRealtimeWebRtc'; import { OpenAIRealtimeWebSocket } from './openaiRealtimeWebsocket'; import { RealtimeAgent } from './realtimeAgent'; @@ -147,6 +150,12 @@ export type RealtimeSessionConnectOptions = { url?: string; }; +function cloneDefaultSessionConfig(): Partial { + return JSON.parse( + JSON.stringify(DEFAULT_OPENAI_REALTIME_SESSION_CONFIG), + ) as Partial; +} + /** * A `RealtimeSession` is the cornerstone of building Voice Agents. It's the equivalent of a * Runner in text-based agents except that it automatically handles multiple turns by maintaining a @@ -206,7 +215,8 @@ export class RealtimeSession< // modalities, speed, toolChoice, turnDetection, etc.). Without this, updating // the agent would drop audio format overrides (e.g. g711_ulaw) and revert to // transport defaults causing issues for integrations like Twilio. - #lastSessionConfig: Partial | null = null; + #lastSessionConfig: Partial | null = + cloneDefaultSessionConfig(); #automaticallyTriggerResponseForMcpToolCalls: boolean = true; constructor( diff --git a/packages/agents-realtime/test/realtimeSession.test.ts b/packages/agents-realtime/test/realtimeSession.test.ts index 59c49bae..1631b1cd 100644 --- a/packages/agents-realtime/test/realtimeSession.test.ts +++ b/packages/agents-realtime/test/realtimeSession.test.ts @@ -11,7 +11,11 @@ import { } from '@openai/agents-core'; import * as utils from '../src/utils'; import type { TransportToolCallEvent } from '../src/transportLayerEvents'; -import { OpenAIRealtimeBase } from '../src/openaiRealtimeBase'; +import { + DEFAULT_OPENAI_REALTIME_SESSION_CONFIG, + OpenAIRealtimeBase, +} from '../src/openaiRealtimeBase'; +import { toNewSessionConfig } from '../src/clientMessages'; function createMessage(id: string, text: string): RealtimeItem { return { @@ -122,6 +126,21 @@ describe('RealtimeSession', () => { expect(t.connectCalls[0]?.url).toBe('ws://example'); }); + it('includes default transcription config when connecting', async () => { + const t = new FakeTransport(); + const agent = new RealtimeAgent({ name: 'A', handoffs: [] }); + const s = new RealtimeSession(agent, { transport: t }); + await s.connect({ apiKey: 'test' }); + + const normalizedConfig = toNewSessionConfig( + t.connectCalls[0]?.initialSessionConfig ?? {}, + ); + + expect(normalizedConfig.audio?.input?.transcription).toEqual( + DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.transcription, + ); + }); + it('updateHistory accepts callback', () => { const item = createMessage('1', 'hi'); session.updateHistory([item]);