From 06549eb3cd35b8170c3da17844996c0e85ace2e7 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Sat, 27 Sep 2025 07:49:08 +0900 Subject: [PATCH 1/2] fix: #494 Voice input transcription failing in realtime-demo --- .changeset/wet-snakes-stare.md | 5 +++++ .../agents-realtime/src/realtimeSession.ts | 14 ++++++++++++-- .../test/realtimeSession.test.ts | 18 +++++++++++++++++- 3 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 .changeset/wet-snakes-stare.md diff --git a/.changeset/wet-snakes-stare.md b/.changeset/wet-snakes-stare.md new file mode 100644 index 00000000..5471efbe --- /dev/null +++ b/.changeset/wet-snakes-stare.md @@ -0,0 +1,5 @@ +--- +'@openai/agents-realtime': patch +--- + +fix: #494 Voice input transcription failing in realtime-demo diff --git a/packages/agents-realtime/src/realtimeSession.ts b/packages/agents-realtime/src/realtimeSession.ts index 1e4fd899..e4ca173f 100644 --- a/packages/agents-realtime/src/realtimeSession.ts +++ b/packages/agents-realtime/src/realtimeSession.ts @@ -29,7 +29,10 @@ import { RealtimeOutputGuardrailSettings, } from './guardrail'; import { RealtimeItem } from './items'; -import { OpenAIRealtimeModels } from './openaiRealtimeBase'; +import { + DEFAULT_OPENAI_REALTIME_SESSION_CONFIG, + OpenAIRealtimeModels, +} from './openaiRealtimeBase'; import { OpenAIRealtimeWebRTC } from './openaiRealtimeWebRtc'; import { OpenAIRealtimeWebSocket } from './openaiRealtimeWebsocket'; import { RealtimeAgent } from './realtimeAgent'; @@ -147,6 +150,12 @@ export type RealtimeSessionConnectOptions = { url?: string; }; +function cloneDefaultSessionConfig(): Partial { + return JSON.parse( + JSON.stringify(DEFAULT_OPENAI_REALTIME_SESSION_CONFIG), + ) as Partial; +} + /** * A `RealtimeSession` is the cornerstone of building Voice Agents. It's the equivalent of a * Runner in text-based agents except that it automatically handles multiple turns by maintaining a @@ -206,7 +215,8 @@ export class RealtimeSession< // modalities, speed, toolChoice, turnDetection, etc.). Without this, updating // the agent would drop audio format overrides (e.g. g711_ulaw) and revert to // transport defaults causing issues for integrations like Twilio. - #lastSessionConfig: Partial | null = null; + #lastSessionConfig: Partial | null = + cloneDefaultSessionConfig(); #automaticallyTriggerResponseForMcpToolCalls: boolean = true; constructor( diff --git a/packages/agents-realtime/test/realtimeSession.test.ts b/packages/agents-realtime/test/realtimeSession.test.ts index 59c49bae..40104298 100644 --- a/packages/agents-realtime/test/realtimeSession.test.ts +++ b/packages/agents-realtime/test/realtimeSession.test.ts @@ -11,7 +11,10 @@ import { } from '@openai/agents-core'; import * as utils from '../src/utils'; import type { TransportToolCallEvent } from '../src/transportLayerEvents'; -import { OpenAIRealtimeBase } from '../src/openaiRealtimeBase'; +import { + DEFAULT_OPENAI_REALTIME_SESSION_CONFIG, + OpenAIRealtimeBase, +} from '../src/openaiRealtimeBase'; function createMessage(id: string, text: string): RealtimeItem { return { @@ -122,6 +125,19 @@ describe('RealtimeSession', () => { expect(t.connectCalls[0]?.url).toBe('ws://example'); }); + it('includes default transcription config when connecting', async () => { + const t = new FakeTransport(); + const agent = new RealtimeAgent({ name: 'A', handoffs: [] }); + const s = new RealtimeSession(agent, { transport: t }); + await s.connect({ apiKey: 'test' }); + + expect( + t.connectCalls[0]?.initialSessionConfig?.audio?.input?.transcription, + ).toEqual( + DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.transcription, + ); + }); + it('updateHistory accepts callback', () => { const item = createMessage('1', 'hi'); session.updateHistory([item]); From 6001c53352eff5a09c14ac6b9a36c6e6936a4d7e Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Sat, 27 Sep 2025 08:20:25 +0900 Subject: [PATCH 2/2] fix --- packages/agents-realtime/test/realtimeSession.test.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/agents-realtime/test/realtimeSession.test.ts b/packages/agents-realtime/test/realtimeSession.test.ts index 40104298..1631b1cd 100644 --- a/packages/agents-realtime/test/realtimeSession.test.ts +++ b/packages/agents-realtime/test/realtimeSession.test.ts @@ -15,6 +15,7 @@ import { DEFAULT_OPENAI_REALTIME_SESSION_CONFIG, OpenAIRealtimeBase, } from '../src/openaiRealtimeBase'; +import { toNewSessionConfig } from '../src/clientMessages'; function createMessage(id: string, text: string): RealtimeItem { return { @@ -131,9 +132,11 @@ describe('RealtimeSession', () => { const s = new RealtimeSession(agent, { transport: t }); await s.connect({ apiKey: 'test' }); - expect( - t.connectCalls[0]?.initialSessionConfig?.audio?.input?.transcription, - ).toEqual( + const normalizedConfig = toNewSessionConfig( + t.connectCalls[0]?.initialSessionConfig ?? {}, + ); + + expect(normalizedConfig.audio?.input?.transcription).toEqual( DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.transcription, ); });