From 252d6d9a2ed1d689e784bb347da82aa70060292e Mon Sep 17 00:00:00 2001 From: lionel-oai Date: Wed, 10 Dec 2025 08:30:59 -0600 Subject: [PATCH] Fix demo energy threshold --- examples/realtime/cli/demo.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/examples/realtime/cli/demo.py b/examples/realtime/cli/demo.py index 4dfaf2c5a..6fc5a7967 100644 --- a/examples/realtime/cli/demo.py +++ b/examples/realtime/cli/demo.py @@ -25,6 +25,7 @@ ENERGY_THRESHOLD = 0.015 # RMS threshold for barge‑in while assistant is speaking PREBUFFER_CHUNKS = 3 # initial jitter buffer (~120ms with 40ms chunks) FADE_OUT_MS = 12 # short fade to avoid clicks when interrupting +PLAYBACK_ECHO_MARGIN = 0.002 # extra energy above playback echo required to count as speech # Set up logging for OpenAI agents SDK # logging.basicConfig( @@ -78,6 +79,7 @@ def __init__(self) -> None: self.fade_total_samples = 0 self.fade_done_samples = 0 self.fade_samples = int(SAMPLE_RATE * (FADE_OUT_MS / 1000.0)) + self.playback_rms = 0.0 # smoothed playback energy to filter out echo def _output_callback(self, outdata, frames: int, time, status) -> None: """Callback for audio output - handles continuous audio stream from server.""" @@ -123,6 +125,7 @@ def _output_callback(self, outdata, frames: int, time, status) -> None: gain = 1.0 - (idx / float(self.fade_total_samples)) ramped = np.clip(src * gain, -32768.0, 32767.0).astype(np.int16) outdata[samples_filled : samples_filled + n, 0] = ramped + self._update_playback_rms(ramped) # Optionally report played bytes (ramped) to playback tracker try: @@ -183,6 +186,7 @@ def _output_callback(self, outdata, frames: int, time, status) -> None: chunk_data = samples[self.chunk_position : self.chunk_position + samples_to_copy] # More efficient: direct assignment for mono audio instead of reshape outdata[samples_filled : samples_filled + samples_to_copy, 0] = chunk_data + self._update_playback_rms(chunk_data) samples_filled += samples_to_copy self.chunk_position += samples_to_copy @@ -273,14 +277,6 @@ async def capture_audio(self) -> None: read_size = int(SAMPLE_RATE * CHUNK_LENGTH_S) try: - # Simple energy-based barge-in: if user speaks while audio is playing, interrupt. - def rms_energy(samples: np.ndarray[Any, np.dtype[Any]]) -> float: - if samples.size == 0: - return 0.0 - # Normalize int16 to [-1, 1] - x = samples.astype(np.float32) / 32768.0 - return float(np.sqrt(np.mean(x * x))) - while self.recording: # Check if there's enough data to read if self.audio_stream.read_available < read_size: @@ -300,7 +296,13 @@ def rms_energy(samples: np.ndarray[Any, np.dtype[Any]]) -> float: if assistant_playing: # Compute RMS energy to detect speech while assistant is talking samples = data.reshape(-1) - if rms_energy(samples) >= ENERGY_THRESHOLD: + mic_rms = self._compute_rms(samples) + # Require the mic to be louder than the echo of the assistant playback. + playback_gate = max( + ENERGY_THRESHOLD, + self.playback_rms * 0.6 + PLAYBACK_ECHO_MARGIN, + ) + if mic_rms >= playback_gate: # Locally flush queued assistant audio for snappier interruption. self.interrupt_event.set() await self.session.send_audio(audio_bytes) @@ -356,6 +358,18 @@ async def _on_event(self, event: RealtimeSessionEvent) -> None: except Exception as e: print(f"Error processing event: {_truncate_str(str(e), 200)}") + def _compute_rms(self, samples: np.ndarray[Any, np.dtype[Any]]) -> float: + """Compute RMS energy for int16 samples normalized to [-1, 1].""" + if samples.size == 0: + return 0.0 + x = samples.astype(np.float32) / 32768.0 + return float(np.sqrt(np.mean(x * x))) + + def _update_playback_rms(self, samples: np.ndarray[Any, np.dtype[Any]]) -> None: + """Keep a smoothed estimate of playback energy to filter out echo feedback.""" + sample_rms = self._compute_rms(samples) + self.playback_rms = 0.9 * self.playback_rms + 0.1 * sample_rms + if __name__ == "__main__": demo = NoUIDemo()