Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions examples/realtime/cli/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
ENERGY_THRESHOLD = 0.015 # RMS threshold for barge‑in while assistant is speaking
PREBUFFER_CHUNKS = 3 # initial jitter buffer (~120ms with 40ms chunks)
FADE_OUT_MS = 12 # short fade to avoid clicks when interrupting
PLAYBACK_ECHO_MARGIN = 0.002 # extra energy above playback echo required to count as speech

# Set up logging for OpenAI agents SDK
# logging.basicConfig(
Expand Down Expand Up @@ -78,6 +79,7 @@ def __init__(self) -> None:
self.fade_total_samples = 0
self.fade_done_samples = 0
self.fade_samples = int(SAMPLE_RATE * (FADE_OUT_MS / 1000.0))
self.playback_rms = 0.0 # smoothed playback energy to filter out echo

def _output_callback(self, outdata, frames: int, time, status) -> None:
"""Callback for audio output - handles continuous audio stream from server."""
Expand Down Expand Up @@ -123,6 +125,7 @@ def _output_callback(self, outdata, frames: int, time, status) -> None:
gain = 1.0 - (idx / float(self.fade_total_samples))
ramped = np.clip(src * gain, -32768.0, 32767.0).astype(np.int16)
outdata[samples_filled : samples_filled + n, 0] = ramped
self._update_playback_rms(ramped)

# Optionally report played bytes (ramped) to playback tracker
try:
Expand Down Expand Up @@ -183,6 +186,7 @@ def _output_callback(self, outdata, frames: int, time, status) -> None:
chunk_data = samples[self.chunk_position : self.chunk_position + samples_to_copy]
# More efficient: direct assignment for mono audio instead of reshape
outdata[samples_filled : samples_filled + samples_to_copy, 0] = chunk_data
self._update_playback_rms(chunk_data)
samples_filled += samples_to_copy
self.chunk_position += samples_to_copy

Expand Down Expand Up @@ -273,14 +277,6 @@ async def capture_audio(self) -> None:
read_size = int(SAMPLE_RATE * CHUNK_LENGTH_S)

try:
# Simple energy-based barge-in: if user speaks while audio is playing, interrupt.
def rms_energy(samples: np.ndarray[Any, np.dtype[Any]]) -> float:
if samples.size == 0:
return 0.0
# Normalize int16 to [-1, 1]
x = samples.astype(np.float32) / 32768.0
return float(np.sqrt(np.mean(x * x)))

while self.recording:
# Check if there's enough data to read
if self.audio_stream.read_available < read_size:
Expand All @@ -300,7 +296,13 @@ def rms_energy(samples: np.ndarray[Any, np.dtype[Any]]) -> float:
if assistant_playing:
# Compute RMS energy to detect speech while assistant is talking
samples = data.reshape(-1)
if rms_energy(samples) >= ENERGY_THRESHOLD:
mic_rms = self._compute_rms(samples)
# Require the mic to be louder than the echo of the assistant playback.
playback_gate = max(
ENERGY_THRESHOLD,
self.playback_rms * 0.6 + PLAYBACK_ECHO_MARGIN,
)
if mic_rms >= playback_gate:
# Locally flush queued assistant audio for snappier interruption.
self.interrupt_event.set()
await self.session.send_audio(audio_bytes)
Expand Down Expand Up @@ -356,6 +358,18 @@ async def _on_event(self, event: RealtimeSessionEvent) -> None:
except Exception as e:
print(f"Error processing event: {_truncate_str(str(e), 200)}")

def _compute_rms(self, samples: np.ndarray[Any, np.dtype[Any]]) -> float:
"""Compute RMS energy for int16 samples normalized to [-1, 1]."""
if samples.size == 0:
return 0.0
x = samples.astype(np.float32) / 32768.0
return float(np.sqrt(np.mean(x * x)))

def _update_playback_rms(self, samples: np.ndarray[Any, np.dtype[Any]]) -> None:
"""Keep a smoothed estimate of playback energy to filter out echo feedback."""
sample_rms = self._compute_rms(samples)
self.playback_rms = 0.9 * self.playback_rms + 0.1 * sample_rms


if __name__ == "__main__":
demo = NoUIDemo()
Expand Down