From 7b2f4e09c0f13ee9ca83c32aa580d6c0a31227bc Mon Sep 17 00:00:00 2001 From: Lucas Wang Date: Sun, 19 Oct 2025 01:33:59 +0800 Subject: [PATCH 1/2] fix: Twilio audio jittering by buffering outgoing audio chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #1906 The Twilio realtime example was experiencing jittering/skip sounds at the beginning of every word. This was caused by sending small audio chunks from OpenAI to Twilio too frequently without buffering. Changes: - Added outgoing audio buffer to accumulate audio chunks from OpenAI - Buffer audio until reaching 50ms worth of data before sending to Twilio - Flush remaining buffered audio on audio_end and audio_interrupted events - Updated periodic flush loop to handle both incoming and outgoing buffers - Added documentation about audio buffering to troubleshooting section Technical details: - Incoming audio (Twilio → OpenAI) was already buffered - Now outgoing audio (OpenAI → Twilio) is also buffered symmetrically - Buffer size: 50ms chunks (400 bytes at 8kHz sample rate) - Prevents choppy playback by sending larger, consistent audio packets Tested with: - Linting: ruff check ✓ - Formatting: ruff format ✓ - Type checking: mypy ✓ Generated with Lucas Wang --- examples/realtime/twilio/README.md | 1 + examples/realtime/twilio/twilio_handler.py | 90 ++++++++++++++++------ 2 files changed, 68 insertions(+), 23 deletions(-) diff --git a/examples/realtime/twilio/README.md b/examples/realtime/twilio/README.md index e92f0681a..845330f3a 100644 --- a/examples/realtime/twilio/README.md +++ b/examples/realtime/twilio/README.md @@ -70,6 +70,7 @@ This example demonstrates how to connect the OpenAI Realtime API to a phone call - **WebSocket connection issues**: Ensure your ngrok URL is correct and publicly accessible - **Audio quality**: Twilio streams audio in mulaw format at 8kHz, which may affect quality +- **Audio jittering/skipping**: The implementation includes audio buffering (50ms chunks) to reduce jittering at word boundaries. This buffers both incoming (Twilio → OpenAI) and outgoing (OpenAI → Twilio) audio for smoother playback. - **Latency**: Network latency between Twilio, your server, and OpenAI affects response time - **Logs**: Check the console output for detailed connection and error logs diff --git a/examples/realtime/twilio/twilio_handler.py b/examples/realtime/twilio/twilio_handler.py index 567015dfc..70b961c4b 100644 --- a/examples/realtime/twilio/twilio_handler.py +++ b/examples/realtime/twilio/twilio_handler.py @@ -52,9 +52,15 @@ def __init__(self, twilio_websocket: WebSocket): self.BUFFER_SIZE_BYTES = int(self.SAMPLE_RATE * self.CHUNK_LENGTH_S) # 50ms worth of audio self._stream_sid: str | None = None + + # Incoming audio buffer (from Twilio to OpenAI) self._audio_buffer: bytearray = bytearray() self._last_buffer_send_time = time.time() + # Outgoing audio buffer (from OpenAI to Twilio) - NEW + self._outgoing_audio_buffer: bytearray = bytearray() + self._last_outgoing_send_time = time.time() + # Mark event tracking for playback self._mark_counter = 0 self._mark_data: dict[ @@ -122,18 +128,10 @@ async def _twilio_message_loop(self) -> None: async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None: """Handle events from the realtime session.""" if event.type == "audio": - base64_audio = base64.b64encode(event.audio.data).decode("utf-8") - await self.twilio_websocket.send_text( - json.dumps( - { - "event": "media", - "streamSid": self._stream_sid, - "media": {"payload": base64_audio}, - } - ) - ) + # Buffer outgoing audio to reduce jittering + self._outgoing_audio_buffer.extend(event.audio.data) - # Send mark event for playback tracking + # Store metadata for this audio chunk self._mark_counter += 1 mark_id = str(self._mark_counter) self._mark_data[mark_id] = ( @@ -142,23 +140,24 @@ async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None: len(event.audio.data), ) - await self.twilio_websocket.send_text( - json.dumps( - { - "event": "mark", - "streamSid": self._stream_sid, - "mark": {"name": mark_id}, - } - ) - ) + # Send buffered audio if we have enough data (reduces jittering) + if len(self._outgoing_audio_buffer) >= self.BUFFER_SIZE_BYTES: + await self._flush_outgoing_audio_buffer(mark_id) elif event.type == "audio_interrupted": print("Sending audio interrupted to Twilio") + # Flush any remaining buffered audio before clearing + if self._outgoing_audio_buffer: + await self._flush_outgoing_audio_buffer(None) await self.twilio_websocket.send_text( json.dumps({"event": "clear", "streamSid": self._stream_sid}) ) + self._outgoing_audio_buffer.clear() elif event.type == "audio_end": - print("Audio end") + print("Audio end - flushing remaining buffered audio") + # Flush remaining audio at the end + if self._outgoing_audio_buffer: + await self._flush_outgoing_audio_buffer(None) elif event.type == "raw_model_event": pass else: @@ -246,19 +245,64 @@ async def _flush_audio_buffer(self) -> None: except Exception as e: print(f"Error sending buffered audio to OpenAI: {e}") + async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None: + """Send buffered audio to Twilio to reduce jittering.""" + if not self._outgoing_audio_buffer: + return + + try: + # Encode and send the buffered audio to Twilio + base64_audio = base64.b64encode(bytes(self._outgoing_audio_buffer)).decode("utf-8") + await self.twilio_websocket.send_text( + json.dumps( + { + "event": "media", + "streamSid": self._stream_sid, + "media": {"payload": base64_audio}, + } + ) + ) + + # Send mark event for playback tracking (if provided) + if mark_id is not None: + await self.twilio_websocket.send_text( + json.dumps( + { + "event": "mark", + "streamSid": self._stream_sid, + "mark": {"name": mark_id}, + } + ) + ) + + # Clear the buffer + self._outgoing_audio_buffer.clear() + self._last_outgoing_send_time = time.time() + + except Exception as e: + print(f"Error sending buffered audio to Twilio: {e}") + async def _buffer_flush_loop(self) -> None: - """Periodically flush audio buffer to prevent stale data.""" + """Periodically flush audio buffers to prevent stale data.""" try: while True: await asyncio.sleep(self.CHUNK_LENGTH_S) # Check every 50ms - # If buffer has data and it's been too long since last send, flush it current_time = time.time() + + # Flush incoming audio buffer (from Twilio to OpenAI) if stale if ( self._audio_buffer and current_time - self._last_buffer_send_time > self.CHUNK_LENGTH_S * 2 ): await self._flush_audio_buffer() + # Flush outgoing audio buffer (from OpenAI to Twilio) if stale + if ( + self._outgoing_audio_buffer + and current_time - self._last_outgoing_send_time > self.CHUNK_LENGTH_S * 2 + ): + await self._flush_outgoing_audio_buffer(None) + except Exception as e: print(f"Error in buffer flush loop: {e}") From ecf2c5718a555a62fb6274112536b6b099059fca Mon Sep 17 00:00:00 2001 From: Lucas Wang Date: Sun, 19 Oct 2025 02:26:53 +0800 Subject: [PATCH 2/2] fix: prevent mark metadata leak in Twilio buffering (addresses Codex P1) Critical fix for memory leak identified by chatgpt-codex-connector: Problem: - Each audio chunk created a mark entry in _mark_data - But only the last mark_id was sent to Twilio when flushing buffer - Earlier marks were never acknowledged, causing memory leak - Playback tracker couldn't track all sent audio Solution: - Track all mark_ids for buffered chunks in _buffered_marks list - Send mark events for ALL buffered chunks when flushing - Clear _buffered_marks after flush to prevent reuse - Extract mark creation logic to _create_mark() method (addresses Copilot nitpick) Additional improvements: - Remove '- NEW' comment suffix (Copilot suggestion) - _flush_outgoing_audio_buffer now handles empty buffer check internally This ensures proper playback tracking and prevents _mark_data from growing indefinitely. Generated with Lucas Wang Co-Authored-By: Claude --- examples/realtime/twilio/twilio_handler.py | 40 +++++++++++++--------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/examples/realtime/twilio/twilio_handler.py b/examples/realtime/twilio/twilio_handler.py index 70b961c4b..fee7b91a3 100644 --- a/examples/realtime/twilio/twilio_handler.py +++ b/examples/realtime/twilio/twilio_handler.py @@ -57,7 +57,7 @@ def __init__(self, twilio_websocket: WebSocket): self._audio_buffer: bytearray = bytearray() self._last_buffer_send_time = time.time() - # Outgoing audio buffer (from OpenAI to Twilio) - NEW + # Outgoing audio buffer (from OpenAI to Twilio) self._outgoing_audio_buffer: bytearray = bytearray() self._last_outgoing_send_time = time.time() @@ -66,6 +66,8 @@ def __init__(self, twilio_websocket: WebSocket): self._mark_data: dict[ str, tuple[str, int, int] ] = {} # mark_id -> (item_id, content_index, byte_count) + # Track marks for buffered audio chunks + self._buffered_marks: list[str] = [] # mark_ids for chunks in current buffer async def start(self) -> None: """Start the session.""" @@ -132,32 +134,28 @@ async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None: self._outgoing_audio_buffer.extend(event.audio.data) # Store metadata for this audio chunk - self._mark_counter += 1 - mark_id = str(self._mark_counter) - self._mark_data[mark_id] = ( - event.audio.item_id, - event.audio.content_index, - len(event.audio.data), + mark_id = self._create_mark( + event.audio.item_id, event.audio.content_index, len(event.audio.data) ) + self._buffered_marks.append(mark_id) # Send buffered audio if we have enough data (reduces jittering) if len(self._outgoing_audio_buffer) >= self.BUFFER_SIZE_BYTES: - await self._flush_outgoing_audio_buffer(mark_id) + await self._flush_outgoing_audio_buffer() elif event.type == "audio_interrupted": print("Sending audio interrupted to Twilio") # Flush any remaining buffered audio before clearing - if self._outgoing_audio_buffer: - await self._flush_outgoing_audio_buffer(None) + await self._flush_outgoing_audio_buffer() await self.twilio_websocket.send_text( json.dumps({"event": "clear", "streamSid": self._stream_sid}) ) self._outgoing_audio_buffer.clear() + self._buffered_marks.clear() elif event.type == "audio_end": print("Audio end - flushing remaining buffered audio") # Flush remaining audio at the end - if self._outgoing_audio_buffer: - await self._flush_outgoing_audio_buffer(None) + await self._flush_outgoing_audio_buffer() elif event.type == "raw_model_event": pass else: @@ -245,7 +243,14 @@ async def _flush_audio_buffer(self) -> None: except Exception as e: print(f"Error sending buffered audio to OpenAI: {e}") - async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None: + def _create_mark(self, item_id: str, content_index: int, byte_count: int) -> str: + """Create a new mark for tracking audio playback.""" + self._mark_counter += 1 + mark_id = str(self._mark_counter) + self._mark_data[mark_id] = (item_id, content_index, byte_count) + return mark_id + + async def _flush_outgoing_audio_buffer(self) -> None: """Send buffered audio to Twilio to reduce jittering.""" if not self._outgoing_audio_buffer: return @@ -263,8 +268,8 @@ async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None: ) ) - # Send mark event for playback tracking (if provided) - if mark_id is not None: + # Send mark events for all buffered chunks (for playback tracking) + for mark_id in self._buffered_marks: await self.twilio_websocket.send_text( json.dumps( { @@ -275,8 +280,9 @@ async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None: ) ) - # Clear the buffer + # Clear the buffer and marks self._outgoing_audio_buffer.clear() + self._buffered_marks.clear() self._last_outgoing_send_time = time.time() except Exception as e: @@ -302,7 +308,7 @@ async def _buffer_flush_loop(self) -> None: self._outgoing_audio_buffer and current_time - self._last_outgoing_send_time > self.CHUNK_LENGTH_S * 2 ): - await self._flush_outgoing_audio_buffer(None) + await self._flush_outgoing_audio_buffer() except Exception as e: print(f"Error in buffer flush loop: {e}")