From 7b2f4e09c0f13ee9ca83c32aa580d6c0a31227bc Mon Sep 17 00:00:00 2001
From: Lucas Wang <lucas_wang@automodules.com>
Date: Sun, 19 Oct 2025 01:33:59 +0800
Subject: [PATCH 1/2] fix: Twilio audio jittering by buffering outgoing audio
 chunks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #1906

The Twilio realtime example was experiencing jittering/skip sounds at
the beginning of every word. This was caused by sending small audio
chunks from OpenAI to Twilio too frequently without buffering.

Changes:
- Added outgoing audio buffer to accumulate audio chunks from OpenAI
- Buffer audio until reaching 50ms worth of data before sending to Twilio
- Flush remaining buffered audio on audio_end and audio_interrupted events
- Updated periodic flush loop to handle both incoming and outgoing buffers
- Added documentation about audio buffering to troubleshooting section

Technical details:
- Incoming audio (Twilio → OpenAI) was already buffered
- Now outgoing audio (OpenAI → Twilio) is also buffered symmetrically
- Buffer size: 50ms chunks (400 bytes at 8kHz sample rate)
- Prevents choppy playback by sending larger, consistent audio packets

Tested with:
- Linting: ruff check ✓
- Formatting: ruff format ✓
- Type checking: mypy ✓

Generated with Lucas Wang<lucas_wang@automodules.com>
---
 examples/realtime/twilio/README.md         |  1 +
 examples/realtime/twilio/twilio_handler.py | 90 ++++++++++++++++------
 2 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/examples/realtime/twilio/README.md b/examples/realtime/twilio/README.md
index e92f0681a..845330f3a 100644
--- a/examples/realtime/twilio/README.md
+++ b/examples/realtime/twilio/README.md
@@ -70,6 +70,7 @@ This example demonstrates how to connect the OpenAI Realtime API to a phone call
 
 -   **WebSocket connection issues**: Ensure your ngrok URL is correct and publicly accessible
 -   **Audio quality**: Twilio streams audio in mulaw format at 8kHz, which may affect quality
+-   **Audio jittering/skipping**: The implementation includes audio buffering (50ms chunks) to reduce jittering at word boundaries. This buffers both incoming (Twilio → OpenAI) and outgoing (OpenAI → Twilio) audio for smoother playback.
 -   **Latency**: Network latency between Twilio, your server, and OpenAI affects response time
 -   **Logs**: Check the console output for detailed connection and error logs
 
diff --git a/examples/realtime/twilio/twilio_handler.py b/examples/realtime/twilio/twilio_handler.py
index 567015dfc..70b961c4b 100644
--- a/examples/realtime/twilio/twilio_handler.py
+++ b/examples/realtime/twilio/twilio_handler.py
@@ -52,9 +52,15 @@ def __init__(self, twilio_websocket: WebSocket):
         self.BUFFER_SIZE_BYTES = int(self.SAMPLE_RATE * self.CHUNK_LENGTH_S)  # 50ms worth of audio
 
         self._stream_sid: str | None = None
+
+        # Incoming audio buffer (from Twilio to OpenAI)
         self._audio_buffer: bytearray = bytearray()
         self._last_buffer_send_time = time.time()
 
+        # Outgoing audio buffer (from OpenAI to Twilio) - NEW
+        self._outgoing_audio_buffer: bytearray = bytearray()
+        self._last_outgoing_send_time = time.time()
+
         # Mark event tracking for playback
         self._mark_counter = 0
         self._mark_data: dict[
@@ -122,18 +128,10 @@ async def _twilio_message_loop(self) -> None:
     async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
         """Handle events from the realtime session."""
         if event.type == "audio":
-            base64_audio = base64.b64encode(event.audio.data).decode("utf-8")
-            await self.twilio_websocket.send_text(
-                json.dumps(
-                    {
-                        "event": "media",
-                        "streamSid": self._stream_sid,
-                        "media": {"payload": base64_audio},
-                    }
-                )
-            )
+            # Buffer outgoing audio to reduce jittering
+            self._outgoing_audio_buffer.extend(event.audio.data)
 
-            # Send mark event for playback tracking
+            # Store metadata for this audio chunk
             self._mark_counter += 1
             mark_id = str(self._mark_counter)
             self._mark_data[mark_id] = (
@@ -142,23 +140,24 @@ async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
                 len(event.audio.data),
             )
 
-            await self.twilio_websocket.send_text(
-                json.dumps(
-                    {
-                        "event": "mark",
-                        "streamSid": self._stream_sid,
-                        "mark": {"name": mark_id},
-                    }
-                )
-            )
+            # Send buffered audio if we have enough data (reduces jittering)
+            if len(self._outgoing_audio_buffer) >= self.BUFFER_SIZE_BYTES:
+                await self._flush_outgoing_audio_buffer(mark_id)
 
         elif event.type == "audio_interrupted":
             print("Sending audio interrupted to Twilio")
+            # Flush any remaining buffered audio before clearing
+            if self._outgoing_audio_buffer:
+                await self._flush_outgoing_audio_buffer(None)
             await self.twilio_websocket.send_text(
                 json.dumps({"event": "clear", "streamSid": self._stream_sid})
             )
+            self._outgoing_audio_buffer.clear()
         elif event.type == "audio_end":
-            print("Audio end")
+            print("Audio end - flushing remaining buffered audio")
+            # Flush remaining audio at the end
+            if self._outgoing_audio_buffer:
+                await self._flush_outgoing_audio_buffer(None)
         elif event.type == "raw_model_event":
             pass
         else:
@@ -246,19 +245,64 @@ async def _flush_audio_buffer(self) -> None:
         except Exception as e:
             print(f"Error sending buffered audio to OpenAI: {e}")
 
+    async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None:
+        """Send buffered audio to Twilio to reduce jittering."""
+        if not self._outgoing_audio_buffer:
+            return
+
+        try:
+            # Encode and send the buffered audio to Twilio
+            base64_audio = base64.b64encode(bytes(self._outgoing_audio_buffer)).decode("utf-8")
+            await self.twilio_websocket.send_text(
+                json.dumps(
+                    {
+                        "event": "media",
+                        "streamSid": self._stream_sid,
+                        "media": {"payload": base64_audio},
+                    }
+                )
+            )
+
+            # Send mark event for playback tracking (if provided)
+            if mark_id is not None:
+                await self.twilio_websocket.send_text(
+                    json.dumps(
+                        {
+                            "event": "mark",
+                            "streamSid": self._stream_sid,
+                            "mark": {"name": mark_id},
+                        }
+                    )
+                )
+
+            # Clear the buffer
+            self._outgoing_audio_buffer.clear()
+            self._last_outgoing_send_time = time.time()
+
+        except Exception as e:
+            print(f"Error sending buffered audio to Twilio: {e}")
+
     async def _buffer_flush_loop(self) -> None:
-        """Periodically flush audio buffer to prevent stale data."""
+        """Periodically flush audio buffers to prevent stale data."""
         try:
             while True:
                 await asyncio.sleep(self.CHUNK_LENGTH_S)  # Check every 50ms
 
-                # If buffer has data and it's been too long since last send, flush it
                 current_time = time.time()
+
+                # Flush incoming audio buffer (from Twilio to OpenAI) if stale
                 if (
                     self._audio_buffer
                     and current_time - self._last_buffer_send_time > self.CHUNK_LENGTH_S * 2
                 ):
                     await self._flush_audio_buffer()
 
+                # Flush outgoing audio buffer (from OpenAI to Twilio) if stale
+                if (
+                    self._outgoing_audio_buffer
+                    and current_time - self._last_outgoing_send_time > self.CHUNK_LENGTH_S * 2
+                ):
+                    await self._flush_outgoing_audio_buffer(None)
+
         except Exception as e:
             print(f"Error in buffer flush loop: {e}")

From ecf2c5718a555a62fb6274112536b6b099059fca Mon Sep 17 00:00:00 2001
From: Lucas Wang <lucas_wang@lucas-futures.com>
Date: Sun, 19 Oct 2025 02:26:53 +0800
Subject: [PATCH 2/2] fix: prevent mark metadata leak in Twilio buffering
 (addresses Codex P1)

Critical fix for memory leak identified by chatgpt-codex-connector:

Problem:
- Each audio chunk created a mark entry in _mark_data
- But only the last mark_id was sent to Twilio when flushing buffer
- Earlier marks were never acknowledged, causing memory leak
- Playback tracker couldn't track all sent audio

Solution:
- Track all mark_ids for buffered chunks in _buffered_marks list
- Send mark events for ALL buffered chunks when flushing
- Clear _buffered_marks after flush to prevent reuse
- Extract mark creation logic to _create_mark() method (addresses Copilot nitpick)

Additional improvements:
- Remove '- NEW' comment suffix (Copilot suggestion)
- _flush_outgoing_audio_buffer now handles empty buffer check internally

This ensures proper playback tracking and prevents _mark_data from growing indefinitely.

Generated with Lucas Wang<lucas_wang@lucas-futures.com>

Co-Authored-By: Claude <noreply@anthropic.com>
---
 examples/realtime/twilio/twilio_handler.py | 40 +++++++++++++---------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/examples/realtime/twilio/twilio_handler.py b/examples/realtime/twilio/twilio_handler.py
index 70b961c4b..fee7b91a3 100644
--- a/examples/realtime/twilio/twilio_handler.py
+++ b/examples/realtime/twilio/twilio_handler.py
@@ -57,7 +57,7 @@ def __init__(self, twilio_websocket: WebSocket):
         self._audio_buffer: bytearray = bytearray()
         self._last_buffer_send_time = time.time()
 
-        # Outgoing audio buffer (from OpenAI to Twilio) - NEW
+        # Outgoing audio buffer (from OpenAI to Twilio)
         self._outgoing_audio_buffer: bytearray = bytearray()
         self._last_outgoing_send_time = time.time()
 
@@ -66,6 +66,8 @@ def __init__(self, twilio_websocket: WebSocket):
         self._mark_data: dict[
             str, tuple[str, int, int]
         ] = {}  # mark_id -> (item_id, content_index, byte_count)
+        # Track marks for buffered audio chunks
+        self._buffered_marks: list[str] = []  # mark_ids for chunks in current buffer
 
     async def start(self) -> None:
         """Start the session."""
@@ -132,32 +134,28 @@ async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
             self._outgoing_audio_buffer.extend(event.audio.data)
 
             # Store metadata for this audio chunk
-            self._mark_counter += 1
-            mark_id = str(self._mark_counter)
-            self._mark_data[mark_id] = (
-                event.audio.item_id,
-                event.audio.content_index,
-                len(event.audio.data),
+            mark_id = self._create_mark(
+                event.audio.item_id, event.audio.content_index, len(event.audio.data)
             )
+            self._buffered_marks.append(mark_id)
 
             # Send buffered audio if we have enough data (reduces jittering)
             if len(self._outgoing_audio_buffer) >= self.BUFFER_SIZE_BYTES:
-                await self._flush_outgoing_audio_buffer(mark_id)
+                await self._flush_outgoing_audio_buffer()
 
         elif event.type == "audio_interrupted":
             print("Sending audio interrupted to Twilio")
             # Flush any remaining buffered audio before clearing
-            if self._outgoing_audio_buffer:
-                await self._flush_outgoing_audio_buffer(None)
+            await self._flush_outgoing_audio_buffer()
             await self.twilio_websocket.send_text(
                 json.dumps({"event": "clear", "streamSid": self._stream_sid})
             )
             self._outgoing_audio_buffer.clear()
+            self._buffered_marks.clear()
         elif event.type == "audio_end":
             print("Audio end - flushing remaining buffered audio")
             # Flush remaining audio at the end
-            if self._outgoing_audio_buffer:
-                await self._flush_outgoing_audio_buffer(None)
+            await self._flush_outgoing_audio_buffer()
         elif event.type == "raw_model_event":
             pass
         else:
@@ -245,7 +243,14 @@ async def _flush_audio_buffer(self) -> None:
         except Exception as e:
             print(f"Error sending buffered audio to OpenAI: {e}")
 
-    async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None:
+    def _create_mark(self, item_id: str, content_index: int, byte_count: int) -> str:
+        """Create a new mark for tracking audio playback."""
+        self._mark_counter += 1
+        mark_id = str(self._mark_counter)
+        self._mark_data[mark_id] = (item_id, content_index, byte_count)
+        return mark_id
+
+    async def _flush_outgoing_audio_buffer(self) -> None:
         """Send buffered audio to Twilio to reduce jittering."""
         if not self._outgoing_audio_buffer:
             return
@@ -263,8 +268,8 @@ async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None:
                 )
             )
 
-            # Send mark event for playback tracking (if provided)
-            if mark_id is not None:
+            # Send mark events for all buffered chunks (for playback tracking)
+            for mark_id in self._buffered_marks:
                 await self.twilio_websocket.send_text(
                     json.dumps(
                         {
@@ -275,8 +280,9 @@ async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None:
                     )
                 )
 
-            # Clear the buffer
+            # Clear the buffer and marks
             self._outgoing_audio_buffer.clear()
+            self._buffered_marks.clear()
             self._last_outgoing_send_time = time.time()
 
         except Exception as e:
@@ -302,7 +308,7 @@ async def _buffer_flush_loop(self) -> None:
                     self._outgoing_audio_buffer
                     and current_time - self._last_outgoing_send_time > self.CHUNK_LENGTH_S * 2
                 ):
-                    await self._flush_outgoing_audio_buffer(None)
+                    await self._flush_outgoing_audio_buffer()
 
         except Exception as e:
             print(f"Error in buffer flush loop: {e}")