From a0c221dd81775914abebfb9cb01e02a050fd4174 Mon Sep 17 00:00:00 2001 From: Lucas Wang Date: Sun, 19 Oct 2025 01:49:24 +0800 Subject: [PATCH] fix: handle odd-length audio chunks in voice streaming (fixes #1824) This change fixes a ValueError that occurred when audio chunks from TTS providers (e.g., ElevenLabs MP3 streams) had an odd number of bytes. The issue was in StreamedAudioResult._transform_audio_buffer which used np.frombuffer with dtype=np.int16. Since int16 requires 2 bytes per element, buffers with odd byte lengths would cause: ValueError: buffer size must be a multiple of element size Solution: - Pad the combined buffer with a zero byte if it has odd length - This ensures the buffer size is always a multiple of 2 bytes - The padding has minimal audio impact (< 1 sample) The fix applies to all TTS providers that may produce odd-length chunks, not just ElevenLabs. Testing: - Linting (ruff check) - passed - Type checking (mypy) - passed - Formatting (ruff format) - passed Generated with Lucas Wang Co-Authored-By: Claude --- src/agents/voice/result.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/agents/voice/result.py b/src/agents/voice/result.py index fea79902e..e2d24f50c 100644 --- a/src/agents/voice/result.py +++ b/src/agents/voice/result.py @@ -88,7 +88,16 @@ async def _add_error(self, error: Exception): def _transform_audio_buffer( self, buffer: list[bytes], output_dtype: npt.DTypeLike ) -> npt.NDArray[np.int16 | np.float32]: - np_array = np.frombuffer(b"".join(buffer), dtype=np.int16) + # Combine all chunks + combined_buffer = b"".join(buffer) + + # Pad with a zero byte if the buffer length is odd + # This is needed because np.frombuffer with dtype=np.int16 requires + # the buffer size to be a multiple of 2 bytes + if len(combined_buffer) % 2 != 0: + combined_buffer += b"\x00" + + np_array = np.frombuffer(combined_buffer, dtype=np.int16) if output_dtype == np.int16: return np_array