-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Open
Labels
Description
Describe the bug
When using Agents SDK with custom Text-to-Speech model provider (ElevenLabs) for audio streaming in mp3 format, I encounter ValueError
when the length of an audio chunk is odd.
The error stems from _transform_audio_buffer
in StreamedAudioResult
class.
Debug information
- Agents SDK version:
0.3.2
- Python version
3.13
- Numpy
2.3.3
- elevenlabs
2.16.0
Repro steps
To run this script, you will need ElevenLabs API key.
Audio format requested: MP3 with sample rate 44100 and bit rate of 96.
When an additional byte is added in case when chunk length is odd, the error disappears.
# voice.py
import asyncio
from collections.abc import AsyncIterator
import os
import numpy as np
from typing import Final
from agents.voice import (
TTSModelSettings,
VoicePipelineConfig
)
from agents.voice.events import (
VoiceStreamEventAudio,
VoiceStreamEventError,
VoiceStreamEventLifecycle,
)
from agents.voice import TTSModel, TTSModelSettings
from agents.voice.result import StreamedAudioResult
from elevenlabs.client import ElevenLabs
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
ELEVEN_MULTILINGUAL_V2: Final[str] = "eleven_multilingual_v2"
# just a random voice to use for the example
DEFAULT_VOICE: Final[str] = "21m00Tcm4TlvDq8ikWAM"
class ElevenlabsModel(TTSModel):
"""A text-to-speech model that can convert text into audio output."""
def __init__(
self,
) -> None:
super().__init__()
self._elevenlabs = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) # provide your ElevenLabs API key
@property
def model_name(self) -> str:
"""The name of the TTS model."""
return ELEVEN_MULTILINGUAL_V2
def run(self, text: str, settings: TTSModelSettings) -> AsyncIterator[bytes]:
"""Given a text string, produces a stream of audio bytes.
Args:
text: The text to convert to audio.
Returns:
An async iterator of audio bytes.
"""
voice_id = DEFAULT_VOICE
output_format = "mp3_44100_96" # MP3 with sample rate 44100 and bit rate of 96
async def _async_stream() -> AsyncIterator[bytes]:
stream = self._elevenlabs.text_to_speech.stream(
text=text,
output_format=output_format,
voice_id=voice_id,
model_id=self.model_name,
language_code=None,
)
cur_chunk: bytes = bytes()
for chunk in stream:
if len(cur_chunk) > 0:
yield cur_chunk
await asyncio.sleep(0)
cur_chunk = bytes(chunk)
if len(cur_chunk) > 0:
if len(cur_chunk) % 2 != 0:
print(f"Warning: Final chunk has odd length {len(cur_chunk)}, padding")
# cur_chunk += b"\x00" # If we pad with zero byte, the error disappears
yield cur_chunk
return _async_stream()
async def main():
output = StreamedAudioResult(
ElevenlabsModel(),
TTSModelSettings(),
VoicePipelineConfig(),
)
# Pick a filename in the current directory (timestamped to avoid overwrites)
filename = f"tts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
path = os.path.join(os.getcwd(), filename)
# Play the audio stream as it comes in
await output._add_text(""""Hey, how are you doing today?""")
await output._turn_done()
await output._done()
with open(path, "wb") as f:
async for event in output.stream():
match event:
case VoiceStreamEventAudio():
byte_data = np.ascontiguousarray(event.data).tobytes()
f.write(byte_data) # raw MP3 bytes
case VoiceStreamEventLifecycle():
print(event.event)
case VoiceStreamEventError():
print(event.error)
break
case _:
break
if __name__ == "__main__":
asyncio.run(main())
Error Stack
Error streaming audio: buffer size must be a multiple of element size
Traceback (most recent call last):
File "voice.py", line 107, in <module>
asyncio.run(main())
File "<python stdlib>/asyncio/runners.py", line 195, in run
return runner.run(main)
File "<python stdlib>/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
File "<python stdlib>/asyncio/base_events.py", line 719, in run_until_complete
return future.result()
File "voice.py", line 92, in main
await output._turn_done()
File "<site-packages>/agents/voice/result.py", line 201, in _turn_done
await asyncio.gather(*self._tasks)
File "<site-packages>/agents/voice/result.py", line 168, in _stream_audio
raise e
File "<site-packages>/agents/voice/result.py", line 139, in _stream_audio
audio_np = self._transform_audio_buffer(buffer, self.tts_settings.dtype)
File "<site-packages>/agents/voice/result.py", line 91, in _transform_audio_buffer
np_array = np.frombuffer(b"".join(buffer), dtype=np.int16)
ValueError: buffer size must be a multiple of element size
Expected behavior
Audio should play without errors regardless of parity of bytes.