Skip to content

'ValueError:buffer size must be a multiple of element size' when mp3 audio chunks have odd byte length #1824

@ritec03

Description

@ritec03

Describe the bug

When using Agents SDK with custom Text-to-Speech model provider (ElevenLabs) for audio streaming in mp3 format, I encounter ValueError when the length of an audio chunk is odd.

The error stems from _transform_audio_buffer in StreamedAudioResult class.

Debug information

  • Agents SDK version:0.3.2
  • Python version 3.13
  • Numpy 2.3.3
  • elevenlabs 2.16.0

Repro steps

To run this script, you will need ElevenLabs API key.
Audio format requested: MP3 with sample rate 44100 and bit rate of 96.
When an additional byte is added in case when chunk length is odd, the error disappears.

# voice.py
import asyncio
from collections.abc import AsyncIterator
import os
import numpy as np
from typing import Final
from agents.voice import (
    TTSModelSettings,
    VoicePipelineConfig
)
from agents.voice.events import (
    VoiceStreamEventAudio,
    VoiceStreamEventError,
    VoiceStreamEventLifecycle,
)
from agents.voice import TTSModel, TTSModelSettings
from agents.voice.result import StreamedAudioResult
from elevenlabs.client import ElevenLabs
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()

ELEVEN_MULTILINGUAL_V2: Final[str] = "eleven_multilingual_v2"
# just a random voice to use for the example
DEFAULT_VOICE: Final[str] = "21m00Tcm4TlvDq8ikWAM"

class ElevenlabsModel(TTSModel):
    """A text-to-speech model that can convert text into audio output."""

    def __init__(
        self,
    ) -> None:
        super().__init__()
        self._elevenlabs = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) # provide your ElevenLabs API key

    @property
    def model_name(self) -> str:
        """The name of the TTS model."""
        return ELEVEN_MULTILINGUAL_V2

    def run(self, text: str, settings: TTSModelSettings) -> AsyncIterator[bytes]:
        """Given a text string, produces a stream of audio bytes.

        Args:
            text: The text to convert to audio.

        Returns:
            An async iterator of audio bytes.
        """
        voice_id = DEFAULT_VOICE
        output_format = "mp3_44100_96" # MP3 with sample rate 44100 and bit rate of 96

        async def _async_stream() -> AsyncIterator[bytes]:
            stream = self._elevenlabs.text_to_speech.stream(
                text=text,
                output_format=output_format,
                voice_id=voice_id,
                model_id=self.model_name,
                language_code=None,
            )

            cur_chunk: bytes = bytes()
            for chunk in stream:
                if len(cur_chunk) > 0:
                    yield cur_chunk
                await asyncio.sleep(0)
                cur_chunk = bytes(chunk)

            if len(cur_chunk) > 0:
                if len(cur_chunk) % 2 != 0:
                    print(f"Warning: Final chunk has odd length {len(cur_chunk)}, padding")
                    # cur_chunk += b"\x00"  # If we pad with zero byte, the error disappears
                yield cur_chunk


        return _async_stream()

async def main():
    output = StreamedAudioResult(
        ElevenlabsModel(),
        TTSModelSettings(),
        VoicePipelineConfig(),
    )

    # Pick a filename in the current directory (timestamped to avoid overwrites)
    filename = f"tts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
    path = os.path.join(os.getcwd(), filename)

    # Play the audio stream as it comes in
    await output._add_text(""""Hey, how are you doing today?""")
    await output._turn_done()
    await output._done()

    with open(path, "wb") as f:
        async for event in output.stream():
            match event:
                case VoiceStreamEventAudio():
                    byte_data = np.ascontiguousarray(event.data).tobytes()
                    f.write(byte_data)   # raw MP3 bytes
                case VoiceStreamEventLifecycle():
                    print(event.event)
                case VoiceStreamEventError():
                    print(event.error)
                    break
                case _:
                    break

if __name__ == "__main__":
    asyncio.run(main())

Error Stack

Error streaming audio: buffer size must be a multiple of element size
Traceback (most recent call last):
  File "voice.py", line 107, in <module>
    asyncio.run(main())
  File "<python stdlib>/asyncio/runners.py", line 195, in run
    return runner.run(main)
  File "<python stdlib>/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
  File "<python stdlib>/asyncio/base_events.py", line 719, in run_until_complete
    return future.result()
  File "voice.py", line 92, in main
    await output._turn_done()
  File "<site-packages>/agents/voice/result.py", line 201, in _turn_done
    await asyncio.gather(*self._tasks)
  File "<site-packages>/agents/voice/result.py", line 168, in _stream_audio
    raise e
  File "<site-packages>/agents/voice/result.py", line 139, in _stream_audio
    audio_np = self._transform_audio_buffer(buffer, self.tts_settings.dtype)
  File "<site-packages>/agents/voice/result.py", line 91, in _transform_audio_buffer
    np_array = np.frombuffer(b"".join(buffer), dtype=np.int16)
ValueError: buffer size must be a multiple of element size

Expected behavior

Audio should play without errors regardless of parity of bytes.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions