In [2]:
from dotenv import load_dotenv
load_dotenv()

True

### Direct REST call

Write stream to `.wav` file

In [4]:
import os
import requests
import io
import wave
import pyaudio

url = "https://api.openai.com/v1/audio/speech"
headers = {
    "Authorization": f'Bearer {os.getenv("OPENAI_API_KEY")}',
}

data = {
    "model": "tts-1",
    "input": "This is a test of the speech synthesis API",
    "voice": "shimmer",
    "response_format": "wav",
}

response = requests.post(url, headers=headers, json=data, stream=True)
if response.status_code == 200:
    buffer = io.BytesIO()
    for chunk in response.iter_content(chunk_size=1024):
        buffer.write(chunk)

with open("speech.wav", "wb") as f:
    f.write(buffer.getvalue())

Analyze `.wav` file

In [11]:
with wave.open('speech.wav', 'rb') as wf:
    p = pyaudio.PyAudio()

    print('format', p.get_format_from_width(wf.getsampwidth()))
    print('channels', wf.getnchannels())
    print('rate', wf.getframerate())

format 8
channels 1
rate 24000


Play `.wav` file

In [5]:
import wave
import pyaudio

CHUNK_SIZE = 1024

with wave.open('speech.wav', 'rb') as wf:
    # Instantiate PyAudio and initialize PortAudio system resources (1)
    p = pyaudio.PyAudio()

    print('format', p.get_format_from_width(wf.getsampwidth()))
    print('channels', wf.getnchannels())
    print('rate', wf.getframerate())

    # Open stream (2)
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # Play samples from the wave file (3)
    while len(data := wf.readframes(CHUNK_SIZE)):  # Requires Python 3.8+ for :=
        stream.write(data)

    # Close stream (4)
    stream.close()

    # Release PortAudio system resources (5)
    p.terminate()

format 8
channels 1
rate 24000


Real-time playback

In [14]:
import requests
import os
import pyaudio
from time import sleep

url = "https://api.openai.com/v1/audio/speech"
headers = {
    "Authorization": f'Bearer {os.getenv("OPENAI_API_KEY")}',
}

data = {
    "model": "tts-1",
    "input": "This is a test of the speech synthesis API",
    "voice": "shimmer",
    "response_format": "wav",
}

response = requests.post(url, headers=headers, json=data, stream=True)


# Author: @aaronepperly

my_buffer = bytes()                 # "my_buffer" gets loaded up from the http stream
my_1024 = bytes()                    # when "my_buffer" has enough, it gets sliced off into "my_1024"

if response.status_code == 200:
	p = pyaudio.PyAudio()
	is_first_chunk = True
	stream = p.open(format=8, channels=1, rate=24000, output=True)
	for chunk in response.iter_content(chunk_size=1024):
		if is_first_chunk:                                     # skip the header
			is_first_chunk = False
			continue
		my_buffer += chunk
		if len(my_buffer) >= 1024:
			my_1024 = my_buffer[0:1024]
			my_buffer = my_buffer[1024:]
		if len(my_1024):
			stream.write(my_1024)
			my_1024 = bytes()
	if len(my_buffer):                                           # Whatever is left in my_buffer, because it will likely
		stream.write(my_buffer)                    # be less than 1024 samples long
	sleep(1)
	stream.close()
	p.terminate()

In [20]:
import os
import requests
from time import sleep
import pyaudio
import wave

url = "https://api.openai.com/v1/audio/speech"
headers = {
    "Authorization": f'Bearer {os.getenv("OPENAI_API_KEY")}',
}

data = {
    "model": "tts-1",
    "input": "For longer inputs, there may be some noise which is caused by variable chunk sizes",
    "voice": "shimmer",
    "response_format": "wav",
}

response = requests.post(
    "https://api.openai.com/v1/audio/speech", headers=headers, json=data, stream=True
)

CHUNK_SIZE = 1024

if response.ok:
    with wave.open(response.raw, "rb") as wf:
        # Define callback for playback (1)
        def callback(in_data, frame_count, time_info, status):
            data = wf.readframes(frame_count)
            # If len(data) is less than requested frame_count, PyAudio automatically
            # assumes the stream is finished, and the stream stops.
            return (data, pyaudio.paContinue)

        # Instantiate PyAudio and initialize PortAudio system resources (2)
        p = pyaudio.PyAudio()

        # Open stream using callback (3)
        stream = p.open(
            format=p.get_format_from_width(wf.getsampwidth()),
            channels=wf.getnchannels(),
            rate=wf.getframerate(),
            output=True,
            stream_callback=callback,
        )

        # Wait for stream to finish (4)
        while stream.is_active():
            sleep(0.1)

        # Close the stream (5)
        stream.close()

        # Release PortAudio system resources (6)
        p.terminate()
else:
    response.raise_for_status()

In [49]:
it = response.iter_content(chunk_size=1024)

In [50]:
print(next(it))

b'RIFF\xff\xff\xff\xffWAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\xc0]\x00\x00\x80\xbb\x00\x00\x02\x00\x10\x00data\xff\xff\xff\xff'


In [51]:
reader = io.BufferedReader(response.raw, buffer_size=1024)

In [53]:
reader.read()

TypeError: object of type 'HTTPResponse' has no len()

In [None]:

# if response.status_code == 200:
#     p = pyaudio.PyAudio()
#     stream = p.open(format=8, channels=1, rate=24000, output=True)
#     for chunk in response.iter_content(chunk_size=1024):
#         if (len(chunk) == 44):
#             # header chunk, skip to prevent pop
#             continue
#         stream.write(chunk)

my_buffer = bytes()                 # "my_buffer" gets loaded up from the http stream
my_1024 = bytes()                    # when "my_buffer" has enough, it gets sliced off into "my_1024"

if response.status_code == 200:
	is_first_chunk = True
	stream = p.open(format=8, channels=1, rate=24000, output=True)
	for chunk in response.iter_content(chunk_size=1024):
		if is_first_chunk:                                     # skip the header
			is_first_chunk = False
			continue
		my_buffer += chunk
		if len(my_buffer) >= 1024:
			my_1024 = my_buffer[0:1024]
			my_buffer = my_buffer[1024:]
		if len(my_1024):
			stream.write(my_1024)
			my_1024 = bytes()
	# if len(my_buffer):                                           # Whatever is left in my_buffer, because it will likely
	# 	stream.write(my_buffer)                    # be less than 1024 samples long
	stream.close()

In [2]:
# Buffers
import io

class ByteBuffer:
    def __init__(self, capacity):
        self.buffer = io.BytesIO()
        self.capacity = capacity

    def write(self, data):
        remaining_space = self.capacity - self.buffer.tell()
        remaining_data = data

        while len(remaining_data) > remaining_space:
            self.buffer.write(remaining_data[:remaining_space])
            self.print_and_clear()
            remaining_data = remaining_data[remaining_space:]
            remaining_space = self.capacity

        self.buffer.write(remaining_data)

    def print_and_clear(self):
        print(self.buffer.getvalue())
        self.buffer = io.BytesIO()

# Usage:
buf = ByteBuffer(8)
buf.write(b"Hello, World!")
buf.print_and_clear()

b'Hello, W'
b'orld!'


### OpenAI libary

In [2]:
from openai import OpenAI
client = OpenAI()

response = client.audio.speech.create(
  model="tts-1",
  voice="alloy",
  input="Today is a wonderful day to build something people love!"
)

response.stream_to_file("speech.mp3")

  response.stream_to_file("speech.mp3")


In [28]:
from openai import OpenAI

client = OpenAI()

with client.audio.speech.with_streaming_response.create(
    model="tts-1",
    voice="alloy",
    input="""I see skies of blue and clouds of white
             The bright blessed days, the dark sacred nights
             And I think to myself
             What a wonderful world""",
) as response:
    # This doesn't seem to be *actually* streaming, it just creates the file
    # and then doesn't update it until the whole generation is finished
    response.stream_to_file("speech.mp3")

22:14:49
