Test tweaked example from [documentation](https://platform.openai.com/docs/guides/audio) to ensure auth is working OK and file can be saved to a distinct folder.


In [1]:
import base64
import hashlib
import os
import subprocess

from dotenv import load_dotenv
from openai import OpenAI
from rich.console import Console
from rich.table import Table

load_dotenv()  # OPENAI_API_KEY="sk-[...]"

client = OpenAI()

output_path = "outputs"

In [2]:
completion = client.chat.completions.create(
    model="gpt-4o-audio-preview",
    modalities=["text", "audio"],
    audio={"voice": "alloy", "format": "wav"},
    messages=[{"role": "user", "content": "Is a golden retriever a good family dog?"}],
)

print(completion.usage)

wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
with open(os.path.join(output_path, "dog.wav"), "wb") as f:
    f.write(wav_bytes)

CompletionUsage(completion_tokens=357, prompt_tokens=17, total_tokens=374, completion_tokens_details=CompletionTokensDetails(audio_tokens=284, reasoning_tokens=0, text_tokens=73), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0, text_tokens=17, image_tokens=0))


## Functionalize

Given a `system` prompt, a `text` input, and a `temperature`, generate audio with those parameters. Due to the high cost of generation, we should also track spend.

Additionally, compress into a `MP3` for future web display using ffmpeg.


In [12]:
# https://openai.com/api/pricing/
TEXT_INPUT_COST = 2.50 / 1e6
AUDIO_INPUT_COST = 100.00 / 1e6

TEXT_OUTPUT_COST = 10.00 / 1e6
AUDIO_OUTPUT_COST = 200.00 / 1e6


def query_cost(completion, console):
    usage = completion.usage
    table = Table(title="Token/Cost Breakdown", show_footer=True)

    text_input_count = usage.prompt_tokens_details.text_tokens
    text_input_cost = text_input_count * TEXT_INPUT_COST

    audio_input_count = usage.prompt_tokens_details.audio_tokens
    audio_input_cost = audio_input_count * AUDIO_INPUT_COST

    text_output_count = usage.completion_tokens_details.text_tokens
    text_output_cost = text_output_count * TEXT_OUTPUT_COST

    audio_output_count = usage.completion_tokens_details.audio_tokens
    audio_output_cost = audio_output_count * AUDIO_OUTPUT_COST

    total_cost = (
        text_input_cost + audio_input_cost + text_output_cost + audio_output_cost
    )

    table.add_column("I/O", footer="[b]Total[b]")
    table.add_column("Token Type")
    table.add_column("Count")
    table.add_column(
        "Cost", footer="[b bright_red]${:.3f}[/b bright_red]".format(total_cost)
    )

    table.add_row(
        "Input", "Text", str(text_input_count), "${:.3f}".format(text_input_cost)
    )

    table.add_row(
        "Input", "Audio", str(audio_input_count), "${:.3f}".format(audio_input_cost)
    )

    table.add_row(
        "Output", "Text", str(text_output_count), "${:.3f}".format(text_output_cost)
    )

    table.add_row(
        "Output", "Audio", str(audio_output_count), "${:.3f}".format(audio_output_cost)
    )

    console.print(table)

In [13]:
BASE_SYSTEM = """
You are an expert voice actor specializing in silly voices. Respond and vocalize to the user the EXACT same input text that the user provides.
"""


def gen_audio(
    system: str = BASE_SYSTEM,
    text: str = "I am a teapot!",
    temperature: float = 0.8,
    voice="alloy",
):
    completion = client.chat.completions.create(
        model="gpt-4o-audio-preview",
        modalities=["text", "audio"],
        audio={"voice": voice, "format": "wav"},
        messages=[
            {"role": "system", "content": system.strip()},
            {"role": "user", "content": text.strip()},
        ],
        temperature=temperature,
    )

    c = Console(width=60)
    c.print(f"[b]Transcript[/b]: {completion.choices[0].message.audio.transcript}")
    query_cost(completion, c)

    # Create output file name: this will be a) distinct and
    # b) deterministic with the given parameters
    m = hashlib.sha256()
    m.update(f"{system}_{text}_{temperature}_{voice}".encode())
    filename = m.hexdigest()[:16]
    out_filename_base = os.path.join(output_path, filename)

    wav_bytes = base64.b64decode(completion.choices[0].message.audio.data)
    with open(out_filename_base + ".wav", "wb") as f:
        f.write(wav_bytes)

    # Save a compressed MP3:
    # MP3 is required for displaying on the web on all browsers
    # 64kbps is enough compression since source audio is lower-quality
    subprocess.run(
        [
            "ffmpeg",
            "-y",
            "-i",
            out_filename_base + ".wav",
            "-acodec",
            "libmp3lame",
            "-b:a",
            "64k",
            out_filename_base + ".mp3",
        ],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.STDOUT,
    )

    # delete original .wav to save space after conversion
    os.remove(out_filename_base + ".wav")

    c.print(f"Audio file saved at [b]{out_filename_base + ".mp3"}[/b].")

In [14]:
gen_audio()

In [15]:
# https://openai.com/index/better-language-models/
BASE_TEXT = """
Is a golden retriever a good family dog?
"""

gen_audio(text=BASE_TEXT)

In [16]:
gen_audio(text=BASE_TEXT, temperature=1.5)

In [17]:
gen_audio(text=BASE_TEXT, temperature=0.6)

In [18]:
gen_audio(text=BASE_TEXT, voice="echo")

In [19]:
gen_audio(text=BASE_TEXT, voice="shimmer")

## Advanced Tests


In [29]:
system = """
You are an expert voice actor specializing in silly voices. Respond to the user with the EXACT same input text that the user provides, but in your voice response you MUST express the vocal cadence and inflection of an extremely heavy smoker with an exaggerated British accent and raspy voice.
"""

text = """
I love you
You love me
We're a happy family
With a great big hug
And a kiss from me to you
Won't you say you love me too?
"""

temperature = 0.8
voice = "echo"

gen_audio(system, text, temperature, voice)

In [32]:
temperature = 1.2

gen_audio(system, text, temperature, voice)

In [33]:
temperature = 0.6

gen_audio(system, text, temperature, voice)