# Audio Generation Test

Experimenting with text-to-audio models

In [None]:
try:
    from torchcodec.encoders import AudioEncoder
    from torchcodec.decoders import AudioDecoder
    from kokoro import KPipeline
    import huggingface_hub as hf
    from audio_io import waveform, spectrum
except ImportError as e:
    print(f"Cannot import lib: {e}")
except RuntimeError as r:
    print(f"Error during library import: {r}")

## Define Variables

In [None]:
# HF model
HF_REPO_ID: str = "hexgrad/Kokoro-82M"

# audio parameters
SAMPLE_RATE: int = 22050

# Download Model

In [None]:
# get model from HF repo
model_path = hf.snapshot_download(repo_id=HF_REPO_ID)

# show download path
print(f"Downloaded model {HF_REPO_ID} @ {model_path}")

# Tweak Parameters

In [None]:
# test cases
TEST_CASES: list = [
    {
        "lang_id": "a",
        "voice_id": "af_heart",
        "text": "Programming isn’t just about coaxing a machine to obey commands; it’s a dialogue between imagination and logic, where each line of code is a brushstroke on a digital canvas, turning abstract ideas into tangible experiences that shape the future."
    },
    {
        "lang_id": "b",
        "voice_id": "bf_emma",
        "text": "Programming isn’t just about coaxing a machine to obey commands; it’s a dialogue between imagination and logic, where each line of code is a brushstroke on a digital canvas, turning abstract ideas into tangible experiences that shape the future."
    }
]

# Audio Generation Test

In [None]:
# gen audio pipeline
OUTPUTS: list = []
for testcase in TEST_CASES:
    # create audio pipeline
    gen_pipeline = KPipeline(lang_code=testcase.get("lang_id"), repo_id=HF_REPO_ID)
    OUTPUTS.append(gen_pipeline(text=testcase.get("text"), voice=testcase.get("voice_id")))
    del gen_pipeline

# explore outputs
print(f"Produced {len(OUTPUTS)} audio waveforms.")

In [None]:
# output generated audio
ENCODED_AUDIO: list = []
for i, output in enumerate(OUTPUTS):
    for index, (gs, ps, audio) in enumerate(output):
        print(f"Grapheme: {gs}, phoneme: {ps}")
        print(f"Tensor Shape: {audio.shape}")

        # encode audio
        filename: str = f"/tmp/output_{i}.mp3"
        e: AudioEncoder = AudioEncoder(samples=audio, sample_rate=SAMPLE_RATE)
        e.to_file(filename)
        ENCODED_AUDIO.append({
            "audioencoder": e,
            "graphemes": gs,
            "phonemes": ps, 
            "filename": filename
        })
        

# Visualize Waveforms

In [None]:
# plot waveforms..
for w in ENCODED_AUDIO:
    d: AudioDecoder = AudioDecoder(w.get("audioencoder").to_tensor(format="wav"))
    waveform(d)
    spectrum(d)
    del d

# Hear Audio Rendering

In [None]:
from IPython.display import Audio

# head audio!
for w in ENCODED_AUDIO:
    print(f"Rendered Phonemes: {w.get('phonemes')}")
    display(Audio(w.get("filename"), rate=SAMPLE_RATE))