In [None]:
import torch
from IPython.display import display, Audio

from torchcodec.decoders import AudioDecoder
from neucodec import DistillNeuCodec

from zerosyl import ZeroSylCollapsed, ULM, AcousticModel

Load the default models:

(These are all trained to operate on the collapsed units with vocabulary size 9116.)

In [None]:
encoder = ZeroSylCollapsed.from_remote().cuda()
ulm = ULM.from_remote().cuda()
acoustic = AcousticModel.from_remote().cuda()

neucodec = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec").cuda()
neucodec.eval();

Load a waveform

In [None]:
audio = AudioDecoder("data/sample.flac", sample_rate=16000, num_channels=1).get_all_samples()
wav = audio.data

display(Audio(wav, rate=16000))

## Encode into syllabic segments


Extract semantic segments

In [None]:
starts, ends, semantic_units = encoder.encode(wav.cuda())

semantic_units

In [None]:
semantic_units_per_second = len(semantic_units) / audio.duration_seconds
print(f"{semantic_units_per_second:.2f} semantic units per second")

## Synthesize speech from the semantic units

### First generate acoustic units from the semantic tokens

( This is not deterministic. If you give the same prompt multiple times, you will get different acoustics. )

In [None]:
acoustic_units_list = acoustic.generate(
    [semantic_units] * 3, # pass 3 identical prompts
    temperature=1.0,
    top_p=0.85,
    max_tokens_per_semantic_unit=20,
    max_tokens=2500,
    show_progress=True,
)

for acoustic_units in acoustic_units_list:
    print(acoustic_units.shape)
    # 50 tokens will generate one second of speech

Now vocode the each set of acoustic units to a waveform

In [None]:
for acoustic_units in acoustic_units_list:
    with torch.inference_mode():
        waveform = neucodec.decode_code(acoustic_units[None, None, :]).squeeze(0).cpu()

    display(Audio(waveform, rate=24000))

## Language modeling

### Probe log likelihood from the language model

In [None]:
ulm.loglikelihood(semantic_units)