# Quickstart

Install and download a sample waveform:

In [None]:
!pip install zerosyl
!wget https://storage.googleapis.com/zerospeech-checkpoints/5895-34629-0010.flac

In [None]:
import matplotlib.pyplot as plt
import torch
import torchaudio
from IPython.display import Audio

from zerosyl import LanguageModel, ZeroSylCollapsed, ZeroSylContinuous, ZeroSylDiscrete

Define a helper function to plot a spectrogram with segment annotations

In [None]:
def view_on_melspec(wav, starts=None, ends=None, ids=None):
    tMel = torchaudio.transforms.MelSpectrogram(
        n_fft=1024,
        win_length=400,
        hop_length=320,
    )
    tDB = torchaudio.transforms.AmplitudeToDB(top_db=80)
    melspec = tDB(tMel(wav.squeeze()))
    plt.figure(figsize=(10, 4))
    plt.imshow(melspec, aspect="auto", origin="lower")
    if starts is not None and ends is not None:
        assert len(starts) == len(ends)
        if ids is None:
            ids = range(len(starts))
        for start, end, id in zip(starts, ends, ids):
            plt.axvline(start, c="w")
            plt.axvline(end, c="w")
            mid = (start + end) / 2
            plt.text(
                x=mid,
                y=tMel.n_mels / 2,
                s=str(int(id)),
                c="w",
                ha="center",
                va="center",
                fontsize=16,
                rotation=90,
            )

Load a waveform

In [None]:
wav, sr = torchaudio.load("5895-34629-0010.flac")
assert sr == 16000
view_on_melspec(wav)
Audio(wav, rate=sr)

Segment and encode **continuous embeddings**  (silences may be fragmented):

In [None]:
zerosyl_continuous = ZeroSylContinuous.from_remote()

In [None]:
starts, ends, embeddings = zerosyl_continuous.encode(wav)

print(starts)
print(ends)
print(embeddings.shape)

view_on_melspec(wav, starts, ends)

Segment and encode **cluster IDs** (silences may be fragmented):

In [None]:
zerosyl_discrete = ZeroSylDiscrete.from_remote()

In [None]:
starts, ends, cluster_ids = zerosyl_discrete.encode(wav)

print(starts)
print(ends)
print(cluster_ids)

view_on_melspec(wav, starts, ends, cluster_ids)

Segment and encode **langauge modeling IDs** (silences are merged):

In [None]:
zerosyl_collapsed = ZeroSylCollapsed.from_remote()

In [None]:
starts_merged, ends_merged, ids = zerosyl_collapsed.encode(wav)

print(starts_merged)
print(ends_merged)
print(ids)

view_on_melspec(wav, starts_merged, ends_merged, ids)

Load the language model that was trained on 60K hours of `ZeroSylCollapsed` tokens.

Test whether "began" or "gantheir" (both are 2-syllable segments) is more likely to appear in isolation:

In [None]:
lm = LanguageModel.from_remote()

# construct the context windows
#
# item 1: [BOS] [SIL] BE   GAN   [SIL]     "began" should have a higher likelihood
# item 2: [BOS] [SIL] GAN  THEIR [SIL]     "gantheir"
#

BOS = lm.config.bos_token_id
SIL = zerosyl_collapsed.SIL
ids_list = [
    torch.tensor([BOS, SIL, 9052, 1924, SIL]),
    torch.tensor([BOS, SIL, 1924, 1773, SIL]),
]

lm.loglikelihoods(ids_list, normalize=False)