In [None]:
!pip install git+https://github.com/nicolvisser/ZeroSyl
!pip install matplotlib

!wget https://storage.googleapis.com/zerospeech-checkpoints/5895-34629-0010.flac

# ZeroSyl Quickstart

In [None]:
import matplotlib.pyplot as plt
import torch
import torchaudio
from IPython.display import Audio

In [None]:
def plot_segments_on_melspec(wav, starts, ends, ids=None):

    tMel = torchaudio.transforms.MelSpectrogram(
        n_fft=1024,
        win_length=400,
        hop_length=320,
    )
    tDB = torchaudio.transforms.AmplitudeToDB(top_db=80)

    n_mels = tMel.n_mels

    melspec = tDB(tMel(wav))[0]

    plt.figure(figsize=(10, 4))
    plt.imshow(melspec, aspect="auto", origin="lower")
    plt.axis("off")

    if ids is None:
        ids = torch.arange(len(starts))

    for start, end, id in zip(starts, ends, ids):
        mid = (start + end) / 2
        plt.axvline(start, c="w")
        plt.axvline(end, c="w")
        plt.text(
            mid,
            n_mels / 2,
            id.item(),
            fontsize=16,
            c="w",
            ha="center",
            va="center",
            rotation=90,
        )

In [None]:
wav, sr = torchaudio.load("5895-34629-0010.flac")

Audio(wav, rate=sr)

In [None]:
from zerosyl import ZeroSylContinuous

model = ZeroSylContinuous.from_remote()
starts, ends, embeddings = model.encode(wav)

print(starts)
print(ends)
print(embeddings.shape)
plot_segments_on_melspec(wav, starts, ends)

In [None]:
from zerosyl import ZeroSylDiscrete

model = ZeroSylDiscrete.from_remote()
wav, sr = torchaudio.load("5895-34629-0010.flac")
starts, ends, ids = model.encode(wav)

print(starts)
print(ends)
print(ids)
plot_segments_on_melspec(wav, starts, ends, ids)

In [None]:
from zerosyl import ZeroSylCollapsed

model = ZeroSylCollapsed.from_remote()
wav, sr = torchaudio.load("5895-34629-0010.flac")
starts, ends, ids = model.encode(wav)

print(starts)
print(ends)
print(ids)
plot_segments_on_melspec(wav, starts, ends, ids)

In [None]:
from zerosyl import LanguageModel

lm = LanguageModel.from_remote()

brick = torch.tensor([9116, 9115, 3045, 9115])
blick = torch.tensor([9116, 9115, 5041, 9115])
print(lm.loglikelihoods([brick, blick]))

In [None]:
print(lm.generate(max_length=10))