# Sylber features

This notebook records how we obtained features from Sylber.

Running the code directly in this notebook will not work.

## Install instructions

```bash
pip install sylber
```

## Extract continuous feature and train K-means model

In [None]:
from pathlib import Path

import faiss
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchaudio
from sklearn.cluster import kmeans_plusplus
from torchcodec.decoders import AudioDecoder
from tqdm import tqdm

from sylber import Segmenter

segmenter = Segmenter(model_ckpt="sylber", device="cuda")
wav_dir = Path("/mnt/wsl/hermione/datasets/LibriSpeech")
wav_paths = list(wav_dir.glob("train-clean-100/**/*.flac"))
all_embeddings = []
for wav_path in tqdm(wav_paths):
    wav, sr = torchaudio.load(wav_path)
    outputs = segmenter(
        wav=wav, in_second=False
    )  # in_second can be False to output segments in frame numbers.
    all_embeddings.append(outputs["segment_features"])

all_embeddings = np.concat(all_embeddings, axis=0)

num_clusters = 10000

num_points, num_dims = all_embeddings.shape
print(f"Found {num_points} points in {num_dims}D")

faiss.normalize_L2(all_embeddings)
init_centroids, _ = kmeans_plusplus(all_embeddings, num_clusters)
faiss.normalize_L2(init_centroids)

step = 0
STEPS_PER_CHECKPOINT = 10
kmeans = faiss.Kmeans(
    d=init_centroids.shape[1],
    k=init_centroids.shape[0],
    niter=STEPS_PER_CHECKPOINT,
    verbose=True,
    spherical=True,
)

while step < 300:
    kmeans.train(all_embeddings, init_centroids=init_centroids)
    step += STEPS_PER_CHECKPOINT
    init_centroids = kmeans.centroids.copy()
    np.save(f"km-centroids-syllablelm-k-{init_centroids.shape[0]}-step-{step}.npy", kmeans.centroids)
    plt.figure()
    plt.plot(kmeans.obj)
    plt.title(f"step-{step}")
    plt.savefig(f"step-{step}.png")
    plt.clf()

# Extract segments using the discovered centroids

In [None]:
from pathlib import Path

import faiss
import numpy as np
import torch
import torchaudio
from tqdm import tqdm
from functools import reduce

from sylber import Segmenter

segmenter = Segmenter(model_ckpt="sylber", device="cuda")
wav_dir = Path("/mnt/wsl/hermione/datasets/LibriSpeech")
wav_paths = list(wav_dir.glob("dev*/**/*.flac"))
out_dir = Path("output/segments/sylber-custom-centroids-k-10000-plus-sil/LibriSpeech")

centroids = np.load("km-centroids-sylber-k-10000-step-110.npy")
k = centroids.shape[0]
faiss.normalize_L2(centroids)
index = faiss.IndexFlatIP(centroids.shape[1])
index.add(centroids)


def reducer(segments, segment):
    starts, ends, units = segments  # already added
    start, end, unit = segment  # to add

    # if there is a gap larger than 7 frames, add an explicit SIL token
    # this creates one extra vocabulary item
    # see Section A.2.3 on coding efficiency in https://arxiv.org/pdf/2410.07168
    if len(ends) == 0 and start > 7:
        starts.append(0)
        ends.append(start)
        units.append(k)
    if len(ends) > 0 and start - ends[-1] > 7:
        starts.append(ends[-1])
        ends.append(start)
        units.append(k)

    starts.append(start)
    ends.append(end)
    units.append(unit)
    return (starts, ends, units)


for wav_path in tqdm(wav_paths):
    wav, sr = torchaudio.load(wav_path)
    outputs = segmenter(wav=wav, in_second=False)
    num_hidden_states = len(outputs['hidden_states'])
    starts, ends = outputs["segments"].T
    embeddings = outputs["segment_features"]
    faiss.normalize_L2(embeddings)
    _, units = index.search(embeddings, 1)
    units = units[:, 0]

    starts, ends, units = reduce(reducer, zip(starts, ends, units), ([], [], []))
    if num_hidden_states - ends[-1] > 7:
        starts.append(ends[-1])
        ends.append(num_hidden_states)
        units.append(k)

    output = np.stack([starts, ends, units], axis=1)
    output = torch.from_numpy(output).long()

    rel_path = wav_path.relative_to(wav_dir).with_suffix(".pt")
    out_path = out_dir / rel_path
    out_path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(output, out_path)
