In [1]:
from pathlib import Path

import faiss
import numpy as np
from tqdm import tqdm

## Load the features you want to cluster

We will use the MFCCS of the `train-clean-100` set of LibriSpeech and use 500 clusters.

In [2]:
feature_dir = Path("/media/SSD/data/LibriSpeech/mfcc/train-clean-100")
num_clusters = 100

In [3]:
feature_paths = list(feature_dir.rglob("*.npy"))
print(f"Found {len(feature_paths)} feature files")

Found 28539 feature files


In [4]:
features = [np.load(path) for path in feature_paths]

In [5]:
features = np.concatenate(features, axis=0)

In [6]:
features.shape

(36205660, 39)

In [7]:
d = features.shape[1]

kmeans = faiss.Kmeans(
    d,
    num_clusters,
    niter=200,
    nredo=1,
    max_points_per_centroid=50000,
    spherical=False,
    verbose=True,
)

kmeans.train(features)

cluster_centers = kmeans.centroids


Sampling a subset of 5000000 / 36205660 for training
Clustering 5000000 points in 39D to 100 clusters, redo 1 times, 200 iterations
  Preprocessing in 1.86 s
  Iteration 199 (140.96 s, search 114.81 s): objective=7.66662e+09 imbalance=1.195 nsplit=0       


In [8]:
np.save("/media/SSD/data/LibriSpeech/mfcc-clustered/cluster_centers_100.npy", cluster_centers)

## Dump the cluster indices

We will use these as targets to train HuBERT. Therefore we need them for the full LibriSpeech dataset.

In [9]:
feature_dir = Path("/media/SSD/data/LibriSpeech/mfcc")
output_dir = Path("/media/SSD/data/LibriSpeech/mfcc-clustered/km100-cluster-ids")

feature_paths = list(feature_dir.rglob("*.npy"))
print(f"Found {len(feature_paths)} feature files")

Found 292367 feature files


In [10]:
for feature_path in tqdm(feature_paths):
    features = np.load(feature_path)
    D, I = kmeans.index.search(features, 1)

    relative_path = feature_path.relative_to(feature_dir)
    output_path = output_dir / relative_path
    output_path.parent.mkdir(parents=True, exist_ok=True)
    I = I[:,0]
    np.save(output_path, I)

100%|██████████| 292367/292367 [03:09<00:00, 1541.90it/s]
