In [11]:
from pathlib import Path

import faiss
import numpy as np
from tqdm import tqdm

In [8]:
feature_dir = Path("./data/mfcc/dev-clean")
num_clusters = 500
output_dir = Path("./data/mfcc-cluster-ids/dev-clean")

In [3]:
feature_paths = list(feature_dir.rglob("*.npy"))
print(f"Found {len(feature_paths)} feature files")

Found 2703 feature files


In [4]:
features = [np.load(path) for path in feature_paths]

In [5]:
features = np.concatenate(features, axis=0)

In [6]:
features.shape

(1939016, 39)

In [7]:
d = features.shape[1]

kmeans = faiss.Kmeans(
    d,
    num_clusters,
    niter=100,
    nredo=1,
    max_points_per_centroid=50000,
    spherical=False,
    verbose=True,
)

kmeans.train(features)

cluster_centers = kmeans.centroids

# Save cluster centers
# np.save("cluster_centers.npy", cluster_centers)


Clustering 1939016 points in 39D to 500 clusters, redo 1 times, 100 iterations
  Preprocessing in 0.06 s
  Iteration 99 (66.78 s, search 62.89 s): objective=2.3356e+09 imbalance=1.152 nsplit=0        


In [12]:
# now do inference

for feature_path in tqdm(feature_paths):
    features = np.load(feature_path)
    D, I = kmeans.index.search(features, 1)

    relative_path = feature_path.relative_to(feature_dir)
    output_path = output_dir / relative_path
    output_path.parent.mkdir(parents=True, exist_ok=True)
    I = I[:,0]
    np.save(output_path, I)

  0%|          | 0/2703 [00:00<?, ?it/s]

100%|██████████| 2703/2703 [00:02<00:00, 1056.51it/s]
