In [136]:
import boto3
import json
import numpy as np
import io
import IPython.display
import scipy.io.wavfile as sciwav
from collections import Counter
from s3fs.core import S3FileSystem

from sklearn.cluster import KMeans

In [138]:
len(s3.walk('s3://music-ml-gigioli/data/nsynth/nsynth-train/embeddings/'))

7376

In [87]:
bucket = 'music-ml-gigioli'

In [88]:
s3 = S3FileSystem()

embeddings = []
for f in s3.walk('s3://music-ml-gigioli/data/nsynth/nsynth-train/embeddings/'):
    if 'acoustic' in f:
        embeddings.append(np.load(s3.open(f)))

In [89]:
len(embeddings)

2548

In [90]:
embeddings[0].shape

(125, 16)

In [115]:
time_features = []
mfcc_features = []
concat_features = []
for embd in embeddings:
    avg_time_features = np.mean(embd, axis=0)
    avg_mfcc_features = np.mean(embd, axis=1)
    
    time_features.append(avg_time_features)
    mfcc_features.append(avg_mfcc_features)
    concat_features.append(np.concatenate([avg_time_features, avg_mfcc_features]))

In [125]:
kmeans = KMeans(n_clusters=11)

In [126]:
clusters = kmeans.fit_predict(time_features)

In [127]:
results = sorted(list(zip(fnames, clusters)), key=lambda x : x[1])

In [128]:
Counter(list(zip(*results))[1])

Counter({0: 212,
         1: 223,
         2: 227,
         3: 164,
         4: 263,
         5: 202,
         6: 325,
         7: 127,
         8: 361,
         9: 366,
         10: 60})

In [130]:
[x for x in results if x[1] == 10][:5]

[('data/nsynth/nsynth-train/embeddings/brass_acoustic_007-065-050_embeddings.npy',
  10),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_014-052-100_embeddings.npy',
  10),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_018-073-127_embeddings.npy',
  10),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_032-073-127_embeddings.npy',
  10),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_040-079-127_embeddings.npy',
  10)]

In [134]:
[x for x in results if x[1] == 0][:5]

[('data/nsynth/nsynth-train/embeddings/brass_acoustic_000-037-100_embeddings.npy',
  0),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_000-050-100_embeddings.npy',
  0),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_001-037-050_embeddings.npy',
  0),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_001-055-050_embeddings.npy',
  0),
 ('data/nsynth/nsynth-train/embeddings/brass_acoustic_003-069-100_embeddings.npy',
  0)]

In [135]:
obj = boto3.resource('s3').Object(bucket, 'data/nsynth/nsynth-train/audio/brass_acoustic_000-037-100.wav')
sample_rate, X = sciwav.read(io.BytesIO(obj.get()['Body'].read()))
X = X.astype(np.float32)
IPython.display.Audio(X, rate=16000)

In [131]:
obj = boto3.resource('s3').Object(bucket, 'data/nsynth/nsynth-train/audio/brass_acoustic_032-073-127.wav')
sample_rate, X = sciwav.read(io.BytesIO(obj.get()['Body'].read()))
X = X.astype(np.float32)
IPython.display.Audio(X, rate=16000)

In [70]:
[x for x in results if x[1] == 5]

[('data/nsynth/nsynth-train/embeddings/bass_electronic_000-022-100_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_electronic_000-023-025_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_electronic_007-022-050_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_electronic_022-034-075_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_synthetic_004-076-050_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_synthetic_005-026-025_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_synthetic_005-039-127_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_synthetic_017-070-127_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_synthetic_017-087-127_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_synthetic_018-046-075_embeddings.npy',
  5),
 ('data/nsynth/nsynth-train/embeddings/bass_synthetic_018-098-100_embeddings.npy',
  5),
 ('data/nsynth/ns