In [1]:
!pip install librosa
!pip install matplotlib

Collecting librosa
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting pooch>=1.0 (from librosa)
  Downloading pooch-1.8.0-py3-none-any.whl.metadata (9.9 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.3.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Downloading librosa-0.10.1-py3-none-any.whl (253 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.7/253.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading audioread-3.0.1-py3-none-any.whl (23 kB)
Downloading pooch-1.8.0-py3-none-any.whl (62 kB)
[2K 

In [48]:
import librosa
import sklearn
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [58]:
df = pd.read_json('nsynth-valid/examples.json').transpose()
df.head(10)

Unnamed: 0,note_str,sample_rate,qualities_str,instrument_source,instrument_family_str,instrument_family,note,instrument_source_str,qualities,pitch,instrument_str,instrument,velocity
keyboard_acoustic_004-060-025,keyboard_acoustic_004-060-025,16000,"[dark, reverb]",0,keyboard,4,278915,acoustic,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0]",60,keyboard_acoustic_004,327,25
bass_synthetic_033-050-100,bass_synthetic_033-050-100,16000,[dark],2,bass,0,270361,synthetic,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",50,bass_synthetic_033,417,100
bass_synthetic_009-052-050,bass_synthetic_009-052-050,16000,"[bright, distortion, long_release]",2,bass,0,270001,synthetic,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0]",52,bass_synthetic_009,150,50
keyboard_electronic_003-064-127,keyboard_electronic_003-064-127,16000,[],1,keyboard,4,50978,electronic,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",64,keyboard_electronic_003,65,127
bass_synthetic_034-030-050,bass_synthetic_034-030-050,16000,"[distortion, tempo-synced]",2,bass,0,265159,synthetic,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 1]",30,bass_synthetic_034,420,50
bass_synthetic_098-040-025,bass_synthetic_098-040-025,16000,[dark],2,bass,0,254242,synthetic,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",40,bass_synthetic_098,803,25
bass_synthetic_009-083-075,bass_synthetic_009-083-075,16000,"[bright, distortion, long_release]",2,bass,0,25301,synthetic,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0]",83,bass_synthetic_009,150,75
mallet_acoustic_062-033-075,mallet_acoustic_062-033-075,16000,"[dark, multiphonic]",0,mallet,5,119907,acoustic,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0]",33,mallet_acoustic_062,644,75
organ_electronic_113-028-075,organ_electronic_113-028-075,16000,[dark],1,organ,6,245891,electronic,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",28,organ_electronic_113,958,75
keyboard_electronic_098-072-100,keyboard_electronic_098-072-100,16000,[],1,keyboard,4,261635,electronic,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",72,keyboard_electronic_098,905,100


In [71]:
def extract_features(audio_file_path):
    y, sr = librosa.load(audio_file_path, sr=None)

    # Harmonic / Percussive
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    harmonic_magnitude = np.mean(y_harmonic)
    percussive_magnitude = np.mean(y_percussive)
    hpss = [harmonic_magnitude, percussive_magnitude]

    # Mel-frequency cepstral coefficients (MFCCs) averaged over time
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13),axis=1)

    # mel-scaled spectrogram averaged over time
    spectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000), axis=1)

    # compute chroma energy averaged over time
    chroma = np.mean(librosa.feature.chroma_cens(y=y, sr=sr), axis=1)

    # compute spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)

    return np.hstack([hpss, mfcc, spectrogram, chroma, contrast])

In [186]:
def load_dataset(dataset_path, label_name, limit=None, onehot=False, shuffle=True, label_encoder=None, feature_scaler=None):
    print(f"Loading {dataset_path} examples info...")
    df = pd.read_json(f'{dataset_path}/examples.json').transpose()
    df["audio_file_path"] = df["note_str"].apply(lambda x: f'{dataset_path}/audio/' + x + '.wav')
    if shuffle:
        print("Shuffling rows...")
        df = sklearn.utils.shuffle(df)

    N = limit if limit is not None else len(df)

    print("Loading labels...")
    encoder = None
    if label_encoder is not None:
        encoder = label_encoder
        labels = encoder.transform(df[[label_name]][:N]).toarray()
    elif onehot:
        encoder = sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore')
        labels = encoder.fit_transform(df[[label_name]][:N]).toarray()
    else:
        labels = df[label_name].values

    print(f"Extracting features for {N} examples...")
    raw_features = np.array([extract_features(fp) for fp in df["audio_file_path"].values[:N]])

    if feature_scaler is None:
        print("Fitting feature scaler...")
        feature_scaler = sklearn.preprocessing.StandardScaler()
        feature_scaler.fit(raw_features)
        
    print("Applying standard scaling...")
    features = feature_scaler.transform(raw_features)

    print("Done!")
    return features, labels, df, encoder, feature_scaler

In [185]:
# The dataset path will be `nsynth-train`, `nsynth-valid` or `nsynth-test` if you've downloaded it with the Makefile.
# Othersize this needs to point to the directory with the extracted archives.
features, labels, df, *_ = load_dataset("nsynth-test", "instrument_family_str", limit=10, onehot=True)

Loading nsynth-test examples info...
Shuffling rows...
Loading labels...
Extracting features for 10 examples...
Applying standard scaling...
Done!


In [79]:
# All the librosa features are concatenated into one long array per example.
features

array([[-0.33230003,  0.29328011, -0.82040979, ..., -1.04692131,
        -0.88444762, -1.13654426],
       [-0.33242278,  0.34337244, -0.29672258, ..., -0.15192529,
        -0.33718003, -1.06768653],
       [-0.33248534,  0.3815953 , -0.30907994, ...,  0.85611916,
         0.28320446, -0.62706768],
       ...,
       [-0.3410816 , -2.99896025,  2.80647452, ..., -1.17163004,
        -0.8286847 ,  0.98057759],
       [-0.33218897,  0.30723564, -0.30822448, ..., -0.06169011,
        -0.35359835, -0.69066209],
       [-0.33257202,  0.37058232, -0.75502056, ..., -0.90678082,
        -0.73752751, -0.62017645]])

In [122]:
# The labels are 1-Hot encoded, meaning the integer class i is represented by element y_i = 1 in the labels array.
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [81]:
# Sample the dataset metadata
df.head()

Unnamed: 0,qualities,pitch,note,instrument_source_str,velocity,instrument_str,instrument,sample_rate,qualities_str,instrument_source,note_str,instrument_family,instrument_family_str,audio_file_path
bass_synthetic_068-049-025,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",49,217499,synthetic,25,bass_synthetic_068,656,16000,[dark],2,bass_synthetic_068-049-025,0,bass,nsynth-test/audio/bass_synthetic_068-049-025.wav
keyboard_electronic_001-021-127,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",21,299359,electronic,127,keyboard_electronic_001,40,16000,[],1,keyboard_electronic_001-021-127,4,keyboard,nsynth-test/audio/keyboard_electronic_001-021-...
guitar_acoustic_010-066-100,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",66,72288,acoustic,100,guitar_acoustic_010,219,16000,[],0,guitar_acoustic_010-066-100,3,guitar,nsynth-test/audio/guitar_acoustic_010-066-100.wav
reed_acoustic_037-068-127,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",68,22259,acoustic,127,reed_acoustic_037,387,16000,[reverb],0,reed_acoustic_037-068-127,7,reed,nsynth-test/audio/reed_acoustic_037-068-127.wav
flute_acoustic_002-077-100,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",77,72001,acoustic,100,flute_acoustic_002,86,16000,[reverb],0,flute_acoustic_002-077-100,2,flute,nsynth-test/audio/flute_acoustic_002-077-100.wav


## Tinker with Model Choice on Tiny Dataset

In [37]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [85]:
features, labels, df, *_ = load_dataset("nsynth-test", "instrument_family_str", limit=200, onehot=True)

Loading dataset examples info...
Loading labels...
Extracting features for 200 examples...


  return pitch_tuning(


Applying standard scaling...
Done!


In [86]:
# The number of examples in each class. There needs to be >= 2 for sklearn accept it.
labels.sum(axis=0)

array([41., 12., 10., 37., 40.,  6., 30.,  9.,  8.,  7.])

In [87]:
# Split tiny dataset to make sure we're not making a memorizing algo
X_train, X_test, y_train, y_test = train_test_split(features, labels, stratify=labels, random_state=3)

In [93]:
X_train.shape

(150, 162)

In [107]:
# A Multi-Layer Perception, i.e. a neural network should work for this.
model = MLPClassifier(
    random_state=1, 
    max_iter=1000, 
    hidden_layer_sizes=(X_train.shape[-1], 128, 64), 
    activation='logistic', 
    tol=1e-3, 
    verbose=True,
)
# Train for a bit then see if we do better than random (1/N-classes) on the holdout data.
# If so, we may want to proceed to using the full dataset.
model.fit(X_train, y_train)
model.score(X_test, y_test)

## Preprocess Full Dataset

Pre-process the train, validation, and test datasets by reading each audio file and computing feature vectors and labels. This will take a while (~8 hours) so we will save the pre-processed NumPy feature/label arrays so we can train models in the future without re-processing the raw data.

In [181]:
dataset_version_key = 'v0'

In [134]:
# If you've already pre-processed that dataset, run this then skip the reset of this section.

# with open(f'nsynth-train-{dataset_version_key}.npy', 'rb') as f:
#     features_train = np.load(f)
#     labels_train = np.load(f)

# with open(f'nsynth-valid-{dataset_version_key}.npy', 'rb') as f:
#     features_valid = np.load(f)
#     labels_valid = np.load(f)

# with open(f'nsynth-test-{dataset_version_key}.npy', 'rb') as f:
#     features_test = np.load(f)
#     labels_test = np.load(f)

In [125]:
df = pd.read_json(f'nsynth-train/examples.json').transpose()
len(df)

289205

In [None]:
features_train, labels_train, df_train, label_encoder, feature_scaler = load_dataset("nsynth-train", "instrument_family_str", onehot=True, limit=5000)

encoder_train.categories_

Loading nsynth-train examples info...
Shuffling rows...
Loading labels...
Extracting features for 5000 examples...


  return pitch_tuning(


In [None]:
with open(f'nsynth-train-{dataset_version_key}.npy', 'wb') as f:
    np.save(f, features_train)
    np.save(f, labels_train)

In [None]:
features_valid, labels_valid, df_valid, *_ = load_dataset("nsynth-valid", "instrument_family_str", label_encoder=label_encoder, feature_scaler=feature_scaler, limit=1000)

In [None]:
with open(f'nsynth-valid-{dataset_version_key}.npy', 'wb') as f:
    np.save(f, features_valid)
    np.save(f, labels_valid)

In [None]:
features_test, labels_test, df_test, *_ = load_dataset("nsynth-test", "instrument_family_str", label_encoder=label_encoder, feature_scaler=feature_scaler, limit=500)

In [None]:
with open(f'nsynth-test-{dataset_version_key}.npy', 'wb') as f:
    np.save(f, features_test)
    np.save(f, labels_test)

## Train a Real Model

In [None]:
print("Training...")
model = MLPClassifier(
    random_state=1, 
    max_iter=5000, 
    hidden_layer_sizes=(features_train.shape[-1], 128, 64), 
    activation='logistic', 
    tol=1e-5, 
    verbose=True,
)
model.fit(features_train, labels_train)

print("Train      Accuracy:", model.score(features_train, labels_train))
print("Validation Accuracy:", model.score(features_valid, labels_valid))
print("Test       Accuracy:", model.score(features_test, labels_test))

In [None]:
import pickle

with open(f'model-{dataset_version_key}.pkl', 'wb') as f:
    pickle.dump(model, f)