# INIT COMMIT DATE : October 11th 2022
# Authors: Matthew Starkey
# References:
* Humphrey, Eric J., Durand, Simon, and McFee, Brian. "OpenMIC-2018: An Open Dataset for Multiple Instrument Recognition." in Proceedings of the 19th International Society for Music Information Retrieval Conference (ISMIR), 2018.

# Instrument Recognition Model Using OpenMic-2018
This is a program that will create a model with sklearn library (intended to be transferred to iOS using CoreML) and the OpenMic training and testing dataset

In [2]:
#imports - necessary requirements for loading data
import pandas as pd
import json
import numpy as np
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


#path root for data samples
DATA_ROOT = 'openmic-2018'
# if not os.path.exists(DATA_ROOT):
#     raise ValueError('Invalid path root')

openmic-2018.npz is a python friendly format to load VGGish samples X, Y truths, Y mask, and sample key

* X : [20000 X 10 x 128]
    * index 1: (cardinality=20000) is sample 
    * index 2: (cardinality=10) is time slice within VGG-ish clip (feature extraction)
    * index 3: (cardinality=128) is VGGish features at each point in 10 second clip
    * ex: X[40, 8] gives 128 dimension vector of 9th time slice of 41st sample.
* Y_true : [20000 X 20]
    * index 1: is sample of instrument classmap probability
    * index 2: label class confidence probability
    * ex: Y[40, 4] indicates confidence 41st sample has instrument #5
* Y_mask
    * index 1: sample key
    * index 2: label class
    * ex: Y[40, 4] indicates whether or not we have observations for instrument 5th instrument in sample 41.
* Sample_key
    * it is the sample key. sample_key[40] gives you the sample key for 41st sample.

In [3]:
OPENMIC = np.load(os.path.join(DATA_ROOT, 'openmic-2018.npz'), allow_pickle=True)

In [4]:
print(list(OPENMIC.keys()))


['X', 'Y_true', 'Y_mask', 'sample_key']


In [5]:
X, Y_true, Y_mask, sample_key = OPENMIC['X'], OPENMIC['Y_true'], OPENMIC['Y_mask'], OPENMIC['sample_key']



In [6]:
X.shape

(20000, 10, 128)

In [7]:
X[40, 8]

array([173,  11, 163, 105, 188,  95, 133, 108, 135, 167, 129, 122,  77,
       134, 100, 114,  73, 127, 102, 168,  51, 221, 106,  99,  78, 187,
       165, 170, 179, 181, 101, 114, 132, 170,  76, 118, 179,  67, 103,
        60, 169, 199,  86, 240, 149, 181,  88,  86, 129, 160, 134, 128,
        83, 169, 171, 139, 182, 152, 109, 193, 169, 143,   2, 216, 199,
       195, 197,  66, 159, 185,  76, 255, 188, 255,  45, 125, 112, 220,
       201,  42, 176, 160, 174, 193,  64, 137,  66, 212, 176, 146, 203,
       203,  86, 124,  68, 240,   0, 106, 138, 130, 177, 124, 121,  98,
        46, 154, 123, 243, 161, 255, 148, 151,  98,  78, 147,  78,  32,
        97,   0,  36, 198, 104,  31, 130, 210,   0,  94, 255])

In [8]:
print(Y_true[0])

[0.5     0.5     0.5     0.5     0.17105 0.5     0.5     0.      0.5
 0.5     0.5     0.5     0.5     0.5     0.5     0.5     0.      0.5
 0.5     0.5    ]


In [9]:
Y_mask[40]

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False])

In [10]:
sample_key.shape

(20000,)

In [11]:
sample_key[40]

'000385_249600'

### Loading Class Map
Given a json file mapping index to instrument names

In [12]:
with open(os.path.join(DATA_ROOT, 'class-map.json'), 'r') as f:
    class_map = json.load(f)

In [13]:
class_map

{'accordion': 0,
 'banjo': 1,
 'bass': 2,
 'cello': 3,
 'clarinet': 4,
 'cymbals': 5,
 'drums': 6,
 'flute': 7,
 'guitar': 8,
 'mallet_percussion': 9,
 'mandolin': 10,
 'organ': 11,
 'piano': 12,
 'saxophone': 13,
 'synthesizer': 14,
 'trombone': 15,
 'trumpet': 16,
 'ukulele': 17,
 'violin': 18,
 'voice': 19}

### Loading train/test data
OpenMic was gracious enough to give us data already split 75%/25% so we will load those .csv files

In [14]:
#Load training and testing samples:
X_train = pd.read_csv(os.path.join(DATA_ROOT, 'partitions/split01_train.csv'), 
                          header=None).squeeze("columns")
X_test = pd.read_csv(os.path.join(DATA_ROOT, 'partitions/split01_test.csv'), 
                         header=None).squeeze("columns")
#Note: use squeeze to return array instead of DF

In [15]:
X_train.head(5)

0      000046_3840
1    000135_483840
2    000139_119040
3    000141_153600
4     000144_30720
Name: 0, dtype: object

In [16]:
print('{} Train samples, {} Test samples'.format(len(X_train), len(X_test)))
print('{}% samples are for training'.format((len(X_train) / len(sample_key) * 100)))

14915 Train samples, 5085 Test samples
74.575% samples are for training


In [17]:
#It is recommended by creators to use sets for this data so we will store them as sets
train_set = set(X_train)
test_set = set(X_test)

# Removing "VGG-ish" Model and using raw mono data

In [18]:
# audio_root = '/Users/starkey/Desktop/machine learning/Music Modeling/openmic-2018/audio'
# print(audio_root)

In [19]:
# os.chdir(audio_root)
# os.getcwd()

Here we store audio files to a numpy array <br>
For simplicity, we will only use 44.1 kHz sampling rate for simplicity in the model (some files are 48kHz)

In [20]:
# import soundfile as sf

# for dir in os.scandir(audio_root):
#     # get each directory in audio dir
#     if dir.is_dir():
#         for audio_file in os.scandir(dir):
#             if audio_file.is_file():
#                 audio, rate = sf.read(audio_file)
#                 if rate == 44100: # exclude non 44.1kHz audio
#                     #turn any dual channel audio to mono
#                     try: # if audio isn't dual channel this try will catch the error
#                         if audio.shape[1] == 2:
#                             audio = np.mean(audio, axis=1)
#                     except:
#                         print("not dual channel")
#                     #write data to pickle file
#                     pickle.dump(audio, open("audio.data.pkl", 'ab'))

In [21]:
# audio_data = np.array([])
# f = open("audio.data.pkl", 'rb')
# audio_data = np.append([audio_data], [pickle.load(f)])

# while 1:
#     try:
#         audio_data = np.vstack([audio_data, pickle.load(f)])
#     except EOFError:
#         print("end of file")
#         break
# f.close()
# audio_data.shape



# Splitting data (as per openmic proposes)

In [22]:
# These loops go through all sample keys, and save their row numbers
# to either idx_train or idx_test
#
# This will be useful in the next step for slicing the array data
idx_train, idx_test = [], []

for idx, n in enumerate(sample_key):
    if n in train_set:
        idx_train.append(idx)
    elif n in test_set:
        idx_test.append(idx)
    else:
        # This should never happen, but better safe than sorry.
        raise RuntimeError('Unknown sample key={}!'.format(sample_key[n]))
        
# Finally, cast the idx_* arrays to numpy structures
idx_train = np.asarray(idx_train)
idx_test = np.asarray(idx_test)

In [23]:
# Finally, we use the split indices to partition the features, labels, and masks
X_train = X[idx_train]
X_test = X[idx_test]

Y_true_train = Y_true[idx_train]
Y_true_test = Y_true[idx_test]

Y_mask_train = Y_mask[idx_train]
Y_mask_test = Y_mask[idx_test]

In [24]:
# Print out the sliced shapes as a sanity check
print(X_train.shape)
print(X_test.shape)

(14915, 10, 128)
(5085, 10, 128)


## Fit Models
iterate through instrument classes fitting a RandomForest model for each one
for each instrument:
1. find subset of training (and testing) data that have been annotated for current instrument
2. Simplify the features to have one observation point per clip, instead of one point per time slice within each clip
3. initialize a classifier
4. Fit classifier to training data (?) -> RandomForest sklearn magic. Its basically big decision tree to determine classifier
(Starkey Note: I am initially worried if this model will be overfit)
5. Evaluate classifier on test data and print report

In [25]:
models = dict()

# We'll iterate over all istrument classes, and fit a model for each one
# After training, we'll print a classification report for each instrument
for instrument in class_map:
    
    # Map the instrument name to its column number
    inst_num = class_map[instrument]
        
    # Step 1: sub-sample the data
    
    # First, we need to select down to the data for which we have annotations
    # This is what the mask arrays are for
    train_inst = Y_mask_train[:, inst_num]
    test_inst = Y_mask_test[:, inst_num]
    
    # Here, we're using the Y_mask_train array to slice out only the training examples
    # for which we have annotations for the given class
    X_train_inst = X_train[train_inst]
    
    # Step 3: simplify the data by averaging over time
    
    # This simplification could lead to poorer results in the future

    # Let's arrange the data for a sklearn Random Forest model 
    # Instead of having time-varying features, we'll summarize each track by its mean feature vector over time
    X_train_inst_sklearn = np.mean(X_train_inst, axis=1)
    
    # Again, we slice the labels to the annotated examples
    # We thresold the label likelihoods at 0.5 to get binary labels
    Y_true_train_inst = Y_true_train[train_inst, inst_num] >= 0.5

    
    # Repeat the above slicing and dicing but for the test set
    X_test_inst = X_test[test_inst]
    X_test_inst_sklearn = np.mean(X_test_inst, axis=1)
    Y_true_test_inst = Y_true_test[test_inst, inst_num] >= 0.5

    # Step 3.
    # Initialize a new classifier
    clf = RandomForestClassifier(max_depth=8, n_estimators=100, random_state=0)
    
    # Step 4.
    clf.fit(X_train_inst_sklearn, Y_true_train_inst)

    # Step 5.
    # Finally, we'll evaluate the model on both train and test
    Y_pred_train = clf.predict(X_train_inst_sklearn)
    Y_pred_test = clf.predict(X_test_inst_sklearn)
    
    print('-' * 52)
    print(instrument)
    print('\tTRAIN')
    print(classification_report(Y_true_train_inst, Y_pred_train))
    print('\tTEST')
    print(classification_report(Y_true_test_inst, Y_pred_test))
    
    # Store the classifier in our dictionary
    models[instrument] = clf

----------------------------------------------------
accordion
	TRAIN
              precision    recall  f1-score   support

       False       0.96      1.00      0.98      1159
        True       1.00      0.88      0.94       374

    accuracy                           0.97      1533
   macro avg       0.98      0.94      0.96      1533
weighted avg       0.97      0.97      0.97      1533

	TEST
              precision    recall  f1-score   support

       False       0.84      0.97      0.90       423
        True       0.77      0.32      0.45       115

    accuracy                           0.83       538
   macro avg       0.81      0.65      0.68       538
weighted avg       0.83      0.83      0.81       538

----------------------------------------------------
banjo
	TRAIN
              precision    recall  f1-score   support

       False       0.98      0.98      0.98      1148
        True       0.97      0.97      0.97       592

    accuracy                           0

# CNN OpenMic Model

In [40]:
# from keras.models import Sequential
# from keras.layers import Dense, Conv2D, Flatten

In [75]:
X_train.shape

(14915, 10, 128)

In [72]:
Y_true_train.shape

(14915, 20)

In [77]:
# #create the model
# CNN_model = Sequential()
# #add model layers
# CNN_model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(10,128,1)))
# CNN_model.add(Conv2D(32, kernel_size=3, activation='relu'))
# CNN_model.add(Flatten())
# CNN_model.add(Dense(20, activation='softmax'))

In [79]:
# #compile model
# CNN_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Train model with opeenmic data

In [80]:
# CNN_model.fit(X_train, Y_true_train, validation_data=(X_test, Y_true_test), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100

KeyboardInterrupt: 

In [26]:
print(models) # model for each instrument class

{'accordion': RandomForestClassifier(max_depth=8, random_state=0), 'banjo': RandomForestClassifier(max_depth=8, random_state=0), 'bass': RandomForestClassifier(max_depth=8, random_state=0), 'cello': RandomForestClassifier(max_depth=8, random_state=0), 'clarinet': RandomForestClassifier(max_depth=8, random_state=0), 'cymbals': RandomForestClassifier(max_depth=8, random_state=0), 'drums': RandomForestClassifier(max_depth=8, random_state=0), 'flute': RandomForestClassifier(max_depth=8, random_state=0), 'guitar': RandomForestClassifier(max_depth=8, random_state=0), 'mallet_percussion': RandomForestClassifier(max_depth=8, random_state=0), 'mandolin': RandomForestClassifier(max_depth=8, random_state=0), 'organ': RandomForestClassifier(max_depth=8, random_state=0), 'piano': RandomForestClassifier(max_depth=8, random_state=0), 'saxophone': RandomForestClassifier(max_depth=8, random_state=0), 'synthesizer': RandomForestClassifier(max_depth=8, random_state=0), 'trombone': RandomForestClassifier(

# Sample prediction example
This uses each model for each class map classification to determine if it exists amongst the features clip or not

In [27]:
sample = X_test_inst_sklearn[0]
#using single sample requires (1, -1) reshape
sample = sample.reshape(1, -1)

sample_predictions = dict()
for instrument in class_map:
    #get model from models dictionary using instruments names in class map
    testing_model = models[instrument]
    #predict for that instrument
    y_test_sample = testing_model.predict(sample)
    sample_predictions[instrument] = y_test_sample

print(sample_predictions)
    

{'accordion': array([False]), 'banjo': array([False]), 'bass': array([False]), 'cello': array([False]), 'clarinet': array([False]), 'cymbals': array([False]), 'drums': array([False]), 'flute': array([False]), 'guitar': array([False]), 'mallet_percussion': array([False]), 'mandolin': array([False]), 'organ': array([False]), 'piano': array([False]), 'saxophone': array([False]), 'synthesizer': array([ True]), 'trombone': array([False]), 'trumpet': array([False]), 'ukulele': array([False]), 'violin': array([False]), 'voice': array([ True])}


In [28]:
# # code used to transfer from sklearn model to coreML model

# import coremltools as ct
# import tensorflow as tf
# from tensorflow import keras

# # core_models = dict()

# # for instrument in class_map:
# #     new_model = ct.converters.sklearn.convert(models[instrument])
# #     core_models[instrument] = new_model

# # for instrument in class_map:
# #     coreStr = "RTC_" + instrument +  ".mlmodel"
# #     core_models[instrument].save(coreStr)


# vggish_model = tf.keras.models.Sequential()
# vggish_model.load_weights("./openmic/vggish/_model/vggish_model.ckpt")
# #vggish_model = tf.keras.models.load_model("./openmic/vggish/_model")
# #vggish_model = ct.convert(, source='tensorflow')

In [29]:
# core_sample_pred = dict()
# for instrument in class_map:
#     inst_model = core_models[instrument]
#     core_pred = inst_model.predict()
#     core_sample_pred[instrument] = core_pred

# Using New Data with Model :

In [30]:
# We need soundfile to load audio data
import soundfile as sf

# For audio playback
from IPython.display import Audio

In [31]:
# And the openmic-vggish preprocessor feature extraction
import openmic.vggish

2022-12-08 19:35:24.808255: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [32]:
audio, rate = sf.read(os.path.join(DATA_ROOT, 'audio/000/000046_3840.ogg'))

#play audio sample
Audio(data=audio.T, rate=rate)

In [33]:
time_points, features = openmic.vggish.waveform_to_features(audio, rate)

2022-12-08 19:35:34.989402: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:Restoring parameters from /Users/starkey/Desktop/machine learning/Music Modeling/openmic/vggish/_model/vggish_model.ckpt


2022-12-08 19:35:35.451239: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled


In [34]:
time_points

array([0.  , 0.96, 1.92, 2.88, 3.84, 4.8 , 5.76, 6.72, 7.68, 8.64])

In [35]:
features.shape

(10, 128)

In [36]:
#apply classifier to instruments

feature_mean = np.mean(features, axis=0, keepdims=True)

for instrument in models:
    clf = models[instrument]
    perc = clf.predict_proba(feature_mean)[0,1]

    print('Prob[{:20s} =1] = {:.2f}'.format(instrument, perc))
    if perc > 0.5:
        print("{} is classified to exist in this sample".format(instrument))

Prob[accordion            =1] = 0.14
Prob[banjo                =1] = 0.03
Prob[bass                 =1] = 0.05
Prob[cello                =1] = 0.65
cello is classified to exist in this sample
Prob[clarinet             =1] = 0.19
Prob[cymbals              =1] = 0.17
Prob[drums                =1] = 0.06
Prob[flute                =1] = 0.29
Prob[guitar               =1] = 0.09
Prob[mallet_percussion    =1] = 0.27
Prob[mandolin             =1] = 0.11
Prob[organ                =1] = 0.23
Prob[piano                =1] = 0.68
piano is classified to exist in this sample
Prob[saxophone            =1] = 0.41
Prob[synthesizer          =1] = 0.37
Prob[trombone             =1] = 0.20
Prob[trumpet              =1] = 0.16
Prob[ukulele              =1] = 0.03
Prob[violin               =1] = 0.62
violin is classified to exist in this sample
Prob[voice                =1] = 0.58
voice is classified to exist in this sample


In [37]:
audio, rate = sf.read(os.path.join('deftones_sample.ogg'))

Audio(data=audio.T, rate=rate)

In [38]:
# of course, feature extraction is required using openmic's "VGGish" model
time_points, features = openmic.vggish.waveform_to_features(audio, rate, compress=True)



INFO:tensorflow:Restoring parameters from /Users/starkey/Desktop/machine learning/Music Modeling/openmic/vggish/_model/vggish_model.ckpt


In [39]:
feature_mean = np.mean(features, axis=0, keepdims=True)

for instrument in models:
    clf = models[instrument]
    perc = clf.predict_proba(feature_mean)[0,1]

    print('Prob[{:20s} =1] = {:.2f}'.format(instrument, perc))
    if perc > 0.5:
        print("{} is classified to exist in this sample".format(instrument))

Prob[accordion            =1] = 0.05
Prob[banjo                =1] = 0.14
Prob[bass                 =1] = 0.42
Prob[cello                =1] = 0.04
Prob[clarinet             =1] = 0.04
Prob[cymbals              =1] = 0.76
cymbals is classified to exist in this sample
Prob[drums                =1] = 0.85
drums is classified to exist in this sample
Prob[flute                =1] = 0.05
Prob[guitar               =1] = 0.84
guitar is classified to exist in this sample
Prob[mallet_percussion    =1] = 0.07
Prob[mandolin             =1] = 0.09
Prob[organ                =1] = 0.05
Prob[piano                =1] = 0.03
Prob[saxophone            =1] = 0.13
Prob[synthesizer          =1] = 0.35
Prob[trombone             =1] = 0.11
Prob[trumpet              =1] = 0.18
Prob[ukulele              =1] = 0.03
Prob[violin               =1] = 0.10
Prob[voice                =1] = 0.62
voice is classified to exist in this sample
