In [43]:
from collections import defaultdict
import datetime
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Activation, Concatenate, Conv2D, Dense, Dropout, Flatten, MaxPooling2D
from keras.models import Input, Model, load_model, model_from_json
from librosa import cqt
import numpy as np
import os
import pickle
import shutil
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import sys
from warnings import warn
from zipfile import ZipFile

module_path = os.path.abspath('..')
if module_path not in sys.path:
    sys.path.append(module_path)
from music_transcription.pitch_detection.cnn_cqt_pitch_detection import CnnCqtFeatureExtractor
from music_transcription.pitch_detection.read_data import get_wav_and_truth_files, read_data_y

In [4]:
DATASETS_CV = {1, 2}
DATASETS_ADDITIONAL = {3, 9, 10, 11}

sample_rate = 44100
subsampling_step = 1
min_pitch = 40
max_pitch = 88
onset_group_threshold_seconds = 0.05

image_data_format = 'channels_first'
cqt_configs = [
    {
        'hop_length': 512,
        'fmin': 55.0,
        'n_bins': 180,
        'bins_per_octave': 36,
        'scale': False,
    },
]

LOSS = 'binary_crossentropy'
OPTIMIZER = 'adam'
METRICS = None
BATCH_SIZE = 256

In [5]:
wav_file_paths_cv, truth_dataset_format_tuples_cv = get_wav_and_truth_files(DATASETS_CV)
wav_file_paths_additional, truth_dataset_format_tuples_additional = get_wav_and_truth_files(DATASETS_ADDITIONAL)

  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + path_to_wav + ', not a .wav file.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')
  warn('Skipping ' + wav_file + ', no truth found.')


In [6]:
folds = []
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for k, (train_indices, test_indices) in enumerate(k_fold.split(wav_file_paths_cv)):
    # if k > 0:
    #     print('Skipping split {}'.format(k))
    #     continue
    
    wav_file_paths_train = [wav_file_paths_cv[i] for i in train_indices] + wav_file_paths_additional
    truth_dataset_format_tuples_train = [truth_dataset_format_tuples_cv[i] for i in train_indices] + truth_dataset_format_tuples_additional
    wav_file_paths_test = [wav_file_paths_cv[i] for i in test_indices]
    truth_dataset_format_tuples_test = [truth_dataset_format_tuples_cv[i] for i in test_indices]
    
    data_train, y_train, wav_file_paths_train_valid, truth_dataset_format_tuples_train_valid = read_data_y(
        wav_file_paths_train, truth_dataset_format_tuples_train,
        sample_rate, subsampling_step,
        min_pitch, max_pitch,
        onset_group_threshold_seconds=onset_group_threshold_seconds
    )
    
    feature_extractor = CnnCqtFeatureExtractor(image_data_format, sample_rate, cqt_configs)
    list_of_X_train, sample_file_indexes_train = feature_extractor.fit_transform(data_train)

    data_test, y_test, wav_file_paths_test_valid, truth_dataset_format_tuples_test_valid = read_data_y(
        wav_file_paths_test, truth_dataset_format_tuples_test,
        sample_rate, subsampling_step,
        min_pitch, max_pitch,
        onset_group_threshold_seconds=onset_group_threshold_seconds
    )
    list_of_X_test, sample_file_indexes_test = feature_extractor.transform(data_test, verbose=True)

    # if self.config['sample_weights'] == 'balanced':
        # validation_data = (list_of_X_test, y_test, self._get_sample_weights(sample_file_indexes_test,
        #                                                                     truth_dataset_format_tuples_test_valid))
    # else:
    
    folds.append((list_of_X_train, y_train, list_of_X_test, y_test))

  warn('Skipping {}, pitch {} is out of range.'.format(path_to_xml, pitch))
  warn('Skipping {}, pitch {} is out of range.'.format(path_to_xml, pitch))
  warn('Skipping {}, pitch {} is out of range.'.format(path_to_xml, pitch))
  warn('Skipping {}, pitch {} is out of range.'.format(path_to_xml, pitch))
  warn('Skipping {}, pitch {} is out of range.'.format(path_to_xml, pitch))
  warn('Skipping ' + path_to_wav + ', cannot handle stereo signal.')
  warn('Skipping {}, pitch {} is out of range.'.format(path_to_xml, pitch))


Creating spectrograms
Fitting standard scalers for each spectrogram and bin
(515965, 180)
3.63677319949
22.0932928692
Standardizing for each spectrogram and bin
-2.02342757837e-16
1.0
(4466, 16, 180)
Reshaping data
(4466, 1, 16, 180)


  warn('Skipping {}, pitch {} is out of range.'.format(path_to_xml, pitch))


Creating spectrograms
(73188, 180)
5.06737530463
27.5702815749
Standardizing for each spectrogram and bin
0.0824521316465
1.19121556558
(633, 16, 180)
Reshaping data
(633, 1, 16, 180)
Creating spectrograms
Fitting standard scalers for each spectrogram and bin
(527000, 180)
3.56672113525
21.6869298092
Standardizing for each spectrogram and bin
-2.03703898456e-16
1.0
(4404, 16, 180)
Reshaping data
(4404, 1, 16, 180)
Creating spectrograms
(62153, 180)
5.91534937004
30.9263472148
Standardizing for each spectrogram and bin
0.149010008662
1.42025840175
(695, 16, 180)
Reshaping data
(695, 1, 16, 180)
Creating spectrograms
Fitting standard scalers for each spectrogram and bin
(504083, 180)
3.68388396785
22.797992656
Standardizing for each spectrogram and bin
1.54527309082e-16
1.0
(4167, 16, 180)
Reshaping data
(4167, 1, 16, 180)
Creating spectrograms
(85070, 180)
4.58840326211
23.1412538061
Standardizing for each spectrogram and bin
0.0728228072821
1.1014277565
(932, 16, 180)
Reshaping data
(9

In [46]:
counts = defaultdict(int)
for ds in [t[1] for t in truth_dataset_format_tuples_cv]:
    counts[ds] += 1
print(counts)

for k, (train_indices, test_indices) in enumerate(k_fold.split(wav_file_paths_cv)):
    print(k)
    counts_test_k = defaultdict(int)
    for ds in [t[1] for t in [truth_dataset_format_tuples_cv[i] for i in test_indices]]:
        counts_test_k[ds] += 1
    print(counts_test_k)

defaultdict(<class 'int'>, {1: 400, 2: 252})
0
defaultdict(<class 'int'>, {1: 83, 2: 48})
1
defaultdict(<class 'int'>, {1: 89, 2: 42})
2
defaultdict(<class 'int'>, {1: 77, 2: 53})
3
defaultdict(<class 'int'>, {1: 72, 2: 58})
4
defaultdict(<class 'int'>, {1: 79, 2: 51})


In [37]:
def predict(model, proba_threshold, list_of_X, y, epsilon=1e-7):
    proba_matrix = model.predict(list_of_X)
    y = proba_matrix > proba_threshold
    y = y.astype(np.int8)

    # Make sure at least one pitch is returned.
    for probas, labels in zip(proba_matrix, y):
        if labels.sum() == 0:
            max_proba = max(probas)
            max_index = np.where(np.logical_and(probas > max_proba - epsilon, probas < max_proba + epsilon))[0][0]
            labels[max_index] = 1

    return y

def print_metrics(y, y_predicted):
    print('Accuracy: {}'.format(accuracy_score(y, y_predicted)))
    print(classification_report(y, y_predicted,
                                target_names=[str(pitch) for pitch in range(min_pitch, max_pitch + 1)]))

In [47]:
def create_model_1(list_of_X, n_output_units):
    inputs = []
    conv_blocks = []
    for X in list_of_X:
        spectrogram = Input(shape=X.shape[1:])
        inputs.append(spectrogram)

        conv = Conv2D(20, (7, 3), padding='valid')(spectrogram)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 3))(conv)
        conv = Conv2D(20, (3, 3), padding='valid')(conv)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 3))(conv)
        conv = Dropout(0.25)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)

    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    z = Dense(256)(z)
    z = Activation('relu')(z)
    z = Dropout(0.5)(z)
    output = Dense(n_output_units, activation='sigmoid')(z)

    model = Model(inputs, output)
    model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
    model.summary()

    return model

def create_model_2(list_of_X, n_output_units, dropout_conv=0.25, dropout_dense=0.5):
    inputs = []
    conv_blocks = []
    for X in list_of_X:
        spectrogram = Input(shape=X.shape[1:])
        inputs.append(spectrogram)

        conv = Conv2D(10, (7, 3), padding='valid')(spectrogram)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 3))(conv)
        conv = Conv2D(20, (3, 3), padding='valid')(conv)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 3))(conv)
        conv = Dropout(dropout_conv)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)

    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    z = Dense(256)(z)
    z = Activation('relu')(z)
    z = Dropout(dropout_dense)(z)
    output = Dense(n_output_units, activation='sigmoid')(z)

    model = Model(inputs, output)
    model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
    # model.summary()

    return model

def create_model_3(list_of_X, n_output_units):
    inputs = []
    conv_blocks = []
    for X in list_of_X:
        spectrogram = Input(shape=X.shape[1:])
        inputs.append(spectrogram)

        conv = Conv2D(49, (16, 6), padding='valid')(spectrogram)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 29))(conv)
        conv = Dropout(0.25)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)

    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    z = Dense(32)(z)
    z = Activation('relu')(z)
    z = Dropout(0.5)(z)
    output = Dense(n_output_units, activation='sigmoid')(z)

    model = Model(inputs, output)
    model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
    model.summary()

    return model

def create_model_4(list_of_X, n_output_units, dropout_conv=0.25, dropout_dense=0.5):
    inputs = []
    conv_blocks = []
    for X in list_of_X:
        spectrogram = Input(shape=X.shape[1:])
        inputs.append(spectrogram)

        conv = Conv2D(10, (7, 3), padding='valid')(spectrogram)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 3))(conv)
        conv = Dropout(dropout_conv)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)

    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    z = Dense(256)(z)
    z = Activation('relu')(z)
    z = Dropout(dropout_dense)(z)
    output = Dense(n_output_units, activation='sigmoid')(z)

    model = Model(inputs, output)
    model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
    # model.summary()

    return model

def create_model_5(list_of_X, n_output_units, dropout_conv=0.25, dropout_dense=0.5):
    inputs = []
    conv_blocks = []
    for X in list_of_X:
        spectrogram = Input(shape=X.shape[1:])
        inputs.append(spectrogram)

        conv = Conv2D(10, (7, 3), padding='valid')(spectrogram)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 3))(conv)
        conv = Conv2D(20, (3, 3), padding='valid')(conv)
        conv = Activation('relu')(conv)
        conv = MaxPooling2D(pool_size=(1, 3))(conv)
        conv = Dropout(dropout_conv)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)

    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    z = Dense(256)(z)
    z = Activation('relu')(z)
    z = Dropout(dropout_dense)(z)
    output = Dense(n_output_units, activation='sigmoid')(z)

    model = Model(inputs, output)
    model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
    model.summary()

    return model

In [51]:
def train_and_evaluate(folds, create_model, dropout_conv, dropout_dense, proba_threshold):
    y_test_all_folds = None
    y_test_predicted_all_folds = None
    for i, (list_of_X_train, y_train, list_of_X_test, y_test) in enumerate(folds):
        # model_dir = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
        # os.mkdir(model_dir)

        model = create_model(list_of_X_train, max_pitch - min_pitch + 1,
                             dropout_conv=dropout_conv, dropout_dense=dropout_dense)
        model.fit(list_of_X_train, y_train,
                  epochs=1000,
                  batch_size=BATCH_SIZE,
                  sample_weight=None,
                  class_weight=None,
                  callbacks=[EarlyStopping(monitor='loss', patience=6),
                             # ModelCheckpoint(os.path.join(model_dir, 'model.' + str(i) + '.{epoch:02d}-{val_loss:.4f}.hdf5'),
                             #                 monitor='val_loss', save_best_only=True)
                            ],
                  verbose=0,
                  # verbose=2,

                  # validation_split=0.1,
                  # validation_data=(list_of_X_test, y_test),
                 )

        # Load model with lowest val_loss
        # path_to_model = max([os.path.join(model_dir, file)
        #                      for file in os.listdir(model_dir)
        #                      if file.startswith('model.' + str(i) + '.')])
        # print(path_to_model)
        # model = load_model(path_to_model)

        y_test_predicted = predict(model, proba_threshold, list_of_X_test, y_test)
        # print_metrics(y_test, y_test_predicted)

        if y_test_all_folds is None:
            y_test_all_folds = y_test
        else:
            y_test_all_folds = np.concatenate((y_test_all_folds, y_test))

        if y_test_predicted_all_folds is None:
            y_test_predicted_all_folds = y_test_predicted
        else:
            y_test_predicted_all_folds = np.concatenate((y_test_predicted_all_folds, y_test_predicted))

    print_metrics(y_test_all_folds, y_test_predicted_all_folds)

models = [
    ('model_2', create_model_2),
]
for i in range(5):
    for model_name, create_model in models:
        print(model_name)
        for dropout_conv in [0.2]:
            for dropout_dense in [0.4]:
                for proba_threshold in [0.5]:
                    print('dropout_conv={}, dropout_dense={}, proba_threshold={}'.format(dropout_conv,
                                                                                         dropout_dense,
                                                                                         proba_threshold))
                    train_and_evaluate(folds, create_model, dropout_conv, dropout_dense, proba_threshold)

model_2
dropout_conv=0.2, dropout_dense=0.4, proba_threshold=0.5
Accuracy: 0.8852672750977836
             precision    recall  f1-score   support

         40       0.95      0.95      0.95        56
         41       1.00      1.00      1.00        20
         42       1.00      0.91      0.95        11
         43       1.00      0.72      0.84        50
         44       1.00      1.00      1.00        11
         45       1.00      0.99      1.00       187
         46       1.00      1.00      1.00        22
         47       0.94      0.99      0.96       119
         48       0.99      0.92      0.95       332
         49       0.97      0.98      0.98       101
         50       1.00      0.99      0.99       267
         51       0.93      0.93      0.93        29
         52       0.91      0.98      0.94       322
         53       0.93      0.75      0.83       108
         54       0.97      0.91      0.94       246
         55       0.92      0.95      0.93       243
    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy: 0.8889178617992177
             precision    recall  f1-score   support

         40       1.00      0.96      0.98        56
         41       1.00      1.00      1.00        20
         42       1.00      1.00      1.00        11
         43       1.00      0.68      0.81        50
         44       1.00      1.00      1.00        11
         45       1.00      0.97      0.99       187
         46       1.00      0.95      0.98        22
         47       0.94      0.98      0.96       119
         48       0.98      0.93      0.96       332
         49       0.99      1.00      1.00       101
         50       1.00      0.99      0.99       267
         51       1.00      1.00      1.00        29
         52       0.92      0.98      0.95       322
         53       0.95      0.73      0.83       108
         54       0.97      0.90      0.93       246
         55       0.89      0.95      0.92       243
         56       0.99      0.96      0.98       169
         57     