In [12]:
import numpy as np
import pandas as pd

from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, Dense, Activation
from keras.callbacks import ModelCheckpoint, TerminateOnNaN
from keras.optimizers import RMSprop

## Load the data we created in data_read_and_process.ipynb

In [13]:
X_train = np.load('train_and_val/X_train.npy')
X_val = np.load('train_and_val/X_val.npy')
y_train = np.load('train_and_val/y_train.npy')
y_val = np.load('train_and_val/y_val.npy')

## Build a Model

In [14]:
def lstm(n_lstm_layers = 2, n_dense_layers = 3, n_lstm_nodes = 512, dropout_rate = 0.6):
    """Generate a keras Sequential model of the form as described in Figure 16 of
    https://www.tandfonline.com/doi/full/10.1080/25765299.2019.1649972"""
    model = Sequential()
    model.add(LSTM(n_lstm_nodes, return_sequences = True, input_shape = (16, 89,)))
    model.add(Dropout(dropout_rate))
    for i in range(1, n_lstm_layers - 1):
        model.add(LSTM(n_lstm_nodes, return_sequences = True))
        model.add(Dropout(dropout_rate))
    model.add(LSTM(n_lstm_nodes))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n_lstm_nodes // 2))
    model.add(Activation('relu'))
    model.add(Dropout(dropout_rate))
    for i in range(n_dense_layers - 1):
        model.add(Dense(n_lstm_nodes // 2))
        model.add(Dropout(0.6))
    model.add(Dense(89))
    model.add(Activation('sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'RMSProp')
    return model

## Custom Loss and Metrics

\begin{equation*}
bce\_loss = \frac{1}{N} (\sum_{i=1}^{N} y_i log(p(y_i)) + (1 - y_i) log(1 - p(y_i)))
\end{equation*}
\begin{equation*}
maestro\_loss = 2 * Harshness \lvert\frac{d_{true} - d_{pred}}{d_{true} + d{_{pred}}}\rvert
\end{equation*}
<br>
\begin{equation*}
total\_loss = MIN(2 * bce\_loss, bce\_loss + maestro\_loss)
\end{equation*}

where N = num_keys_piano, <b>Harshness</b> is a constant to be determined, and <b>d</b> gives the normalized duration. I'll call it the <b>Maestro Loss Function</b> since it pays special attention to the timing of the notes. It is usually composed of a Binary Cross Entropy Term with an additional term proportional to the relative error in duration between $d_{true}$ and $d_{pred}$. However, we limit the total_loss to be less than twice the bce_loss. We also define custom metrics, read the docstrings for their descriptions.

In [15]:
import keras.backend as K
import tensorflow as tf

def maestro_loss_wr(harshness): 
    """A loss function which, in addition to penalizing for misclassification on the 
    first n_keys_piano elements, includes a term proportional to the relative
    error in the prediction of the last element (which repesents the duration). 
    The proportionality constant is the 'harshness' of the maestro in regards to
    timing."""
    def maestro_loss(ytrue, ypred):
        # Standard binary cross-entropy
        bce_loss = - K.mean(ytrue[:, :-1] * K.log(ypred[:, :-1]) + (1 - ytrue[:, :-1]) * \
                     K.log(1 - ypred[:, :-1]))

        # Duration error term
        dur_loss = 2 * harshness * K.mean(K.abs((ytrue[:, -1] - ypred[:, -1]) / \
                                      (ytrue[:, -1] + ypred[:, -1] + K.epsilon())))
        
        if (dur_loss > bce_loss):   # Often times, ytrue[:, -1] elements will be zero
            return bce_loss * 2     # This may spike dur_loss. To control, I limit it
                                    # so that it never exceeds the bce_loss.
        return bce_loss + dur_loss
    
    return maestro_loss

def precision_mod(ytrue, ypred):
    """Just a modified precision excluding the last element (which is not a classification)"""

    true_positives = K.sum(K.round(ytrue[:, :-1] * ypred[:, :-1]))
    pred_positives = K.sum(K.round(ypred[:, :-1]))
    return true_positives / (pred_positives + K.epsilon())

def recall_mod(ytrue, ypred):
    """Just a modified recall excluding the last element (which is not a classification)"""

    true_positives = K.sum(K.round(ytrue[:, :-1] * ypred[:, :-1]))
    poss_positives = K.sum(ytrue[:, :-1])
    return true_positives / (poss_positives + K.epsilon())

def f1_score_mod(ytrue, ypred):
    """Just a modified f1_score excluding the last element (which is not a classification)"""

    precision = precision_mod(ytrue, ypred)
    recall = recall_mod(ytrue, ypred)   
    return 2 * (precision * recall) / (precision + recall + K.epsilon())

def dur_error(ytrue, ypred):
    """A new metric that only gives information on the error in duration predictions"""
    
    return 2 * K.mean(K.abs((ytrue[:, -1] - ypred[:, -1]) / (ytrue[:, -1] + ypred[:, -1] + \
                                                         K.epsilon())))

def maestro_dur_loss_wr(harshness):
    """The second term of the maestro loss, based purely on error in duration predictions.
    To be used as a metric in order to decompose the loss components during analysis"""
    def maestro_dur_loss(ytrue, ypred):

        return 2 * harshness * K.mean(K.abs((ytrue[:, -1] - ypred[:, -1]) / \
                                      (ytrue[:, -1] + ypred[:, -1] + K.epsilon())))
    return maestro_dur_loss

In [16]:
def generate_cols_dict(history):
    """return a mapping of desired column names to the corresponding columns in the
    history dictionary (previously history.history where history is the return value
    of model.train)"""
    return {'maestro_loss': history['loss'], 'f1_score': history['f1_score_mod'], \
 'precision': history['precision_mod'], 'recall': history['recall_mod'], \
 'dur_error': history['dur_error'], 'dur_loss': history['maestro_dur_loss'], \
 'val_maestro_loss': history['val_loss'], 'val_f1_score': history['val_f1_score_mod'], \
 'val_precision': history['val_precision_mod'], 'val_recall': history['val_recall_mod'], \
 'val_dur_error': history['val_dur_error'], 'val_dur_loss': history['val_maestro_dur_loss']}

## Training

In [None]:
model = lstm(n_lstm_layers = 2, n_dense_layers = 1, n_lstm_nodes = 512, dropout_rate = 0.4)
opt = RMSprop()
model.compile(loss = maestro_loss_wr(0.1), optimizer = opt, metrics = [f1_score_mod, recall_mod, precision_mod, dur_error, maestro_dur_loss_wr(0.1)])
mc = ModelCheckpoint('models/best_maestro_model_2_1_512_pt4.h5', monitor = 'val_loss', mode = 'min', save_best_only = True, verbose = 1)
history = model.fit(X_train, y_train, batch_size = 512, epochs = 100, \
                    validation_data = (X_val, y_val), verbose = 2, callbacks = [mc, TerminateOnNaN()])
if (len(history.history['val_loss']) < len(history.history['loss'])):  # a NaN during training
    for key, value in history.history.items():
        if (key[:3] == 'val'):          # pd.DataFrame requires value lengths to be equal
            value.append(np.nan)  
df = pd.DataFrame(generate_cols_dict(history.history))
df.index.name = 'Epochs'
df.to_csv('model_data/best_maestro_model_2_1_512_pt4.csv')

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.17842, saving model to models/best_maestro_model_2_1_512_pt4.h5
50/50 - 316s - loss: 0.2826 - f1_score_mod: 0.0105 - recall_mod: 0.0211 - precision_mod: 0.0767 - dur_error: 1.0439 - maestro_dur_loss: 0.1044 - val_loss: 0.1784 - val_f1_score_mod: 0.0000e+00 - val_recall_mod: 0.0000e+00 - val_precision_mod: 0.0000e+00 - val_dur_error: 0.5199 - val_maestro_dur_loss: 0.0520
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.17842
50/50 - 330s - loss: 0.2047 - f1_score_mod: 2.6882e-05 - recall_mod: 1.3468e-05 - precision_mod: 0.0067 - dur_error: 0.7001 - maestro_dur_loss: 0.0700 - val_loss: 0.1869 - val_f1_score_mod: 0.0000e+00 - val_recall_mod: 0.0000e+00 - val_precision_mod: 0.0000e+00 - val_dur_error: 0.6552 - val_maestro_dur_loss: 0.0655
Epoch 3/100

Epoch 00003: val_loss improved from 0.17842 to 0.17361, saving model to models/best_maestro_model_2_1_512_pt4.h5
50/50 - 318s - loss: 0.1916 - f1_score_mod: 0.0017 - recall_mo

Epoch 22/100

Epoch 00022: val_loss did not improve from 0.12565
50/50 - 350s - loss: 0.1454 - f1_score_mod: 0.1732 - recall_mod: 0.0997 - precision_mod: 0.6720 - dur_error: 0.4394 - maestro_dur_loss: 0.0439 - val_loss: 0.1269 - val_f1_score_mod: 0.1830 - val_recall_mod: 0.1057 - val_precision_mod: 0.6950 - val_dur_error: 0.3084 - val_maestro_dur_loss: 0.0308
Epoch 23/100

Epoch 00023: val_loss did not improve from 0.12565
50/50 - 361s - loss: 0.1450 - f1_score_mod: 0.1827 - recall_mod: 0.1058 - precision_mod: 0.6834 - dur_error: 0.4421 - maestro_dur_loss: 0.0442 - val_loss: 0.1267 - val_f1_score_mod: 0.1782 - val_recall_mod: 0.1018 - val_precision_mod: 0.7334 - val_dur_error: 0.3151 - val_maestro_dur_loss: 0.0315
Epoch 24/100

Epoch 00024: val_loss improved from 0.12565 to 0.12464, saving model to models/best_maestro_model_2_1_512_pt4.h5
50/50 - 345s - loss: 0.1437 - f1_score_mod: 0.1868 - recall_mod: 0.1086 - precision_mod: 0.6789 - dur_error: 0.4357 - maestro_dur_loss: 0.0436 - val_