## Scaling Up to handle a dataset of size 1 GB

In [47]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sys import getsizeof

from keras.models import Sequential, load_model
from keras.layers import LSTM, Dropout, Dense, Activation
from keras.callbacks import ModelCheckpoint, TerminateOnNaN
from keras.optimizers import RMSprop

from decimal import Decimal

import keras.backend as K
import tensorflow as tf

### Read-in transposed_chopin_sequences

In [6]:
transposed_chopin_sequences = np.load('../train_and_val/transposed_chopin_sequences.npy', allow_pickle = True)

### Generate model inputs

In [12]:
def sequences_to_inputs(sequences, window_size = 16):
    """Apply a window function of size window_size across the dataset to create X. The next 
    vector in the sequence is appended to y for each window. Return X, y."""
    
    X = []
    y = []
    
    for i in range(len(sequences)):
        if (len(sequences[i]) < window_size + 1):
            print("Skipping index ", i, " because the song is too short. Try a shorter window_size to include it.")
            continue
        for j in range(len(sequences[i]) - window_size):
            X.append(sequences[i][j:j + window_size])
            y.append(sequences[i][j + window_size])

    return np.array(X), np.array(y)

First, we will generate them exactly as has been done in ./data_read_and_process.ipynb to verify that the checked statistics remain the same. This is meant to verify that transposed_chopin_sequences is unchanged by saving and loading. 

In [13]:
X, y = sequences_to_inputs(transposed_chopin_sequences)
X, y = shuffle(X, y, random_state = 42)

In [14]:
maximum_duration = max(X[:, :, -1].max(), y[:, -1].max())
print('Maximum Duration = {}'.format(maximum_duration))
# Keep variable so one can multiply the durations by this after music generation
# to convert back into seconds
X[:, :, -1] /= maximum_duration
y[:, -1] /= maximum_duration

Maximum Duration = 6.545454545454545


In [15]:
y.shape

(36153, 89)

In [16]:
X.shape

(36153, 16, 89)

In [19]:
print('Dataset size is {} MB'.format(getsizeof(X) / 1000000))

Dataset size is 411.855104 MB


In [22]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [23]:
print('Maximum Scaled Duration for X_train: {}'.format(X_train[:, :, -1].max()))
print('Maximum Scaled Duration for X_val: {}'.format(X_val[:, :, -1].max()))
print('Maximum Scaled Duration for y_train: {}'.format(y_train[:, -1].max()))
print('Maximum Scaled Duration for y_val: {}'.format(y_val[:, -1].max()))

Maximum Scaled Duration for X_train: 0.860215053763441
Maximum Scaled Duration for X_val: 0.5516975308641976
Maximum Scaled Duration for y_train: 1.0
Maximum Scaled Duration for y_val: 0.9166666666666667


In [24]:
print('Train-Validation Ratio of the Mean of the Scaled Duration: ', y_train[:, -1].mean() / y_val[:, -1].mean())
print('Train-Validation Ratio of the Stdv of the Scaled Duration: ', y_train[:, -1].std() / y_val[:, -1].std())

Train-Validation Ratio of the Mean of the Scaled Duration:  0.9898257021922111
Train-Validation Ratio of the Stdv of the Scaled Duration:  0.9209054376139967


Indeed, they are the same. Now, we want to scale to 1 GB.

In [37]:
X, y = sequences_to_inputs(transposed_chopin_sequences, window_size = 42)
X, y = shuffle(X, y, random_state = 42)

Skipping index  24  because the song is too short. Try a shorter window_size to include it.


In [38]:
maximum_duration = max(X[:, :, -1].max(), y[:, -1].max())
print('Maximum Duration = {}'.format(maximum_duration))
# Keep variable so one can multiply the durations by this after music generation
# to convert back into seconds
X[:, :, -1] /= maximum_duration
y[:, -1] /= maximum_duration

Maximum Duration = 6.545454545454545


In [42]:
y.shape

(34260, 89)

In [43]:
X.shape

(34260, 42, 89)

In [44]:
print('Dataset size is {} MB'.format(getsizeof(X) / 1000000))

Dataset size is 1024.511168 MB


## Build and Train a Model

The below functions are the same ones used in ./model_training.ipynb which are necessary to train and save a model (along with performance data). The only difference is in the base filename in the function 'train_lstm_model' is changed from 'best_maestro_model' to 'best_maestro_model_scaled'.

In [45]:
def lstm(n_lstm_layers = 2, n_dense_layers = 1, n_lstm_nodes = 512, dropout_rate = 0.4):
    """Generate a keras Sequential model of the form as described in Figure 16 of
    https://www.tandfonline.com/doi/full/10.1080/25765299.2019.1649972"""
    model = Sequential()
    model.add(LSTM(n_lstm_nodes, return_sequences = True, input_shape = (16, 89,)))
    model.add(Dropout(dropout_rate))
    for i in range(1, n_lstm_layers - 1):
        model.add(LSTM(n_lstm_nodes, return_sequences = True))
        model.add(Dropout(dropout_rate))
    model.add(LSTM(n_lstm_nodes))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n_lstm_nodes // 2))
    model.add(Activation('relu'))
    model.add(Dropout(dropout_rate))
    for i in range(n_dense_layers - 1):
        model.add(Dense(n_lstm_nodes // 2))
        model.add(Dropout(0.6))
    model.add(Dense(89))
    model.add(Activation('sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'RMSProp')
    return model

In [46]:
def maestro_loss_wr(harshness): 
    """A loss function which, in addition to penalizing for misclassification on the 
    first n_keys_piano elements, includes a term proportional to the relative
    error in the prediction of the last element (which repesents the duration). 
    The proportionality constant is the 'harshness' of the maestro in regards to
    timing."""
    def maestro_loss(ytrue, ypred):
        # Standard binary cross-entropy
        bce_loss = - K.mean(ytrue[:, :-1] * K.log(ypred[:, :-1]) + (1 - ytrue[:, :-1]) * \
                     K.log(1 - ypred[:, :-1]))

        # Duration error term
        dur_loss = 2 * harshness * K.mean(K.abs((ytrue[:, -1] - ypred[:, -1]) / \
                                      (ytrue[:, -1] + ypred[:, -1] + K.epsilon())))
        
        if (dur_loss > bce_loss):   # Often times, ytrue[:, -1] elements will be zero
            return bce_loss * 2     # This may spike dur_loss. To control, I limit it
                                    # so that it never exceeds the bce_loss.
        return bce_loss + dur_loss
    
    return maestro_loss

def precision_mod(ytrue, ypred):
    """Just a modified precision excluding the last element (which is not a classification)"""

    true_positives = K.sum(K.round(ytrue[:, :-1] * ypred[:, :-1]))
    pred_positives = K.sum(K.round(ypred[:, :-1]))
    return true_positives / (pred_positives + K.epsilon())

def recall_mod(ytrue, ypred):
    """Just a modified recall excluding the last element (which is not a classification)"""

    true_positives = K.sum(K.round(ytrue[:, :-1] * ypred[:, :-1]))
    poss_positives = K.sum(ytrue[:, :-1])
    return true_positives / (poss_positives + K.epsilon())

def f1_score_mod(ytrue, ypred):
    """Just a modified f1_score excluding the last element (which is not a classification)"""

    precision = precision_mod(ytrue, ypred)
    recall = recall_mod(ytrue, ypred)   
    return 2 * (precision * recall) / (precision + recall + K.epsilon())

def dur_error(ytrue, ypred):
    """A new metric that only gives information on the error in duration predictions"""
    
    return 2 * K.mean(K.abs((ytrue[:, -1] - ypred[:, -1]) / (ytrue[:, -1] + ypred[:, -1] + \
                                                         K.epsilon())))

def maestro_dur_loss_wr(harshness):
    """The second term of the maestro loss, based purely on error in duration predictions.
    To be used as a metric in order to decompose the loss components during analysis"""
    def maestro_dur_loss(ytrue, ypred):

        return 2 * harshness * K.mean(K.abs((ytrue[:, -1] - ypred[:, -1]) / \
                                      (ytrue[:, -1] + ypred[:, -1] + K.epsilon())))
    return maestro_dur_loss

In [48]:
def generate_cols_dict(history):
    """return a mapping of desired column names to the corresponding columns in the
    history dictionary (previously history.history where history is the return value
    of model.train)"""
    return {'maestro_loss': history['loss'], 'f1_score': history['f1_score_mod'], \
 'precision': history['precision_mod'], 'recall': history['recall_mod'], \
 'dur_error': history['dur_error'], 'dur_loss': history['maestro_dur_loss'], \
 'val_maestro_loss': history['val_loss'], 'val_f1_score': history['val_f1_score_mod'], \
 'val_precision': history['val_precision_mod'], 'val_recall': history['val_recall_mod'], \
 'val_dur_error': history['val_dur_error'], 'val_dur_loss': history['val_maestro_dur_loss']}

In [49]:
def train_lstm_model(n_lstm_layers = 2, n_dense_layers = 1, n_lstm_nodes = 512, dropout_rate = 0.4, \
                     batch_size = 512, harshness = 0.05, lr = None, clipnorm = None, clipvalue = None, \
                     epochs = 150):
    """Train a model using the passed parameters, the data, and using the RMSprop optimizer. Write the
    best model as a .h5 and a .csv containing columns for the training and validation custom loss and
    metrics. Returns nothing."""
    model = lstm(n_lstm_layers = n_lstm_layers, n_dense_layers = n_dense_layers, \
                 n_lstm_nodes = n_lstm_nodes, dropout_rate = dropout_rate)

    if (lr or clipnorm or clipvalue):
        if (lr):          # It's required that the first argument to RMSprop is not None
            opt = RMSprop(lr = lr, clipnorm = clipnorm, clipvalue = clipvalue)
        elif (clipnorm):
            opt = RMSprop(clipnorm = clipnorm, clipvalue = clipvalue)
        else: # clipvalue
            opt = RMSprop(clipvalue = clipvalue)
    else:
        opt = RMSprop()   # TypeError when all are None, so do this instead
        
    model.compile(loss = maestro_loss_wr(harshness), optimizer = opt, metrics = [f1_score_mod, recall_mod, \
                                                precision_mod, dur_error, maestro_dur_loss_wr(harshness)])
    
    filename = 'best_maestro_model_scaled_{0}_{1}_{2}_{3}'.format(n_lstm_layers, n_dense_layers, n_lstm_nodes, \
                                                          str(dropout_rate).replace('.', 'pt'))
    if (lr):
        filename += '_lr_{}'.format('%.0e' % Decimal(lr))
    if (clipnorm):
        filename += '_cn_{}'.format(str(clipnorm).replace('.', 'pt'))     
    if (clipvalue):
        filename += '_cv_{}'.format(str(clipvalue).replace('.', 'pt'))
                                   
    mc = ModelCheckpoint('../models/' + filename + '.h5', monitor = 'val_loss', mode = 'min', \
                                                         save_best_only = True, verbose = 1)
                                   
    history = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, \
                    validation_data = (X_val, y_val), verbose = 2, callbacks = [mc, TerminateOnNaN()])
    
    # In most preliminary tests model training has failed at some point when the loss becomes NaN during
    # validation
    if (len(history.history['val_loss']) < len(history.history['loss'])):  # a NaN during training
        for key, value in history.history.items():
            if (key[:3] == 'val'):          # pd.DataFrame requires value lengths to be equal
                value.append(np.nan)
                
    df = pd.DataFrame(generate_cols_dict(history.history))
    df.index.name = 'Epochs'
    df.to_csv('../model_data/' + filename + '.csv')

Let's train a model on this scaled version of the dataset using the same hyperparameters that gave us our best model in ./model_training.ipynb.

In [50]:
train_lstm_model(lr = 0.0005, clipvalue = 0.2)

Epoch 1/150

Epoch 00001: val_loss improved from inf to 0.14477, saving model to ../models/best_maestro_model_scaled_2_1_512_0pt4_lr_5e-04_cv_0pt2.h5
50/50 - 150s - loss: 0.2414 - f1_score_mod: 0.0272 - recall_mod: 0.0392 - precision_mod: 0.0857 - dur_error: 1.0064 - maestro_dur_loss: 0.0503 - val_loss: 0.1448 - val_f1_score_mod: 0.0000e+00 - val_recall_mod: 0.0000e+00 - val_precision_mod: 0.0000e+00 - val_dur_error: 0.4532 - val_maestro_dur_loss: 0.0227
Epoch 2/150

Epoch 00002: val_loss improved from 0.14477 to 0.13937, saving model to ../models/best_maestro_model_scaled_2_1_512_0pt4_lr_5e-04_cv_0pt2.h5
50/50 - 165s - loss: 0.1663 - f1_score_mod: 0.0070 - recall_mod: 0.0036 - precision_mod: 0.2045 - dur_error: 0.7137 - maestro_dur_loss: 0.0357 - val_loss: 0.1394 - val_f1_score_mod: 0.0000e+00 - val_recall_mod: 0.0000e+00 - val_precision_mod: 0.0000e+00 - val_dur_error: 0.4507 - val_maestro_dur_loss: 0.0225
Epoch 3/150

Epoch 00003: val_loss did not improve from 0.13937
50/50 - 140s -

Epoch 21/150

Epoch 00021: val_loss improved from 0.10861 to 0.10718, saving model to ../models/best_maestro_model_scaled_2_1_512_0pt4_lr_5e-04_cv_0pt2.h5
50/50 - 160s - loss: 0.1169 - f1_score_mod: 0.2199 - recall_mod: 0.1310 - precision_mod: 0.7028 - dur_error: 0.4329 - maestro_dur_loss: 0.0216 - val_loss: 0.1072 - val_f1_score_mod: 0.2444 - val_recall_mod: 0.1459 - val_precision_mod: 0.7559 - val_dur_error: 0.3046 - val_maestro_dur_loss: 0.0152
Epoch 22/150

Epoch 00022: val_loss did not improve from 0.10718
50/50 - 157s - loss: 0.1160 - f1_score_mod: 0.2316 - recall_mod: 0.1388 - precision_mod: 0.7085 - dur_error: 0.4288 - maestro_dur_loss: 0.0214 - val_loss: 0.1073 - val_f1_score_mod: 0.2527 - val_recall_mod: 0.1517 - val_precision_mod: 0.7608 - val_dur_error: 0.3091 - val_maestro_dur_loss: 0.0155
Epoch 23/150

Epoch 00023: val_loss did not improve from 0.10718
50/50 - 158s - loss: 0.1154 - f1_score_mod: 0.2370 - recall_mod: 0.1426 - precision_mod: 0.7095 - dur_error: 0.4273 - mae

Epoch 42/150

Epoch 00042: val_loss improved from 0.10027 to 0.09955, saving model to ../models/best_maestro_model_scaled_2_1_512_0pt4_lr_5e-04_cv_0pt2.h5
50/50 - 134s - loss: 0.1047 - f1_score_mod: 0.3306 - recall_mod: 0.2136 - precision_mod: 0.7356 - dur_error: 0.3868 - maestro_dur_loss: 0.0193 - val_loss: 0.0996 - val_f1_score_mod: 0.3446 - val_recall_mod: 0.2235 - val_precision_mod: 0.7540 - val_dur_error: 0.2837 - val_maestro_dur_loss: 0.0142
Epoch 43/150

Epoch 00043: val_loss did not improve from 0.09955
50/50 - 143s - loss: 0.1042 - f1_score_mod: 0.3311 - recall_mod: 0.2139 - precision_mod: 0.7375 - dur_error: 0.3854 - maestro_dur_loss: 0.0193 - val_loss: 0.1005 - val_f1_score_mod: 0.3497 - val_recall_mod: 0.2291 - val_precision_mod: 0.7407 - val_dur_error: 0.3074 - val_maestro_dur_loss: 0.0154
Epoch 44/150

Epoch 00044: val_loss did not improve from 0.09955
50/50 - 138s - loss: 0.1037 - f1_score_mod: 0.3353 - recall_mod: 0.2174 - precision_mod: 0.7371 - dur_error: 0.3826 - mae

Epoch 63/150

Epoch 00063: val_loss did not improve from 0.09458
50/50 - 248s - loss: 0.0938 - f1_score_mod: 0.4152 - recall_mod: 0.2879 - precision_mod: 0.7471 - dur_error: 0.3576 - maestro_dur_loss: 0.0179 - val_loss: 0.0956 - val_f1_score_mod: 0.4171 - val_recall_mod: 0.2935 - val_precision_mod: 0.7217 - val_dur_error: 0.2868 - val_maestro_dur_loss: 0.0143
Epoch 64/150

Epoch 00064: val_loss did not improve from 0.09458
50/50 - 248s - loss: 0.0934 - f1_score_mod: 0.4193 - recall_mod: 0.2915 - precision_mod: 0.7485 - dur_error: 0.3591 - maestro_dur_loss: 0.0180 - val_loss: 0.0996 - val_f1_score_mod: 0.4179 - val_recall_mod: 0.2967 - val_precision_mod: 0.7077 - val_dur_error: 0.3675 - val_maestro_dur_loss: 0.0184
Epoch 65/150

Epoch 00065: val_loss improved from 0.09458 to 0.09402, saving model to ../models/best_maestro_model_scaled_2_1_512_0pt4_lr_5e-04_cv_0pt2.h5
50/50 - 250s - loss: 0.0928 - f1_score_mod: 0.4242 - recall_mod: 0.2973 - precision_mod: 0.7428 - dur_error: 0.3557 - mae