In [1]:
import glob
import pickle
import numpy
from music21 import converter, instrument, note, chord

In [2]:
def get_notes():
    """ Get all the notes and chords from the midi files in the ./midi_songs directory """
    notes = []

    for file in glob.glob("midi_songs/*.mid"):
        midi = converter.parse(file)

        print("Parsing %s" % file)

        notes_to_parse = None

        try: # file has instrument parts
            s2 = instrument.partitionByInstrument(midi)
            notes_to_parse = s2.parts[0].recurse() 
        except: # file has notes in a flat structure
            notes_to_parse = midi.flat.notes

        for element in notes_to_parse:
            if isinstance(element, note.Note):
                notes.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                notes.append('.'.join(str(n) for n in element.normalOrder))

    with open('data/notes', 'wb') as filepath:
        pickle.dump(notes, filepath)

    return notes


In [94]:
def prepare_sequences(notes, n_vocab):
    """ Prepare the sequences used by the Neural Network """
    sequence_length = 100

    # get all pitch names
    pitchnames = sorted(set(item for item in notes))

     # create a dictionary to map pitches to integers
    note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

    network_input = []
    network_output = []

    # create input sequences and the corresponding outputs
    for i in range(0, len(notes) - sequence_length, 1):
        sequence_in = notes[i:i + sequence_length]
        sequence_out = notes[i + sequence_length]
        network_input.append([note_to_int[char] for char in sequence_in])
        network_output.append(note_to_int[sequence_out])

    n_patterns = len(network_input)

    # reshape the input into a format compatible with LSTM layers
    network_input = numpy.reshape(network_input, (n_patterns, sequence_length, 1))
    # normalize input
    # network_input = network_input / float(n_vocab)

    # network_input = np_utils.to_categorical(network_input)
    network_output = np_utils.to_categorical(network_output)

    return (network_input, network_output)

In [95]:
import keras
import keras.backend as K
from keras.models import Model, Input
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.layers import Lambda, RepeatVector, TimeDistributed, Layer
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [96]:
latent_dim = 3
max_length = 100

notes = get_notes()

# get amount of pitch names
n_vocab = len(set(notes))

network_input, network_output = prepare_sequences(notes, n_vocab)

encoder_input = Input(shape=(network_input.shape[1], network_input.shape[2]))

x = LSTM(512)(encoder_input)

z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)

Parsing midi_songs/bwv782.mid


Parsing midi_songs/bwv783.mid


Parsing midi_songs/bwv781.mid


Parsing midi_songs/bwv780.mid


Parsing midi_songs/bwv784.mid


Parsing midi_songs/bwv785.mid


Parsing midi_songs/bwv778.mid


Parsing midi_songs/bwv786.mid
Parsing midi_songs/bwv779.mid


Parsing midi_songs/bwv774.mid
Parsing midi_songs/bwv775.mid


Parsing midi_songs/bwv777.mid


Parsing midi_songs/bwv776.mid


Parsing midi_songs/bwv772.mid
Parsing midi_songs/bwv773.mid


In [97]:
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim),
                              mean=0., stddev=1.)
    return z_mean + K.exp(z_log_var) * epsilon

z = Lambda(sampling)([z_mean, z_log_var])

In [98]:
import numpy as np

decoder_input = Input(K.int_shape(z)[1:])

repeated_context = RepeatVector(max_length)(decoder_input)

h = LSTM(512, return_sequences=True)(repeated_context)

decoded = TimeDistributed(Dense(n_vocab, activation='softmax'), name='decoded_mean')(h)

decoder = Model(decoder_input, decoded)

z_decoded = decoder(z)

In [99]:
class CustomVariationalLayer(Layer):
    def vae_loss(self, x, z_decoded):
        x = K.flatten(x)
        z_decoded = K.argmax(z_decoded, axis=-1)
        z_decoded = K.flatten(z_decoded)
        xend_loss = keras.metrics.sparse_categorical_crossentropy(x, z_decoded)
        kl_loss = -5e-4 * K.mean(
            1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1
        )
        return K.mean(xend_loss + kl_loss)
    
    def call(self, inputs):
        x = inputs[0]
        z_decoded = inputs[1]
        loss = self.vae_loss(x, z_decoded)
        self.add_loss(loss, inputs=inputs)
        return x

y = CustomVariationalLayer()([encoder_input, z_decoded])

TypeError: Expected int64, got 1e-07 of type 'float' instead.

In [49]:
vae = Model(encoder_input, y)
vae.compile(optimizer='rmsprop', loss=None)
vae.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 100, 1)       0                                            
__________________________________________________________________________________________________
lstm_11 (LSTM)                  (None, 512)          1052672     input_13[0][0]                   
__________________________________________________________________________________________________
dense_14 (Dense)                (None, 3)            1539        lstm_11[0][0]                    
__________________________________________________________________________________________________
dense_15 (Dense)                (None, 3)            1539        lstm_11[0][0]                    
__________________________________________________________________________________________________
lambda_6 (

In [50]:
vae.fit(x=network_input, y=None,
        shuffle=False,
        epochs=10,
        batch_size=32,)

Epoch 1/10


InvalidArgumentError: Incompatible shapes: [396800] vs. [3200]
	 [[Node: training_1/RMSprop/gradients/custom_variational_layer_4/logistic_loss/mul_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@train...ad/Reshape"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](training_1/RMSprop/gradients/custom_variational_layer_4/logistic_loss/mul_grad/Shape, training_1/RMSprop/gradients/custom_variational_layer_4/logistic_loss/mul_grad/Shape_1)]]

In [45]:
network_input.shape

(8327, 100, 1)

In [46]:
z_decoded.shape

TensorShape([Dimension(None), Dimension(100), Dimension(124)])

In [47]:
32 * 100 * 124


396800