In [22]:
batch_size = 1
max_length = 300
lstm_dim = 512
latent_dim = 2

In [23]:
import os
import glob
from music21 import converter, instrument, note, chord, stream

def parse_midi_files(dir):
    notes = []
    songs = []

    for file in glob.glob(os.path.join(dir, '*.mid')):
        song = []
        midi = converter.parse(file)

        print("Parsing %s" % file)

        notes_to_parse = None

        try: # file has instrument parts
            s2 = instrument.partitionByInstrument(midi)
            notes_to_parse = s2.parts[0].recurse() 
        except: # file has notes in a flat structure
            notes_to_parse = midi.flat.notes

        for element in notes_to_parse:
            if isinstance(element, note.Note):
                song.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                song.append('.'.join(str(n) for n in element.normalOrder))
        songs.append(song)
        notes += song

    return notes, songs

In [24]:
from keras.preprocessing.sequence import pad_sequences

notes, songs = parse_midi_files('./midi_songs')

pitchnames = sorted(set(notes))
n_vocab = len(pitchnames)

note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
int_to_note = dict([[number, note] for note, number in note_to_int.items()])

encoded_songs = [[note_to_int[note] for note in song] for song in songs]

padded_songs = pad_sequences(encoded_songs, maxlen=max_length)

Parsing ./midi_songs/bwv782.mid
Parsing ./midi_songs/bwv783.mid
Parsing ./midi_songs/bwv781.mid
Parsing ./midi_songs/bwv780.mid
Parsing ./midi_songs/bwv784.mid
Parsing ./midi_songs/bwv785.mid
Parsing ./midi_songs/bwv778.mid
Parsing ./midi_songs/bwv786.mid
Parsing ./midi_songs/bwv779.mid
Parsing ./midi_songs/bwv774.mid
Parsing ./midi_songs/bwv775.mid
Parsing ./midi_songs/bwv777.mid
Parsing ./midi_songs/bwv776.mid
Parsing ./midi_songs/bwv772.mid
Parsing ./midi_songs/bwv773.mid


In [25]:
import numpy as np

temp = np.zeros((padded_songs.shape[0], max_length, n_vocab))
temp[np.expand_dims(np.arange(padded_songs.shape[0]), axis=0).reshape(padded_songs.shape[0], 1), 
           np.repeat(np.array([np.arange(max_length)]), padded_songs.shape[0], axis=0), padded_songs] = 1

one_hot_encoded_songs = temp

In [26]:
from keras.layers import Input, LSTM, CuDNNLSTM
from keras.layers.core import Dense

x = Input(batch_shape=(batch_size, max_length, n_vocab))

h = LSTM(lstm_dim, return_sequences=False, name='lstm_1')(x)

z_mean = Dense(latent_dim)(h) # 潜在変数の平均 μ
z_log_var = Dense(latent_dim)(h) #潜在変数の分散 σのlog

In [27]:
from keras import backend as K
from keras.layers import Lambda

# 潜在変数のサンプリング
# 平均と分散を受けて、潜在変数をサンプリングします。
# z = μ + εΣ
# εは標準正規分布
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=1.0)
    return z_mean + K.exp(z_log_var) * epsilon

In [28]:
# Lambdaを使って式をwrap
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

In [29]:
from keras.layers import Dense, RepeatVector, TimeDistributed

repeated_context = RepeatVector(max_length)(z)
h_decoded = LSTM(lstm_dim, return_sequences=True, name='dec_lstm_1')(repeated_context)
x_decoded = TimeDistributed(Dense(n_vocab, activation='softmax'), name='decoded_mean')(h_decoded)

In [30]:
from keras.layers import Layer
from keras import metrics

class CustomVariationalLayer(Layer): # Layer classの継承
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)

    def vae_loss(self, x, x_decoded):
        x = K.flatten(x)
        x_decoded = K.flatten(x_decoded)
        xent_loss = max_length * metrics.binary_crossentropy(x, x_decoded) # 復元誤差: Reconstruction Error
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) # 正則化項: KL Divergence
        return K.mean(xent_loss + kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded = inputs[1]
        loss = self.vae_loss(x, x_decoded)
        self.add_loss(loss, inputs=inputs) # Layer class のadd_lossを利用
        return x # 実質的には出力は利用しない

In [36]:
from keras.models import Model

y = CustomVariationalLayer()([x, x_decoded])
vae = Model(x, y) # xをinputにyを出力, 出力は実質関係ない
vae.compile(optimizer='rmsprop', loss=None, metrics=['acc']) # CustomVariationalLayerで追加したLossを利用するのでここでのlossはNoneとする

In [37]:
from keras.callbacks import ModelCheckpoint

def create_model_checkpoint(dir, model_name):
    filepath = dir + '/' + \
               model_name + "-{epoch:02d}-{acc:.2f}-{loss:.2f}.h5"
    directory = os.path.dirname(filepath)

    try:
        os.stat(directory)
    except:
        os.mkdir(directory)

    checkpointer = ModelCheckpoint(filepath=filepath,
                                                              monitor='loss',
                                                              verbose=1,
                                                              save_best_only=True)

    return checkpointer

In [38]:
from keras.callbacks import CSVLogger

checkpointer = create_model_checkpoint('./vae_new_output_1', 'music_vae')
csv_logger = CSVLogger(os.path.join('./vae_new_output_1', 'music_vae_log.csv'))

In [40]:
history = vae.fit(one_hot_encoded_songs, 
                            shuffle=True,
                            batch_size=batch_size, 
                            epochs=100, 
                            callbacks=[csv_logger], 
                            verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
 3/15 [=====>........................] - ETA: 20s - loss: 12.2522

KeyboardInterrupt: 