In [1]:
max_length = 100
latent_dim = 32
lstm_dim = 512
steps = 15001
batch_size = 128

# midi_dir =  '/content/gdrive/My Drive/Colab/midi_songs'
# out_dir = '/content/gdrive/My Drive/Colab/gan_new_output_1'
midi_dir =  './midi_songs'
out_dir = './gan_new_output_2'

In [2]:
import os
import glob
from music21 import converter, instrument, note, chord, stream

def parse_midi_files(dir):
    notes = []
    songs = []
    file_list = []
    
    files = glob.glob(os.path.join(dir, '*.mid'))

    for file in files:
        song = []
        
        file_list.append(os.path.basename(file))
        midi = converter.parse(file)

        print("Parsing %s" % file)

        notes_to_parse = None

        try: # file has instrument parts
            s2 = instrument.partitionByInstrument(midi)
            notes_to_parse = s2.parts[0].recurse() 
        except: # file has notes in a flat structure
            notes_to_parse = midi.flat.notes

        for element in notes_to_parse:
            if isinstance(element, note.Note):
                song.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                song.append('.'.join(str(n) for n in element.normalOrder))
        songs.append(song)
        notes += song

    return notes, songs, file_list

In [3]:
notes, songs, file_list = parse_midi_files(midi_dir)

Parsing ./midi_songs/bwv782.mid
Parsing ./midi_songs/bwv783.mid
Parsing ./midi_songs/bwv781.mid
Parsing ./midi_songs/bwv780.mid
Parsing ./midi_songs/bwv784.mid
Parsing ./midi_songs/bwv785.mid
Parsing ./midi_songs/bwv778.mid
Parsing ./midi_songs/bwv786.mid
Parsing ./midi_songs/bwv779.mid
Parsing ./midi_songs/bwv774.mid
Parsing ./midi_songs/bwv775.mid
Parsing ./midi_songs/bwv777.mid
Parsing ./midi_songs/bwv776.mid
Parsing ./midi_songs/bwv772.mid
Parsing ./midi_songs/bwv773.mid


In [4]:
pitchnames = sorted(set(item for item in notes))
n_vocab = len(pitchnames)

note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
int_to_note = dict([[number, note] for note, number in note_to_int.items()])

In [5]:
import numpy as np

def prepare_sequences(notes, sequence_length=100):
    # get all pitch names
    pitchnames = sorted(set(item for item in notes))
    n_vocab = len(pitchnames)
    
    # convert notes to one-hot encoded
    one_hot_notes = []
    for note in notes:
        one_hot_note = np.zeros(n_vocab)
        one_hot_note[note_to_int[note]] = 1
        one_hot_notes.append(one_hot_note)

    network_input = []
    network_output = []

    # create input sequences and the corresponding outputs
    for i in range(0, len(one_hot_notes) - sequence_length, 1):
        sequence_in = one_hot_notes[i:i + sequence_length]
        sequence_out = one_hot_notes[i + sequence_length]
        network_input.append(sequence_in)
        network_output.append(sequence_out)

    n_patterns = len(network_input)

    # reshape the input into a format compatible with LSTM layers
    network_input = np.reshape(network_input, (n_patterns, sequence_length, n_vocab))

    network_output = np.array(network_output)

    return (network_input, network_output)

In [6]:
network_input, network_output = prepare_sequences(notes, sequence_length=max_length)

In [7]:
from keras.layers import Input
from keras.layers import RepeatVector, Dense, TimeDistributed
from keras.layers import LSTM, CuDNNLSTM 
from keras.optimizers import Adam
from keras.models import Model
from tqdm import tqdm

Using TensorFlow backend.
  return f(*args, **kwds)


In [8]:
def Generator(latent_dim=32, max_length=100, lstm_dim=512, n_vocab=None):
    model_input = Input(shape=(max_length, latent_dim,))
    x = LSTM(lstm_dim, return_sequences=True)(model_input)
    model_output = TimeDistributed(Dense(n_vocab, activation='softmax'))(x)
    model = Model(model_input, model_output)
    
    return model

In [9]:
def Discriminator(max_length=100, n_vocab=None, lstm_dim=512, opt=Adam(lr=1e-4)):
    model_input = Input(shape=(max_length, n_vocab))
    x = LSTM(lstm_dim)(model_input)
    model_output = Dense(2, activation='softmax')(x)
    model = Model(model_input, model_output)
    model.compile(loss='binary_crossentropy', optimizer=opt)
    
    return model

In [10]:
def combined_network(generator, discriminator, max_length=100, latent_dim=32, opt=Adam(lr=1e-3)):
    gan_input = Input(shape=(max_length, latent_dim))
    x = generator(gan_input)
    gan_output = discriminator(x)
    model = Model(gan_input, gan_output)
    model.compile(loss='binary_crossentropy', optimizer=opt)
    
    return model

In [11]:
def make_trainable(net, val):
    net.trainable = val
    for l in net.layers:
        l.trainable = val

In [12]:
import os
from music21 import instrument, note, stream, chord

def create_midi(prediction_output, file_path):
    """ convert the output from the prediction to notes and create a midi file
        from the notes """
    offset = 0
    output_notes = []

    # create note and chord objects based on the values generated by the model
    for pattern in prediction_output:
        # pattern is a chord
        if ('.' in pattern) or pattern.isdigit():
            notes_in_chord = pattern.split('.')
            notes = []
            for current_note in notes_in_chord:
                new_note = note.Note(int(current_note))
                new_note.storedInstrument = instrument.Piano()
                notes.append(new_note)
            new_chord = chord.Chord(notes)
            new_chord.offset = offset
            output_notes.append(new_chord)
        # pattern is a note
        else:
            new_note = note.Note(pattern)
            new_note.offset = offset
            new_note.storedInstrument = instrument.Piano()
            output_notes.append(new_note)

        # increase offset each iteration so that notes do not stack
        offset += 0.5

    midi_stream = stream.Stream(output_notes)
    midi_stream.write('midi', fp=file_path)

In [13]:
import csv
from keras.optimizers import RMSprop

generator = Generator(latent_dim=latent_dim, max_length=max_length, 
                                        lstm_dim=lstm_dim, n_vocab=n_vocab)
discriminator = Discriminator(max_length=max_length, n_vocab=n_vocab, 
                                                    lstm_dim=lstm_dim, opt=RMSprop(lr=8e-4, clipvalue=1.0))
make_trainable(discriminator, False)
GAN = combined_network(generator, discriminator, latent_dim=32, 
                                                opt=RMSprop(lr=4e-4, clipvalue=1.0))
GAN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100, 32)           0         
_________________________________________________________________
model_1 (Model)              (None, 100, 124)          1179772   
_________________________________________________________________
model_2 (Model)              (None, 2)                 1305602   
Total params: 2,485,374
Trainable params: 1,179,772
Non-trainable params: 1,305,602
_________________________________________________________________


In [15]:
f = open(os.path.join(out_dir, 'gan_log.csv'),'a')
writer = csv.writer(f)

for step in range(steps):  
    input_batch = network_input[np.random.randint(0, network_input.shape[0], size=batch_size),:,:]    
    noise_gen = np.random.uniform(0,1,size=[batch_size, max_length, latent_dim])
    generated_melodies = generator.predict(noise_gen)

    make_trainable(discriminator,True)

    X = np.concatenate((input_batch, generated_melodies))
    y = np.zeros([2 * batch_size, 2])
    y[:batch_size,1] = 1
    y[batch_size:,0] = 1

    d_loss = discriminator.train_on_batch(X,y)

    make_trainable(discriminator,False)

    noise_gen = np.random.uniform(0,1,size=[batch_size, max_length, latent_dim])
    y2 = np.zeros([batch_size, 2])
    y2[:,1] = 1

    a_loss = GAN.train_on_batch(noise_gen, y2 )

    writer.writerow([step, d_loss, a_loss])

    if step % 100 == 0:
        # Save model weights
        GAN.save_weights(os.path.join(out_dir, 'gan_{}.h5'.format(step)))

        # Print metrics
        print('discriminator loss at step %s: %s' % (step, d_loss))
        print('adversarial loss at step %s: %s' % (step, a_loss))

        generated_indices = np.argmax(generated_melodies, axis=2)

        generated_song = [int_to_note[index] for index in generated_indices[0]]

        # Save Generated Song Midi
        create_midi(generated_song, os.path.join(out_dir, 'generated_song_' + str(step) + '.mid'))

f.close()

discriminator loss at step 0: 0.6417545
adversarial loss at step 0: 5.724183


KeyboardInterrupt: 