# AnimeLSTM
Basically what I'm trying to do is train a NN to learn to play anime piano music. All training data is in ./data and bellow I encode each note in every song into an array (if its a chord I encode the chord into a string speparated by "."). Then I'll use an LSTM to guess what chord comes next after letting it predict it based on ~100 past notes.

In [111]:
# imports
from tqdm.notebook import tqdm
import numpy as np
import glob
from time import time
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import BatchNormalization as BatchNorm
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from music21 import converter, instrument, note, chord, stream

In [123]:
notes = []
allFiles = glob.glob("data/*.mid")
for file in tqdm(allFiles):
    midi = converter.parse(file)
    notes_to_parse = None
    parts = instrument.partitionByInstrument(midi)
    if parts: # file has instrument parts
        notes_to_parse = parts.parts[0].recurse()
    else: # file has notes in a flat structure
        notes_to_parse = midi.flat.notes
    for sound in notes_to_parse:
        if isinstance(sound, note.Note):
            notes.append(str(sound.pitch))
        elif isinstance(sound, chord.Chord):
            notes.append('.'.join(str(n) for n in sound.normalOrder))

HBox(children=(IntProgress(value=0, max=112), HTML(value='')))




Save the notes in notes.npy and turn into numpy array for easy use later on.

In [127]:
notes = np.array(notes)
np.save("notes.npy",notes)

Parse the data into LSTM compatible data (uses past 100 notes to predict next 1)

In [126]:
sequence_length = 300

# get all pitch names
pitchnames = sorted(set(item for item in notes))
# create a dictionary to map pitches to integers
note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
network_input = []
network_output = []
# create input sequences and the corresponding outputs
for i in range(0, len(notes) - sequence_length, 1):
    sequence_in = notes[i:i + sequence_length]
    sequence_out = notes[i + sequence_length]
    network_input.append([note_to_int[char] for char in sequence_in])
    network_output.append(note_to_int[sequence_out])
n_patterns = len(network_input)
# reshape the input into a format compatible with LSTM layers
network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
# normalize input
non_normalized_input = network_input
network_input = network_input / float(len(pitchnames))
network_output = to_categorical(network_output)

In [129]:
np.save("x.npy", network_input)
np.save("y.npy", network_output)

In [130]:
model = Sequential()
model.add(LSTM(
    400,
    input_shape=(network_input.shape[1], network_input.shape[2]),
    return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(800, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(400))
model.add(Dense(400))
model.add(Dropout(0.3))
model.add(Dense(len(pitchnames)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
filepath = "weights\weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"    
checkpoint = ModelCheckpoint(
    filepath, monitor='loss', 
    verbose=0,        
    save_best_only=True,        
    mode='min'
) 
tensorboard = TensorBoard(log_dir="logs\{}".format(time()), profile_batch=0, update_freq='epoch')
callbacks_list = [checkpoint, tensorboard]     
model.fit(network_input, network_output, epochs=300, batch_size=64, callbacks=callbacks_list)

Train on 77735 samples
Epoch 1/300








Epoch 2/300
  128/77735 [..............................] - ETA: 34:05 - loss: 4.70 - ETA: 34:15 - loss: 4.7485

## Generation

In [119]:
start = np.random.randint(0, len(network_input)-1)
int_to_note = dict((number, note) for number, note in enumerate(pitchnames))
pattern = non_normalized_input[start]
prediction_output = []
# generate 500 notes
for note_index in tqdm(range(1000)):
    prediction_input = np.reshape(pattern,(1,pattern.shape[0],1))
    prediction_input = prediction_input / float(len(pitchnames))
    prediction = model.predict(prediction_input, verbose=0)
    index = np.argmax(prediction)
    result = int_to_note[index]
    prediction_output.append(result)
    pattern = np.append(pattern,[index])
    pattern = pattern[1:len(pattern)]

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [120]:
offset = 0
output_notes = []
# create note and chord objects based on the values generated by the model
for pattern in prediction_output:
    # pattern is a chord
    if ('.' in pattern) or pattern.isdigit():
        notes_in_chord = pattern.split('.')
        notes = []
        for current_note in notes_in_chord:
            new_note = note.Note(int(current_note))
            new_note.storedInstrument = instrument.Piano()
            notes.append(new_note)
        new_chord = chord.Chord(notes)
        new_chord.offset = offset
        output_notes.append(new_chord)
    # pattern is a note
    else:
        new_note = note.Note(pattern)
        new_note.offset = offset
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)
    # increase offset each iteration so that notes do not stack
    offset += 0.5

In [121]:
midi_stream = stream.Stream(output_notes)
midi_stream.write('midi', fp=f'anime_music_{time()}.mid')

'anime_music_1589394248.6651573.mid'