# AnimeLSTM
Author: [@neelr](https://github.com/neelr)  
Dataset: https://www.kaggle.com/programgeek01/anime-music-midi/  

In summary what this notebook tries to do is generate anime music from MIDI files using an LSTM neural network.

In [None]:
from tqdm.notebook import tqdm
import numpy as np
import re
import glob
from time import time
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import BatchNormalization as BatchNorm
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from keras_tqdm import TQDMNotebookCallback
from music21 import converter, instrument, note, chord, stream, tempo
import music21

Check all devices attached, make sure there is a GPU

In [None]:
tf.config.list_physical_devices(
    device_type=None
)


Parse each midi file and store all notes/chords/rests/metronome/time signature/key in the `notes` array.

In [None]:
notes = []
allFiles = glob.glob("data/*.mid")
for file in tqdm(allFiles):
    midi = converter.parse(file)
    notes_to_parse = None
    parts = instrument.partitionByInstrument(midi)
    if parts: # file has instrument parts
        notes_to_parse = parts.parts[0].recurse()
    else: # file has notes in a flat structure
        notes_to_parse = midi.flat.notes
    for sound in notes_to_parse:
        if isinstance(sound, note.Note):
            notes.append(str(sound.pitch))
        elif isinstance(sound, chord.Chord):
            notes.append('.'.join(str(n) for n in sound.normalOrder))
        elif isinstance(sound, note.Rest):
            notes.append("rest")
        elif isinstance(sound, tempo.MetronomeMark):
            notes.append(f"metronome-{sound.number}-{sound.text}")
        elif isinstance(sound, music21.meter.TimeSignature):
            notes.append(f"timesig-{sound.ratioString}")
        elif isinstance(sound, music21.key.Key):
            notes.append(f"key-{str(sound)}")

Save the notes in notes.npy and turn into numpy array for easy use later on.

In [None]:
np.save("numpy_array_saves/notes.npy", notes)

Parse the data (`notes` array) into LSTM compatible data (uses past 200 notes to predict next 1)

In [None]:
sequence_length = 200

# get all pitch names
pitchnames = sorted(set(item for item in notes))
# create a dictionary to map pitches to integers
note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
network_input = []
network_output = []
# create input sequences and the corresponding outputs
for i in range(0, len(notes) - sequence_length, 1):
    sequence_in = notes[i:i + sequence_length]
    sequence_out = notes[i + sequence_length]
    network_input.append([note_to_int[char] for char in sequence_in])
    network_output.append(note_to_int[sequence_out])
n_patterns = len(network_input)
# reshape the input into a format compatible with LSTM layers
network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
# normalize input
non_normalized_input = network_input
network_input = network_input / float(len(pitchnames))
network_output = to_categorical(network_output)

In [None]:
len(pitchnames) # number of outputs

### Decide model structure
Main differences between v1, v2, and v3 are that v1 had 100 note lookback, and 88 samples, while v2 had a 200 note lookback and ~130 samples. The big jump from v2 to v3 is that I added melodies, time signatures, and keys.

In [None]:
model = Sequential()
model.add(LSTM(
    300,
    input_shape=(network_input.shape[1], network_input.shape[2]),
    return_sequences=True
))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(700))
model.add(Dense(850))
model.add(Dropout(0.3))
model.add(Dense(len(pitchnames)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
filepath = "v3-{epoch:02d}-{loss:.4f}-bigger.hdf5"    
checkpoint = ModelCheckpoint(
    filepath, monitor='loss', 
    verbose=0,        
    save_best_only=True,    
    mode='min'
)
tensorboard = TensorBoard(log_dir="logs\{}".format(time()), profile_batch=0, update_freq='epoch')
# Take out checkpoint if you don't want to save the best weights, and tensorboard if you don't have that running
callbacks_list = [checkpoint, TQDMNotebookCallback(), tensorboard]

# Verbose 0 because keras TQDM is a better UI + keras' loading bars don't work for me locally
model.fit(network_input, network_output, epochs=200, batch_size=64, callbacks=callbacks_list,verbose=0)

In [None]:
# Skip the whole training and load weights if you want
model.load("models/Anime-LSTM-Model-v3.hdf5")

## Generation
Here we use the model to generate 1000 notes. Bellow we predict each note and use the `int_to_note` dictionary to map the outputs to the corresponding string

In [None]:
start = np.random.randint(0, len(network_input)-1)
int_to_note = dict((number, note) for number, note in enumerate(pitchnames))
pattern = non_normalized_input[start]
prediction_output = []
# generate 1000 notes
for note_index in tqdm(range(1000)):
    prediction_input = np.reshape(pattern,(1,pattern.shape[0],1))
    prediction_input = prediction_input / float(len(pitchnames))
    prediction = model.predict(prediction_input, verbose=0)
    index = np.argmax(prediction)
    result = int_to_note[index]
    prediction_output.append(result)
    pattern = np.append(pattern,[index])
    pattern = pattern[1:len(pattern)]

Here we take the output strings and parse them into the corresponding music21 classes, and add then to the `output_notes` array

In [None]:
offset = 0
output_notes = []
# create note and chord objects based on the values generated by the model
for pattern in prediction_output:
    # Pattern is a chord
    if (('.' in pattern) or pattern.isdigit()) and not ('metronome' in pattern):
        notes_in_chord = pattern.split('.')
        notes = []
        for current_note in notes_in_chord:
            new_note = note.Note(int(current_note))
            new_note.storedInstrument = instrument.Piano()
            notes.append(new_note)
        new_chord = chord.Chord(notes)
        new_chord.offset = offset
        output_notes.append(new_chord)
    # Pattern is a rest
    elif pattern == "rest":
        new_note = note.Rest()
        new_note.offset = offset
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)
    # Pattern is a key
    elif ('key' in pattern):
        key = music21.key.Key(re.findall(r"-(.*) ",pattern)[0], re.findall(r" (.*)",pattern)[0])
        output_notes.append(key)
    # Pattern is a metronome
    elif ('metronome' in pattern):
        metronome = music21.tempo.MetronomeMark(re.findall(r"-.*?(-(.*))",pattern)[0][1], float(re.findall(r"-(.*)-",pattern)[0]))
        output_notes.append(metronome)
    # Pattern is timesig
    elif ('timesig' in pattern):
        timesig = music21.meter.TimeSignature(re.findall(r"-(.*)",pattern))
        output_notes.append(timesig)
    # If none, then pattern is a note
    else:
        new_note = note.Note(pattern)
        new_note.offset = offset
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)
    # increase offset each iteration so that notes do not stack
    # Can change to edit the speed, but 0.5 is the norm
    offset += 0.5

Lastly we take the output notes and create a MIDI stream and save it!

In [None]:
midi_stream = stream.Stream(output_notes)
midi_stream.write('midi', fp=f'anime_music_{time()}.mid')