In [2]:
import collections
import datetime
import glob
import numpy as np
import pathlib
import pandas as pd
import random
import pretty_midi
import seaborn as sns
import tensorflow as tf
from model import *
import math
from IPython import display
from matplotlib import pyplot as plt
from typing import Dict, List, Optional, Sequence, Tuple

In [4]:
data_dir = pathlib.Path('data/pokemon_rby')
filenames = glob.glob(str(data_dir/'**/*.mid*'))
print(len(filenames))

114


In [5]:
def midi_to_notes(midi_file: str) -> pd.DataFrame:
  notes = collections.defaultdict(list)
  pm = pretty_midi.PrettyMIDI(midi_file)
  note_list = []
  #sorted_instrums = sorted(pm.instruments, key=lambda instrument: len(instrument.notes))
  for instrument in pm.instruments[:5]:
      for n in instrument.notes:
        note_list.append(n)
  
  sorted_notes = sorted(note_list, key=lambda note: note.start)

  prev_start = sorted_notes[0].start
  for note in sorted_notes:
    start = note.start
    end = note.end
    notes['pitch'].append(note.pitch)
    notes['start'].append(start)
    notes['end'].append(end)
    notes['step'].append(start - prev_start)
    notes['duration'].append(end - start)
    prev_start = start

  return pd.DataFrame({name: np.array(value) for name, value in notes.items()})

In [6]:
def notes_to_midi(
  notes: pd.DataFrame,
  out_file: str, 
  instrument_name: str,
  velocity: int = 100,  # note loudness
) -> pretty_midi.PrettyMIDI:

  pm = pretty_midi.PrettyMIDI()
  instrument = pretty_midi.Instrument(
      program=pretty_midi.instrument_name_to_program(
          instrument_name))

  prev_start = 0
  for i, note in notes.iterrows():
    start = float(prev_start + note['step'])
    end = float(start + note['duration'])
    note = pretty_midi.Note(
        velocity=velocity,
        pitch=int(note['pitch']),
        start=start,
        end=end,
    )
    instrument.notes.append(note)
    prev_start = start

  pm.instruments.append(instrument)
  pm.write(out_file)
  return pm
    

In [7]:
def closest(numbers, target):
    def difference(number):
        return math.fabs(number-target)
    return str(min(numbers, key=difference))

def split_labels(sequences):
    inputs = sequences[:-1]
    labels = sequences[1:]
    return inputs, labels

In [8]:
key_order = ['pitch', 'step', 'duration']
#durations = [.001, .02, .03, .04, .05, .06, .08, .1, .35, 1]
#steps = [.0001, .001, .003, .005, .008, .01, .04, .06, .1, .3, .7, 1]
durations = [2, 1, .5, .25, .125, .0625, .03125, .015625]
steps = [2, 1, .5, .25, .125, .0625, .03125, .015625]
pitches = 128
vocab_size = len(durations)*len(steps)*pitches

vocab = {}

x = 1
for i in range(1, pitches+1):
    for s in steps:
        for d in durations:
            vocab[str(i)+"+"+str(s)+"+"+str(d)] = x
            x+=1

rev_vocab = dict((value, key) for key, value in vocab.items())
seq_length = 30
batch_size = 128
num_files =  184

print(vocab_size)

8192


In [9]:
all_notes = []
for f in filenames[:num_files]:
  notes = midi_to_notes(f)
  all_notes.append(notes)

all_notes = pd.concat(all_notes)

n_notes = len(all_notes)
print('Number of notes parsed:', n_notes)

stacked_notes = np.stack([all_notes[key] for key in key_order], axis=1)
tokenized_notes = []
for note in stacked_notes:
    string = str(int(note[0]))+'+'+closest(steps, note[1])+'+'+closest(durations, note[2])
    tokenized_notes.append(vocab[string])

notes_ds = tf.data.Dataset.from_tensor_slices(tokenized_notes)

expanded_seq = seq_length+1
windows = notes_ds.window(expanded_seq, shift=1, stride=1, drop_remainder=True)
flatten = lambda x: x.batch(expanded_seq, drop_remainder=True)
sequences = windows.flat_map(flatten)
    
seq_ds = sequences.map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)

buffer_size = n_notes-seq_length
train_dataset = (seq_ds
            .shuffle(buffer_size)
            .batch(batch_size, drop_remainder=True)
            .cache()
            .prefetch(tf.data.experimental.AUTOTUNE))



Number of notes parsed: 135124


In [10]:
embed_dim = d_model = 300
num_heads = 8
warmup_steps = 4000

learning_rate = 0.0005
        
def scheduler(epoch, lr):
    if epoch == 0:
        return lr
    else:
        step_num = epoch * warmup_steps
        lr = d_model**(-0.5) * min(step_num**(-0.5), step_num * warmup_steps**(-1.5))
        return lr
        
lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [11]:
encoder_inputs = keras.Input(shape = (seq_length), dtype='int32', name = "input_layer_new")
y = InputEmbedding(seq_length, vocab_size, embed_dim, name = "embed_layer")(encoder_inputs)
z = EncoderBlock(embed_dim, 2048, num_heads, name = "encoder_layer_new")(y) 
encoder_outputs = layers.Dense(vocab_size, name = "output_layer_new")(z)

encoder = keras.Model(encoder_inputs, encoder_outputs)
embeddings = keras.Model(encoder_inputs, y)

encoder.summary()

encoder.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=['accuracy'],
)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='./training_checkpoints/ckpt_{epoch}',
        save_weights_only=True),
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        patience=5,
        verbose=1,
        restore_best_weights=True),
    lr_callback
]

#encoder.fit(train_dataset, epochs=30, callbacks=callbacks)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer_new (InputLayer  [(None, 30)]             0         
 )                                                               
                                                                 
 input_embedding (InputEmbed  (None, 30, 300)          2466600   
 ding)                                                           
                                                                 
 encoder_block (EncoderBlock  (None, 30, 300)          4119848   
 )                                                               
                                                                 
 output_layer_new (Dense)    (None, 30, 8192)          2465792   
                                                                 
Total params: 9,052,240
Trainable params: 9,052,240
Non-trainable params: 0
___________________________________________________

In [174]:
encoder = keras.models.load_model('models/rby_model')

In [187]:
temperature = .001
num_predictions = 1000

x = random.randint(1, 114)
#sample_file = filenames[x]
sample_file = 'data\pokemon_dpp\Pokemon\Jubilife City.mid'
pm = pretty_midi.PrettyMIDI(sample_file)
initial_notes = midi_to_notes(sample_file)
instrument_name = 'Acoustic Grand Piano'

stacked = np.stack([initial_notes[key] for key in key_order], axis=1)[:seq_length]

tokenized = []
for note in stacked:
    string = str(int(note[0]))+'+'+closest(steps, note[1])+'+'+closest(durations, note[2])
    tokenized.append(vocab[string])

generated_notes = []
prev_start = 0

for _ in range(num_predictions):
    inputs = tf.expand_dims(tokenized, 0)
    predictions = encoder.predict(inputs)[0][-1]
    pred = rev_vocab[np.argmax(predictions)]
    pred_split = pred.split('+')
    pitch, step, duration = int(pred_split[0]), float(pred_split[1]), float(pred_split[2])
    start = prev_start + step
    end = start + duration
    generated_notes.append((*(pitch,step,duration), start, end))
    prev_start = start
    tokenized = tokenized[1:]
    tokenized.append(np.argmax(predictions))

generated_notes = pd.DataFrame(
    generated_notes, columns=(*key_order, 'start', 'end'))









In [188]:
out_file = 'output.mid'
out_pm = notes_to_midi(
    generated_notes, out_file=out_file, instrument_name=instrument_name)