In [None]:
%load_ext autoreload
%autoreload 2

# Reinforcement learning for generating music

Inspired by [this](https://deepmind.com/blog/article/learning-to-generate-images) article of deepmind, we want to try to train an agent to generate music.
When humans create music, they neither generate pure waveforms or spectrograms, instead, they choose a couple of sounds or instruments and experiment on a higher level with them. Midi data is a good abstraction for this.
We can try to mimic this process by having an agent generate the chords, and simultanously training a discriminator. The agents reward will be the log likelihood that $D$ predicts about being music.


Code structure:

Discriminator:
    - transform (midi -> mel spectro)
    - predict
    - train on batch
    - Models:
        - conv net
        - lstm
        - transformer
        
    

Environment:
    - step
    - reset
    - render
    - close
    
Agent:
    - next chord
    
    
midi -> wav -> mel -> D

Agent -> y -> midi 
    


## Code structure

In [None]:
class Discriminator():
    """Base class for neural nets that learn to discriminate
    generated midi from real midi data
    """
    
    def predict(self, sequence):
        """
        Gets:
            sequence: List of midi events, single sample
        Returns:
            q: float, log likelihood of the sequence being real
        """
        pass
    
    def train_on_batch(self, sequences):
        """
        Gets:
            sequences: List of midi sequences
        """
        pass
    
    @staticmethod
    def __transform(sequence):
        """
        Gets a sequence and generates a mel spectrogram
        which can be used for the neural net input
        """
        pass

In [None]:
class Agent():
    def reset(self):
        """Clear the memory of the agent to start generating
        a new song
        """
        pass
    
    def next_action(self, observation):
        pass

In [None]:
import gym


class Environment(gym.Env):
    def __init__(self):
        super().__init__()
#         # Define action and observation space
#         # They must be gym.spaces objects
#         # Example when using discrete actions:
#         self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
#         # Example for using image as input:
#         self.observation_space = spaces.Box(low=0, high=255,
#                                             shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)

    def step(self, action):
        return observation, reward, done, info

    def reset(self):
        return observation  # reward, done, info can't be included

    def render(self, mode='human'):
        pass
    
    def close (self):
        pass

In [None]:
class Training():
    def __init__(self, data, env, agent):
        pass

# Getting and inspecting the dataset

In [None]:
# !wget https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip
# !unzip maestro-v2.0.0-midi.zip

In [None]:
ls maestro-v2.0.0

In [None]:
import pandas as pd

df = pd.read_csv('maestro-v2.0.0/maestro-v2.0.0.csv')

In [None]:
df

In [None]:
# from __future__ import print_function

# import numpy as np
# import pretty_midi
# import tensorflow as tf


# def piano_roll_sequences(filenames, batch_size, sequence_size, rate=100):
#     """Returns a dataset of piano roll sequences from the given files.."""

#     def _to_piano_roll(filename, sequence_size):
#         """Load a file and return consecutive piano roll sequences."""
#         try:
#             midi = pretty_midi.PrettyMIDI(tf.compat.as_text(filename))
#         except Exception:
#             print("Skipping corrupt MIDI file", filename)
#             return np.zeros([0, sequence_size, 128], dtype=np.bool)
#         roll = np.asarray(midi.get_piano_roll(rate).transpose(), dtype=np.bool)
#         assert roll.shape[1] == 128
#         # Pad the roll to a multiple of sequence_size
#         length = len(roll)
#         remainder = length % sequence_size
#         if remainder:
#             new_length = length + sequence_size - remainder
#             roll = np.resize(roll, (new_length, 128))
#             roll[length:, :] = False
#             length = new_length
#         return np.reshape(roll, (length // sequence_size, sequence_size, 128))

#     def _to_piano_roll_dataset(filename):
#         """Filename (string scalar) -> Dataset of piano roll sequences."""
#         sequences, = tf.py_function(_to_piano_roll,
#                                 [filename, sequence_size],
#                                 [tf.bool])
#         sequences.set_shape([None, None, 128])
#         return tf.data.Dataset.from_tensor_slices(sequences)

#     batch_size = tf.cast(batch_size, tf.int64)
#     return (tf.data.Dataset.from_tensor_slices(filenames)
#             .interleave(_to_piano_roll_dataset,
#                         cycle_length=batch_size * 5,
#                         block_length=1)
#             .repeat()
#             .shuffle(1000)
#             .batch(batch_size))


# def piano_roll_to_midi(piano_roll, sample_rate):
#     """Convert the piano roll to a PrettyMIDI object.
#     See: http://github.com/craffel/examples/reverse_pianoroll.py
#     """
#     midi = pretty_midi.PrettyMIDI()
#     instrument = pretty_midi.Instrument(0)
#     midi.instruments.append(instrument)
#     padded_roll = np.pad(piano_roll, [(1, 1), (0, 0)], mode='constant')
#     changes = np.diff(padded_roll, axis=0)
#     notes = np.full(piano_roll.shape[1], -1, dtype=np.int)
#     for tick, pitch in zip(*np.where(changes)):
#         prev = notes[pitch]
#         if prev == -1:
#             notes[pitch] = tick
#             continue
#         notes[pitch] = -1
#         instrument.notes.append(pretty_midi.Note(
#             velocity=100,
#             pitch=pitch,
#             start=prev / float(sample_rate),
#             end=tick / float(sample_rate)))
#     return midi


# def write_test_note(path, duration, note):
#     midi = pretty_midi.PrettyMIDI()
#     instrument = pretty_midi.Instrument(0)
#     instrument.notes.append(pretty_midi.Note(100, note, 0.0, duration))
#     midi.instruments.append(instrument)
#     midi.write(path)

# Pretty Midi

Resources:
https://nbviewer.jupyter.org/github/craffel/pretty-midi/blob/master/Tutorial.ipynb

Instruments
- How many instruments do the samples have? -> 1
- What information is there about each instruments?
    - Notes
    - Control changes
    - Pitch bends -> We dont have it

In [None]:

import pretty_midi
from glob import glob
samples = [pretty_midi.PrettyMIDI(i) for i in glob('maestro-v2.0.0/2008/**.midi')]

In [None]:
samples[0].__dict__

In [None]:
plt.hist([sample.estimate_tempo() for sample in samples], bins=20)

## Visualizing

We can plot the notes being played per sample

In [None]:
import librosa.display
from matplotlib import pyplot as plt

def plot_piano_roll(pm, start_pitch, end_pitch, fs=100):
    # Use librosa's specshow function for displaying the piano roll
    librosa.display.specshow(pm.get_piano_roll(fs)[start_pitch:end_pitch],
                             hop_length=1, sr=fs, x_axis='time', y_axis='cqt_note',
                             fmin=pretty_midi.note_number_to_hz(start_pitch))

plt.figure(figsize=(8, 4))
plot_piano_roll(samples[0], 56, 70)

## Listen to it

In [90]:
from IPython.display import display, Audio
import soundfile as sf


def midi2wav(sample):
    """Generate an in-memory wav file from a PrettyMidi object
    Gets:
        sample: PrettMidi object
    Returns:
        data: np.array with 1 dimension, waveform
        rate: int, sample rate
    """
    return sample.synthesize(fs=44100), 44100


def listen_to(sample):
    """Create a audio player that renders a PrettyMidi object"""
    data, rate = midi2wav(sample)
    display(Audio(data=data, rate=rate))
    
def save_as_wav(sample, filename):
    data, rate = midi2wav(sample)
    sf.write(filename, data, rate)

In [None]:
listen_to(samples[0])

## Instruments

In [None]:
# How many instruments per sample?
num_instruments = [len(sample.instruments) for sample in samples]
min(num_instruments), max(num_instruments)

In [None]:
[sample.instruments[0].program for sample in samples]

## Resolution

In [None]:
[sample.resolution for sample in samples]

## Lyrics

In [None]:
[sample.lyrics for sample in samples if len(sample.lyrics)>0]

## Notes

In [None]:
# Notes of one instrument
sorted(samples[0].instruments[0].notes, key = lambda a: a.start)

In [None]:
# Number of notes per file
from matplotlib import pyplot as plt
plt.hist([len(sample.instruments[0].notes) for sample in samples], bins=20)
plt.show()

## Pitch bends
Since MIDI notes are all defined to have a specific integer pitch value, in order to represent arbitrary pitch frequencies we need to use pitch bends. A PitchBend class in pretty_midi holds a time (in seconds) and a pitch offset. The pitch offset is an integer in the range [-8192, 8191], which in General MIDI spans the range from -2 to +2 semitones. As with Notes, the Instrument class has a list for PitchBend class instances.

In [None]:
[len(sample.instruments[0].pitch_bends) for sample in samples]

Our dataset doesn't contain it, so we will ignore this

## Control Changes

In [None]:
# Number of control changes per file
plt.hist([len(sample.instruments[0].control_changes) for sample in samples], bins=20)
plt.show()

In [None]:
sorted(samples[0].instruments[0].control_changes, key=lambda a: a.time)

What does the value mean? Should we use one hot encoding or use the numerical value?
-> https://www.midi.org/specifications-old/item/table-3-control-change-messages-data-bytes-2

Number is a categorical feature, value is a numerical feature.

In [None]:
sample_control_changes_values = [i.value for i in samples[0].instruments[0].control_changes]
sorted(set(sample_control_changes_values))

In [None]:
tmp = [[i.number for i in sample.instruments[0].control_changes] for sample in samples]
sample_control_changes_number = [item for sublist in tmp for item in sublist]
sorted(set(sample_control_changes_number))

## Midi vector Mapper class

Summarizing, we got:
- One istrument per song in our piano dataset
- Notes that look like this Note(start=3.192708, end=3.227865, pitch=59, velocity=69)
    - What does velocity mean? -> How fast you hit the keyboard key, i.e. volume
- Control changes that have a categorical feature (number) and a numerical value
    - In our dataset, only a few different control numbers occur. So for one hot encoding, we should use only as many dimensions as different numbers occur, which is why the mapper should be dataset dependent
- No pitch bends
- Resolution is always 384

How should we encode it?
- Sequence of events, ordered by time
- Notes:
    - Encode duration vs end
    - Encode absolute start vs offset since last vs offset relative to rythm?
    
Should the one hot encoding -> midi category be a deterministic mapping (`[0.1, 0.5, 0.4] -> 1`) or a probabilistic mapping( $P(c | onehot) = onehot[c]$ )?

In [None]:
s = samples[0].instruments[0]
s.notes[0].start, s.control_changes


In [None]:
import numpy as np


class MidiVectorMapper():
    """Map a PrettyMIDI object to a sequence of vectors and back.
    For single instrument midi tracks only.
    Gets:
        - dataset: List of PrettyMIDI objects, to check for the categorical features, which features exist
    """
    def __init__(self, samples):
        """
        Dimensions:
            0: time
            1: is_note
            For notes only:
            2: pitch
            3: velocity
            4: duration
            For control changes only:
            5: value
            6-?: one hot encoding for control number
        """
        tmp = [[i.number for i in sample.instruments[0].control_changes] for sample in samples]
        sample_control_changes_number = [item for sublist in tmp for item in sublist]
        self.control_change_categories = sorted(set(sample_control_changes_number))
        self.dims = 5 + len(self.control_change_categories)
        
    def _timeof(self, event):
        """Return the start time for notes or the time for control change events
        """
        return event.start if isinstance(event, pretty_midi.Note) else event.time
    
    def midi2vec(self, sample):
        """Map a PrettyMIDI object to a sequence of vectors"""
        events = sorted(
            sample.instruments[0].notes +
            sample.instruments[0].control_changes,
            key=self._timeof
        )
        seq = np.zeros([len(events), self.dims])
        for i, event in enumerate(events):
            seq[i, 0] = self._timeof(event)
            if isinstance(event, pretty_midi.Note):
                seq[i, 1:5] = 1, event.pitch, event.velocity, event.end - event.start
            else:
                seq[5] = event.value
                seq[6+self.control_change_categories.index(event.number)] = 1
                
        return seq
    
    def vec2midi(self, seq):
        """Map a vector to a PrettyMIDI object with a single piano
        """
        song = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=300)
        piano = pretty_midi.Instrument(program=0)
        for event_vec in seq:
            if event_vec[1] > 0.5:
                piano.notes.append(
                    pretty_midi.Note(
                        start=event_vec[0],
                        pitch=int(event_vec[2]),
                        velocity=int(event_vec[3]),
                        end=event_vec[0]+event_vec[4]
                    )
                )
            else:
                piano.control_changes.append(
                    pretty_midi.ControlChange(
                        time=event_vec[0],
                        value=int(event_vec[5]),
                        number=self.control_change_categories[np.argmax(event_vec[6:])]
                    )
                )
        song.instruments.append(piano)
        return song
    
mapper = MidiVectorMapper(samples)
seq = mapper.midi2vec(samples[0])
reconstruction = mapper.vec2midi(seq)
listen_to(reconstruction)

In [None]:
listen_to(samples[0])

## Generating a random midi track

In [None]:
rand = np.zeros([8000,8])
rand[:,0] = np.random.uniform(0, 300, size=8000) # start
rand[:,1] = np.random.uniform(0,1,size=8000) < seq[:,1].mean() # is_note
# split notes and control change events
rand_notes = rand[rand[:,1]==1] # select rows there is_note is true
rand_cc = rand[rand[:,1]==0] # select rows there is_note is false
seq_notes = seq[seq[:,1]==1] # select rows there is_note is true
seq_cc = seq[seq[:,1]==0] # select rows there is_note is false
# notes: pitch, velocity, duration/end
rand_notes[:,2:5] = np.random.multivariate_normal(seq_notes[:,2:5].mean(axis=0), np.diag(seq_notes[:,2:5].std(axis=0)), size=len(rand_notes))
rand_notes[:,4] = np.max(rand_notes[:,4], 0)
# events: value, one hot encodings for number
# it doesn't really make sense to use normal distributed values for one hot
# encoding - it should be a distribution where p(rand_one_hot.argmax()) is distributed
# like p(rand_one_hot.argmax()), but it doesn't really matter
rand_cc[:,5:] = np.random.multivariate_normal(seq_cc[:,5:].mean(axis=0), np.diag(seq_cc[:,5:].std(axis=0)), size=len(rand_cc))
# copy back
rand[rand[:,1]==1] = rand_notes
rand[rand[:,1]==0] = rand_cc
# columns 2-5 are 7bit ints
rand[:,2:6] = np.clip(rand[:,2:6], 0, 127).astype(int)
# Done!
rand_midi = mapper.vec2midi(rand)
listen_to(rand_midi)

In [91]:
save_as_wav(rand_midi, 'random_sounds.wav')
!ls

exploration.ipynb       maestro-v2.0.0-midi.zip random_sounds.wav
[1m[36mmaestro-v2.0.0[m[m          maestro-v2.0.0.zip      requirements.txt
