# Imports and midi loading

Now we have a trainable discriminator - it's time to build the environment

TODO:
    - discriminator into env
        - reverse postprocessing to set range for actions
        - make it more efficient by making discriminator stateful and always feed a single time step
    - models => lstm
   

In [1]:
## Imports and data loading

%load_ext autoreload
%autoreload 2

import numpy as np
from matplotlib import pyplot as plt


from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras import metrics

from musicrl.midi2vec import MidiVectorMapper
from musicrl.render import *
from musicrl.data import RandomMidiDataGenerator

import pretty_midi
from glob import glob



import pprint
pprint = pprint.PrettyPrinter(indent=4).pprint


REAL = 1
GEN = 0

In [2]:
REAL

1

In [3]:
filepaths = list(glob('maestro-v2.0.0/2008/**.midi'))
real_midis = [pretty_midi.PrettyMIDI(i) for i in filepaths]
mapper = MidiVectorMapper(real_midis)


In [4]:
mapper = MidiVectorMapper(real_midis)
real_seq = mapper.midi2vec(real_midis[1])
real_seq.shape

(60867, 5)

In [5]:
real_seqs = [mapper.midi2vec(i) for i in real_midis]
shortest = np.min(np.array(list(map( lambda seq: len(seq), real_seqs))))
real_seqs=np.array(list(map(lambda seq: seq[0:shortest] ,real_seqs )))

# Load the discriminator

In [6]:
discriminator = load_model("models/seq_lstm.h5")

In [7]:
print(discriminator.inputs)
discriminator.summary()


[<tf.Tensor 'lstm_3_input:0' shape=(None, None, 5) dtype=float32>]
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, None, 128)         68608     
_________________________________________________________________
time_distributed_6 (TimeDist (None, None, 128)         16512     
_________________________________________________________________
time_distributed_7 (TimeDist (None, None, 1)           129       
Total params: 85,249
Trainable params: 85,249
Non-trainable params: 0
_________________________________________________________________


In [8]:
mapper.dims

5

# Environment

In [9]:
import gym
import pretty_midi



class SeqEnvironment(gym.Env):
    """We ignore control change events for now
    """
    def __init__(
        self,
        discriminator,
        mapper,
        observation_shape=[128],
        change_rate=0.01,
        batch_size=32
    ):
        super().__init__()
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions:
        # self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
        # Example for using image as input:
        # self.observation_space = spaces.Box(low=0, high=255,
        #                                    shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
        self.discriminator = discriminator
        self.mapper = mapper
        self.observation_shape = observation_shape
        self.change_rate = change_rate
        self.batch_size = batch_size
        self.reset()
        
    def step(self, actions):
        assert len(actions)==self.batch_size, f"Expected batch_size of {self.batch_size}"
        self.current_seqs.append(actions)
        bs = len(actions)
        self.observations  = self.observations \
            + np.random.normal(0, self.change_rate, size=[bs] + self.observation_shape)
        self.done = self.done | self.mapper.is_done(actions)
        
        reward = self.discriminator.predict(np.transpose(self.current_seqs, [1, 0, 2]))[:,-1,:]
        return np.array(self.observations), reward, self.done, None
    
    def reset(self):
        self.current_seqs = []
        self.done = np.array([False]*self.batch_size)
        self.observations = np.random.normal(0, 1, size=[self.batch_size] + self.observation_shape)
        return self.observations  # reward, done, info can't be included

    def render(self, mode='human'):
        pass
    
    def close (self):
        pass

In [10]:
discriminator.fit

<bound method Model.fit of <tensorflow.python.keras.engine.sequential.Sequential object at 0x15c4d7eb8>>

# Training Loop

In [11]:
from musicrl.midi2vec import PostProcessor

def postprocess_and_synthesize(gen_seq):
    postprocess = PostProcessor([mapper.midi2vec(real_midi) for real_midi in real_midis[:5]])
    gen_seq[:,0] += 0.4
    gen_seq = postprocess(gen_seq)
    gen_midi = mapper.vec2midi(gen_seq)
    listen_to(gen_midi)
    
def gen_real_midis_batch(batch_size, real_seqs, nr_timestep):
    shuffled_indices=np.random.permutation(len(real_seqs))  

    return np.array(list(map( lambda midi: midi[0:nr_timestep], real_seqs[shuffled_indices][0:batch_size])))
    
    
def train_discriminator(fake_seqs,real_seqs,batch_size):
    x = fake_seqs
    y=np.zeros(len(x))  #fake=0
    
    x_real = gen_real_midis_batch(batch_size,real_seqs,x.shape[1])    
    y_real = np.ones( (batch_size,1)  ) #real=1;
    
    x = np.concatenate((x,x_real))
    y=np.concatenate((np.expand_dims(y,axis=1),y_real),axis=0)
        
    history=discriminator.train_on_batch(x,y)
    return history

In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")


from musicrl.agent import *
from musicrl.models import *
from tqdm import *
import pandas as pd

batch_size = 4

env = SeqEnvironment(discriminator, mapper, batch_size=batch_size)
agent = DDPG(mapper.dims,env.observation_shape,act_range=3)


print(agent.actor.train_model.summary())



# First, gather experience
config = {
    "nb_episodes" : 20
}

print(range(config["nb_episodes"]))

tqdm_e = tqdm(range(config["nb_episodes"]), desc='Score', leave=True, unit=" episodes")
metrics=pd.DataFrame(columns=['tid', 'disc_loss', 'disc_acc', 'actor_loss', 'critic_loss', 'q_values'])
for e in tqdm_e:

    # Reset episode
    cumul_reward, done = np.zeros((batch_size, 1)), 0
    old_state = env.reset()
    actions, states, rewards = [], [], []
    
    step = 0
    while not np.mean(done)>0.5:
        
        if step>1000:
            break;
        print("Step", step)
        step += 1
        env.render()
        # Actor picks an action (following the deterministic policy)   
        
        actions = agent.policy_action(old_state)
        states, rewards, dones, _ = env.step(actions) #new_states -> bs of new_state
        q_values = agent.critic.target_model.predict([states, agent.actor.target_model.predict(states)])
        # Compute critic target

        critic_target = agent.bellman(rewards, q_values, dones)
    
        # Train both networks on sampled batch, update target networks
        critic_loss, actor_loss = agent.update_models(states, actions, critic_target)
        # Update current state
        cumul_reward += rewards
    postprocess_and_synthesize(np.array([seq[0] for seq in env.current_seqs]))
            
    discriminator_history= train_discriminator(np.transpose(env.current_seqs, [1,0,2]),real_seqs,env.batch_size)
    
   
    metrics=metrics.append({
        "tid" : e,
        "disc_loss" : discriminator_history[0],
        "disc_acc" : discriminator_history[1],
        "actor_loss" : actor_loss,
        "critic_loss" :  critic_loss,
        "q_values" : q_values,
    }, ignore_index=True)
    print(metrics)
    
    

    # Display score
    tqdm_e.set_description("Score: " + str(cumul_reward))
    tqdm_e.refresh()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[128]
hola
[128]
hola


Score:   0%|          | 0/20 [00:00<?, ? episodes/s]

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, 5)            66565       input_3[0][0]                    
__________________________________________________________________________________________________
model (Model)                   (None, 1)            66689       input_3[0][0]                    
                                                                 model_2[1][0]                    
Total params: 133,254
Trainable params: 66,565
Non-trainable params: 66,689
__________________________________________________________________________________________________
None
range(0, 20

Step 16
Step 17
Step 18
Step 19
Step 20
Step 21
Step 22
Step 23
Step 24
Step 25
Step 26
Step 27
Step 28
Step 29


Step 30
Step 31
Step 32
Step 33
Step 34
Step 35
Step 36
Step 37
Step 38
Step 39
Step 40
Step 41
Step 42
Step 43


Step 44
Step 45
Step 46
Step 47
Step 48
Step 49
Step 50
Step 51
Step 52
Step 53
Step 54
Step 55
Step 56
Step 57


Step 58
Step 59
Step 60
Step 61
Step 62
Step 63
Step 64
Step 65
Step 66
Step 67
Step 68
Step 69
Step 70
Step 71


# Listen to it

In [None]:
from musicrl.midi2vec import PostProcessor

def postprocess_and_synthesize(gen_seq):
    postprocess = PostProcessor([mapper.midi2vec(real_midi) for real_midi in real_midis[:5]])
    gen_seq[:,0] += 0.4
    gen_seq = postprocess(gen_seq)
    gen_midi = mapper.vec2midi(gen_seq)
    listen_to(gen_midi)

In [None]:
current_seqs_list = np.array([seq[i] for seq in env.current_seqs])
for i in range(env.batch_size):
    postprocess_and_synthesize(current_seqs_list)

In [None]:
gen_seqs = np.array(env.current_seqs).transpose([1, 0, 2])
#reals = np.array([mapper.midi2vec(i) for i in real_midis])
plot_predictions_over_time(discriminator, np.zeros(gen_seqs.shape), gen_seqs)

In [None]:
tf.version