# Environment

Now we have a trainable discriminator - it's time to build the environment

In [2]:
## Imports and data loading

%load_ext autoreload
%autoreload 2

import numpy as np
from matplotlib import pyplot as plt

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras import metrics

from musicrl.midi2vec import MidiVectorMapper
from musicrl.render import *
from musicrl.data import RandomMidiDataGenerator

import pretty_midi
from glob import glob


REAL = 1
GEN = 0

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
filepaths = list(glob('maestro-v2.0.0/2008/**.midi'))
real_midis = [pretty_midi.PrettyMIDI(i) for i in filepaths]
mapper = MidiVectorMapper(real_midis)

In [4]:
mapper = MidiVectorMapper(real_midis)
real_seq = mapper.midi2vec(real_midis[1])
real_seq.shape

(60867, 5)

In [5]:
import gym
import pretty_midi


class SeqEnvironment(gym.Env):
    """We ignore control change events for now
    """
    def __init__(
        self,
        discriminator,
        mapper,
        observation_shape=(1,128), #1 is batch_size
        change_rate=0.01
    ):
        super().__init__()
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions:
        # self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)
        # Example for using image as input:
        # self.observation_space = spaces.Box(low=0, high=255,
        #                                    shape=(HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
        self.discriminator = discriminator
        self.mapper = mapper
        self.observation_shape = observation_shape
        self.change_rate = change_rate
        self.reset()
        
    def step(self, action):
        self.current_seq.append(action)
        self.observation  = self.observation + np.random.normal(0, self.change_rate, size=self.observation_shape)
        reward, done, info = None, None, None
        reward, done, info = [1], [False], None
        return np.array(self.observation), reward, done, info
    
    def reset(self):
        self.current_seq = []
        self.observation = np.random.normal(0, 1, size=self.observation_shape)
        return self.observation  # reward, done, info can't be included

    def render(self, mode='human'):
        pass
    
    def close (self):
        pass

In [6]:


mapper.column_meaning
env = SeqEnvironment(None, mapper)

In [10]:
for action in real_seq:
    env.step(action)

wav = mapper.vec2midi(env.current_seq).synthesize(22050)
display(Audio(wav, rate=22050))

In [15]:
from musicrl.agent import *
from musicrl.models import *
from tqdm import *



midi_representation_dim=128
agent = DDPG(midi_representation_dim,midi_representation_dim,act_range=midi_representation_dim,k=0)


# First, gather experience
config = {
    "nb_episodes" : 10
}

print(range(config["nb_episodes"]))

tqdm_e = tqdm(range(config["nb_episodes"]), desc='Score', leave=True, unit=" episodes")
for e in tqdm_e:

    # Reset episode
    time, cumul_reward, done = 0, 0, False
    old_state = env.reset()
    actions, states, rewards = [], [], []

    while not done:
        env.render()
        # Actor picks an action (following the deterministic policy)   
        
        actions = agent.policy_action(old_state)

    
        
        # Clip continuous values to be valid w.r.t. environment
        #a = np.clip(a+noise.generate(time), -self.act_range, self.act_range)
        # Retrieve new state, reward, and whether the state is terminal
        
        states, rewards, dones, _ = env.step(actions) #new_states -> bs of new_state
        

        

        # Add outputs to memory buffer
        #agent.memorize(old_state, a, r, done, new_state)
        # Sample experience from buffer
        
        #states, actions, rewards, dones, new_states, _ = agent.sample_batch(config["batch_size"])
        # Predict target q-values using target networks
        
        

        
        q_values = agent.critic.target_model.predict([states, agent.actor.target_model.predict(states)])        
        # Compute critic target


        critic_target = agent.bellman(rewards, q_values, dones)

        
        print("##")
        import pdb
        pdb.set_trace()
        print(len(states))
        print(actions.shape)
        print(critic_target)
    
        # Train both networks on sampled batch, update target networks
        agent.update_models(states, actions, critic_target)
        # Update current state
        old_state = new_state
        cumul_reward += r
        time += 1

    # Gather stats every episode for plotting
    if(args.gather_stats):
        mean, stdev = gather_stats(self, env)
        results.append([e, mean, stdev])

    # Export results for Tensorboard
    score = tfSummary('score', cumul_reward)
    summary_writer.add_summary(score, global_step=e)
    summary_writer.flush()
    # Display score
    tqdm_e.set_description("Score: " + str(cumul_reward))
    tqdm_e.refresh()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
++
(?, 128)
6
<zip object at 0x16c3d4208>
++
++


Score:   0%|          | 0/10 [00:00<?, ? episodes/s]

range(0, 10)





NameError: name 'ptin' is not defined