In [91]:
import shutup
shutup.please()

In [140]:
import gym
from src.utils.gym_environment import GymEnvironment
from src.environments.discrete.mountain_car import environment

In [141]:
import numpy as np

from keras.layers import Dense, Activation
from keras.models import Sequential, load_model

from tensorflow.keras.optimizers import Adam


# Handles storage of state/action/reward and transitions
class ReplayBuffer():
    def __init__(
            self,
            environment,
            buffer_size,
            buffers=['state','new_state',"action","reward", "done"]
    ):
        
        self.buffer_position = 0
        self.buffer_size = buffer_size
        self.discrete = environment.action_space_mode == 'Discrete'
        
        self.__init_memory(environment)
        # if discrete, the action space must be a one-hot encoded 
        
    def __init_memory(self,environment):
        dtype = np.int8 if self.discrete else np.float32
        
        if len(environment.observation_shape) ==1:
            input_shape = environment.observation_shape[0]
        else:
            input_shape = environment.observation_shape[0]*environment.observation_shape[1]
            
        self.states = np.zeros((self.buffer_size,input_shape ))
        self.new_states = np.zeros((self.buffer_size,input_shape ))    
        self.actions = np.zeros((self.buffer_size, environment.n_actions), dtype=dtype)
        self.rewards = np.zeros(self.buffer_size)
        self.dones = np.zeros(self.buffer_size, dtype=np.float32)
        
    def remember(self,state,action,reward,state_, done):
        # once we hit buffer_size entries, we want to add it to the beginning 
        index = self.buffer_position % self.buffer_size
        
        self.states[index] = state
        self.new_states[index] = state_
        self.rewards[index] = reward
        self.dones[index] = 1- int(done)
        
        # When discrete store one hot
        if self.discrete:
            actions = np.zeros(self.actions.shape[1])
            actions[action] = 1.0 
            self.actions[index] = actions
        else:
            self.actions[index] = action
        
        self.buffer_position +=1
        
    def sample(self, batch_size=32):
        max_position = min(self.buffer_position, self.buffer_size)
        batch = np.random.choice(max_position, batch_size)
        
        states = self.states[batch]
        states_ = self.new_states[batch]
        rewards = self.rewards[batch]
        dones = self.dones[batch]
        actions = self.actions[batch]

        return states, actions, rewards, states_, dones

In [142]:
class DqnAgent():
    def __init__(
        self,
        environment,
        alpha = 0.0005,
        gamma = 0.99,
        epsilon = 1.0,
        epsilon_decay=0.9996,
        
        buffer_size=1000000,
        batch_size=64,
        optimizer = Adam(learning_rate=0.0005),
        fully_connected_layer_configuration = 'MlpPolicy',
    ):
        # Args
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.optimizer = optimizer
        
        # Environment
        env = GymEnvironment(environment)
        self.env = env.env
        self.n_actions = env.n_actions
        self.actions = env.actions
        self.observation_shape = env.observation_shape
        self.action_space_mode = env.action_space_mode
    
        # Boot
        self.__init_model() 
        self.__init_memory(env)
        
    def __init_model(self):
        model = Sequential([
            Dense(256, input_shape=self.observation_shape, activation="relu"),
            Dense(256, activation="relu"),
            Dense(self.n_actions)
        ])
        
        model.compile(optimizer=self.optimizer, loss="mse")
        
        #self.model = model
        self.q_eval = model
        
    def __init_memory(self,environment):
        self.memory = ReplayBuffer(environment,self.buffer_size)
        
    def decrement_eps(self):
        self.epsilon = self.epsilon * self.epsilon_decay if self.epsilon > 0.01 else 0.01
        
    def remember(self,state,action,reward,new_state,done):
        self.memory.remember(state,action,reward,new_state,done)
        
    def get_state(self,obs):
        return obs
        
    def choose_action(self,state):
        state = state[np.newaxis,:]
        rand = np.random.random()
        if rand < self.epsilon:
            if self.action_space_mode == "Discrete":
                action = np.random.choice(self.actions)
            else:
                action = self.env.action_space.sample()
        else:
            if self.action_space_mode == "Discrete":
                actions = self.q_eval.predict(state)
                action = np.argmax(actions)
            else:
                actions = self.q_eval.predict(state)
                action = actions
            
        return action
    def replay(self):
        if self.memory.buffer_position < self.batch_size:
            return
        

        state, action, reward, new_state, done = self.memory.sample(self.batch_size)

        # TODO - this is in discrete only
        action_values = np.array(self.actions, dtype=np.int8)
        action_indices = np.dot(action,action_values)
        
        q_eval = self.q_eval.predict(state)
        q_next = self.q_eval.predict(new_state)
        
        q_target = q_eval.copy()
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        q_target[batch_index,action_indices] = reward + self.gamma * np.max(q_next,axis=1)#* done
        
        q_target = reward + self.gamma * np.max(q_next,axis=1)* done
        _ = self.q_eval.fit(state,q_target,verbose=0)
        
        self.decrement_eps()
        
    def learn(self, timesteps=-1, success_threshold=150, plot_results=True):
        obs = self.env.reset()
        
        self.total_rewards = []
        self.avg_rewards = []
        
        score = 0
        timestep = 0
        episode = 0

        # Loop condition
        def learning_condition():
            if timesteps == -1:
                return True
            else: 
                return timesteps > timestep
        
        while learning_condition():

            # Choose action
            action = self.choose_action(obs)
               
            # Step
            obs_,reward,done, info = self.env.step(action)
            
            # Get next state
            score += reward
            
            self.remember(obs,action,reward,obs_,done)
            obs = obs_
            
            self.replay()

            
            if done:
                # Loop episode state
                if episode % 2 == 0 and episode > 0:
                    print('episode',episode,'score',score,'epsilon %:.3f',self.epsilon)
                
                # Update pointers
                self.total_rewards.append(score)
                
                # Track reward evolution
                if len(self.total_rewards) > 100:
                    avg_reward = np.mean(self.total_rewards[-100:])
                    self.avg_rewards.append(avg_reward)
                    
                    # Break loop if average reward is greater than success threshold
                    if avg_reward > success_threshold:
                        print('Agent solved environment at the episode {}'.format(episode))
                        break
                
                # Reset environment
                score = 0
                episode +=1
                obs = self.env.reset()
                #state = self.get_state(obs)
                
            # Update timestep counter
            timestep+=1
        
        if plot_results:
            plt.plot(self.avg_rewards)

In [144]:
agent = DqnAgent(environment,batch_size=1024)
agent.learn()


| ---------------------------------
| MountainCar-v0
| Action space:
|   * Discrete with low state-space
| Dev notes:
|   * Switched _max_episode_steps from 200 to 1000 so 
|     the agent can explore better.
| ----------------------------------------------------------   


episode 2 score -1000.0 epsilon %:.3f 0.45341012848884044
episode 4 score -1000.0 epsilon %:.3f 0.20369770041952698
episode 6 score -600.0 epsilon %:.3f 0.14439725488149371
episode 8 score -1000.0 epsilon %:.3f 0.08556390706986079
episode 10 score -699.0 epsilon %:.3f 0.043359576568349
episode 12 score -1000.0 epsilon %:.3f 0.021077019276257936
episode 14 score -1000.0 epsilon %:.3f 0.01


KeyboardInterrupt: 

In [None]:
def environment():
    return gym.make('LunarLanderContinuous-v2')

In [127]:
agent2 = DqnAgent(environment,batch_size=512)
agent2.learn()

[-0.0633368  -0.24367835]
[-0.05356975  0.6722458 ]
[-0.76528037  0.36938968]
[-0.9259187  0.7559831]
[0.44720107 0.75970054]
[-0.27903184  0.18431884]
[0.01356326 0.718217  ]
[0.5353247 0.8629306]
[-0.5461493  0.5685996]
[0.9518825 0.6007192]
[-0.1062305   0.79450095]
[0.9007106 0.8401727]
[-0.11708402  0.8448496 ]
[-0.5269093   0.53305244]
[0.5354025  0.69721764]
[0.35754716 0.13913973]
[ 0.0054669 -0.8371688]
[-0.8493767  0.8713312]
[-0.9293254   0.77524847]
[-0.57839215 -0.80573606]
[0.96759635 0.10845508]
[ 0.4084481  -0.91168207]
[-0.4664504 -0.0221516]
[-0.28357014 -0.39967752]
[-0.31281063 -0.7315502 ]
[ 0.62336594 -0.8415845 ]
[-0.5840137  -0.21036048]
[-0.6302923  -0.31064716]
[-0.3825194   0.26230457]
[0.02264549 0.5801822 ]
[-0.36797705 -0.23173308]
[-0.6495363   0.98836005]
[ 0.467967  -0.7819958]
[-0.68230194  0.21298589]
[-0.39990023  0.15090734]
[-0.05084972 -0.7535494 ]
[0.5978603  0.44560415]
[ 0.84613    -0.11139116]
[-0.5405891  0.5175358]
[ 0.599845   -0.92617303]


[-0.08823588  0.04737251]
[-0.29108357  0.36126688]
[-0.06398269 -0.29433268]
[-0.27063945  0.968677  ]
[ 0.54183125 -0.3955426 ]
[0.8702851  0.08683275]
[-0.15497018 -0.20516816]
[-0.96203834  0.9395652 ]
[-0.894422   0.2778901]
[-0.14765032 -0.44220752]
[-0.8250152 -0.6316387]
[-0.86882013  0.42715898]
[0.2798227  0.07811061]
[-0.28510845  0.8460584 ]
[0.86331856 0.20642674]
[-0.6045139  -0.19660573]
[-0.86337656  0.11353471]
[-0.35388586  0.2896628 ]
[ 0.18539    -0.38323602]
[-0.65724456  0.7050641 ]
[0.9031156  0.69637114]
[-0.11701062 -0.2720563 ]
[0.15340726 0.33954063]
[-0.3763001 -0.8923053]
[-0.9544976  -0.14179796]
[-0.7639739 -0.8687708]
[-0.3007412  0.4346523]
episode 4 score -210.12410759287064 epsilon %:.3f 0.9888602708591564
[-0.15633708  0.34083796]
[0.0335612 0.6538079]
[0.19526497 0.7073592 ]
[0.9232765  0.45447588]
[ 0.05069947 -0.6821521 ]
[-0.77955055 -0.9266228 ]
[-0.02339509 -0.26199716]
[-0.08718321  0.9983971 ]
[ 0.9572237  -0.37933457]
[-0.4100904  -0.3564774

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
gym.make('LunarLanderContinuous-v2').action_space.shape

In [16]:
agent.env.step([0.9,1])

(array([ 3.2882689e-04,  1.4102881e+00,  2.2365194e-02,  4.8845876e-02,
        -2.1515167e-03, -4.0259656e-02,  0.0000000e+00,  0.0000000e+00],
       dtype=float32),
 1.8001698641777761,
 False,
 {})

In [18]:
agent.q_eval.predict(np.expand_dims(agent.env.reset(),axis=0))

array([[ 0.04110719, -0.17545287]], dtype=float32)

In [19]:
agent.env.reset()[np.newaxis,:]

array([[-0.00695028,  1.4209193 , -0.7040039 ,  0.4443818 ,  0.00806044,
         0.1594675 ,  0.        ,  0.        ]], dtype=float32)

In [23]:
agent.env.action_space.sample()

array([-0.2968304 , -0.19083126], dtype=float32)