In [1]:
import shutup
shutup.please()

In [2]:
import gym
from src.utils.gym_environment import GymEnvironment
from src.environments.discrete.mountain_car import environment

In [3]:
import numpy as np

from keras.layers import Dense, Activation
from keras.models import Sequential, load_model

from tensorflow.keras.optimizers import Adam


# Handles storage of state/action/reward and transitions
class ReplayBuffer():
    def __init__(
            self,
            environment,
            buffer_size,
            buffers=['state','new_state',"action","reward", "done"]
    ):
        
        self.buffer_position = 0
        self.buffer_size = buffer_size
        self.discrete = environment.action_space_mode == 'Discrete'
        
        self.__init_memory(environment)
        # if discrete, the action space must be a one-hot encoded 
        
    def __init_memory(self,environment):
        dtype = np.int8 if self.discrete else np.float32
        
        if len(environment.observation_shape) ==1:
            input_shape = environment.observation_shape[0]
        else:
            input_shape = environment.observation_shape[0]*environment.observation_shape[1]
            
        self.states = np.zeros((self.buffer_size,input_shape ))
        self.new_states = np.zeros((self.buffer_size,input_shape ))    
        self.actions = np.zeros((self.buffer_size, environment.n_actions), dtype=dtype)
        self.rewards = np.zeros(self.buffer_size)
        self.dones = np.zeros(self.buffer_size, dtype=np.float32)
        
    def remember(self,state,action,reward,state_, done):
        # once we hit buffer_size entries, we want to add it to the beginning 
        index = self.buffer_position % self.buffer_size
        
        self.states[index] = state
        self.new_states[index] = state_
        self.rewards[index] = reward
        self.dones[index] = 1- int(done)
        
        # When discrete store one hot
        if self.discrete:
            actions = np.zeros(self.actions.shape[1])
            actions[action] = 1.0 
            self.actions[index] = actions
        else:
            self.actions[index] = action
        
        self.buffer_position +=1
        
    def sample(self, batch_size=32):
        max_position = min(self.buffer_position, self.buffer_size)
        batch = np.random.choice(max_position, batch_size)
        
        states = self.states[batch]
        states_ = self.new_states[batch]
        rewards = self.rewards[batch]
        dones = self.dones[batch]
        actions = self.actions[batch]

        return states, actions, rewards, states_, dones

KeyboardInterrupt: 

In [None]:
class DqnAgent():
    def __init__(
        self,
        environment,
        alpha = 0.0005,
        gamma = 0.99,
        epsilon = 1.0,
        epsilon_decay=0.9996,
        
        buffer_size=1000000,
        batch_size=64,
        optimizer = Adam(learning_rate=0.0005),
        fully_connected_layer_configuration = 'MlpPolicy',
    ):
        # Args
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.optimizer = optimizer
        
        # Environment
        env = GymEnvironment(environment)
        self.env = env.env
        self.n_actions = env.n_actions
        self.actions = env.actions
        self.observation_shape = env.observation_shape
        self.action_space_mode = env.action_space_mode
    
        # Boot
        self.__init_model() 
        self.__init_memory(env)
        
    def __init_model(self):
        model = Sequential([
            Dense(256, input_shape=self.observation_shape, activation="relu"),
            Dense(256, activation="relu"),
            Dense(self.n_actions)
        ])
        
        model.compile(optimizer=self.optimizer, loss="mse")
        
        #self.model = model
        self.q_eval = model
        
    def __init_memory(self,environment):
        self.memory = ReplayBuffer(environment,self.buffer_size)
        
    def decrement_eps(self):
        self.epsilon = self.epsilon * self.epsilon_decay if self.epsilon > 0.01 else 0.01
        
    def remember(self,state,action,reward,new_state,done):
        self.memory.remember(state,action,reward,new_state,done)
        
    def get_state(self,obs):
        return obs
        
    def choose_action(self,state):
        state = state[np.newaxis,:]
        rand = np.random.random()
        if rand < self.epsilon:
            if self.action_space_mode == "Discrete":
                action = np.random.choice(self.actions)
            else:
                action = self.env.action_space.sample()
        else:
            if self.action_space_mode == "Discrete":
                actions = self.q_eval.predict(state)
                action = np.argmax(actions)
            else:
                actions = self.q_eval.predict(state)
                action = actions
            
        return action
    def replay(self):
        if self.memory.buffer_position < self.batch_size:
            return
        

        state, action, reward, new_state, done = self.memory.sample(self.batch_size)

        # TODO - this is in discrete only
        action_values = np.array(self.actions, dtype=np.int8)
        action_indices = np.dot(action,action_values)
        
        q_eval = self.q_eval.predict(state)
        q_next = self.q_eval.predict(new_state)
        
        q_target = q_eval.copy()
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        q_target[batch_index,action_indices] = reward + self.gamma * np.max(q_next,axis=1)#* done
        
        q_target = reward + self.gamma * np.max(q_next,axis=1)* done
        _ = self.q_eval.fit(state,q_target,verbose=0)
        
        self.decrement_eps()
        
    def learn(self, timesteps=-1, success_threshold=150, plot_results=True):
        obs = self.env.reset()
        
        self.total_rewards = []
        self.avg_rewards = []
        
        score = 0
        timestep = 0
        episode = 0

        # Loop condition
        def learning_condition():
            if timesteps == -1:
                return True
            else: 
                return timesteps > timestep
        
        while learning_condition():

            # Choose action
            action = self.choose_action(obs)
               
            # Step
            obs_,reward,done, info = self.env.step(action)
            
            # Get next state
            score += reward
            
            self.remember(obs,action,reward,obs_,done)
            obs = obs_
            
            self.replay()

            
            if done:
                # Loop episode state
                if episode % 2 == 0 and episode > 0:
                    print('episode',episode,'score',score,'epsilon %:.3f',self.epsilon)
                
                # Update pointers
                self.total_rewards.append(score)
                
                # Track reward evolution
                if len(self.total_rewards) > 100:
                    avg_reward = np.mean(self.total_rewards[-100:])
                    self.avg_rewards.append(avg_reward)
                    
                    # Break loop if average reward is greater than success threshold
                    if avg_reward > success_threshold:
                        print('Agent solved environment at the episode {}'.format(episode))
                        break
                
                # Reset environment
                score = 0
                episode +=1
                obs = self.env.reset()
                #state = self.get_state(obs)
                
            # Update timestep counter
            timestep+=1
        
        if plot_results:
            plt.plot(self.avg_rewards)

In [None]:
agent = DqnAgent(environment,batch_size=1024)
agent.learn()

In [None]:
def environment():
    return gym.make('LunarLanderContinuous-v2')

In [None]:
agent2 = DqnAgent(environment,batch_size=512)
agent2.learn()

In [None]:
gym.make('LunarLanderContinuous-v2').action_space.shape

In [None]:
agent.env.step([0.9,1])

In [None]:
agent.q_eval.predict(np.expand_dims(agent.env.reset(),axis=0))

In [None]:
agent.env.reset()[np.newaxis,:]

In [None]:
agent.env.action_space.sample()