In [2]:
import shutup
shutup.please()

In [3]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras
from tensorflow.keras import layers


from src.utils.gym_environment import GymEnvironment


In [10]:
#sigma = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
#sigma = tf.nn.softplus(sigma) + 1e-5
#norm_dist = tf.contrib.distributions.Normal(mu, sigma)

class Agent():
    def __init__(self,
                 environment, 
                alpha = 0.1,
                gamma = 0.99,
                eps = np.finfo(np.float32).eps.item(),
                optimizer = keras.optimizers.Adam(learning_rate=0.01)
                ):
        
        # Args
        self.alpha = alpha
        self.gamma = gamma
        self.eps = eps
        self.optimizer = optimizer
        
        # Environment
        env = GymEnvironment(environment)
        self.env = env.env
        self.n_actions = env.n_actions
        self.actions = env.actions
        self.observation_shape = env.observation_shape
        
        self.__init_networks()
        
    def __init_networks(self):
        num_inputs = self.observation_shape[0]
        num_actions = self.n_actions
        num_hidden = 128

        inputs = layers.Input(shape=(num_inputs,))
        common = layers.Dense(num_hidden, activation="relu")(inputs)
        action = layers.Dense(num_actions, activation="softmax")(common)
        critic = layers.Dense(1)(common)

        self.model = keras.Model(inputs=inputs, outputs=[action, critic])

    
    def choose_action(self,action_probs):
        
        # Sample action from action probability distribution
        action = np.random.choice(self.n_actions, p=np.squeeze(action_probs))
        action_log_prob = tf.math.log(action_probs[0, action])

        return action, action_log_prob
    
    def learn(self):
        huber_loss = keras.losses.Huber()
        action_probs_history = []
        critic_value_history = []
        rewards_history = []
        running_reward = 0
        episode_count = 0
        
        while True:  # Run until solved
            state = self.env.reset()
            episode_reward = 0
            with tf.GradientTape() as tape:
                for timestep in range(1, 1000):
                    # env.render(); Adding this line would show the attempts
                    # of the agent in a pop up window.

                    state = tf.convert_to_tensor(state)
                    state = tf.expand_dims(state, 0)

                    # Predict action probabilities and estimated future rewards
                    # from environment state
                    action_probs, critic_value = self.model(state)
                    critic_value_history.append(critic_value[0, 0])

                    # Sample action from action probability distribution
                    #action = np.random.choice(num_actions, p=np.squeeze(action_probs))
                    #action_probs_history.append(tf.math.log(action_probs[0, action]))
                    action, action_log_prob = self.choose_action(action_probs)

                    action_probs_history.append(action_log_prob)
                    
                    #print({"action":action, "action_log_prob":action_log_prob})
                    
                    # Apply the sampled action in our environment
                    state, reward, done, _ = self.env.step(action)
                    rewards_history.append(reward)
                    episode_reward += reward

                    if done:
                        break

                # Update running reward to check condition for solving
                running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

                # Calculate expected value from rewards
                # - At each timestep what was the total reward received after that timestep
                # - Rewards in the past are discounted by multiplying them with gamma
                # - These are the labels for our critic
                returns = []
                discounted_sum = 0
                for r in rewards_history[::-1]:
                    discounted_sum = r + self.gamma * discounted_sum
                    returns.insert(0, discounted_sum)

                # Normalize
                returns = np.array(returns)
                returns = (returns - np.mean(returns)) / (np.std(returns) + self.eps)
                returns = returns.tolist()

                # Calculating loss values to update our network
                history = zip(action_probs_history, critic_value_history, returns)
                actor_losses = []
                critic_losses = []
                for log_prob, value, ret in history:
                    # At this point in history, the critic estimated that we would get a
                    # total reward = `value` in the future. We took an action with log probability
                    # of `log_prob` and ended up recieving a total reward = `ret`.
                    # The actor must be updated so that it predicts an action that leads to
                    # high rewards (compared to critic's estimate) with high probability.
                    diff = ret - value
                    actor_losses.append(-log_prob * diff)  # actor loss

                    # The critic must be updated so that it predicts a better estimate of
                    # the future rewards.
                    critic_losses.append(
                        huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                    )

                # Backpropagation
                loss_value = sum(actor_losses) + sum(critic_losses)
                grads = tape.gradient(loss_value, self.model.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

                # Clear the loss and reward history
                action_probs_history.clear()
                critic_value_history.clear()
                rewards_history.clear()

            # Log details
            episode_count += 1
            if episode_count % 10 == 0:
                template = "running reward: {:.2f} at episode {}"
                print(template.format(running_reward, episode_count))

            if running_reward > 195:  # Condition to consider the task solved
                print("Solved at episode {}!".format(episode_count))
                break

In [6]:
from src.environments.discrete.cartpole import environment
agent = Agent(environment)
agent.learn()
#{'action': 0, 'action_log_prob': <tf.Tensor: shape=(), dtype=float32, numpy=-0.6900733>}


| ---------------------------------
| CartPole-v1
| Action space:
|   * Discrete with high state-space
| Dev notes:
|   * Agents that track State/Action combinations like 
|     Q learning will fail due to high state space
| ----------------------------------------------------------   


running reward: 6.78 at episode 10
running reward: 12.80 at episode 20
running reward: 28.42 at episode 30
running reward: 80.46 at episode 40
running reward: 94.96 at episode 50
running reward: 73.45 at episode 60
running reward: 64.85 at episode 70
running reward: 67.75 at episode 80
running reward: 91.22 at episode 90
running reward: 151.50 at episode 100
running reward: 135.05 at episode 110
running reward: 125.50 at episode 120
running reward: 116.25 at episode 130
running reward: 108.29 at episode 140
running reward: 113.86 at episode 150
running reward: 122.69 at episode 160
running reward: 144.65 at episode 170
running reward: 169.91 at episode 180
running reward: 170.67 at episode 190
running

In [4]:
#sigma = tf.layers.dense(hidden2, n_outputs, None, init_xavier)
#sigma = tf.nn.softplus(sigma) + 1e-5
#norm_dist = tf.contrib.distributions.Normal(mu, sigma)

class Agent():
    def __init__(self,
                 environment, 
                alpha = 0.1,
                gamma = 0.99,
                eps = np.finfo(np.float32).eps.item(),
                optimizer = keras.optimizers.Adam(learning_rate=0.01)
                ):
        
        # Args
        self.alpha = alpha
        self.gamma = gamma
        self.eps = eps
        self.optimizer = optimizer
        
        # Environment
        env = GymEnvironment(environment)
        self.env = env.env
        self.n_actions = env.n_actions
        self.actions = env.actions
        self.observation_shape = env.observation_shape
        
        self.__init_networks()
        
    def __init_networks(self):
        num_inputs = self.observation_shape[0]
        num_actions = self.n_actions
        num_hidden = 128

        inputs = layers.Input(shape=(num_inputs,))
        common = layers.Dense(num_hidden, activation="relu")(inputs)
        sigma = layers.Dense(1, activation="softplus", name="sigma")(common)
        mu = layers.Dense(1, activation="tanh" , name='mu')(common)
        #sigma = tf.nn.softplus(sigma) + 1e-5
        #norm_dist = tf.contrib.distributions.Normal(mu, sigma)
        #action = layers.Dense(num_actions, activation="softmax")(common)
        critic = layers.Dense(1, activation=None ,name='critic')(common)
        
        actor = tf.keras.layers.Concatenate(axis=-1)([mu,sigma])
        self.model = keras.Model(inputs=inputs, outputs=[actor, critic])

    
    def choose_action(self,mu,sigma):
        
        # Sample action from action probability distribution
        #action = np.random.choice(self.n_actions, p=np.squeeze(action_probs))
        #action_log_prob = tf.math.log(action_probs[0, action])
        #mu = tf.math.tanh(mu)
        norm_dist = tfp.distributions.Normal(mu, sigma)
        action = tf.squeeze(norm_dist.sample(1), axis=0)
        action_log_prob = -(norm_dist.log_prob(action)+self.eps)
        action = tf.clip_by_value(
            action, self.env.action_space.low[0], 
            self.env.action_space.high[0])
        
        return np.array(action[0],dtype=np.float32), action_log_prob
    
    def learn(self):
        huber_loss = keras.losses.Huber()
        action_probs_history = []
        critic_value_history = []
        rewards_history = []
        running_reward = 0
        episode_count = 0
        
        while True:  # Run until solved
            state = self.env.reset()
            episode_reward = 0
            with tf.GradientTape() as tape:
                for timestep in range(1, 1000):
                    # env.render(); Adding this line would show the attempts
                    # of the agent in a pop up window.

                    state = tf.convert_to_tensor(state)
                    state = tf.expand_dims(state, 0)

                    # Predict action probabilities and estimated future rewards
                    # from environment state
                    actor_value, critic_value = self.model(state)
                    critic_value_history.append(critic_value[0, 0])

                    #print('teste',tfp.distributions.Normal(actor_value))
                    mu = actor_value[:,0:1]
                    sigma = actor_value[:,1:]
                    # Sample action from action probability distribution
                    #action = np.random.choice(num_actions, p=np.squeeze(action_probs))
                    #action_probs_history.append(tf.math.log(action_probs[0, action]))
                    action, action_log_prob = self.choose_action(mu,sigma)

                    action_probs_history.append(action_log_prob)
                    #action_probs_history.append(actor_value)
                    
                    #print({"action":action, "action_log_prob":action_log_prob})
                    
                    # Apply the sampled action in our environment
                    
                    #print(a)
                    state, reward, done, _ = self.env.step(action)
                    rewards_history.append(reward)
                    episode_reward += reward

                    if done:
                        break

                # Update running reward to check condition for solving
                running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

                # Calculate expected value from rewards
                # - At each timestep what was the total reward received after that timestep
                # - Rewards in the past are discounted by multiplying them with gamma
                # - These are the labels for our critic
                returns = []
                discounted_sum = 0
                for r in rewards_history[::-1]:
                    discounted_sum = r + self.gamma * discounted_sum
                    returns.insert(0, discounted_sum)

                # Normalize
                returns = np.array(returns)
                returns = (returns - np.mean(returns)) / (np.std(returns) + self.eps)
                returns = returns.tolist()

                # Calculating loss values to update our network
                history = zip(action_probs_history, critic_value_history, returns)
                actor_losses = []
                critic_losses = []
                for log_prob, value, ret in history:
                    # At this point in history, the critic estimated that we would get a
                    # total reward = `value` in the future. We took an action with log probability
                    # of `log_prob` and ended up recieving a total reward = `ret`.
                    # The actor must be updated so that it predicts an action that leads to
                    # high rewards (compared to critic's estimate) with high probability.
                    diff = ret - value
                    actor_losses.append(-log_prob * diff)  # actor loss

                    # The critic must be updated so that it predicts a better estimate of
                    # the future rewards.
                    critic_losses.append(
                        huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                    )

                # Backpropagation
                loss_value = sum(actor_losses) + sum(critic_losses)
                grads = tape.gradient(loss_value, self.model.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

                # Clear the loss and reward history
                action_probs_history.clear()
                critic_value_history.clear()
                rewards_history.clear()

            # Log details
            episode_count += 1
            if episode_count % 10 == 0:
                template = "running reward: {:.2f} at episode {}"
                print(template.format(running_reward, episode_count))

            if running_reward > 0:  # Condition to consider the task solved
                print("Solved at episode {}!".format(episode_count))
                break

In [None]:
from src.environments.continuous.mountain_car import environment


agent = Agent(environment)
agent.learn()

#{'action': array([0.6679693], dtype=float32), 'action_log_prob': array([1.0068668], dtype=float32)}


| ---------------------------------
| MountainCarContinuous-v0
| Action space:
|   * Continuous with low state-space
| Dev notes:
|   * Switched _max_episode_steps from 200 to 1000 so 
|     the agent can explore better.
| ----------------------------------------------------------   




In [None]:
state = agent.env.reset()
#state = agent.get_state(obs)
done = False
score = 0

while not done:
    agent.env.render()
    actor_value, critic_value = agent.model(np.expand_dims(state,axis=0))
    mu = actor_value[:,0:1]
    sigma = actor_value[:,1:]
    # Sample action from action probability distribution
    #action = np.random.choice(num_actions, p=np.squeeze(action_probs))
    #action_probs_history.append(tf.math.log(action_probs[0, action]))
    action, action_log_prob = agent.choose_action(mu,sigma)
    #action = mu#agent.max_action(state)
            
    # Step
    obs_,reward,done, info = agent.env.step(action)
            
    # Get next state
    score += reward
    #state_ = agent.get_state(obs_)
    state = obs_
    # Set state as next state so the agent keeps 
    #state = state_
    
print(score)

In [None]:

agent.env.close