In [2]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

"""
Value-based methods: estimate the value function to predicted the expected future reward for given state/action
                    More Efficient
Policy-based methods: directly map states to actions through policy. Policy updated through the policy gradient theorem.
                    Updates the policy in the direction to increase reward.
                    Better for continuous and stochastic envs
                    
Actor-Critic combines value-based and policy-based methods
    Critic: evaluate the action taken by the agent based on the value function. 
            Use advantage values capture how better an action is compared to others at a given state.
            It learns how much better an action is. This reduces the high variance of policy networks.
    Actor: use policy gradient to control how the agent behaves. Learns optimal policy by exploring and use feedback
    from the Critic to update the policy
                      
"""

env = gym.make("CartPole-v1", render_mode="human")  # Create the environment
eps = np.finfo(np.float32).eps.item() 

In [3]:
num_inputs = 4
num_actions = 2
num_hidden = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
# Takes in state as input and returns a probability val for each action in the action space
action = layers.Dense(num_actions, activation="softmax")(common)
# Takes in state as input and compute the val function
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

In [4]:
# train
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

gamma = 0.99  # Discount factor for past rewards
max_steps_per_episode = 10000

while True:
    state, _ = env.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):

            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Predict action probabilities and estimated future rewards from env state
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])

            # Sample action from action probability distribution
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply the sampled action in our environment
            state, reward, done, _, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done: break

        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses, critic_losses = [], []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up recieving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of the future rewards.
            critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))

    if running_reward > 195:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break
        
env.close()

running reward: 10.17 at episode 10
running reward: 13.41 at episode 20
running reward: 12.54 at episode 30
running reward: 12.42 at episode 40
running reward: 12.87 at episode 50
running reward: 13.79 at episode 60
running reward: 20.04 at episode 70
running reward: 31.46 at episode 80
running reward: 38.11 at episode 90
running reward: 58.49 at episode 100
running reward: 44.99 at episode 110
running reward: 35.76 at episode 120
running reward: 29.78 at episode 130
running reward: 27.88 at episode 140
running reward: 25.51 at episode 150
running reward: 23.69 at episode 160
running reward: 24.09 at episode 170
running reward: 22.95 at episode 180
running reward: 26.72 at episode 190
running reward: 34.81 at episode 200
running reward: 46.34 at episode 210
running reward: 82.51 at episode 220
running reward: 105.82 at episode 230
running reward: 155.39 at episode 240
running reward: 165.62 at episode 250
running reward: 185.41 at episode 260
Solved at episode 264!
