Link to tutorial: https://keras.io/examples/rl/actor_critic_cartpole/

In [1]:
# importing libraries
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# configuring the set up parameters
seed = 42
# gamma is the discount factor for past rewards
gamma = 0.99
max_steps_per_episode = 10000
# setting up the environment
env = gym.make("CartPole-v0")
env.seed(seed)
# smallest number such that 1.0 + eps != 1.0 
eps = np.finfo(np.float32).eps.item()

In [2]:
# implementing actor critic network
num_inputs = 4
num_actions = 2
num_hidden = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden,activation="relu")(inputs)
action = layers.Dense(num_actions,activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action,critic])

In [8]:
# training the actor critic network
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

# Run until the task is solved
while True:
  state = env.reset()
  episode_reward = 0
  with tf.GradientTape() as tape:
    for timestep in range(1, max_steps_per_episode):
      # adding this line would show the number of attempts of the agent in a pop up window
      #env.reader()
      state = tf.convert_to_tensor(state)
      state = tf.expand_dims(state, 0)

      # predict action probabilities and estimated future rewards from environment states
      action_probs, critic_value = model(state)
      critic_value_history.append(critic_value[0,0])

      # sample action from action probability distribution
      action = np.random.choice(num_actions, p=np.squeeze(action_probs))
      action_probs_history.append(tf.math.log(action_probs[0, action]))

      # apply the sampled action in our environment
      state, reward, done, _  = env.step(action)
      rewards_history.append(reward)
      episode_reward += reward

      if done:
        break

     # Update running reward to check condition for solving the task
    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

    # calculate expected value from rewards
    # at each time step what was the total reward received after that timestep
    # rewards in the past are discounted by multiplying them with gamma
    # these are the label for our critic
    returns = []
    discounted_sum = 0
    for r in rewards_history[::-1]:
      discounted_sum = r + gamma * discounted_sum
      returns.insert(0, discounted_sum)

    # normalize
    returns = np.array(returns)
    returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
    returns = returns.tolist()

    # calculating the loss values to update the network during training
    history = zip(action_probs_history, critic_value_history, returns)
    actor_losses = []
    critic_losses = []

    # at this point in the critic estimated that we would get a total reward = value
    # in the future , we took an action with log probability of log_prob
    # ended up receiving a total reward = 'ret' 
    # the actor must be updated so that it predicts an action that leads to high
    # rewards (compared to critic estimate) with high probability

    for log_prob, value, ret in history:
       diff = ret - value
      # actor loss
       actor_losses.append(-log_prob * diff)

      # the critic must be updated so that it predicts a better estimate
      # of the future rewards
       critic_losses.append(
                  huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
       )
    # backpropogation 
    loss_value = sum(actor_losses) + sum(critic_losses)
    grads = tape.gradient(loss_value, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # clear the loss and reward history
    action_probs_history.clear()
    critic_value_history.clear()
    rewards_history.clear()

  # log details
  episode_count +=1 
  if episode_count % 10 == 0:
    template = "running reward: {:.2f} at episode {}"
    print(template.format(running_reward, episode_count))
    # condition to verify the task is solved

  if running_reward > 195: 
    print("Solved at episode {}!", format(episode_count))
    break


running reward: 70.54 at episode 10
running reward: 107.39 at episode 20
running reward: 125.21 at episode 30
running reward: 138.37 at episode 40
running reward: 144.97 at episode 50
running reward: 149.84 at episode 60
running reward: 158.03 at episode 70
running reward: 143.26 at episode 80
running reward: 145.11 at episode 90
running reward: 142.96 at episode 100
running reward: 131.73 at episode 110
running reward: 112.80 at episode 120
running reward: 108.42 at episode 130
running reward: 114.45 at episode 140
running reward: 122.98 at episode 150
running reward: 122.70 at episode 160
running reward: 115.66 at episode 170
running reward: 115.24 at episode 180
running reward: 124.63 at episode 190
running reward: 135.74 at episode 200
running reward: 138.10 at episode 210
running reward: 143.48 at episode 220
running reward: 152.06 at episode 230
running reward: 146.11 at episode 240
running reward: 139.36 at episode 250
running reward: 137.00 at episode 260
running reward: 140.88