In [1]:
import gym
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

from plot_utils import plot_animation
from cart_pole_utils import play_one_episode

In [2]:
SEED = 47

In [3]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        t_predict_left_prob = model(obs[np.newaxis])
        t_action = tf.random.uniform([1, 1]) > t_predict_left_prob
        t_target_left_prob = tf.constant([[1.]]) - tf.cast(t_action, tf.float32)
        t_loss = tf.reduce_mean(loss_fn(t_target_left_prob, t_predict_left_prob))
    t_grads = tape.gradient(t_loss, model.trainable_variables)
    action = int(tf.squeeze(t_action).numpy())
    obs, reward, done, info = env.step(action)
    return obs, reward, done, t_grads

In [4]:
def play_episodes(env, num_episodes, max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(num_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [5]:
# env = gym.make("CartPole-v1")
# obs = env.reset()
# play_episodes(env, 2, 200, model, loss_fn)

In [6]:
def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_reward = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_reward)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discount_rewards - reward_mean) / reward_std for discount_rewards in all_discounted_reward]

In [7]:
discount_rewards([10, 0, -50], discount_rate=0.8)

array([-22, -40, -50])

In [8]:
discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)

[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

In [9]:
num_iterations = 150
num_episodes_per_update = 10
max_steps = 200
discount_rate = 0.95

In [10]:
tf.random.set_seed(SEED)
np.random.seed(SEED)

model = tf.keras.models.Sequential([
    layers.Dense(5, activation='elu', input_shape=[4]),
    layers.Dense(1, activation='sigmoid')
])

optimizer = keras.optimizers.Adam(lr=0.01)
loss_fn = keras.losses.binary_crossentropy

In [11]:
env = gym.make('CartPole-v1')
env.seed(SEED)

for iteration in range(num_iterations):
    all_rewards, all_grads = play_episodes(env, num_episodes_per_update, max_steps, model, loss_fn)
    total_rewards = sum(map(sum, all_rewards))
    print(f"\rIteration: {iteration}, mean rewards: {(total_rewards / num_episodes_per_update):.1f}", end="")
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_rate)
    
    all_mean_grads = []
    for var_idx in range(len(model.trainable_variables)):
        weighted_grads = []
        for episode_idx, final_rewards in enumerate(all_final_rewards):
            for step, final_reward in enumerate(final_rewards):
                weighted_grads.append(final_reward * all_grads[episode_idx][step][var_idx])
        mean_grads = tf.reduce_mean(weighted_grads, axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

env.close()

Iteration: 149, mean rewards: 181.1

In [12]:
def policy(obs):
    left_prob = model.predict(obs[np.newaxis])
    action = int(np.random.rand() > left_prob)
    return action

In [13]:
rewards, frames = play_one_episode(policy, True)
print(f"rewards = {rewards}")
# plot_animation(frames)

rewards = 200.0
