# REINFORCE

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
np.random.seed(42)

In [2]:
# Instantiate environment
env = gym.make("CartPole-v1")

observation_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

In [3]:
def build_policy_network(input_size, output_size):
    inputs = tf.keras.layers.Input(input_size, name='input')
    dense1 = tf.keras.layers.Dense(32, activation='relu', name='hidden')(inputs)
    outputs = tf.keras.layers.Dense(output_size, activation='softmax', name='prob_outputs')(dense1)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    optimizer=tf.keras.optimizers.Adam(lr=1e-3)
    return model, optimizer

In [4]:
def discount_rewards(rewards, discount_rate=0.99, normalize_rewards=False):
    rewards_to_go = list()
    cumulative_rewards = 0.0

    for r in rewards[::-1]:
        cumulative_rewards = r + discount_rate * cumulative_rewards
        rewards_to_go.append(cumulative_rewards)
    rewards_to_go.reverse()

    if normalize_rewards:
        mean = np.mean(rewards_to_go)
        std = np.std(rewards_to_go)
        rewards_to_go = (rewards_to_go - mean) / std
    return np.array(rewards_to_go)

In [5]:
def get_action(predict_model, state):
    state = state.reshape((1, -1))
    action_prob = predict_model.predict(state, batch_size=1).flatten()
    selected_action = np.random.choice(action_dim, 1, p=action_prob)[0]
    action_prob = np.array(action_prob[selected_action])
    return selected_action, action_prob.tolist()

In [12]:
def update_policy(model, rewards, states, actions, actions_prob):
    rewards_to_go = discount_rewards(rewards)
    states = np.vstack(states)
    actions = np.array(actions)
    actions = actions.reshape((len(actions), 1))  
    
    with tf.GradientTape() as tape:
        action_probs = model(states)
        pa = tf.gather_nd(action_probs, actions, batch_dims=1)
        rewards_to_go = tf.convert_to_tensor(tf.cast(rewards_to_go, dtype=tf.float32))
        loss = -tf.reduce_sum(tf.multiply(tf.math.log(tf.clip_by_value(pa, 1e-8, 1-1e-8)), rewards_to_go))
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [14]:
NUM_EPISODES = 1

policy_model, optimizer = build_policy_network(observation_dim, action_dim)

episode_rewards = list()
for episode in range(NUM_EPISODES):
    batch_rewards = []
    batch_obs = []
    batch_actions = []
    batch_actions_prob = []
    obs = env.reset()
    
    # Run policy to get one entire trajectory
    while True:
#         env.render()

        action, action_prob = get_action(policy_model, obs)
        new_obs, reward, done, _ = env.step(action)

        # Save data
        batch_obs.append(obs)
        batch_rewards.append(reward)
        batch_actions.append(action)
        batch_actions_prob.append(action_prob)
        if done:
            episode_reward = sum(batch_rewards)
            episode_rewards.append(episode_reward)

            update_policy(policy_model, batch_rewards, batch_obs, batch_actions, batch_actions_prob)

            print(f"Episode: {episode}, Reward: {episode_reward}")
            break

        obs = new_obs



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Episode: 0, Reward: 17.0


In [None]:
plt.plot([i for i in range(515)], episode_rewards)
plt.xlabel("episode number")
plt.ylabel("reward")
plt.title("Simple PG Training")
plt.show()