# REINFORCE
Training takes quite a while for this.

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
np.random.seed(42)

In [None]:
# Instantiate environment
env = gym.make("CartPole-v1")

observation_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

In [None]:
def build_policy_network(input_size, output_size):
    inputs = tf.keras.layers.Input(input_size, name='input')
    dense1 = tf.keras.layers.Dense(32, activation='relu', name='hidden')(inputs)
    outputs = tf.keras.layers.Dense(output_size, activation='softmax', name='prob_outputs')(dense1)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    optimizer=tf.keras.optimizers.Adam(lr=1e-3)
    return model, optimizer

In [None]:
def discount_rewards(rewards, discount_rate=0.99, normalize_rewards=False):
    rewards_to_go = list()
    cumulative_rewards = 0.0

    for r in rewards[::-1]:
        cumulative_rewards = r + discount_rate * cumulative_rewards
        rewards_to_go.append(cumulative_rewards)
    rewards_to_go.reverse()

    if normalize_rewards:
        mean = np.mean(rewards_to_go)
        std = np.std(rewards_to_go)
        rewards_to_go = (rewards_to_go - mean) / std
    return np.array(rewards_to_go)

In [None]:
def get_action(model, state):
    state = state.reshape((1, -1))
    action_prob = model.predict(state, batch_size=1).flatten()
    selected_action = np.random.choice(action_dim, 1, p=action_prob)[0]
    return selected_action

In [None]:
def generate_trajectories(env, model):
    batch_rewards = []
    batch_obs = []
    batch_actions = []
    obs = env.reset()
    
    done = False
    while not done:
        action= get_action(model, obs)
        batch_obs.append(obs)
        obs, reward, done, _ = env.step(action)
        
        batch_actions.append(action)
        batch_rewards.append(reward)
    
    # postprocess
    batch_obs = np.vstack(batch_obs)
    batch_actions = np.array(batch_actions).reshape((len(batch_actions), 1))  
    batch_rewards = np.array(batch_rewards)
    return batch_obs, batch_actions, batch_rewards

In [None]:
def train(env, model):
    states, actions, rewards = generate_trajectories(env, policy_model)
    discounted_rewards = discount_rewards(rewards)
    discounted_rewards = tf.convert_to_tensor(tf.cast(discounted_rewards, dtype=tf.float32))
    
    # Forward pass
    with tf.GradientTape() as tape:
        action_probs = model(states)
        action_probs = tf.gather_nd(action_probs, actions, batch_dims=1)
        action_probs = tf.clip_by_value(action_probs, 1e-8, 1-1e-8)
        
        loss = - tf.reduce_sum(tf.multiply(tf.math.log(action_probs), discounted_rewards))
    
    # Backwards pass
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    episode_reward = np.sum(rewards)
    return episode_reward

In [None]:
NUM_EPISODES = 650

policy_model, optimizer = build_policy_network(observation_dim, action_dim)

episode_rewards = list()

start = time.time()
for episode in range(NUM_EPISODES):
    ep_rewards = train(env, policy_model)
    episode_rewards.append(ep_rewards)
    if episode <= 50:
        print(f'episode {episode}/{NUM_EPISODES} - reward: {ep_rewards}')
    if episode > 50:
        print(f'episode {episode}/{NUM_EPISODES} - reward: {ep_rewards} - last 50 avg: {np.mean(episode_rewards[-50:])}')
end = time.time()
print(f'Training time: {int(round(end - start))} seconds.)

In [None]:
x = [i for i in range(605)]
plt.plot(x, episode_rewards)
smoothed_fn = np.poly1d(np.polyfit(x, episode_rewards, 3))
plt.plot(x, smoothed_fn(x), linestyle = '-')
    
plt.xlabel("episode number")
plt.ylabel("reward")
plt.title("Simple PG Training")
plt.show()