Code adapted from Yourness Mansar 

Original Code: https://github.com/CVxTz/RL
Article discussing PPO: https://towardsdatascience.com/learning-to-play-cartpole-and-lunarlander-with-proximal-policy-optimization-dacbd6045417

In [None]:
from pathlib import Path

import gym
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

import matplotlib.pyplot as plt
from IPython.display import clear_output
import math

import time

from model import (
    PolicyNetwork,
    ValueNetwork,
    device,
    train_value_network,
    train_policy_network,
)
from replay import Episode, History

%matplotlib inline

In [None]:
def plot_res(values, title=''):   
    ''' Plot the reward curve and histogram of results over time.'''
    # Update the window after each episode
    clear_output(wait=True)
    
    # Define the figure
    f, ax = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
    f.suptitle(title)
    ax[0].plot(values, label='score per run')
    ax[0].axhline(195, c='red',ls='--', label='goal')
    ax[0].set_xlabel('Episodes')
    ax[0].set_ylabel('Reward')
    x = range(len(values))
    ax[0].legend()

    
    # Plot the histogram of results
    ax[1].hist(values[-50:])
    ax[1].axvline(195, c='red', label='goal')
    ax[1].set_xlabel('Scores per Last 50 Episodes')
    ax[1].set_ylabel('Frequency')
    ax[1].legend()
    plt.show()

In [None]:
def main(
    env_name="CartPole-v1",
    reward_scale=20.0,
    clip=0.2,
    log_dir="../logs",
    learning_rate=0.001,
    state_scale=1.0):
    
    title = env_name
    final = []
    env = gym.make(env_name)
    observation = env.reset()

    n_actions = env.action_space.n
    feature_dim = observation.size

    value_model = ValueNetwork(in_dim=feature_dim).to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

    n_epoch = 1

    max_episodes = 20
    max_timesteps = 400

    batch_size = 32

    max_iterations = 15

    history = History()

    epoch_ite = 0
    episode_ite = 0

    for ite in range(max_iterations):

        for episode_i in range(max_episodes):

            observation = env.reset()
            episode = Episode()
            total = 0

            for timestep in range(max_timesteps):

                action, log_probability = policy_model.sample_action(observation / state_scale)
                
                value = value_model.state_value(observation / state_scale)

                new_observation, reward, done, info = env.step(action)
                
                total += reward
                
                episode.append(
                    observation=observation / state_scale,
                    action=action,
                    reward=reward,
                    value=value,
                    log_probability=log_probability,
                    reward_scale=reward_scale,
                )

                observation = new_observation

                if done:
                    episode.end_episode(last_value=0)
                    break

                if timestep == max_timesteps - 1:
                    value = value_model.state_value(observation / state_scale)
                    episode.end_episode(last_value=value)

            episode_ite += 1
            final.append(total)
            plot_res(final, title)

            history.add_episode(episode)

        history.build_dataset()
        data_loader = DataLoader(history, batch_size=batch_size, shuffle=True)

        policy_loss = train_policy_network(
            policy_model, policy_optimizer, data_loader, epochs=n_epoch, clip=clip
        )

        value_loss = train_value_network(
            value_model, value_optimizer, data_loader, epochs=n_epoch
        )

        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1


        history.free_memory()


In [None]:
main()