This Notebook contains implementaions of a few basic policy gradient algorithms.

The implementaions here use non-linear function approximation through Neural Networks.

The graph for training loss and agent performance are written to Tensorboard.

Have Fun!

In [1]:
%matplotlib inline

import numpy as np
import gym
from tqdm import tqdm

from pg_agents import REINFORCE, ActorCritic
from plots import plot_var_history, ValuePlot_2D, FunctionPlot_3D
from torch.utils.tensorboard import SummaryWriter

import torch
import torch.nn as nn

from IPython.core.debugger import set_trace

## Available Agents
        - Policy Gradient with Baseline
        - Actor-Critic

## Hyperparameters:

### Neural Network:
        - Network Architecture
        - Learning Rate

### Algorithm
        - Advantage Function
        - Batch Size

In [2]:
def get_model_name(parameter_dict):
    pol_hid_lyrs = ''.join(str(parameter_dict.get("pol_hid_lyrs")).split(','))
    val_hid_lyrs = ''.join(str(parameter_dict.get("val_hid_lyrs")).split(','))
    pol_lr = parameter_dict.get("pol_lr")
    val_lr = parameter_dict.get("val_lr")
    batch_size = parameter_dict.get("batch_size")
    weight_decay = parameter_dict.get("weight_decay")
    name = 'actor_hid_{0}_lr_{1}_critic_hid_{2}_lr_{3:.0e}_wd_{4}_batch_{5}'.format(pol_hid_lyrs, pol_lr, val_hid_lyrs, val_lr, weight_decay, batch_size)
    return name

In [3]:
def run_experient(env, num_runs, num_episodes, agent_args, train_writer=None,
            render_env = False, plot_value_func = False, plot_state_visit = False):
    reward_history = []
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0] if isinstance(env.action_space, gym.spaces.Box) else env.action_space.n
    for i, (Agent, agent_kwargs) in enumerate(agent_args):
        print("Agent Setting {}".format(i+1))
        reward_history.append([])
        # Start the runs for each setting
        for run in range(1, num_runs+1):
            reward_history[i].append([])
            env.seed(run)
            agent_kwargs["seed"] = run
            agent = Agent(state_dim, act_dim, **agent_kwargs)
            # Instantiate plot for value function
            if plot_value_func:
                value_plot = ValuePlot(agent)
            # Instantiate plot for state visitation count
            if plot_state_visit:
                state_freq_plot =  StateFrequencyPlot(agent)
            # Start the episodes
            for episode in tqdm(range(1, num_episodes+1)):
                observation  = env.reset()
                done = False
                time_step = 0
                action = agent.start(observation)
                # Start interaction with environment
                while True:
                    if render_env:
                        env.render()
                    observation, reward, done, info = env.step(action)
                    time_step +=1
                    if done:
                        actor_loss, critic_loss = agent.end(reward)
                        if train_writer:
                            writer.add_scalars('Run {}'.format(run), {'Actor Loss':actor_loss,
                                    'Critic Loss':critic_loss, 'Total Timesteps': time_step}, episode)
                        break
                    else:
                        action = agent.take_step(reward, observation)
                    # Update state visits
                    if plot_state_visit:
                        pos = int((observation[0]-state_freq_plot.left_limit[0]) / state_freq_plot.steps[0])
                        vel = int((observation[1]-state_freq_plot.left_limit[1]) / state_freq_plot.steps[1])
                        state_freq_plot.visits[vel, pos] +=1
                        if (episode in [1, 10, 25, 100, 500, 1000, 200, 3500, 5000] and done):
                            state_freq_plot.update("after {} episodes".format(episode)) 
                    # Plot the value function at fixed intervals
                    if plot_value_func:
                        if (time_step % 200 == 0 and episode == 1):
                            value_plot.update("after {} timesteps".format(time_step))
                        elif (episode in [10, 50, 100, 200, 500, 1000, 2000, 3500, 5000] and done):
                            value_plot.update("after {} episodes".format(episode))
                reward_history[i][run-1].append(time_step)
    return reward_history, agent

In [4]:
# Setup Cart Pole environment
cart_pole_env = gym.make('CartPole-v0').env

# Experiment Settting
settings = [(
             REINFORCE,
             {'pol_hid_lyrs': [32], 'val_hid_lyrs': [16],
              'pol_lr': 0.001, 'val_lr': 0.005, 'batch_size': 5000, 'is_discrete': True}
              )]


model_name = 'actor_hid_[32]_lr_1e-3_critic_hid_[16]_lr_5e-3_batch_5000'

# Setup Tensorboard path 
writer = SummaryWriter('../runs/' + Environment + '/REINFORCE/' + model_name)


_, _ = run_experient(cart_pole_env, 1, 5000, settings, render_env= True, train_writer=writer)

In [5]:
# Setup Mountain Car environment
Environment = 'MountainCarContinuous-v0'
mountain_car_env = gym.make(Environment).env

# Experiment Settting
settings = [(
             REINFORCE,
             {'pol_hid_lyrs': [16, 32, 32, 32, 16], 'val_hid_lyrs': [16, 32, 32, 16],
              'pol_lr': 5e-4, 'val_lr': 5e-3, 'weight_decay':0.01, 'batch_size': 500, 'is_discrete': False}
              )]

model_name = get_model_name(settings[0][1])

# Setup Tensorboard path 
writer = SummaryWriter('../runs/' + Environment + '/REINFORCE/' + model_name)


rew_hist, ann_agent = run_experient(mountain_car_env, 1, 5000, settings, train_writer=writer, render_env=False)