In [1]:
%matplotlib inline

import numpy as np
import gym
from tqdm import tqdm

from neural_agents import ActorCritic
from plots import plot_var_history, ValuePlot_2D, FunctionPlot_3D
from torch.utils.tensorboard import SummaryWriter

from IPython.core.debugger import set_trace

In [2]:
def run_experient(env, num_runs, num_episodes, agent_args, train_writer=None,
            render_env = False, plot_value_func = False, plot_state_visit = False):
    reward_history = []
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0] if isinstance(env.action_space, gym.spaces.Box) else env.action_space.n
    for i, (Agent, agent_kwargs) in enumerate(agent_args):
        print("Agent Setting {}".format(i+1))
        reward_history.append([])
        # Start the runs for each setting
        for run in range(1, num_runs+1):
            reward_history[i].append([])
            env.seed(run)
            agent_kwargs["seed"] = run
            agent = Agent(state_dim, act_dim, **agent_kwargs)
            # Instantiate plot for value function
            if plot_value_func:
                value_plot = ValuePlot(agent)
            # Instantiate plot for state visitation count
            if plot_state_visit:
                state_freq_plot =  StateFrequencyPlot(agent)
            # Start the episodes
            for episode in tqdm(range(1, num_episodes+1)):
                observation  = env.reset()
                done = False
                time_step = 0
                action = agent.start(observation)
                # Start interaction with environment
                while True:
                    if render_env:
                        env.render()
                    observation, reward, done, info = env.step(action)
                    time_step +=1
                    if done:
                        actor_loss, critic_loss = agent.end(reward)
                        if train_writer:
                            writer.add_scalars('Run {}'.format(run), {'Actor Loss':actor_loss,
                                    'Critic Loss':critic_loss, 'Total Timesteps': time_step}, episode)
                        break
                    else:
                        action = agent.take_step(reward, observation)
                    # Update state visits
                    if plot_state_visit:
                        pos = int((observation[0]-state_freq_plot.left_limit[0]) / state_freq_plot.steps[0])
                        vel = int((observation[1]-state_freq_plot.left_limit[1]) / state_freq_plot.steps[1])
                        state_freq_plot.visits[vel, pos] +=1
                        if (episode in [1, 10, 25, 100, 500, 1000, 200, 3500, 5000] and done):
                            state_freq_plot.update("after {} episodes".format(episode)) 
                    # Plot the value function at fixed intervals
                    if plot_value_func:
                        if (time_step % 200 == 0 and episode == 1):
                            value_plot.update("after {} timesteps".format(time_step))
                        elif (episode in [10, 50, 100, 200, 500, 1000, 2000, 3500, 5000] and done):
                            value_plot.update("after {} episodes".format(episode))
                reward_history[i][run-1].append(time_step)
    return reward_history, agent

In [3]:
# # Setup Cart Pole environment
# cart_pole_env = gym.make('CartPole-v0').env

# # Experiment Settting
# settings = [(
#              ActorCritic,
#              {'act_hid_lyrs': [16], 'critic_hid_lyrs': [16],
#               'actor_lr': 0.001, 'critic_lr': 0.005, 'batch_size': 5000, 'is_discrete': True}
#               )]

# # Setup Tensorboard path 
# writer = SummaryWriter('../runs/name')


# _, _ = run_experient(cart_pole_env, 1, 5000, settings, train_writer=writer, render_env=True)

In [4]:
# Setup Cart Pole environment
cart_pole_env = gym.make('MountainCar-v0').env

# Experiment Settting
settings = [(
             ActorCritic,
             {'act_hid_lyrs': [16, 32, 16], 'critic_hid_lyrs': [16, 16],
              'actor_lr': 0.0001, 'critic_lr': 0.001, 'batch_size': 500, 'is_discrete': True}
              )]

# Setup Tensorboard path 
writer = SummaryWriter('../runs/mountain_car_d_big')


rew_hist, ann_agent = run_experient(cart_pole_env, 1, 5000, settings, train_writer=writer, render_env=False)

Agent Setting 1
 27%|██▋       | 136/500 [11:45<34:08,  5.63s/it]

KeyboardInterrupt: 