In [1]:
%matplotlib inline

import numpy as np
import gym
from tqdm import tqdm

from neural_agents import REINFORCE, ActorCritic
from plots import plot_var_history, ValuePlot_2D, FunctionPlot_3D
from torch.utils.tensorboard import SummaryWriter

import torch
import torch.nn as nn

from IPython.core.debugger import set_trace

In [2]:
def run_experient(env, num_runs, num_episodes, agent_args, train_writer=None,
            render_env = False, plot_value_func = False, plot_state_visit = False):
    reward_history = []
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0] if isinstance(env.action_space, gym.spaces.Box) else env.action_space.n
    for i, (Agent, agent_kwargs) in enumerate(agent_args):
        print("Agent Setting {}".format(i+1))
        reward_history.append([])
        # Start the runs for each setting
        for run in range(1, num_runs+1):
            reward_history[i].append([])
            env.seed(run)
            agent_kwargs["seed"] = run
            agent = Agent(state_dim, act_dim, **agent_kwargs)
            # Instantiate plot for value function
            if plot_value_func:
                value_plot = ValuePlot(agent)
            # Instantiate plot for state visitation count
            if plot_state_visit:
                state_freq_plot =  StateFrequencyPlot(agent)
            # Start the episodes
            for episode in tqdm(range(1, num_episodes+1)):
                observation  = env.reset()
                done = False
                time_step = 0
                action = agent.start(observation)
                # Start interaction with environment
                while True:
                    if render_env:
                        env.render()
                    observation, reward, done, info = env.step(action)
                    time_step +=1
                    if done:
                        actor_loss, critic_loss = agent.end(reward)
                        if train_writer:
                            writer.add_scalars('Run {}'.format(run), {'Actor Loss':actor_loss,
                                    'Critic Loss':critic_loss, 'Total Timesteps': time_step}, episode)
                        break
                    else:
                        action = agent.take_step(reward, observation)
                    # Update state visits
                    if plot_state_visit:
                        pos = int((observation[0]-state_freq_plot.left_limit[0]) / state_freq_plot.steps[0])
                        vel = int((observation[1]-state_freq_plot.left_limit[1]) / state_freq_plot.steps[1])
                        state_freq_plot.visits[vel, pos] +=1
                        if (episode in [1, 10, 25, 100, 500, 1000, 200, 3500, 5000] and done):
                            state_freq_plot.update("after {} episodes".format(episode)) 
                    # Plot the value function at fixed intervals
                    if plot_value_func:
                        if (time_step % 200 == 0 and episode == 1):
                            value_plot.update("after {} timesteps".format(time_step))
                        elif (episode in [10, 50, 100, 200, 500, 1000, 2000, 3500, 5000] and done):
                            value_plot.update("after {} episodes".format(episode))
                reward_history[i][run-1].append(time_step)
    return reward_history, agent

In [3]:
# # Setup Cart Pole environment
# cart_pole_env = gym.make('CartPole-v0').env

# # Experiment Settting
# settings = [(
#              REINFORCE,
#              {'pol_hid_lyrs': [16], 'val_hid_lyrs': [16],
#               'pol_lr': 0.001, 'val_lr': 0.005, 'batch_size': 5000, 'is_discrete': True}
#               )]

# # Setup Tensorboard path 
# writer = SummaryWriter('../runs/CartPole-v0/REINFORCE')


# _, _ = run_experient(cart_pole_env, 1, 5000, settings, train_writer=writer)

In [4]:
# # Setup Mountain Car environment
# Environment = 'MountainCarContinuous-v0'
# mountain_car_env = gym.make(Environment).env

# # Experiment Settting
# settings = [(
#              REINFORCE,
#              {'pol_hid_lyrs': [16, 32, 16], 'val_hid_lyrs': [16, 16],
#               'pol_lr': 1e-4, 'val_lr': 1e-3, 'batch_size': 500, 'is_discrete': False}
#               )]

# model_name = 'actor_hid_[16 32 16]_lr_1e-4_critic_hid_[16 16]_lr_1e-3_batch_500'

# # Setup Tensorboard path 
# writer = SummaryWriter('../runs/' + Environment + '/REINFORCE/' + model_name)


# rew_hist, ann_agent = run_experient(mountain_car_env, 1, 5000, settings, train_writer=writer, render_env=False)

In [5]:
# Setup Mountain Car environment
Environment = 'MountainCarContinuous-v0'
mountain_car_env = gym.make(Environment).env

# Experiment Settting
settings = [(
             ActorCritic,
             {'pol_hid_lyrs': [16, 32, 16], 'val_hid_lyrs': [16, 16],
              'pol_act': nn.Tanh, 'val_act': nn.Tanh, 'discount': 1,
              'pol_lr': 1e-4, 'val_lr': 1e-3, 'batch_size': 500, 'is_discrete': False}
              )]

model_name = 'actor_hid_[16 32 16]_lr_1e-4_critic_hid_[16 16]_lr_1e-3_batch_500'

# Setup Tensorboard path 
writer = SummaryWriter('../runs/' + Environment + '/Trial/' + model_name)


rew_hist, ann_agent = run_experient(mountain_car_env, 1, 5000, settings, train_writer=writer, render_env=True)

Agent Setting 1
  0%|          | 2/5000 [00:13<7:59:12,  5.75s/it]

KeyboardInterrupt: 

In [30]:
a = torch.randn(10,2)
print(a)
a,b = torch.split(a,1,dim=1)

tensor([[-0.5675, -0.6354],
        [ 0.6367,  2.2053],
        [-0.4758,  1.8802],
        [-0.5544, -2.2358],
        [-0.4321, -1.6297],
        [-0.0800, -0.0044],
        [ 1.4590, -0.4275],
        [ 0.3299, -0.8522],
        [ 0.1406, -0.2824],
        [ 0.6118,  2.0799]])
