# Exploration Test sandbox

In [1]:
%matplotlib notebook

import argparse
import sys
from tqdm import trange

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)

import gym
from gym import wrappers, logger

from multigoal_env.multigoal import MultiGoal

from spinup.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
from agents import RandomAgent

import torch
import tree

# Multi Goal Environment

## The code for multi goal is written in the `multigoal_env` folder
## The enviroment is registered in `__init__.py` and defined in `multigoal.py`

#### About multigoal environment: The multigoal environment depicts a 2D state space with multimodal rewards. In many applications, we want to learn a policy that achieves high reward via distinct trajectories. The multigoal enviroment is a nice and easy test bed to test various RL algorithms for such multimodal reward setting.



In [1]:
%matplotlib notebook

import argparse
import sys
from tqdm import trange

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)

import gym
from gym import wrappers, logger

from multigoal_env.multigoal import MultiGoal

from spinup.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
from agents import RandomAgent

import torch
import tree

env_id = 'MultiGoal-v0'
seed = 1234

In [3]:
def process_sample(observation, action,reward, 
                    terminal,next_observation,info):
    
    processed_observation = {
        'observations': observation,
        'actions': action,
        'rewards': np.atleast_1d(reward),
        'terminals': np.atleast_1d(terminal),
        'next_observations': next_observation,
        'infos': info,
    }

    return processed_observation

def test_policy(agent_fun):
    
    env = gym.make(env_id)
    env.seed(seed)
    reward = 0.
    done = False
    agent = agent_fun(env)
    current_path = []
    
    path_length = 0
    path_return = 0.
    num_episodes = 10
    max_steps = 50
    all_paths = []
    total_rewards = []
    
    for episode in range(1, num_episodes+1):

        current_ob = env.reset()
        rewards = []

        while True:
            ob_tensor = torch.as_tensor(current_ob, dtype=torch.float32)
            action = agent.act(ob_tensor)
            
            ob, reward, done, info = env.step(action)

            processed_sample = process_sample(observation=current_ob, action=action, reward=reward,
                                              terminal=done, next_observation=ob,info=info)

            rewards.append(reward)
            current_path.append(processed_sample)

            path_length += 1

            if done or path_length > max_steps:
                current_ob = env.reset()
                path_length = 0
                last_path = tree.map_structure(lambda *x: np.stack(x, axis=0), *current_path)
                all_paths.append(last_path)
                env.render_rollouts(all_paths)
                current_path = []

                break
            else:
                current_ob = ob    
            
        total_rewards.append((np.sum(rewards), np.mean(rewards)))
#         if episode % 5 == 0:
#             print ('Episode Number: {:d} | Total Reward: {:.4f} '.format(episode, np.sum(rewards)))

    env.close()
    total_rewards = tree.map_structure(lambda *x: np.stack(x, axis=0), *total_rewards)
    return total_rewards

# Random Agent

In [4]:
agent = lambda env: RandomAgent(env.action_space)
total_rewards = test_policy(agent)
print ('Average Reward across {:d} episodes is {:.2f}'.format(total_rewards[0].shape[0], np.mean(total_rewards[0])))
print ('Reward per step of the particle is {:.2f}'.format(np.mean(total_rewards[1])))



<IPython.core.display.Javascript object>

Average Reward across 10 episodes is -1418.61
Reward per step of the particle is -30.73


# Policy Gradient 

In [3]:
hid = 10
num_layers = 2
gamma = 0.9
num_epochs = 500
cpu = 2

MAX_STEPS_PER_EPISODE = 100
steps = MAX_STEPS_PER_EPISODE
seed = 37

In [4]:
from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs('vpg', seed, data_dir='model/')
from algos.vpg_source.custom_actor_critic import MLPActorCritic
from algos.vpg_source.vpg import vpg as vpg_pytorch

In [15]:
epoch_reward = vpg_pytorch(lambda : gym.make(env_id), actor_critic=MLPActorCritic,
                            ac_kwargs=dict(hidden_sizes=[hid]*num_layers), gamma=gamma, 
                            seed=seed, steps_per_epoch=steps, epochs=num_epochs,
                            logger_kwargs=logger_kwargs, max_ep_len=MAX_STEPS_PER_EPISODE)

[32;1mLogging data to model/vpg/vpg_s37/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            10,
            10
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "env_fn":	"<function <lambda> at 0x7f2ab2a13dd0>",
    "epochs":	500,
    "exp_name":	"vpg",
    "gamma":	0.9,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f2ad44aa690>":	{
            "epoch_dict":	{},
            "exp_name":	"vpg",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"model/vpg/vpg_s37",
            "output_file":	{
                "<_io.TextIOWrapper name='model/vpg/vpg_s37/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                }
            }
        }
    },
    "logger_kwargs":	{
        "exp_name":	"vpg",
        "output_dir":	"model/vpg/vpg_s37"
    },
    "max_ep_len":	100,
    "pi_lr":	0.0003,
 



---------------------------------------
|             Epoch |              10 |
|      AverageEpRet |       -1.12e+03 |
|          StdEpRet |             913 |
|          MaxEpRet |            -382 |
|          MinEpRet |       -3.37e+03 |
|             EpLen |              37 |
|      AverageVVals |           -6.81 |
|          StdVVals |            3.89 |
|          MaxVVals |            2.31 |
|          MinVVals |           -12.6 |
| TotalEnvInteracts |         1.1e+03 |
|            LossPi |         -0.0757 |
|             LossV |        6.81e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -735 |
|           Entropy |           0.919 |
|                KL |       -2.62e-09 |
|              Time |            3.72 |
---------------------------------------




---------------------------------------
|             Epoch |              20 |
|      AverageEpRet |       -1.15e+03 |
|          StdEpRet |        1.06e+03 |
|          MaxEpRet |            -190 |
|          MinEpRet |       -3.31e+03 |
|             EpLen |            39.9 |
|      AverageVVals |           -17.5 |
|          StdVVals |            2.48 |
|          MaxVVals |           -13.5 |
|          MinVVals |           -21.5 |
| TotalEnvInteracts |         2.1e+03 |
|            LossPi |          -0.161 |
|             LossV |         5.6e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -386 |
|           Entropy |           0.917 |
|                KL |       -3.34e-09 |
|              Time |            7.26 |
---------------------------------------




---------------------------------------
|             Epoch |              30 |
|      AverageEpRet |       -1.28e+03 |
|          StdEpRet |             932 |
|          MaxEpRet |            -352 |
|          MinEpRet |       -3.49e+03 |
|             EpLen |            43.2 |
|      AverageVVals |           -25.8 |
|          StdVVals |            2.82 |
|          MaxVVals |           -7.48 |
|          MinVVals |           -29.7 |
| TotalEnvInteracts |         3.1e+03 |
|            LossPi |          -0.212 |
|             LossV |        5.75e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -487 |
|           Entropy |           0.916 |
|                KL |       -2.03e-09 |
|              Time |              11 |
---------------------------------------




---------------------------------------
|             Epoch |              40 |
|      AverageEpRet |            -945 |
|          StdEpRet |             692 |
|          MaxEpRet |            -211 |
|          MinEpRet |       -3.24e+03 |
|             EpLen |            31.6 |
|      AverageVVals |           -34.2 |
|          StdVVals |            2.24 |
|          MaxVVals |           -30.8 |
|          MinVVals |           -37.7 |
| TotalEnvInteracts |         4.1e+03 |
|            LossPi |          -0.134 |
|             LossV |        4.66e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -315 |
|           Entropy |           0.913 |
|                KL |        1.31e-09 |
|              Time |            14.1 |
---------------------------------------




---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |            -898 |
|          StdEpRet |             696 |
|          MaxEpRet |            -173 |
|          MinEpRet |       -3.03e+03 |
|             EpLen |            31.5 |
|      AverageVVals |             -42 |
|          StdVVals |            2.09 |
|          MaxVVals |           -38.8 |
|          MinVVals |           -45.6 |
| TotalEnvInteracts |         5.1e+03 |
|            LossPi |          -0.191 |
|             LossV |        3.86e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -270 |
|           Entropy |           0.911 |
|                KL |       -5.96e-10 |
|              Time |            17.4 |
---------------------------------------




---------------------------------------
|             Epoch |              60 |
|      AverageEpRet |       -1.28e+03 |
|          StdEpRet |        1.02e+03 |
|          MaxEpRet |            -351 |
|          MinEpRet |       -3.86e+03 |
|             EpLen |              43 |
|      AverageVVals |             -50 |
|          StdVVals |            2.25 |
|          MaxVVals |           -46.4 |
|          MinVVals |           -53.7 |
| TotalEnvInteracts |         6.1e+03 |
|            LossPi |          -0.182 |
|             LossV |        4.33e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -332 |
|           Entropy |           0.909 |
|                KL |        1.85e-09 |
|              Time |            20.5 |
---------------------------------------




---------------------------------------
|             Epoch |              70 |
|      AverageEpRet |            -973 |
|          StdEpRet |             791 |
|          MaxEpRet |            -175 |
|          MinEpRet |       -2.85e+03 |
|             EpLen |            34.7 |
|      AverageVVals |           -58.4 |
|          StdVVals |            2.18 |
|          MaxVVals |             -55 |
|          MinVVals |           -61.8 |
| TotalEnvInteracts |         7.1e+03 |
|            LossPi |          -0.156 |
|             LossV |        3.17e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -244 |
|           Entropy |           0.906 |
|                KL |       -3.16e-09 |
|              Time |            23.7 |
---------------------------------------




---------------------------------------
|             Epoch |              80 |
|      AverageEpRet |            -763 |
|          StdEpRet |             463 |
|          MaxEpRet |            -303 |
|          MinEpRet |       -1.78e+03 |
|             EpLen |            27.9 |
|      AverageVVals |           -66.1 |
|          StdVVals |             2.3 |
|          MaxVVals |           -62.6 |
|          MinVVals |           -69.7 |
| TotalEnvInteracts |         8.1e+03 |
|            LossPi |          -0.136 |
|             LossV |        3.34e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -268 |
|           Entropy |           0.904 |
|                KL |       -1.19e-10 |
|              Time |            26.9 |
---------------------------------------




---------------------------------------
|             Epoch |              90 |
|      AverageEpRet |       -1.11e+03 |
|          StdEpRet |             789 |
|          MaxEpRet |            -302 |
|          MinEpRet |       -3.67e+03 |
|             EpLen |              39 |
|      AverageVVals |           -74.2 |
|          StdVVals |            2.12 |
|          MaxVVals |           -70.8 |
|          MinVVals |           -77.6 |
| TotalEnvInteracts |         9.1e+03 |
|            LossPi |         -0.0898 |
|             LossV |        2.88e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -228 |
|           Entropy |           0.902 |
|                KL |        1.19e-10 |
|              Time |            30.2 |
---------------------------------------




---------------------------------------
|             Epoch |             100 |
|      AverageEpRet |       -1.13e+03 |
|          StdEpRet |             801 |
|          MaxEpRet |            -291 |
|          MinEpRet |       -3.35e+03 |
|             EpLen |            39.4 |
|      AverageVVals |             -82 |
|          StdVVals |            2.35 |
|          MaxVVals |           -78.3 |
|          MinVVals |           -85.6 |
| TotalEnvInteracts |        1.01e+04 |
|            LossPi |           -0.19 |
|             LossV |        2.97e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -247 |
|           Entropy |           0.901 |
|                KL |        2.21e-09 |
|              Time |            33.4 |
---------------------------------------




---------------------------------------
|             Epoch |             110 |
|      AverageEpRet |            -714 |
|          StdEpRet |             502 |
|          MaxEpRet |            -206 |
|          MinEpRet |       -2.59e+03 |
|             EpLen |            26.2 |
|      AverageVVals |             -89 |
|          StdVVals |            1.92 |
|          MaxVVals |           -86.4 |
|          MinVVals |           -92.3 |
| TotalEnvInteracts |        1.11e+04 |
|            LossPi |          -0.143 |
|             LossV |         1.8e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -150 |
|           Entropy |           0.899 |
|                KL |        1.25e-09 |
|              Time |            36.7 |
---------------------------------------




---------------------------------------
|             Epoch |             120 |
|      AverageEpRet |            -889 |
|          StdEpRet |             524 |
|          MaxEpRet |            -261 |
|          MinEpRet |       -2.92e+03 |
|             EpLen |              32 |
|      AverageVVals |           -96.5 |
|          StdVVals |            2.15 |
|          MaxVVals |           -93.1 |
|          MinVVals |           -99.8 |
| TotalEnvInteracts |        1.21e+04 |
|            LossPi |          -0.133 |
|             LossV |        1.78e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -163 |
|           Entropy |           0.896 |
|                KL |        3.52e-09 |
|              Time |            39.8 |
---------------------------------------




---------------------------------------
|             Epoch |             130 |
|      AverageEpRet |            -620 |
|          StdEpRet |             308 |
|          MaxEpRet |            -159 |
|          MinEpRet |       -1.49e+03 |
|             EpLen |            21.7 |
|      AverageVVals |            -104 |
|          StdVVals |            2.17 |
|          MaxVVals |            -101 |
|          MinVVals |            -107 |
| TotalEnvInteracts |        1.31e+04 |
|            LossPi |          -0.162 |
|             LossV |        1.66e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -144 |
|           Entropy |           0.894 |
|                KL |        2.98e-09 |
|              Time |              43 |
---------------------------------------




---------------------------------------
|             Epoch |             140 |
|      AverageEpRet |            -941 |
|          StdEpRet |             838 |
|          MaxEpRet |            -245 |
|          MinEpRet |       -3.61e+03 |
|             EpLen |              32 |
|      AverageVVals |            -111 |
|          StdVVals |            2.59 |
|          MaxVVals |            -108 |
|          MinVVals |            -116 |
| TotalEnvInteracts |        1.41e+04 |
|            LossPi |          -0.111 |
|             LossV |        2.19e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -229 |
|           Entropy |           0.892 |
|                KL |        4.17e-10 |
|              Time |            46.3 |
---------------------------------------




---------------------------------------
|             Epoch |             150 |
|      AverageEpRet |            -757 |
|          StdEpRet |             626 |
|          MaxEpRet |            -182 |
|          MinEpRet |       -3.27e+03 |
|             EpLen |            27.4 |
|      AverageVVals |            -119 |
|          StdVVals |            1.84 |
|          MaxVVals |            -116 |
|          MinVVals |            -122 |
| TotalEnvInteracts |        1.51e+04 |
|            LossPi |           -0.14 |
|             LossV |        1.47e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -134 |
|           Entropy |           0.891 |
|                KL |       -3.93e-09 |
|              Time |            49.4 |
---------------------------------------




---------------------------------------
|             Epoch |             160 |
|      AverageEpRet |            -565 |
|          StdEpRet |             297 |
|          MaxEpRet |            -194 |
|          MinEpRet |       -1.39e+03 |
|             EpLen |            21.1 |
|      AverageVVals |            -127 |
|          StdVVals |            1.87 |
|          MaxVVals |            -123 |
|          MinVVals |            -129 |
| TotalEnvInteracts |        1.61e+04 |
|            LossPi |          -0.143 |
|             LossV |        1.75e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -144 |
|           Entropy |           0.889 |
|                KL |       -1.37e-09 |
|              Time |            52.7 |
---------------------------------------




---------------------------------------
|             Epoch |             170 |
|      AverageEpRet |            -935 |
|          StdEpRet |             724 |
|          MaxEpRet |            -175 |
|          MinEpRet |       -2.72e+03 |
|             EpLen |            33.3 |
|      AverageVVals |            -133 |
|          StdVVals |            2.31 |
|          MaxVVals |            -130 |
|          MinVVals |            -137 |
| TotalEnvInteracts |        1.71e+04 |
|            LossPi |          -0.155 |
|             LossV |        1.38e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -153 |
|           Entropy |           0.887 |
|                KL |       -7.57e-09 |
|              Time |            55.9 |
---------------------------------------




---------------------------------------
|             Epoch |             180 |
|      AverageEpRet |            -546 |
|          StdEpRet |             418 |
|          MaxEpRet |            -192 |
|          MinEpRet |        -2.6e+03 |
|             EpLen |              19 |
|      AverageVVals |            -140 |
|          StdVVals |            1.92 |
|          MaxVVals |            -138 |
|          MinVVals |            -144 |
| TotalEnvInteracts |        1.81e+04 |
|            LossPi |          -0.158 |
|             LossV |        1.44e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -130 |
|           Entropy |           0.885 |
|                KL |        2.03e-09 |
|              Time |            59.2 |
---------------------------------------




---------------------------------------
|             Epoch |             190 |
|      AverageEpRet |            -662 |
|          StdEpRet |             527 |
|          MaxEpRet |            -215 |
|          MinEpRet |       -2.99e+03 |
|             EpLen |            24.4 |
|      AverageVVals |            -147 |
|          StdVVals |            1.69 |
|          MaxVVals |            -144 |
|          MinVVals |            -149 |
| TotalEnvInteracts |        1.91e+04 |
|            LossPi |           -0.18 |
|             LossV |        1.12e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -74.4 |
|           Entropy |           0.883 |
|                KL |       -1.49e-09 |
|              Time |            62.4 |
---------------------------------------




---------------------------------------
|             Epoch |             200 |
|      AverageEpRet |            -546 |
|          StdEpRet |             415 |
|          MaxEpRet |            -186 |
|          MinEpRet |       -2.45e+03 |
|             EpLen |            19.7 |
|      AverageVVals |            -152 |
|          StdVVals |            1.43 |
|          MaxVVals |            -150 |
|          MinVVals |            -154 |
| TotalEnvInteracts |        2.01e+04 |
|            LossPi |          -0.234 |
|             LossV |        1.05e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -58.4 |
|           Entropy |            0.88 |
|                KL |        1.19e-09 |
|              Time |            65.6 |
---------------------------------------




---------------------------------------
|             Epoch |             210 |
|      AverageEpRet |            -758 |
|          StdEpRet |             699 |
|          MaxEpRet |            -278 |
|          MinEpRet |       -3.29e+03 |
|             EpLen |            26.9 |
|      AverageVVals |            -158 |
|          StdVVals |            2.66 |
|          MaxVVals |            -154 |
|          MinVVals |            -162 |
| TotalEnvInteracts |        2.11e+04 |
|            LossPi |           -0.15 |
|             LossV |        1.14e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -150 |
|           Entropy |           0.877 |
|                KL |        4.17e-10 |
|              Time |            68.9 |
---------------------------------------




---------------------------------------
|             Epoch |             220 |
|      AverageEpRet |            -894 |
|          StdEpRet |             626 |
|          MaxEpRet |            -277 |
|          MinEpRet |       -2.41e+03 |
|             EpLen |            33.3 |
|      AverageVVals |            -164 |
|          StdVVals |            1.63 |
|          MaxVVals |            -162 |
|          MinVVals |            -167 |
| TotalEnvInteracts |        2.21e+04 |
|            LossPi |          -0.154 |
|             LossV |         7.8e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -49.5 |
|           Entropy |           0.875 |
|                KL |        6.08e-09 |
|              Time |            72.1 |
---------------------------------------




---------------------------------------
|             Epoch |             230 |
|      AverageEpRet |            -825 |
|          StdEpRet |             825 |
|          MaxEpRet |            -152 |
|          MinEpRet |       -2.96e+03 |
|             EpLen |            30.4 |
|      AverageVVals |            -170 |
|          StdVVals |            1.72 |
|          MaxVVals |            -167 |
|          MinVVals |            -172 |
| TotalEnvInteracts |        2.31e+04 |
|            LossPi |          -0.197 |
|             LossV |         8.6e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -97.7 |
|           Entropy |           0.872 |
|                KL |        5.96e-10 |
|              Time |            75.3 |
---------------------------------------




---------------------------------------
|             Epoch |             240 |
|      AverageEpRet |            -627 |
|          StdEpRet |             447 |
|          MaxEpRet |            -264 |
|          MinEpRet |        -2.6e+03 |
|             EpLen |            22.6 |
|      AverageVVals |            -175 |
|          StdVVals |             1.9 |
|          MaxVVals |            -173 |
|          MinVVals |            -179 |
| TotalEnvInteracts |        2.41e+04 |
|            LossPi |          -0.156 |
|             LossV |        8.41e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -73.1 |
|           Entropy |            0.87 |
|                KL |         2.5e-09 |
|              Time |            78.5 |
---------------------------------------




---------------------------------------
|             Epoch |             250 |
|      AverageEpRet |            -627 |
|          StdEpRet |             501 |
|          MaxEpRet |            -143 |
|          MinEpRet |        -2.6e+03 |
|             EpLen |            24.2 |
|      AverageVVals |            -179 |
|          StdVVals |           0.164 |
|          MaxVVals |            -179 |
|          MinVVals |            -179 |
| TotalEnvInteracts |        2.51e+04 |
|            LossPi |         -0.0861 |
|             LossV |        6.77e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -18.2 |
|           Entropy |           0.868 |
|                KL |       -1.19e-10 |
|              Time |            81.7 |
---------------------------------------




---------------------------------------
|             Epoch |             260 |
|      AverageEpRet |            -638 |
|          StdEpRet |             528 |
|          MaxEpRet |            -219 |
|          MinEpRet |       -2.24e+03 |
|             EpLen |              24 |
|      AverageVVals |            -181 |
|          StdVVals |            1.39 |
|          MaxVVals |            -179 |
|          MinVVals |            -184 |
| TotalEnvInteracts |        2.61e+04 |
|            LossPi |          -0.126 |
|             LossV |        7.15e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -42.7 |
|           Entropy |           0.866 |
|                KL |        1.07e-09 |
|              Time |            84.9 |
---------------------------------------




---------------------------------------
|             Epoch |             270 |
|      AverageEpRet |            -662 |
|          StdEpRet |             407 |
|          MaxEpRet |            -185 |
|          MinEpRet |       -2.06e+03 |
|             EpLen |            24.7 |
|      AverageVVals |            -184 |
|          StdVVals |           0.477 |
|          MaxVVals |            -184 |
|          MinVVals |            -185 |
| TotalEnvInteracts |        2.71e+04 |
|            LossPi |          -0.161 |
|             LossV |        7.64e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |             -75 |
|           Entropy |           0.864 |
|                KL |       -1.19e-09 |
|              Time |            88.1 |
---------------------------------------




---------------------------------------
|             Epoch |             280 |
|      AverageEpRet |            -787 |
|          StdEpRet |             699 |
|          MaxEpRet |            -193 |
|          MinEpRet |       -2.87e+03 |
|             EpLen |            29.3 |
|      AverageVVals |            -188 |
|          StdVVals |            1.27 |
|          MaxVVals |            -186 |
|          MinVVals |            -190 |
| TotalEnvInteracts |        2.81e+04 |
|            LossPi |           -0.15 |
|             LossV |         7.7e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -91.5 |
|           Entropy |           0.862 |
|                KL |        3.58e-10 |
|              Time |            91.3 |
---------------------------------------




---------------------------------------
|             Epoch |             290 |
|      AverageEpRet |            -466 |
|          StdEpRet |             260 |
|          MaxEpRet |            -169 |
|          MinEpRet |       -1.45e+03 |
|             EpLen |            17.9 |
|      AverageVVals |            -189 |
|          StdVVals |            1.17 |
|          MaxVVals |            -187 |
|          MinVVals |            -191 |
| TotalEnvInteracts |        2.91e+04 |
|            LossPi |          -0.113 |
|             LossV |        7.52e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -47.9 |
|           Entropy |            0.86 |
|                KL |        2.32e-09 |
|              Time |            94.6 |
---------------------------------------




---------------------------------------
|             Epoch |             300 |
|      AverageEpRet |            -597 |
|          StdEpRet |             397 |
|          MaxEpRet |            -222 |
|          MinEpRet |       -1.67e+03 |
|             EpLen |            22.1 |
|      AverageVVals |            -187 |
|          StdVVals |           0.983 |
|          MaxVVals |            -186 |
|          MinVVals |            -188 |
| TotalEnvInteracts |        3.01e+04 |
|            LossPi |           -0.12 |
|             LossV |        7.29e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -33.4 |
|           Entropy |           0.858 |
|                KL |       -3.81e-09 |
|              Time |            97.9 |
---------------------------------------




---------------------------------------
|             Epoch |             310 |
|      AverageEpRet |            -538 |
|          StdEpRet |             371 |
|          MaxEpRet |            -210 |
|          MinEpRet |       -2.02e+03 |
|             EpLen |            21.5 |
|      AverageVVals |            -187 |
|          StdVVals |           0.828 |
|          MaxVVals |            -186 |
|          MinVVals |            -189 |
| TotalEnvInteracts |        3.11e+04 |
|            LossPi |          -0.159 |
|             LossV |        6.66e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -58.9 |
|           Entropy |           0.857 |
|                KL |       -4.89e-09 |
|              Time |             101 |
---------------------------------------




---------------------------------------
|             Epoch |             320 |
|      AverageEpRet |            -605 |
|          StdEpRet |             521 |
|          MaxEpRet |            -186 |
|          MinEpRet |       -2.64e+03 |
|             EpLen |            22.4 |
|      AverageVVals |            -184 |
|          StdVVals |               1 |
|          MaxVVals |            -183 |
|          MinVVals |            -186 |
| TotalEnvInteracts |        3.21e+04 |
|            LossPi |          -0.129 |
|             LossV |        6.65e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -56.4 |
|           Entropy |           0.855 |
|                KL |       -4.53e-09 |
|              Time |             104 |
---------------------------------------




---------------------------------------
|             Epoch |             330 |
|      AverageEpRet |            -483 |
|          StdEpRet |             325 |
|          MaxEpRet |            -254 |
|          MinEpRet |       -1.88e+03 |
|             EpLen |            17.5 |
|      AverageVVals |            -185 |
|          StdVVals |           0.765 |
|          MaxVVals |            -183 |
|          MinVVals |            -186 |
| TotalEnvInteracts |        3.31e+04 |
|            LossPi |          -0.121 |
|             LossV |        7.46e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |             -52 |
|           Entropy |           0.853 |
|                KL |       -9.54e-10 |
|              Time |             108 |
---------------------------------------




---------------------------------------
|             Epoch |             340 |
|      AverageEpRet |            -737 |
|          StdEpRet |             654 |
|          MaxEpRet |            -228 |
|          MinEpRet |       -2.78e+03 |
|             EpLen |            27.3 |
|      AverageVVals |            -190 |
|          StdVVals |            1.42 |
|          MaxVVals |            -188 |
|          MinVVals |            -193 |
| TotalEnvInteracts |        3.41e+04 |
|            LossPi |          -0.163 |
|             LossV |        6.69e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -67.9 |
|           Entropy |           0.851 |
|                KL |        1.07e-09 |
|              Time |             111 |
---------------------------------------




---------------------------------------
|             Epoch |             350 |
|      AverageEpRet |            -557 |
|          StdEpRet |             285 |
|          MaxEpRet |            -232 |
|          MinEpRet |       -1.34e+03 |
|             EpLen |            20.6 |
|      AverageVVals |            -191 |
|          StdVVals |           0.777 |
|          MaxVVals |            -190 |
|          MinVVals |            -192 |
| TotalEnvInteracts |        3.51e+04 |
|            LossPi |          -0.152 |
|             LossV |        6.87e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -42.7 |
|           Entropy |           0.849 |
|                KL |        -3.4e-09 |
|              Time |             114 |
---------------------------------------




---------------------------------------
|             Epoch |             360 |
|      AverageEpRet |            -693 |
|          StdEpRet |             528 |
|          MaxEpRet |            -215 |
|          MinEpRet |       -2.57e+03 |
|             EpLen |            26.6 |
|      AverageVVals |            -191 |
|          StdVVals |            1.34 |
|          MaxVVals |            -189 |
|          MinVVals |            -193 |
| TotalEnvInteracts |        3.61e+04 |
|            LossPi |          -0.198 |
|             LossV |        5.99e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -36.1 |
|           Entropy |           0.847 |
|                KL |       -5.96e-10 |
|              Time |             117 |
---------------------------------------




---------------------------------------
|             Epoch |             370 |
|      AverageEpRet |            -798 |
|          StdEpRet |             667 |
|          MaxEpRet |            -254 |
|          MinEpRet |       -3.18e+03 |
|             EpLen |            29.2 |
|      AverageVVals |            -193 |
|          StdVVals |            2.47 |
|          MaxVVals |            -190 |
|          MinVVals |            -196 |
| TotalEnvInteracts |        3.71e+04 |
|            LossPi |          -0.148 |
|             LossV |        1.07e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -134 |
|           Entropy |           0.844 |
|                KL |       -2.74e-09 |
|              Time |             121 |
---------------------------------------




---------------------------------------
|             Epoch |             380 |
|      AverageEpRet |            -710 |
|          StdEpRet |             699 |
|          MaxEpRet |            -178 |
|          MinEpRet |       -2.74e+03 |
|             EpLen |            27.1 |
|      AverageVVals |            -195 |
|          StdVVals |           0.707 |
|          MaxVVals |            -194 |
|          MinVVals |            -196 |
| TotalEnvInteracts |        3.81e+04 |
|            LossPi |          -0.204 |
|             LossV |        6.18e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -51.9 |
|           Entropy |           0.842 |
|                KL |        3.52e-09 |
|              Time |             124 |
---------------------------------------




---------------------------------------
|             Epoch |             390 |
|      AverageEpRet |            -626 |
|          StdEpRet |             613 |
|          MaxEpRet |            -179 |
|          MinEpRet |       -3.19e+03 |
|             EpLen |            23.2 |
|      AverageVVals |            -196 |
|          StdVVals |           0.766 |
|          MaxVVals |            -195 |
|          MinVVals |            -198 |
| TotalEnvInteracts |        3.91e+04 |
|            LossPi |          -0.165 |
|             LossV |        8.63e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -84.3 |
|           Entropy |           0.839 |
|                KL |       -3.81e-09 |
|              Time |             127 |
---------------------------------------




---------------------------------------
|             Epoch |             400 |
|      AverageEpRet |            -637 |
|          StdEpRet |             586 |
|          MaxEpRet |            -178 |
|          MinEpRet |       -2.41e+03 |
|             EpLen |              25 |
|      AverageVVals |            -195 |
|          StdVVals |           0.817 |
|          MaxVVals |            -193 |
|          MinVVals |            -196 |
| TotalEnvInteracts |        4.01e+04 |
|            LossPi |          -0.183 |
|             LossV |        6.53e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -35.7 |
|           Entropy |           0.837 |
|                KL |       -7.03e-09 |
|              Time |             130 |
---------------------------------------




---------------------------------------
|             Epoch |             410 |
|      AverageEpRet |            -529 |
|          StdEpRet |             433 |
|          MaxEpRet |            -197 |
|          MinEpRet |       -2.59e+03 |
|             EpLen |            20.4 |
|      AverageVVals |            -192 |
|          StdVVals |           0.946 |
|          MaxVVals |            -190 |
|          MinVVals |            -194 |
| TotalEnvInteracts |        4.11e+04 |
|            LossPi |          -0.184 |
|             LossV |        7.92e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -46.3 |
|           Entropy |           0.834 |
|                KL |       -1.97e-09 |
|              Time |             134 |
---------------------------------------




---------------------------------------
|             Epoch |             420 |
|      AverageEpRet |            -526 |
|          StdEpRet |             427 |
|          MaxEpRet |            -141 |
|          MinEpRet |       -2.63e+03 |
|             EpLen |              20 |
|      AverageVVals |            -190 |
|          StdVVals |            0.99 |
|          MaxVVals |            -188 |
|          MinVVals |            -191 |
| TotalEnvInteracts |        4.21e+04 |
|            LossPi |          -0.155 |
|             LossV |         7.6e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -40.7 |
|           Entropy |           0.832 |
|                KL |       -8.94e-10 |
|              Time |             137 |
---------------------------------------




---------------------------------------
|             Epoch |             430 |
|      AverageEpRet |            -720 |
|          StdEpRet |             664 |
|          MaxEpRet |            -180 |
|          MinEpRet |       -2.65e+03 |
|             EpLen |            28.5 |
|      AverageVVals |            -187 |
|          StdVVals |           0.909 |
|          MaxVVals |            -185 |
|          MinVVals |            -188 |
| TotalEnvInteracts |        4.31e+04 |
|            LossPi |         -0.0959 |
|             LossV |        6.26e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -69.2 |
|           Entropy |            0.83 |
|                KL |       -2.32e-09 |
|              Time |             140 |
---------------------------------------




---------------------------------------
|             Epoch |             440 |
|      AverageEpRet |            -536 |
|          StdEpRet |             450 |
|          MaxEpRet |            -199 |
|          MinEpRet |       -2.66e+03 |
|             EpLen |            20.4 |
|      AverageVVals |            -186 |
|          StdVVals |           0.952 |
|          MaxVVals |            -184 |
|          MinVVals |            -187 |
| TotalEnvInteracts |        4.41e+04 |
|            LossPi |          -0.191 |
|             LossV |        7.15e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -60.8 |
|           Entropy |           0.828 |
|                KL |        -2.5e-09 |
|              Time |             143 |
---------------------------------------




---------------------------------------
|             Epoch |             450 |
|      AverageEpRet |            -520 |
|          StdEpRet |             315 |
|          MaxEpRet |            -192 |
|          MinEpRet |       -1.94e+03 |
|             EpLen |              20 |
|      AverageVVals |            -184 |
|          StdVVals |            1.15 |
|          MaxVVals |            -182 |
|          MinVVals |            -185 |
| TotalEnvInteracts |        4.51e+04 |
|            LossPi |          -0.151 |
|             LossV |        6.54e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -35.5 |
|           Entropy |           0.826 |
|                KL |       -3.76e-09 |
|              Time |             147 |
---------------------------------------




---------------------------------------
|             Epoch |             460 |
|      AverageEpRet |            -572 |
|          StdEpRet |             576 |
|          MaxEpRet |            -199 |
|          MinEpRet |       -2.63e+03 |
|             EpLen |            21.9 |
|      AverageVVals |            -181 |
|          StdVVals |           0.848 |
|          MaxVVals |            -179 |
|          MinVVals |            -182 |
| TotalEnvInteracts |        4.61e+04 |
|            LossPi |          -0.194 |
|             LossV |        7.47e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -88.1 |
|           Entropy |           0.824 |
|                KL |       -2.62e-09 |
|              Time |             150 |
---------------------------------------




---------------------------------------
|             Epoch |             470 |
|      AverageEpRet |            -438 |
|          StdEpRet |             266 |
|          MaxEpRet |            -217 |
|          MinEpRet |       -1.89e+03 |
|             EpLen |            16.9 |
|      AverageVVals |            -180 |
|          StdVVals |            1.31 |
|          MaxVVals |            -178 |
|          MinVVals |            -182 |
| TotalEnvInteracts |        4.71e+04 |
|            LossPi |          -0.159 |
|             LossV |        6.96e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -42.6 |
|           Entropy |           0.821 |
|                KL |       -1.31e-09 |
|              Time |             153 |
---------------------------------------




---------------------------------------
|             Epoch |             480 |
|      AverageEpRet |            -539 |
|          StdEpRet |             444 |
|          MaxEpRet |            -210 |
|          MinEpRet |       -2.58e+03 |
|             EpLen |            21.2 |
|      AverageVVals |            -180 |
|          StdVVals |            1.64 |
|          MaxVVals |            -178 |
|          MinVVals |            -182 |
| TotalEnvInteracts |        4.81e+04 |
|            LossPi |          -0.134 |
|             LossV |        6.88e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -64.7 |
|           Entropy |           0.819 |
|                KL |       -8.34e-10 |
|              Time |             156 |
---------------------------------------




---------------------------------------
|             Epoch |             490 |
|      AverageEpRet |            -587 |
|          StdEpRet |             489 |
|          MaxEpRet |            -232 |
|          MinEpRet |       -2.43e+03 |
|             EpLen |            23.5 |
|      AverageVVals |            -181 |
|          StdVVals |           0.938 |
|          MaxVVals |            -179 |
|          MinVVals |            -182 |
| TotalEnvInteracts |        4.91e+04 |
|            LossPi |          -0.164 |
|             LossV |        5.34e+03 |
|       DeltaLossPi |               0 |
|        DeltaLossV |           -52.7 |
|           Entropy |           0.817 |
|                KL |        1.37e-09 |
|              Time |             160 |
---------------------------------------






In [17]:
agent = lambda env: torch.load('/host/final_project/model/vpg/vpg_s37/pyt_save/model.pt')
total_rewards = test_policy(agent)
print ('Average Reward across {:d} episodes is {:.2f}'.format(total_rewards[0].shape[0], np.mean(total_rewards[0])))
print ('Reward per step of the particle is {:.2f}'.format(np.mean(total_rewards[1])))

<IPython.core.display.Javascript object>

Average Reward across 10 episodes is -679.50
Reward per step of the particle is -26.03


# Bayes Explore

In [5]:
%matplotlib notebook

import argparse
import sys
from tqdm import trange

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)

import gym
from gym import wrappers, logger

from multigoal_env.multigoal import MultiGoal

from spinup.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
from agents import RandomAgent

import torch
import tree

env_id = 'MultiGoal-v0'
seed = 1234

In [6]:
hid = 10
num_layers = 2
gamma = 0.9
num_epochs = 5
cpu = 2

MAX_STEPS_PER_EPISODE = 100
steps = MAX_STEPS_PER_EPISODE
seed = 37

In [7]:
from algos.vpg_source.custom_actor_critic import BayesMLPActorCritic
from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs('bayes_vpg', seed, data_dir='model/')
from algos.vpg_source.vpg import vpg as vpg_pytorch

In [8]:
epoch_reward = vpg_pytorch(lambda : gym.make(env_id), actor_critic=BayesMLPActorCritic,
                            ac_kwargs=dict(hidden_sizes=[hid]*num_layers), gamma=gamma, 
                            seed=seed, steps_per_epoch=steps, epochs=num_epochs,
                            logger_kwargs=logger_kwargs, max_ep_len=MAX_STEPS_PER_EPISODE)

[32;1mLogging data to model/bayes_vpg/bayes_vpg_s37/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            10,
            10
        ]
    },
    "actor_critic":	"BayesMLPActorCritic",
    "env_fn":	"<function <lambda> at 0x7f783973b9e0>",
    "epochs":	5,
    "exp_name":	"bayes_vpg",
    "gamma":	0.9,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7f783979ddd0>":	{
            "epoch_dict":	{},
            "exp_name":	"bayes_vpg",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"model/bayes_vpg/bayes_vpg_s37",
            "output_file":	{
                "<_io.TextIOWrapper name='model/bayes_vpg/bayes_vpg_s37/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                }
            }
        }
    },
    "logger_kwargs":	{
        "exp_name":	"bayes_vpg",
        "output_dir":	"model/bayes_vp



torch.Size([100, 2])
[BayesLinear(prior_mu=0.0, prior_sigma=0.1, in_features=2, out_features=10, bias=True), Tanh(), BayesLinear(prior_mu=0.0, prior_sigma=0.1, in_features=10, out_features=10, bias=True), Tanh(), BayesLinear(prior_mu=0.0, prior_sigma=0.1, in_features=10, out_features=1, bias=True), Identity()]


NameError: name 'sys' is not defined

In [7]:
agent = lambda env: torch.load('/host/final_project/model/bayes_vpg/bayes_vpg_s37/pyt_save/model.pt')
total_rewards = test_policy(agent)
print ('Average Reward across {:d} episodes is {:.2f}'.format(total_rewards[0].shape[0], np.mean(total_rewards[0])))
print ('Reward per step of the particle is {:.2f}'.format(np.mean(total_rewards[1])))



<IPython.core.display.Javascript object>

Average Reward across 10 episodes is -1208.56
Reward per step of the particle is -30.33
