# Exploration Test sandbox

In [1]:
%matplotlib notebook

import argparse
import sys
from tqdm import trange

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)

import gym
from gym import wrappers, logger

from multigoal_env.multigoal import MultiGoal

from spinup.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
from agents import RandomAgent

import torch
import tree

# Multi Goal Environment

## The code for multi goal is written in the `multigoal_env` folder
## The enviroment is registered in `__init__.py` and defined in `multigoal.py`

#### About multigoal environment: The multigoal environment depicts a 2D state space with multimodal rewards. In many applications, we want to learn a policy that achieves high reward via distinct trajectories. The multigoal enviroment is a nice and easy test bed to test various RL algorithms for such multimodal reward setting.



In [2]:
env_id = 'MultiGoal-v0'
seed = 1234

In [3]:
# Testing VPG

In [3]:
from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs('vpg', seed, data_dir='model/')

# The following two files were adapted to make
# discrete observation space work with vpg.
from custom_actor_critic import MLPActorCritic
from vpg import vpg as vpg_pytorch

In [8]:
hid = 10
num_layers = 2
gamma = 0.9
num_epochs = 1000
cpu = 2

MAX_STEPS_PER_EPISODE = 1000
steps = MAX_STEPS_PER_EPISODE

In [9]:
epoch_reward = vpg_pytorch(lambda : gym.make(env_id), actor_critic=MLPActorCritic,
                            ac_kwargs=dict(hidden_sizes=[hid]*num_layers), gamma=gamma, 
                            seed=seed, steps_per_epoch=steps, epochs=num_epochs,
                            logger_kwargs=logger_kwargs, max_ep_len=MAX_STEPS_PER_EPISODE)

[32;1mLogging data to model/vpg/vpg_s1234/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            10,
            10
        ]
    },
    "actor_critic":	"MLPActorCritic",
    "env_fn":	"<function <lambda> at 0x7fc0b43303b0>",
    "epochs":	1000,
    "exp_name":	"vpg",
    "gamma":	0.9,
    "lam":	0.97,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7fc1441a1650>":	{
            "epoch_dict":	{},
            "exp_name":	"vpg",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"model/vpg/vpg_s1234",
            "output_file":	{
                "<_io.TextIOWrapper name='model/vpg/vpg_s1234/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                }
            }
        }
    },
    "logger_kwargs":	{
        "exp_name":	"vpg",
        "output_dir":	"model/vpg/vpg_s1234"
    },
    "max_ep_len":	1000,
    "pi_lr":



---------------------------------------
|             Epoch |               0 |
|      AverageEpRet |       -8.87e+03 |
|          StdEpRet |        1.31e+04 |
|          MaxEpRet |            -377 |
|          MinEpRet |       -3.14e+04 |
|             EpLen |             168 |
|      AverageVVals |           0.104 |
|          StdVVals |          0.0567 |
|          MaxVVals |           0.521 |
|          MinVVals |         -0.0193 |
| TotalEnvInteracts |           1e+03 |
|            LossPi |        -0.00361 |
|             LossV |        3.19e+05 |
|       DeltaLossPi |               0 |
|        DeltaLossV |       -1.29e+03 |
|           Entropy |           0.919 |
|                KL |        -3.1e-09 |
|              Time |             2.8 |
---------------------------------------








































































































































































































---------------------------------------
|             Epoch |             500 |
|      AverageEpRet |            -415 |
|          StdEpRet |             833 |
|          MaxEpRet |            -122 |
|          MinEpRet |       -3.89e+04 |
|             EpLen |            14.1 |
|      AverageVVals |            -103 |
|          StdVVals |            49.6 |
|          MaxVVals |            11.4 |
|          MinVVals |            -201 |
| TotalEnvInteracts |        5.01e+05 |
|            LossPi |          -0.203 |
|             LossV |        2.41e+04 |
|       DeltaLossPi |               0 |
|        DeltaLossV |            -210 |
|           Entropy |           0.835 |
|                KL |        1.33e-10 |
|              Time |        1.37e+03 |
---------------------------------------










































































































































































































In [17]:
def process_sample(observation, action,reward, 
                    terminal,next_observation,info):
    
    processed_observation = {
        'observations': observation,
        'actions': action,
        'rewards': np.atleast_1d(reward),
        'terminals': np.atleast_1d(terminal),
        'next_observations': next_observation,
        'infos': info,
    }

    return processed_observation

def test_policy(agent_fun):
    
    env = gym.make(env_id)
    env.seed(seed)
    reward = 0.
    done = False
    agent = agent_fun(env)
    current_path = []
    
    path_length = 0
    path_return = 0.
    num_episodes = 4
    max_steps = 100
    all_paths = []
    total_rewards = []
    
    for episode in range(1, num_episodes+1):

        current_ob = env.reset()
        rewards = []

        while True:
            ob_tensor = torch.as_tensor(current_ob, dtype=torch.float32)
            action = agent.act(ob_tensor)
            
            ob, reward, done, info = env.step(action)

            processed_sample = process_sample(observation=current_ob, action=action, reward=reward,
                                              terminal=done, next_observation=ob,info=info)

            rewards.append(reward)
            current_path.append(processed_sample)

            path_length += 1

            if done or path_length > max_steps:
                current_ob = env.reset()
                path_length = 0
                last_path = tree.map_structure(lambda *x: np.stack(x, axis=0), *current_path)
                all_paths.append(last_path)
                env.render_rollouts(all_paths)
                current_path = []

                break
            else:
                current_ob = ob    
            
        total_rewards.append((np.sum(rewards), np.mean(rewards)))
#         if episode % 5 == 0:
#             print ('Episode Number: {:d} | Total Reward: {:.4f} '.format(episode, np.sum(rewards)))

    env.close()
    total_rewards = tree.map_structure(lambda *x: np.stack(x, axis=0), *total_rewards)
    return total_rewards

In [25]:
agent = lambda env: RandomAgent(env.action_space)
total_rewards = test_policy(agent)
print ('Average Reward across {:d} episodes is {:.2f}'.format(total_rewards[0].shape[0], np.mean(total_rewards[0])))
print ('Reward per step of the particle is {:.2f}'.format(np.mean(total_rewards[1])))

<IPython.core.display.Javascript object>

Average Reward across 283 episodes is -1252.84
Reward per step of the particle is -33.16


In [26]:
agent = lambda env: torch.load('/host/final_project/model/vpg/vpg_s1234/pyt_save/model.pt')
total_rewards = test_policy(agent)
print ('Average Reward across {:d} episodes is {:.2f}'.format(total_rewards[0].shape[0], np.mean(total_rewards[0])))
print ('Reward per step of the particle is {:.2f}'.format(np.mean(total_rewards[1])))

<IPython.core.display.Javascript object>

Average Reward across 25 episodes is -131.10
Reward per step of the particle is -37.32
