In [1]:
import envpool
# import gymnasium as gym
import gym

In [2]:
import argparse
import os
import random
import time
from collections import deque
from distutils.util import strtobool

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

  if not hasattr(tensorboard, "__version__") or LooseVersion(


In [4]:
from typing import Any, Dict, Optional, Tuple
from gym.spaces import Box, Discrete


In [236]:
class FlattenObsWrapper(gym.Wrapper):
    def __init__(self, env):
        if isinstance(type(env.observation_space), Dict):
            size, highs, lows = self.get_obs_space(env.observation_space)
            self.observation_space = Box(low=lows, high=highs)
        
    def reset(self, **kwargs):
        observations = super().reset(**kwargs)
        if isinstance(type(observations, Dict)):
            observations =  self.flatten_dict(observations)
        return observations
    
    def step(self, action):
        observations, rewards, dones, infos = super().step(action)
        if isinstance(type(observations, Dict)):
            observations = self.flatten_dict(observations)
        return (
            observations,
            rewards,
            dones,
            infos,
        )
    
    def flatten_dict(self, obs):
        obs_pieces = []
        for v in obs.values():
            flat = np.array([v]) if np.isscalar(v) else v.ravel()
            obs_pieces.append(flat)
        return np.concatenate(obs_pieces, axis=-1)

    def get_obs_space(self, observation_space):
        shapes = []
        highs = []
        lows = []
        for key, box in observation_space.items():
            shapes.append(box.shape)
            highs.append(box.high)
            lows.append(box.low)
        size = (np.sum(shapes, dtype=np.int32))
        highs = np.concatenate(highs, axis=-1)
        lows = np.concatenate(lows, axis=-1)        
        return size, highs, lows
    
class RecordEpisodeStatistics(gym.Wrapper):
    def __init__(self, env, deque_size=100):
        super().__init__(env)
        self.num_envs = getattr(env, "num_envs", 1)
        self.episode_returns = None
        self.episode_lengths = None

    def reset(self, **kwargs):
        observations = super().reset(**kwargs)
        self.episode_returns = np.zeros(self.num_envs, dtype=np.float32)
        self.episode_lengths = np.zeros(self.num_envs, dtype=np.int32)
        self.lives = np.zeros(self.num_envs, dtype=np.int32)
        self.returned_episode_returns = np.zeros(self.num_envs, dtype=np.float32)
        self.returned_episode_lengths = np.zeros(self.num_envs, dtype=np.int32)
        return observations

    def step(self, action):
        observations, rewards, dones, infos = super().step(action)
        self.episode_returns += rewards #infos["reward"]
        self.episode_lengths += 1
        self.returned_episode_returns[:] = self.episode_returns
        self.returned_episode_lengths[:] = self.episode_lengths
        self.episode_returns *= 1 - dones # infos["terminated"]
        self.episode_lengths *= 1 - dones # infos["terminated"]
        infos["r"] = self.returned_episode_returns
        infos["l"] = self.returned_episode_lengths
        return (
            observations,
            rewards,
            dones,
            infos,
        )

In [237]:
from typing import Any, Dict, Optional, Tuple, OrderedDict
from gym.spaces.dict import Dict as GymDict

In [255]:
class FlattenRecordEpisodeStatistics(gym.Wrapper):
    def __init__(self, env, deque_size=100):
        super().__init__(env)
        # print(f"env obs space type:{type(env.observation_space)}")
        if isinstance(env.observation_space, GymDict) or \
            isinstance(env.observation_space, Dict) or \
            isinstance(env.observation_space, OrderedDict):
        # if type(env.observation_space) == GymDict or \
            # type(env.observation_space) == Dict or \
            # type(env.observation_space) == dict or \
            #  type(env.observation_space) == OrderedDict :
            size, highs, lows = self.get_obs_space(env.observation_space)
            self.observation_space = Box(low=lows, high=highs)
            # print("observation is changed")
        self.num_envs = getattr(env, "num_envs", 1)
        self.episode_returns = None
        self.episode_lengths = None

    def reset(self, **kwargs):
        observations = super().reset(**kwargs)
        # if type(observations) == dict or \
        #     type(observations) == Dict or \
        #     type(observations) == GymDict or \
        #           type(observations) == OrderedDict: 
        if isinstance(observations, Dict) or \
            isinstance(observations, GymDict) or \
                isinstance(observations, OrderedDict):
            observations =  self.flatten_dict(observations)
        self.episode_returns = np.zeros(self.num_envs, dtype=np.float32)
        self.episode_lengths = np.zeros(self.num_envs, dtype=np.int32)
        self.lives = np.zeros(self.num_envs, dtype=np.int32)
        self.returned_episode_returns = np.zeros(self.num_envs, dtype=np.float32)
        self.returned_episode_lengths = np.zeros(self.num_envs, dtype=np.int32)
        return observations

    def step(self, action):
        observations, rewards, dones, infos = super().step(action)
        # if type(observations) == dict or \
            # type(observations) == GymDict or \
            # type(observations) == Dict or \
            #  type(observations) == OrderedDict :
        if isinstance(observations, Dict) or \
            isinstance(observations, GymDict) or \
                isinstance(observations, OrderedDict):
            observations = self.flatten_dict(observations)
        self.episode_returns += rewards #infos["reward"]
        self.episode_lengths += 1
        self.returned_episode_returns[:] = self.episode_returns
        self.returned_episode_lengths[:] = self.episode_lengths
        self.episode_returns *= 1 - dones # infos["terminated"]
        self.episode_lengths *= 1 - dones # infos["terminated"]
        infos["r"] = self.returned_episode_returns
        infos["l"] = self.returned_episode_lengths
        return (
            observations,
            rewards,
            dones,
            infos,
        )
        
    def flatten_dict(self, obs):
        obs_pieces = []
        for v in obs.values():
            flat = np.array([v]) if np.isscalar(v) else v.ravel()
            obs_pieces.append(flat)
        return np.concatenate(obs_pieces, axis=-1)

    def get_obs_space(self, observation_space):
        shapes = []
        highs = []
        lows = []
        for key, box in observation_space.items():
            if len(box.shape) == 0:
                shapes.append(1)
                # highs.append(np.expand_dims(box.high, axis=-1))
                # lows.append(np.expand_dims(box.low, axis=-1))
                highs += [*box.high]
                lows += [*box.low]
            elif len(box.shape) == 1:
                shapes += [*box.shape]
                highs += [*box.high]
                lows += [*box.low]
            elif len(box.shape) > 1:
                print(box)
                shapes += [np.prod(box.shape)]
                highs += [*(box.high.reshape(-1))]
                lows += [*(box.low.reshape(-1))]
        print(shapes)
        print(highs)
        print(lows)
        size = (np.sum(shapes, dtype=np.int32))
        highs = np.array(highs)
        lows = np.array(lows)        
        return size, highs, lows



In [256]:
envpool.list_all_envs()

['Adventure-v5',
 'AirRaid-v5',
 'Alien-v5',
 'Amidar-v5',
 'Assault-v5',
 'Asterix-v5',
 'Asteroids-v5',
 'Atlantis-v5',
 'Atlantis2-v5',
 'Backgammon-v5',
 'BankHeist-v5',
 'BasicMath-v5',
 'BattleZone-v5',
 'BeamRider-v5',
 'Berzerk-v5',
 'Blackjack-v5',
 'Bowling-v5',
 'Boxing-v5',
 'Breakout-v5',
 'Carnival-v5',
 'Casino-v5',
 'Centipede-v5',
 'ChopperCommand-v5',
 'CrazyClimber-v5',
 'Crossbow-v5',
 'Darkchambers-v5',
 'Defender-v5',
 'DemonAttack-v5',
 'DonkeyKong-v5',
 'DoubleDunk-v5',
 'Earthworld-v5',
 'ElevatorAction-v5',
 'Enduro-v5',
 'Entombed-v5',
 'Et-v5',
 'FishingDerby-v5',
 'FlagCapture-v5',
 'Freeway-v5',
 'Frogger-v5',
 'Frostbite-v5',
 'Galaxian-v5',
 'Gopher-v5',
 'Gravitar-v5',
 'Hangman-v5',
 'HauntedHouse-v5',
 'Hero-v5',
 'HumanCannonball-v5',
 'IceHockey-v5',
 'Jamesbond-v5',
 'JourneyEscape-v5',
 'Kaboom-v5',
 'Kangaroo-v5',
 'KeystoneKapers-v5',
 'KingKong-v5',
 'Klax-v5',
 'Koolaid-v5',
 'Krull-v5',
 'KungFuMaster-v5',
 'LaserGates-v5',
 'LostLuggage-v5',

In [257]:
# env_ids = ["CartPole-v1", "Hopper-v4"]
env_ids = ['ManipulatorBringBall-v1', ]#'HumanoidCMUStand-v1']


In [258]:
def flatten_dict(dictionary, parent_key='', sep='_'):
    flattened_list = []
    for key, value in dictionary.items():
        new_key = parent_key + sep + key if parent_key else key
        if isinstance(value, dict):
            flattened_dict.update(flatten_dict(value, new_key, sep))
        else:
            flattened_dict[new_key] = value
    return flattened_dict

In [259]:
nested_dict = {
    'key1': 1,
    'key2': {
        'subkey1': 2,
        'subkey2': {
            'subsubkey1': 3
        }
    },
    'key3': 4
}

flattened_dict = flatten_dict(nested_dict)
print(flattened_dict)

{'key1': 1, 'key2_subkey1': 2, 'key2_subkey2_subsubkey1': 3, 'key3': 4}


In [260]:
original_env = envpool.make(
                        env_ids[0],
                        env_type="gym",
                        num_envs=10,
                        seed=3142,
                    )

In [261]:
original_env.observation_space

Dict(arm_pos:Box(2.2250738585072014e-308, 1.7976931348623157e+308, (8, 2), float64), arm_vel:Box(2.2250738585072014e-308, 1.7976931348623157e+308, (8,), float64), touch:Box(2.2250738585072014e-308, 1.7976931348623157e+308, (5,), float64), hand_pos:Box(2.2250738585072014e-308, 1.7976931348623157e+308, (4,), float64), object_pos:Box(2.2250738585072014e-308, 1.7976931348623157e+308, (4,), float64), object_vel:Box(2.2250738585072014e-308, 1.7976931348623157e+308, (3,), float64), target_pos:Box(2.2250738585072014e-308, 1.7976931348623157e+308, (4,), float64))

In [262]:
for key, box in original_env.observation_space.items():
    print(box)
    break

Box(2.2250738585072014e-308, 1.7976931348623157e+308, (8, 2), float64)


In [263]:
box.shape

(8, 2)

In [264]:
box.high

array([[1.79769313e+308, 1.79769313e+308],
       [1.79769313e+308, 1.79769313e+308],
       [1.79769313e+308, 1.79769313e+308],
       [1.79769313e+308, 1.79769313e+308],
       [1.79769313e+308, 1.79769313e+308],
       [1.79769313e+308, 1.79769313e+308],
       [1.79769313e+308, 1.79769313e+308],
       [1.79769313e+308, 1.79769313e+308]])

In [265]:
box.low

array([[2.22507386e-308, 2.22507386e-308],
       [2.22507386e-308, 2.22507386e-308],
       [2.22507386e-308, 2.22507386e-308],
       [2.22507386e-308, 2.22507386e-308],
       [2.22507386e-308, 2.22507386e-308],
       [2.22507386e-308, 2.22507386e-308],
       [2.22507386e-308, 2.22507386e-308],
       [2.22507386e-308, 2.22507386e-308]])

In [266]:
reshaped_low = box.low.reshape(-1)
reshaped_low

array([2.22507386e-308, 2.22507386e-308, 2.22507386e-308, 2.22507386e-308,
       2.22507386e-308, 2.22507386e-308, 2.22507386e-308, 2.22507386e-308,
       2.22507386e-308, 2.22507386e-308, 2.22507386e-308, 2.22507386e-308,
       2.22507386e-308, 2.22507386e-308, 2.22507386e-308, 2.22507386e-308])

In [267]:
print(*reshaped_low)

2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308 2.2250738585072014e-308


In [268]:
original_env.action_space

Box(-1.0, 1.0, (5,), float64)

In [269]:
test_different_envs = []
for env_id in env_ids:
    test_envs = envpool.make(
                        env_id,
                        env_type="gym",
                        num_envs=10,
                        seed=3142,
                    )
    test_envs.num_envs = 10
    # test_envs = RecordEpisodeStatistics(test_envs)
    test_envs = FlattenRecordEpisodeStatistics(test_envs)
    test_different_envs.append(test_envs)

Box(2.2250738585072014e-308, 1.7976931348623157e+308, (8, 2), float64)
[16, 8, 5, 4, 4, 3, 4]
[1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.7976931348623157e+308, 1.797

In [270]:
test_envs_returns = dict()
test_envs_lengths = dict()

In [271]:
test_envs_returns = dict()
for i, test_envs in enumerate(test_different_envs):
    ith_test_episodic_returns = []
    ith_test_episodic_lengths = []
    next_obs = test_envs.reset()
    next_obs = torch.Tensor(next_obs)
    dones = torch.zeros(1)
    while len(ith_test_episodic_returns) < 10:
        actions = np.array([test_envs.action_space.sample() for i in range(10)])
        next_obs, reward, dones, info = test_envs.step(actions)
        # print(info)
        if True in dones:
            done_env_indices = np.where(dones==True)
            total_return = info['r'][done_env_indices]
            total_length = info['l'][done_env_indices]
            ith_test_episodic_returns += [*total_return]
            ith_test_episodic_lengths += [*total_length]
    ith_test_episodic_returns = np.array(ith_test_episodic_returns)
    test_envs_returns[env_ids[i]] = ith_test_episodic_returns

In [272]:
test_envs_returns

{'ManipulatorBringBall-v1': array([0.0000000e+00, 2.1965659e-34, 4.9636367e-07, 6.0298610e+00,
        6.1756853e-22, 0.0000000e+00, 6.0298610e+00, 9.1594962e-35,
        0.0000000e+00, 0.0000000e+00], dtype=float32)}

: 

In [59]:
test_envs_returns["CartPole-v1"].mean()


20.9

In [55]:
ith_test_episodic_returns

[12.026357,
 13.051925,
 12.034881,
 13.6669035,
 11.130557,
 15.382595,
 23.922834,
 7.8448334,
 9.577665,
 22.375751]

In [56]:
len(ith_test_episodic_returns)

10

(array([0]),)

In [34]:
dones

array([False])