In [1]:
import gym
from gym.utils import seeding

import ray
from ray import tune
from ray.rllib.agents import ppo
from ray.tune.registry import register_env

### Environment Space Attributes
    Most environments have two special attributes: action_space, observation_space

    These contain instances of gym.spaces classes
    Makes it easy to find out what are valid states and actions I
    There is a convenient sample method to generate uniform random samples in the space.
#### gym.spaces
    Action spaces and State spaces are defined by instances of classes of the gym.spaces modules

    Included types are:
      gym.spaces.Discrete

      gym.spaces.MultiDiscrete

      gym.spaces.Box

      gym.spaces.Tuple
    All instances have a sample method which will sample random instances within the space
#### gym.spaces.Discrete
    The homework environments will use this type of space Specifies a space containing n discrete points
    Each point is mapped to an integer from [0 ,n−1]
    Discrete(10) A space containing 10 items mapped to integers in [0,9] sample will return integers such as 0, 3, and 9.
#### gym.spaces.MultiDiscrete
    You will use this to implement an environment in the homework
    Species a space containing k dimensions each with a separate number of discrete points.
    Each point in the space is represented by a vector of integers of length k
    MultiDiscrete([(1, 3), (0, 5)]) A space with k= 2 dimensions First dimension has 4 points mapped to integers in [1,3] Second dimension has 6 points mapped to integers in [0,5] sample will return a vector such as [2,5] and [1,3]
#### gym.spaces.Box
    Used for multidimensional continuous spaces with bounds
    You will see environments with these types of state and action spaces in future homeworks
    Box(np.array((-1.0, -2.0)), np.array((1.0, 2.0))) A 2D continous state spaceI First dimension has values in range [−1.0,1.0) Second dimension has values in range [−2.0,2.0) sample will return a vector such as [−.55,2.] and [.768,−1.55]

### Creating an Environment
#### gym.Env Class
    All environments should inherit from gym.Env
    At a minimum you must override a handful of methods:
    step()
    reset()
    At a minimum you must provide the following attributes action_space, observation_space
#### Subclass Methods
    _step is the same api as the step function used in the example
    _reset is the same api as the reset function in the example
    You may also provide the following methods for additionalfunctionality:

    _render
    _close
    _configure
    _seed

#### Attributes
    observation_space represents the state space
    action_space represents the action space
    Both are instances of gym.spaces classes
    You can also provide a reward_range , but this defaults to (−∞,∞)
#### Registration
    How do you get your environment to work with gym.make()? You must register it
#### Registration Example
    from gym.envs.registration import register

    register(
      id='Deterministic-4x4-FrozenLake-v0',
      entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
      kwargs={'map_name': '4x4',
      'is_slippery': False})
    id: the environment name used with gym.make
    entry_point: module path and class name of environment
    kwargs: dictionary of keyword arguments to environment constructor
#### Discrete Environment Class
    A subclass of the gym.Env which provides the followingattributes
    nS: number of states
    nA: number of actions
    P: model of environment
    isd: initial state distribution


In [2]:
class CustomEnv (gym.Env):
    # possible actions
    MOVE_LF = 0
    MOVE_RT = 1

    # possible positions
    LF_MIN = 1
    RT_MAX = 10

    # land on the GOAL position within MAX_STEPS steps
    MAX_STEPS = 10

    # possible rewards
    REWARD_AWAY = -2
    REWARD_STEP = -1
    REWARD_GOAL = MAX_STEPS

    metadata = {
        "render.modes": ["human"]
        }


    def __init__ (self, config):
        # the action space ranges [0, 1] where:
        #  `0` move left
        #  `1` move right
        self.action_space = gym.spaces.Discrete(2)

        # NB: Ray throws exceptions for any `0` value Discrete
        # observations so we'll make position a 1's based value
        self.observation_space = gym.spaces.Discrete(self.RT_MAX + 1)

        # possible positions to chose on `reset()`
        self.goal = int((self.LF_MIN + self.RT_MAX - 1) / 2)

        self.init_positions = list(range(self.LF_MIN, self.RT_MAX))
        self.init_positions.remove(self.goal)

        # NB: change to guarantee the sequence of pseudorandom numbers
        # (e.g., for debugging)
        self.seed()

        self.reset()


    def reset (self):
        """
        Reset the state of the environment and returns an initial observation.
        Returns
        -------
        observation (object): the initial observation of the space.
        """
        self.position = self.np_random.choice(self.init_positions)
        self.count = 0

        # for this environment, state is simply the position
        self.state = self.position
        self.reward = 0
        self.done = False
        self.info = {}

        return self.state


    def step (self, action):
        """
        The agent takes a step in the environment.
        Parameters
        ----------
        action : Discrete
        Returns
        -------
        observation, reward, done, info : tuple
            observation (object) :
                an environment-specific object representing your observation of
                the environment.
            reward (float) :
                amount of reward achieved by the previous action. The scale
                varies between environments, but the goal is always to increase
                your total reward.
            done (bool) :
                whether it's time to reset the environment again. Most (but not
                all) tasks are divided up into well-defined episodes, and done
                being True indicates the episode has terminated. (For example,
                perhaps the pole tipped too far, or you lost your last life.)
            info (dict) :
                 diagnostic information useful for debugging. It can sometimes
                 be useful for learning (for example, it might contain the raw
                 probabilities behind the environment's last state change).
                 However, official evaluations of your agent are not allowed to
                 use this for learning.
        """
        if self.done:
            # code should never reach this point
            print("EPISODE DONE!!!")

        elif self.count == self.MAX_STEPS:
            self.done = True;

        else:
            assert self.action_space.contains(action)
            self.count += 1

            if action == self.MOVE_LF:
                if self.position == self.LF_MIN:
                    # invalid
                    self.reward = self.REWARD_AWAY
                else:
                    self.position -= 1

                    if self.position == self.goal:
                        # on goal now
                        self.reward = self.REWARD_GOAL
                        self.done = 1
                    elif self.position < self.goal:
                        # moving away from goal
                        self.reward = self.REWARD_AWAY
                    else:
                        # moving toward goal
                        self.reward = self.REWARD_STEP

            elif action == self.MOVE_RT:
                if self.position == self.RT_MAX:
                    # invalid
                    self.reward = self.REWARD_AWAY
                else:
                    self.position += 1

                    if self.position == self.goal:
                        # on goal now
                        self.reward = self.REWARD_GOAL
                        self.done = 1
                    elif self.position > self.goal:
                        # moving away from goal
                        self.reward = self.REWARD_AWAY
                    else:
                        # moving toward goal
                        self.reward = self.REWARD_STEP

            self.state = self.position
            self.info["dist"] = self.goal - self.position

        try:
            assert self.observation_space.contains(self.state)
        except AssertionError:
            print("INVALID STATE", self.state)

        return [self.state, self.reward, self.done, self.info]


    def render (self, mode="human"):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.) By convention,
        if mode is:
        - human: render to the current display or terminal and
          return nothing. Usually for human consumption.
        - rgb_array: Return an numpy.ndarray with shape (x, y, 3),
          representing RGB values for an x-by-y pixel image, suitable
          for turning into a video.
        - ansi: Return a string (str) or StringIO.StringIO containing a
          terminal-style text representation. The text can include newlines
          and ANSI escape sequences (e.g. for colors).
        Note:
            Make sure that your class's metadata 'render.modes' key includes
              the list of supported modes. It's recommended to call super()
              in implementations to use the functionality of this method.
        Args:
            mode (str): the mode to render with
        """
        s = "position: {:2d}  reward: {:2d}  info: {}"
        print(s.format(self.state, self.reward, self.info))


    def seed (self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Note:
            Some environments use multiple pseudorandom number generators.
            We want to capture all such seeds used in order to ensure that
            there aren't accidental correlations between multiple generators.
        Returns:
            list<bigint>: Returns the list of seeds used in this env's random
              number generators. The first value in the list should be the
              "main" seed, or the value which a reproducer should pass to
              'seed'. Often, the main seed equals the provided 'seed', but
              this won't be true if seed=None, for example.
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]


    def close (self):
        """Override close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        pass

In [3]:
ray.init()

{'node_ip_address': '172.16.35.94',
 'raylet_ip_address': '172.16.35.94',
 'redis_address': '172.16.35.94:46312',
 'object_store_address': '/tmp/ray/session_2021-10-15_14-41-51_900331_1008/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-10-15_14-41-51_900331_1008/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-10-15_14-41-51_900331_1008',
 'metrics_export_port': 50367,
 'node_id': '0164695434c0689e41765ad9152be00ecfb71aabe528847ddac76c8c'}

### Using train: 

In [7]:
def main(iters =20, path='checkpoint'):
    config = {
        "env": CustomEnv,  # or "corridor" if registered above
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": 0,
        "num_workers": 5  # parallelism
    }
    
    # register the custom environment
    #register_env("customEnv", CustomEnv())
    #trainer = ppo.PPOTrainer(env="customEnv")
    
    #trainer = ppo.PPOTrainer(config=config, env=CustomEnv)
    trainer = ppo.PPOTrainer(env=CustomEnv, config=config)
    n=0
    while True:
        if n>=iters:
            break
        trainer.train()
        chkpt = trainer.save(path)
        
        n=+1
    
if __name__ == '__main__':
    pass
    main()

### Use Tune: 

In [5]:
stop = {
        "training_iteration": 10,
        "episode_reward_mean": 100,
    }

config = {
        "env": CustomEnv, 
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": 0,
        "num_workers": 1  # parallelism
    }
    
tune.run('PPO',num_samples=2,
    stop=stop,
    config=config
)

Trial name,status,loc
PPO_CustomEnv_114a8_00000,PENDING,
PPO_CustomEnv_114a8_00001,PENDING,


[2m[36m(pid=1248)[0m 2021-10-15 14:42:08,170	INFO ppo.py:165 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=1248)[0m 2021-10-15 14:42:08,171	INFO trainer.py:760 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1259)[0m 2021-10-15 14:42:08,147	INFO ppo.py:165 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=1259)[0m 2021-10-15 14:42:08,148	INFO trainer.py:760 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2021-10-15 14:42:10,049	ERROR trial_runner.py:846 -- Trial PPO_CustomEnv_114a8_00000: Error processing event.
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/p

Result for PPO_CustomEnv_114a8_00000:
  {}
  
Result for PPO_CustomEnv_114a8_00001:
  {}
  


Trial name,status,loc
PPO_CustomEnv_114a8_00000,ERROR,
PPO_CustomEnv_114a8_00001,ERROR,

Trial name,# failures,error file
PPO_CustomEnv_114a8_00000,1,/home/ec2-user/ray_results/PPO/PPO_CustomEnv_114a8_00000_0_2021-10-15_14-42-06/error.txt
PPO_CustomEnv_114a8_00001,1,/home/ec2-user/ray_results/PPO/PPO_CustomEnv_114a8_00001_1_2021-10-15_14-42-06/error.txt


[2m[36m(pid=1248)[0m 2021-10-15 14:42:10,050	ERROR worker.py:428 -- Exception raised in creation task: The actor died because of an error raised in its creation task, [36mray::PPO.__init__()[39m (pid=1248, ip=172.16.35.94)
[2m[36m(pid=1248)[0m   File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/ray/rllib/agents/trainer_template.py", line 137, in __init__
[2m[36m(pid=1248)[0m     Trainer.__init__(self, config, env, logger_creator)
[2m[36m(pid=1248)[0m   File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/ray/rllib/agents/trainer.py", line 611, in __init__
[2m[36m(pid=1248)[0m     super().__init__(config, logger_creator)
[2m[36m(pid=1248)[0m   File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/ray/tune/trainable.py", line 106, in __init__
[2m[36m(pid=1248)[0m     self.setup(copy.deepcopy(self.config))
[2m[36m(pid=1248)[0m   File "/home/ec2-user/anaconda3/envs/pytorch_p36/lib/python3.6/sit

TuneError: ('Trials did not complete', [PPO_CustomEnv_114a8_00000, PPO_CustomEnv_114a8_00001])