In [1]:
import gym
from gym.utils import seeding

import ray
from ray import tune
from ray.rllib.agents import ppo
from ray.tune.registry import register_env

Instructions for updating:
non-resource variables are not supported in the long term


### Environment Space Attributes
    Most environments have two special attributes: action_space, observation_space

    These contain instances of gym.spaces classes
    Makes it easy to find out what are valid states and actions I
    There is a convenient sample method to generate uniform random samples in the space.
#### gym.spaces
    Action spaces and State spaces are defined by instances of classes of the gym.spaces modules

    Included types are:
      gym.spaces.Discrete

      gym.spaces.MultiDiscrete

      gym.spaces.Box

      gym.spaces.Tuple
    All instances have a sample method which will sample random instances within the space
#### gym.spaces.Discrete
    The homework environments will use this type of space Specifies a space containing n discrete points
    Each point is mapped to an integer from [0 ,n−1]
    Discrete(10) A space containing 10 items mapped to integers in [0,9] sample will return integers such as 0, 3, and 9.
#### gym.spaces.MultiDiscrete
    You will use this to implement an environment in the homework
    Species a space containing k dimensions each with a separate number of discrete points.
    Each point in the space is represented by a vector of integers of length k
    MultiDiscrete([(1, 3), (0, 5)]) A space with k= 2 dimensions First dimension has 4 points mapped to integers in [1,3] Second dimension has 6 points mapped to integers in [0,5] sample will return a vector such as [2,5] and [1,3]
#### gym.spaces.Box
    Used for multidimensional continuous spaces with bounds
    You will see environments with these types of state and action spaces in future homeworks
    Box(np.array((-1.0, -2.0)), np.array((1.0, 2.0))) A 2D continous state spaceI First dimension has values in range [−1.0,1.0) Second dimension has values in range [−2.0,2.0) sample will return a vector such as [−.55,2.] and [.768,−1.55]

### Creating an Environment
#### gym.Env Class
    All environments should inherit from gym.Env
    At a minimum you must override a handful of methods:
    step()
    reset()
    At a minimum you must provide the following attributes action_space, observation_space
#### Subclass Methods
    _step is the same api as the step function used in the example
    _reset is the same api as the reset function in the example
    You may also provide the following methods for additionalfunctionality:

    _render
    _close
    _configure
    _seed

#### Attributes
    observation_space represents the state space
    action_space represents the action space
    Both are instances of gym.spaces classes
    You can also provide a reward_range , but this defaults to (−∞,∞)
#### Registration
    How do you get your environment to work with gym.make()? You must register it
#### Registration Example
    from gym.envs.registration import register

    register(
      id='Deterministic-4x4-FrozenLake-v0',
      entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
      kwargs={'map_name': '4x4',
      'is_slippery': False})
    id: the environment name used with gym.make
    entry_point: module path and class name of environment
    kwargs: dictionary of keyword arguments to environment constructor
#### Discrete Environment Class
    A subclass of the gym.Env which provides the followingattributes
    nS: number of states
    nA: number of actions
    P: model of environment
    isd: initial state distribution


In [2]:
class CustomEnv (gym.Env):
    # possible actions
    MOVE_LF = 0
    MOVE_RT = 1

    # possible positions
    LF_MIN = 1
    RT_MAX = 10

    # land on the GOAL position within MAX_STEPS steps
    MAX_STEPS = 10

    # possible rewards
    REWARD_AWAY = -2
    REWARD_STEP = -1
    REWARD_GOAL = MAX_STEPS

    metadata = {
        "render.modes": ["human"]
        }


    def __init__ (self, config):
        # the action space ranges [0, 1] where:
        #  `0` move left
        #  `1` move right
        self.action_space = gym.spaces.Discrete(2)

        # NB: Ray throws exceptions for any `0` value Discrete
        # observations so we'll make position a 1's based value
        self.observation_space = gym.spaces.Discrete(self.RT_MAX + 1)

        # possible positions to chose on `reset()`
        self.goal = int((self.LF_MIN + self.RT_MAX - 1) / 2)

        self.init_positions = list(range(self.LF_MIN, self.RT_MAX))
        self.init_positions.remove(self.goal)

        # NB: change to guarantee the sequence of pseudorandom numbers
        # (e.g., for debugging)
        self.seed()

        self.reset()


    def reset (self):
        """
        Reset the state of the environment and returns an initial observation.
        Returns
        -------
        observation (object): the initial observation of the space.
        """
        self.position = self.np_random.choice(self.init_positions)
        self.count = 0

        # for this environment, state is simply the position
        self.state = self.position
        self.reward = 0
        self.done = False
        self.info = {}

        return self.state


    def step (self, action):
        """
        The agent takes a step in the environment.
        Parameters
        ----------
        action : Discrete
        Returns
        -------
        observation, reward, done, info : tuple
            observation (object) :
                an environment-specific object representing your observation of
                the environment.
            reward (float) :
                amount of reward achieved by the previous action. The scale
                varies between environments, but the goal is always to increase
                your total reward.
            done (bool) :
                whether it's time to reset the environment again. Most (but not
                all) tasks are divided up into well-defined episodes, and done
                being True indicates the episode has terminated. (For example,
                perhaps the pole tipped too far, or you lost your last life.)
            info (dict) :
                 diagnostic information useful for debugging. It can sometimes
                 be useful for learning (for example, it might contain the raw
                 probabilities behind the environment's last state change).
                 However, official evaluations of your agent are not allowed to
                 use this for learning.
        """
        if self.done:
            # code should never reach this point
            print("EPISODE DONE!!!")

        elif self.count == self.MAX_STEPS:
            self.done = True;

        else:
            assert self.action_space.contains(action)
            self.count += 1

            if action == self.MOVE_LF:
                if self.position == self.LF_MIN:
                    # invalid
                    self.reward = self.REWARD_AWAY
                else:
                    self.position -= 1

                    if self.position == self.goal:
                        # on goal now
                        self.reward = self.REWARD_GOAL
                        self.done = 1
                    elif self.position < self.goal:
                        # moving away from goal
                        self.reward = self.REWARD_AWAY
                    else:
                        # moving toward goal
                        self.reward = self.REWARD_STEP

            elif action == self.MOVE_RT:
                if self.position == self.RT_MAX:
                    # invalid
                    self.reward = self.REWARD_AWAY
                else:
                    self.position += 1

                    if self.position == self.goal:
                        # on goal now
                        self.reward = self.REWARD_GOAL
                        self.done = 1
                    elif self.position > self.goal:
                        # moving away from goal
                        self.reward = self.REWARD_AWAY
                    else:
                        # moving toward goal
                        self.reward = self.REWARD_STEP

            self.state = self.position
            self.info["dist"] = self.goal - self.position

        try:
            assert self.observation_space.contains(self.state)
        except AssertionError:
            print("INVALID STATE", self.state)

        return [self.state, self.reward, self.done, self.info]


    def render (self, mode="human"):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.) By convention,
        if mode is:
        - human: render to the current display or terminal and
          return nothing. Usually for human consumption.
        - rgb_array: Return an numpy.ndarray with shape (x, y, 3),
          representing RGB values for an x-by-y pixel image, suitable
          for turning into a video.
        - ansi: Return a string (str) or StringIO.StringIO containing a
          terminal-style text representation. The text can include newlines
          and ANSI escape sequences (e.g. for colors).
        Note:
            Make sure that your class's metadata 'render.modes' key includes
              the list of supported modes. It's recommended to call super()
              in implementations to use the functionality of this method.
        Args:
            mode (str): the mode to render with
        """
        s = "position: {:2d}  reward: {:2d}  info: {}"
        print(s.format(self.state, self.reward, self.info))


    def seed (self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Note:
            Some environments use multiple pseudorandom number generators.
            We want to capture all such seeds used in order to ensure that
            there aren't accidental correlations between multiple generators.
        Returns:
            list<bigint>: Returns the list of seeds used in this env's random
              number generators. The first value in the list should be the
              "main" seed, or the value which a reproducer should pass to
              'seed'. Often, the main seed equals the provided 'seed', but
              this won't be true if seed=None, for example.
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]


    def close (self):
        """Override close in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        pass

In [3]:
ray.init()

File descriptor limit 256 is too low for production servers and may result in connection errors. At least 8192 is recommended. --- Fix with 'ulimit -n 8192'
2021-02-16 14:15:12,225	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


{'node_ip_address': '192.168.0.23',
 'raylet_ip_address': '192.168.0.23',
 'redis_address': '192.168.0.23:39480',
 'object_store_address': '/tmp/ray/session_2021-02-16_14-15-11_633148_71934/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-02-16_14-15-11_633148_71934/sockets/raylet',
 'webui_url': '127.0.0.1:8266',
 'session_dir': '/tmp/ray/session_2021-02-16_14-15-11_633148_71934',
 'metrics_export_port': 60362,
 'node_id': 'effed84ac8f18d25be89ff088750e3aac2388404'}

### Using train: 

In [4]:
def main(iters =20, path='checkpoint'):
    config = {
        "env": CustomEnv,  # or "corridor" if registered above
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": 0,
        "num_workers": 5  # parallelism
    }
    
    # register the custom environment
    #register_env("customEnv", CustomEnv())
    #trainer = ppo.PPOTrainer(env="customEnv")
    
    #trainer = ppo.PPOTrainer(config=config, env=CustomEnv)
    trainer = ppo.PPOTrainer(env=CustomEnv, config=config)
    n=0
    while True:
        if n>=iters:
            break
        trainer.train()
        chkpt = trainer.save(path)
        
        n=+1
    
if __name__ == '__main__':
    pass
    #main()

### Use Tune: 

In [4]:
stop = {
        "training_iteration": 10,
        "episode_reward_mean": 100,
    }

config = {
        "env": CustomEnv, 
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": 0,
        "num_workers": 1  # parallelism
    }
    
tune.run('PPO',num_samples=2,
    stop=stop,
    config=config
)

Trial name,status,loc
PPO_CustomEnv_b3308_00000,RUNNING,


[2m[36m(pid=79384)[0m Instructions for updating:
[2m[36m(pid=79384)[0m non-resource variables are not supported in the long term
[2m[36m(pid=79383)[0m Instructions for updating:
[2m[36m(pid=79383)[0m non-resource variables are not supported in the long term
[2m[36m(pid=79384)[0m 2021-02-16 14:15:27,395	INFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=79384)[0m 2021-02-16 14:15:27,395	INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=79383)[0m 2021-02-16 14:15:27,404	INFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=79383)[0m 2021-02-16 14:15:27,404	INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=79381)[0m Instructions for updating:
[2m[

Result for PPO_CustomEnv_b3308_00001:
  custom_metrics: {}
  date: 2021-02-16_14-15-38
  done: false
  episode_len_mean: 7.487804878048781
  episode_reward_max: 10.0
  episode_reward_mean: -5.50844277673546
  episode_reward_min: -21.0
  episodes_this_iter: 533
  episodes_total: 533
  experiment_id: 0bd8aa1204fb4f70a63ae424f19a06ba
  hostname: Mingjuns-MacBook-Pro.local
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.6498297452926636
        entropy_coeff: 0.0
        kl: 0.044635090976953506
        model: {}
        policy_loss: -0.1090070977807045
        total_loss: 58.26771545410156
        vf_explained_var: 0.16147367656230927
        vf_loss: 58.36779022216797
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
  node_ip: 192.168.0.23
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 34.75
    ram_util_percent: 65.2
  pid:

[2m[36m(pid=79381)[0m Instructions for updating:
[2m[36m(pid=79381)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=79385)[0m Instructions for updating:
[2m[36m(pid=79385)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=79384)[0m Instructions for updating:
[2m[36m(pid=79384)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=79383)[0m Instructions for updating:
[2m[36m(pid=79383)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomEnv_b3308_00000,RUNNING,,,,,,,,
PPO_CustomEnv_b3308_00001,RUNNING,192.168.0.23:79383,1.0,4.1773,4000.0,-5.50844,10.0,-21.0,7.4878


Result for PPO_CustomEnv_b3308_00000:
  custom_metrics: {}
  date: 2021-02-16_14-15-38
  done: false
  episode_len_mean: 7.541509433962264
  episode_reward_max: 10.0
  episode_reward_mean: -5.943396226415095
  episode_reward_min: -21.0
  episodes_this_iter: 530
  episodes_total: 530
  experiment_id: 36699cfc061e4ba9a8db597236c9313f
  hostname: Mingjuns-MacBook-Pro.local
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.6474537253379822
        entropy_coeff: 0.0
        kl: 0.04711364209651947
        model: {}
        policy_loss: -0.11257157474756241
        total_loss: 56.84780502319336
        vf_explained_var: 0.1571224331855774
        vf_loss: 56.95095443725586
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
  node_ip: 192.168.0.23
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 34.71666666666667
    ram_util_percent:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomEnv_b3308_00000,RUNNING,192.168.0.23:79384,2,8.23408,8000,1.05991,10,-20,5.69757
PPO_CustomEnv_b3308_00001,RUNNING,192.168.0.23:79383,3,12.1047,12000,5.81268,10,-20,3.90244


Result for PPO_CustomEnv_b3308_00000:
  custom_metrics: {}
  date: 2021-02-16_14-15-46
  done: false
  episode_len_mean: 3.9332679097154073
  episode_reward_max: 10.0
  episode_reward_mean: 5.768400392541707
  episode_reward_min: -19.0
  episodes_this_iter: 1019
  episodes_total: 2250
  experiment_id: 36699cfc061e4ba9a8db597236c9313f
  hostname: Mingjuns-MacBook-Pro.local
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.44999998807907104
        cur_lr: 4.999999873689376e-05
        entropy: 0.3933897614479065
        entropy_coeff: 0.0
        kl: 0.019531626254320145
        model: {}
        policy_loss: -0.07263380289077759
        total_loss: 26.433692932128906
        vf_explained_var: 0.27956098318099976
        vf_loss: 26.497535705566406
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_restore: 3
  node_ip: 192.168.0.23
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 44.083333333333336
    ram_util

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomEnv_b3308_00000,RUNNING,192.168.0.23:79384,4,16.6038,16000,7.23342,10,-17,3.27518
PPO_CustomEnv_b3308_00001,RUNNING,192.168.0.23:79383,5,20.4138,20000,7.75501,10,-16,2.9703


Result for PPO_CustomEnv_b3308_00000:
  custom_metrics: {}
  date: 2021-02-16_14-15-54
  done: false
  episode_len_mean: 3.0
  episode_reward_max: 10.0
  episode_reward_mean: 7.706896551724138
  episode_reward_min: -16.0
  episodes_this_iter: 1334
  episodes_total: 4805
  experiment_id: 36699cfc061e4ba9a8db597236c9313f
  hostname: Mingjuns-MacBook-Pro.local
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.44999998807907104
        cur_lr: 4.999999873689376e-05
        entropy: 0.20682156085968018
        entropy_coeff: 0.0
        kl: 0.008061332628130913
        model: {}
        policy_loss: -0.05003109574317932
        total_loss: 4.269073963165283
        vf_explained_var: 0.3641972839832306
        vf_loss: 4.31547737121582
    num_steps_sampled: 20000
    num_steps_trained: 20000
  iterations_since_restore: 5
  node_ip: 192.168.0.23
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 32.25
    ram_util_percent: 65.15
  pid: 79384
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomEnv_b3308_00000,RUNNING,192.168.0.23:79384,6,24.6146,24000,8.01697,10,-1,2.82815
PPO_CustomEnv_b3308_00001,RUNNING,192.168.0.23:79383,7,28.2447,28000,8.16339,10,-15,2.71186


Result for PPO_CustomEnv_b3308_00000:
  custom_metrics: {}
  date: 2021-02-16_14-16-02
  done: false
  episode_len_mean: 2.6852348993288593
  episode_reward_max: 10.0
  episode_reward_mean: 8.206711409395973
  episode_reward_min: -2.0
  episodes_this_iter: 1490
  episodes_total: 7709
  experiment_id: 36699cfc061e4ba9a8db597236c9313f
  hostname: Mingjuns-MacBook-Pro.local
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.44999998807907104
        cur_lr: 4.999999873689376e-05
        entropy: 0.11157828569412231
        entropy_coeff: 0.0
        kl: 0.0022592744790017605
        model: {}
        policy_loss: -0.038646992295980453
        total_loss: 1.0654664039611816
        vf_explained_var: 0.5826941728591919
        vf_loss: 1.103096604347229
    num_steps_sampled: 28000
    num_steps_trained: 28000
  iterations_since_restore: 7
  node_ip: 192.168.0.23
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 41.3
    ram_util_percent: 64.5

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomEnv_b3308_00000,RUNNING,192.168.0.23:79384,8,32.6744,32000,8.34521,10,1,2.59003
PPO_CustomEnv_b3308_00001,RUNNING,192.168.0.23:79383,9,36.7687,36000,8.45066,10,3,2.51414


Result for PPO_CustomEnv_b3308_00000:
  custom_metrics: {}
  date: 2021-02-16_14-16-11
  done: false
  episode_len_mean: 2.5673940949935816
  episode_reward_max: 10.0
  episode_reward_mean: 8.398587933247754
  episode_reward_min: 2.0
  episodes_this_iter: 1558
  episodes_total: 10811
  experiment_id: 36699cfc061e4ba9a8db597236c9313f
  hostname: Mingjuns-MacBook-Pro.local
  info:
    learner:
      default_policy:
        cur_kl_coeff: 0.11249999701976776
        cur_lr: 4.999999873689376e-05
        entropy: 0.04854472726583481
        entropy_coeff: 0.0
        kl: 0.0020619549322873354
        model: {}
        policy_loss: -0.022537557408213615
        total_loss: 0.2924669682979584
        vf_explained_var: 0.8185659646987915
        vf_loss: 0.3147725462913513
    num_steps_sampled: 36000
    num_steps_trained: 36000
  iterations_since_restore: 9
  node_ip: 192.168.0.23
  num_healthy_workers: 1
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 41.65714285714285
    ram_util

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomEnv_b3308_00000,TERMINATED,,10,41.6054,40000,8.41357,10,4,2.56146
PPO_CustomEnv_b3308_00001,TERMINATED,,10,41.1396,40000,8.44099,10,4,2.53744


2021-02-16 14:16:16,088	INFO tune.py:448 -- Total run time: 52.72 seconds (52.41 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f8bb27b7b50>