In [44]:
import torch
import gym
import gym_conservation
import gym_fishing
from ray import tune
from ray.rllib import agents
import os



os.environ["RLLIB_NUM_GPUS"] = str(torch.cuda.device_count())

## Possible bug, as --shm-size is already large!
os.environ["RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE"] = "1"




In [45]:

## rllib ignores gym registered names, need to register manually:
## note these envs were not written to take a single parameter dictionary ("config")
tune.register_env("conservation-v6", lambda config: gym_conservation.envs.NonStationaryV6())
tune.register_env("conservation-v5", lambda config: gym_conservation.envs.NonStationaryV5())
tune.register_env("fishing-v0", lambda config: gym_fishing.envs.FishingEnv())
tune.register_env("fishing-v1", lambda config: gym_fishing.envs.FishingCtsEnv())


In [46]:
trainer = agents.ppo.PPOTrainer(env="fishing-v1")




In [47]:
# Customized configure for the algorithm.
config = {
    "env": "fishing-v1",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 4,
    "framework": "torch",
    "num_gpus": torch.cuda.device_count(),

    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "relu",
    },
    "evaluation_num_workers": 1,
    "evaluation_interval": 2,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": False,
    }
}

# Create our RLlib Trainer.
trainer = agents.ppo.PPOTrainer(config=config)





In [48]:
for _ in range(4):
    trainer.train()



[2m[1m[36m(scheduler +12m27s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.


Required resources for this actor or task: {CPU: 1.000000}
Available resources on this node: {0.000000/24.000000 CPU, 35.478617 GiB/35.478617 GiB memory, 1.000000/1.000000 GPU, 10.999989 GiB/10.999989 GiB object_store_memory, 1.000000/1.000000 accelerator_type:G, 1.000000/1.000000 node:172.18.0.5}
 In total there are 0 pending tasks and 1 pending actors on this node.


In [49]:
checkpoint = trainer.save()

In [50]:
trainer.evaluate()


{'evaluation': {'episode_reward_max': 1.1701428891871886,
  'episode_reward_min': 0.75,
  'episode_reward_mean': 0.8909582040576767,
  'episode_len_mean': 3.4,
  'episode_media': {},
  'episodes_this_iter': 10,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [1.1701428891871886,
    1.0337219210797386,
    0.8812072934789418,
    0.9237330803724098,
    0.75,
    0.75,
    0.75,
    0.85311328125,
    0.9728964856470526,
    0.8247670895614376],
   'episode_lengths': [9, 5, 3, 4, 1, 1, 1, 3, 5, 2]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 0.3457967337075766,
   'mean_inference_ms': 1.0051046098981584,
   'mean_action_processing_ms': 0.10796646019081016,
   'mean_env_wait_ms': 0.09344769762707995,
   'mean_env_render_ms': 0.0},
  'off_policy_estimator': {},
  'timesteps_this_iter': 0}}

In [51]:
model = agents.ppo.PPOTrainer(config)
# Path will be different
model.restore(checkpoint)

## Or manually give the path
##model.restore("/home/cboettig/ray_results/PPOTrainer_fishing-v1_2022-02-22_19-55-51cej434z4/checkpoint_000004/checkpoint-4")


2022-02-22 20:04:14,510	INFO trainable.py:472 -- Restored on 172.18.0.5 from checkpoint: /home/cboettig/ray_results/PPOTrainer_fishing-v1_2022-02-22_20-03-03ql6e0onn/checkpoint_000004/checkpoint-4
2022-02-22 20:04:14,513	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 4, '_timesteps_total': 16000, '_time_total': 60.387168884277344, '_episodes_total': 8350}


In [52]:

# Evaluate the trained Trainer (and render each timestep to the shell's output).
model.evaluate()




{'evaluation': {'episode_reward_max': 1.2160595664436524,
  'episode_reward_min': 0.75,
  'episode_reward_mean': 0.8503476138371683,
  'episode_len_mean': 2.7,
  'episode_media': {},
  'episodes_this_iter': 10,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_mean': {},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [0.75,
    0.80625,
    0.8630928908613323,
    0.75,
    1.2160595664436524,
    0.75,
    0.75,
    0.9234481767736002,
    0.8649533402662901,
    0.8296721640268085],
   'episode_lengths': [1, 2, 3, 1, 8, 1, 1, 4, 3, 3]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 0.36495072501046316,
   'mean_inference_ms': 0.9745955467224121,
   'mean_action_processing_ms': 0.09769201278686523,
   'mean_env_wait_ms': 0.08575405393327985,
   'mean_env_render_ms': 0.0},
  'off_policy_estimator': {},
  'timesteps_this_iter': 0}}