In [1]:
import json

from flow.core.params import (
    NetParams,
    InitialConfig,
    SumoParams,
    EnvParams,
    VehicleParams,
)
from flow.controllers import IDMController, ContinuousRouter, RLController
from flow.networks.figure_eight import FigureEightNetwork, ADDITIONAL_NET_PARAMS
from flow.envs.ring.accel import AccelEnv, ADDITIONAL_ENV_PARAMS

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder

from ray.rllib.algorithms.ppo import DEFAULT_CONFIG
from ray.tune import run_experiments
from ray.tune.registry import register_env


In [2]:
# Define horizon as a variable to ensure consistent use across notebook
HORIZON = 100
N_WORKERS = 9  # 1 - number of CPUS
N_ROLLOUTS = 1
# The algorithm or model to train. This may refer to the name of a built-on
# algorithm (e.g. RLLib's DQN or PPO), or a user-defined trainable function or
# class registered in the tune registry.)
RL_ALG_NAME = "PPO"
EXP_NAME = "figure_eight"


In [3]:
vehicles = VehicleParams()
vehicles.add(
    "human",
    acceleration_controller=(IDMController, {}),
    routing_controller=(ContinuousRouter, {}),
    num_vehicles=15,
)
vehicles.add(
    veh_id="rl",
    acceleration_controller=(RLController, {}),
    routing_controller=(ContinuousRouter, {}),
    num_vehicles=1,
)


In [4]:
# Creating flow_params. Make sure the dictionary keys are as specified.
flow_params = dict(
    # name of the experiment
    exp_tag=EXP_NAME,
    # name of the flow environment the experiment is running on
    env_name=AccelEnv,
    # name of the network class the experiment uses
    network=FigureEightNetwork,
    # simulator that is used by the experiment
    simulator="traci",
    # simulation-related parameters
    sim=SumoParams(sim_step=0.1, render=False),
    # environment related parameters (see flow.core.params.EnvParams)
    env=EnvParams(
        # length of one rollout
        horizon=HORIZON,
        additional_params=ADDITIONAL_ENV_PARAMS,
    ),
    # network-related parameters (see flow.core.params.NetParams and
    # the network's documentation or ADDITIONAL_NET_PARAMS component)
    net=NetParams(additional_params=ADDITIONAL_NET_PARAMS),
    # vehicles to be placed in the network at the start of a rollout
    # (see flow.core.vehicles.Vehicles)
    veh=vehicles,
    # (optional) parameters affecting the positioning of vehicles upon
    # initialization/reset (see flow.core.params.InitialConfig)
    initial=InitialConfig(spacing="uniform", perturbation=1),
)

# Call the utility function make_create_env to be able to
# register the Flow env for this experiment
gym_name, create_env = make_create_env(params=flow_params, version=0)

# Register as rllib env with Gym
register_env(gym_name, create_env)

In [5]:
# save the flow params for replay
flow_json = json.dumps(
    flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4
)  # generating a string version of flow_params

config = {
    **DEFAULT_CONFIG,
    "num_workers": N_WORKERS,  # number of parallel workers
    "train_batch_size": HORIZON * N_ROLLOUTS,  # batch size
    "model": {
        **DEFAULT_CONFIG["model"],
        "fcnet_hiddens": [16, 16],
    },  # size of hidden layers in network
    "use_gae": True,  # using generalized advantage estimation
    "gamma": 0.999,  # discount rate
    "lambda": 0.97,
    "sgd_minibatch_size": min(
        16 * 1024, HORIZON * N_ROLLOUTS
    ),  # stochastic gradient descent
    "kl_target": 0.02,  # target KL divergence
    "num_sgd_iter": 10,  # number of SGD iterations
    "horizon": HORIZON,  # rollout horizon
    "framework": "tf2",
    "eager_tracing": True,
    "env_config": {
        "flow_params": flow_json,
        "run": RL_ALG_NAME,
    },
}




In [6]:
trials = run_experiments(
    {
        EXP_NAME: {
            "run": RL_ALG_NAME,
            "env": gym_name,
            "config": config.copy(),
            "checkpoint_freq": 10,  # number of iterations between checkpoints
            "checkpoint_at_end": True,  # generate a checkpoint at the end
            "max_failures": 999,
            "stop": {  # stopping conditions
                "training_iteration": 150,  # number of iterations to stop after
            },
        },
    }
)


2022-09-04 18:08:35,909	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Trial name,status,loc,iter,total time (s),ts,reward,num_recreated_wor...,episode_reward_max,episode_reward_min
PPO_AccelEnv-v0_1fa80_00000,TERMINATED,127.0.0.1:84495,150,1376.71,750600,102.781,0,103.799,101.938


[2m[36m(PPO pid=84495)[0m 2022-09-04 18:08:40,832	INFO algorithm.py:1860 -- Executing eagerly (framework='tf2'), with eager_tracing=True. For production workloads, make sure to set eager_tracing=True  in order to match the speed of tf-static-graph (framework='tf'). For debugging purposes, `eager_tracing=False` is the best choice.
[2m[36m(PPO pid=84495)[0m 2022-09-04 18:08:40,833	INFO algorithm.py:351 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(RolloutWorker pid=84504)[0m Metal device set to: Apple M1 Max
[2m[36m(RolloutWorker pid=84504)[0m 
[2m[36m(RolloutWorker pid=84504)[0m systemMemory: 64.00 GB
[2m[36m(RolloutWorker pid=84504)[0m maxCacheSize: 24.00 GB
[2m[36m(RolloutWorker pid=84504)[0m 
[2m[36m(RolloutWorker pid=84503)[0m Metal device set to: Apple M1 Max
[2m[36m(RolloutWorker pid=84503)[0m 
[2m[36m(RolloutWorker pid=84503)[0m systemMemory: 64.00 GB
[2m[36m(RolloutWorker pid=84503)[0m maxCacheSize: 24.00 GB
[2m[36m(RolloutWorker pid=84503)[0m 
[2m[36m(RolloutWorker pid=84501)[0m Metal device set to: Apple M1 Max
[2m[36m(RolloutWorker pid=84501)[0m 
[2m[36m(RolloutWorker pid=84501)[0m systemMemory: 64.00 GB
[2m[36m(RolloutWorker pid=84501)[0m maxCacheSize: 24.00 GB
[2m[36m(RolloutWorker pid=84501)[0m 
[2m[36m(RolloutWorker pid=84498)[0m Metal device set to: Apple M1 Max
[2m[36m(RolloutWorker pid=84498)[0m 
[2m[36m(RolloutWorker pid=84498)[0m systemMemory: 64.



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 5004
  counters:
    num_agent_steps_sampled: 5004
    num_agent_steps_trained: 5004
    num_env_steps_sampled: 5004
    num_env_steps_trained: 5004
  custom_metrics: {}
  date: 2022-09-04_18-08-58
  done: false
  episode_len_mean: .nan
  episode_media: {}
  episode_reward_max: .nan
  episode_reward_mean: .nan
  episode_reward_min: .nan
  episodes_this_iter: 0
  episodes_total: 0
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 1.414780616760254
          entropy_coeff: 0.0
          kl: 3.5099797059956472e-06
          policy_loss: -0.10837201029062271
          total_loss: 6.024956226348877
          vf_explained_var: -4.117786738788709e-05
          vf_loss: 6.133327007293701
        num_age



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 50040
  counters:
    num_agent_steps_sampled: 50040
    num_agent_steps_trained: 50040
    num_env_steps_sampled: 50040
    num_env_steps_trained: 50040
  custom_metrics: {}
  date: 2022-09-04_18-10-22
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.79093230533132
  episode_reward_min: 101.94032551804062
  episodes_this_iter: 0
  episodes_total: 9
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0003906250058207661
          cur_lr: 4.999999873689376e-05
          entropy: 1.416961908340454
          entropy_coeff: 0.0
          kl: 1.934244409085295e-07
          policy_loss: -0.060349058359861374
          total_loss: 5.647315502166748
          vf_explained_var: -5.528628753381781e-05
  



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 95076
  counters:
    num_agent_steps_sampled: 95076
    num_agent_steps_trained: 95076
    num_env_steps_sampled: 95076
    num_env_steps_trained: 95076
  custom_metrics: {}
  date: 2022-09-04_18-11-46
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.7876795703186
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 18
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 7.629394644936838e-07
          cur_lr: 4.999999873689376e-05
          entropy: 1.4144999980926514
          entropy_coeff: 0.0
          kl: 3.67280927093816e-06
          policy_loss: 0.0028420835733413696
          total_loss: 5.319384574890137
          vf_explained_var: -0.015750834718346596
   



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 140112
  counters:
    num_agent_steps_sampled: 140112
    num_agent_steps_trained: 140112
    num_env_steps_sampled: 140112
    num_env_steps_trained: 140112
  custom_metrics: {}
  date: 2022-09-04_18-13-12
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.78710092135817
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 27
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.4901161415892261e-09
          cur_lr: 4.999999873689376e-05
          entropy: 1.412330985069275
          entropy_coeff: 0.0
          kl: 9.72497446127818e-07
          policy_loss: -0.0012716141063719988
          total_loss: 5.410078525543213
          vf_explained_var: -0.000338679557899



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 185148
  counters:
    num_agent_steps_sampled: 185148
    num_agent_steps_trained: 185148
    num_env_steps_sampled: 185148
    num_env_steps_trained: 185148
  custom_metrics: {}
  date: 2022-09-04_18-14-35
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.78666484678386
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 36
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.9103830890414573e-12
          cur_lr: 4.999999873689376e-05
          entropy: 1.406970500946045
          entropy_coeff: 0.0
          kl: 5.154122845851816e-06
          policy_loss: 0.11036480963230133
          total_loss: 5.334985733032227
          vf_explained_var: -0.00016441344632767



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 230184
  counters:
    num_agent_steps_sampled: 230184
    num_agent_steps_trained: 230184
    num_env_steps_sampled: 230184
    num_env_steps_trained: 230184
  custom_metrics: {}
  date: 2022-09-04_18-15-58
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.7863039182708
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 45
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 5.684341970784096e-15
          cur_lr: 4.999999873689376e-05
          entropy: 1.3997843265533447
          entropy_coeff: 0.0
          kl: 1.6863805285538547e-06
          policy_loss: -0.04143005609512329
          total_loss: 5.706910610198975
          vf_explained_var: -0.0006967782974243



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 275220
  counters:
    num_agent_steps_sampled: 275220
    num_agent_steps_trained: 275220
    num_env_steps_sampled: 275220
    num_env_steps_trained: 275220
  custom_metrics: {}
  date: 2022-09-04_18-17-22
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.7861282275023
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 54
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.1102230411687688e-17
          cur_lr: 4.999999873689376e-05
          entropy: 1.413654088973999
          entropy_coeff: 0.0
          kl: 8.044286232689046e-07
          policy_loss: 0.030261129140853882
          total_loss: 5.1338911056518555
          vf_explained_var: -0.0006632447475567



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 320256
  counters:
    num_agent_steps_sampled: 320256
    num_agent_steps_trained: 320256
    num_env_steps_sampled: 320256
    num_env_steps_trained: 320256
  custom_metrics: {}
  date: 2022-09-04_18-18-45
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.78592085034302
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 63
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 2.1684043772827515e-20
          cur_lr: 4.999999873689376e-05
          entropy: 1.408850908279419
          entropy_coeff: 0.0
          kl: 5.466590209834976e-06
          policy_loss: 0.08965369313955307
          total_loss: 4.758111000061035
          vf_explained_var: -1.4153122720017564e



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 365292
  counters:
    num_agent_steps_sampled: 365292
    num_agent_steps_trained: 365292
    num_env_steps_sampled: 365292
    num_env_steps_trained: 365292
  custom_metrics: {}
  date: 2022-09-04_18-20-08
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.78578059919995
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 72
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 4.235164799380374e-23
          cur_lr: 4.999999873689376e-05
          entropy: 1.3979681730270386
          entropy_coeff: 0.0
          kl: 2.803788220262504e-06
          policy_loss: -0.09125436842441559
          total_loss: 5.590971946716309
          vf_explained_var: -0.0016925155650824



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 410328
  counters:
    num_agent_steps_sampled: 410328
    num_agent_steps_trained: 410328
    num_env_steps_sampled: 410328
    num_env_steps_trained: 410328
  custom_metrics: {}
  date: 2022-09-04_18-21-32
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7974450028322
  episode_reward_mean: 102.78726960327619
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 81
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 8.271806248789793e-26
          cur_lr: 4.999999873689376e-05
          entropy: 1.3998770713806152
          entropy_coeff: 0.0
          kl: 3.865011876769131e-06
          policy_loss: 0.11150070279836655
          total_loss: 4.671496391296387
          vf_explained_var: -0.00028428435325622



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 455364
  counters:
    num_agent_steps_sampled: 455364
    num_agent_steps_trained: 455364
    num_env_steps_sampled: 455364
    num_env_steps_trained: 455364
  custom_metrics: {}
  date: 2022-09-04_18-22-55
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7975513424916
  episode_reward_mean: 102.78710450832294
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 90
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.6155871579667565e-28
          cur_lr: 4.999999873689376e-05
          entropy: 1.3957000970840454
          entropy_coeff: 0.0
          kl: 2.954198862425983e-06
          policy_loss: -0.09041589498519897
          total_loss: 5.5765533447265625
          vf_explained_var: -0.00142641062848



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 500400
  counters:
    num_agent_steps_sampled: 500400
    num_agent_steps_trained: 500400
    num_env_steps_sampled: 500400
    num_env_steps_trained: 500400
  custom_metrics: {}
  date: 2022-09-04_18-24-16
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.7975513424916
  episode_reward_mean: 102.78692805512081
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 99
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 3.1554436679038213e-31
          cur_lr: 4.999999873689376e-05
          entropy: 1.385683298110962
          entropy_coeff: 0.0
          kl: 1.1629770597210154e-05
          policy_loss: 0.08761271834373474
          total_loss: 4.692473411560059
          vf_explained_var: -0.0011332214344292



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 545436
  counters:
    num_agent_steps_sampled: 545436
    num_agent_steps_trained: 545436
    num_env_steps_sampled: 545436
    num_env_steps_trained: 545436
  custom_metrics: {}
  date: 2022-09-04_18-25-37
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.79902259922984
  episode_reward_mean: 102.78093730025607
  episode_reward_min: 101.93794751526434
  episodes_this_iter: 0
  episodes_total: 108
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 6.162975913874651e-34
          cur_lr: 4.999999873689376e-05
          entropy: 1.3788559436798096
          entropy_coeff: 0.0
          kl: 1.374132807541173e-05
          policy_loss: 0.07014212757349014
          total_loss: 4.598898887634277
          vf_explained_var: -0.001358297420665



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 590472
  counters:
    num_agent_steps_sampled: 590472
    num_agent_steps_trained: 590472
    num_env_steps_sampled: 590472
    num_env_steps_trained: 590472
  custom_metrics: {}
  date: 2022-09-04_18-26-58
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.79902259922984
  episode_reward_mean: 102.78085139889768
  episode_reward_min: 101.93821971981195
  episodes_this_iter: 0
  episodes_total: 117
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 1.2037062331786428e-36
          cur_lr: 4.999999873689376e-05
          entropy: 1.3665322065353394
          entropy_coeff: 0.0
          kl: 1.8738066501100548e-05
          policy_loss: 0.01774388924241066
          total_loss: 4.824573993682861
          vf_explained_var: -0.0013543546665



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 635508
  counters:
    num_agent_steps_sampled: 635508
    num_agent_steps_trained: 635508
    num_env_steps_sampled: 635508
    num_env_steps_trained: 635508
  custom_metrics: {}
  date: 2022-09-04_18-28-19
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.79902259922984
  episode_reward_mean: 102.78075484189979
  episode_reward_min: 101.93821971981195
  episodes_this_iter: 0
  episodes_total: 126
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 4.999999873689376e-05
          entropy: 1.3473856449127197
          entropy_coeff: 0.0
          kl: 3.473773176665418e-05
          policy_loss: 0.0671953409910202
          total_loss: 4.490694999694824
          vf_explained_var: -0.0014346539974212646
          vf_l



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 680544
  counters:
    num_agent_steps_sampled: 680544
    num_agent_steps_trained: 680544
    num_env_steps_sampled: 680544
    num_env_steps_trained: 680544
  custom_metrics: {}
  date: 2022-09-04_18-29-40
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.79902259922984
  episode_reward_mean: 102.78063993278272
  episode_reward_min: 101.93793493358653
  episodes_this_iter: 0
  episodes_total: 135
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 4.999999873689376e-05
          entropy: 1.3334676027297974
          entropy_coeff: 0.0
          kl: 1.8772328985505737e-05
          policy_loss: -0.04187304526567459
          total_loss: 5.258872985839844
          vf_explained_var: -0.0014101117849349976
          v



Result for PPO_AccelEnv-v0_1fa80_00000:
  agent_timesteps_total: 725580
  counters:
    num_agent_steps_sampled: 725580
    num_agent_steps_trained: 725580
    num_env_steps_sampled: 725580
    num_env_steps_trained: 725580
  custom_metrics: {}
  date: 2022-09-04_18-31-02
  done: false
  episode_len_mean: 5000.0
  episode_media: {}
  episode_reward_max: 103.79945888203886
  episode_reward_mean: 102.78080550188763
  episode_reward_min: 101.93793493358653
  episodes_this_iter: 0
  episodes_total: 144
  experiment_id: 0e814ab2ba9c41d8b7aa032a01a2cb23
  hostname: Maxs-MacBook-Pro.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.0
          cur_lr: 4.999999873689376e-05
          entropy: 1.303743600845337
          entropy_coeff: 0.0
          kl: 3.9257480239029974e-05
          policy_loss: -0.03199172765016556
          total_loss: 4.8065996170043945
          vf_explained_var: 0.0009902060264721513
          vf

[2m[36m(RolloutWorker pid=84504)[0m Error: tcpip::Socket::recvAndCheck @ recv: peer shutdown
[2m[36m(RolloutWorker pid=84504)[0m Quitting (on error).
[2m[36m(RolloutWorker pid=84503)[0m Error: tcpip::Socket::recvAndCheck @ recv: peer shutdown
[2m[36m(RolloutWorker pid=84503)[0m Quitting (on error).
[2m[36m(RolloutWorker pid=84506)[0m Error: tcpip::Socket::recvAndCheck @ recv: peer shutdown
[2m[36m(RolloutWorker pid=84506)[0m Quitting (on error).
[2m[36m(RolloutWorker pid=84499)[0m Error: tcpip::Socket::recvAndCheck @ recv: peer shutdown
[2m[36m(RolloutWorker pid=84499)[0m Quitting (on error).
[2m[36m(RolloutWorker pid=84505)[0m Error: tcpip::Socket::recvAndCheck @ recv: peer shutdown
[2m[36m(RolloutWorker pid=84505)[0m Quitting (on error).
[2m[36m(RolloutWorker pid=84498)[0m Error: tcpip::Socket::recvAndCheck @ recv: peer shutdown
[2m[36m(RolloutWorker pid=84498)[0m Quitting (on error).
[2m[36m(RolloutWorker pid=84501)[0m Error: tcpip::Socket::rec

[PPO_AccelEnv-v0_1fa80_00000]

In [9]:
print(trials[0].logdir)
!python ../flow/visualize/visualizer_rllib.py ${trials[0].logdir} 10

NameError: name 'trials' is not defined