# Playing Differentiated Demand Environemnt

## Imports and initialize


In [6]:
#Imports

from marketsai.markets.diff_demand import DiffDemand

#import ray

from ray import tune, shutdown, init
from ray.tune.registry import register_env
from ray.rllib.agents.a3c.a2c import A2CTrainer
from ray.rllib.agents.dqn.dqn import DQNTrainer
from ray.tune.integration.mlflow import MLflowLoggerCallback
from ray.rllib.utils.exploration.epsilon_greedy import EpsilonGreedy
from ray.rllib.utils.schedules.exponential_schedule import ExponentialSchedule

import random
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging

In [7]:
# STEP 0: Inititialize ray
NUM_CPUS = 11
shutdown()
init(num_cpus=NUM_CPUS, 
    logging_level=logging.ERROR,
)

{'node_ip_address': '192.168.1.202',
 'raylet_ip_address': '192.168.1.202',
 'redis_address': '192.168.1.202:51686',
 'object_store_address': '/tmp/ray/session_2021-04-13_13-13-54_817863_90750/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-04-13_13-13-54_817863_90750/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-04-13_13-13-54_817863_90750',
 'metrics_export_port': 63596,
 'node_id': '46b1b40f925ca2c5f1cd3bbcd7163c2e9c7277a757c14602886498c7'}

In [8]:
# STEP 1: register environment
register_env("diffdemand", DiffDemand)
env = DiffDemand()
policy_ids = [f"policy_{i}" for i in range(env.n_agents)]

## Experminet Configuration

In [9]:
# STEP 2: Experiment configuration

#Experiment configuration
test=True
date="April13"
env_label="Diff_Dd"
if (test==True): 
    MAX_STEPS = 20 * 1000
    exp_label =env_label+"_test_"+date 
else: 
    MAX_STEPS = 3000 * 1000
    exp_label ="_run_"+date 

verbosity=2
stop = {"episodes_total": MAX_STEPS//100}

#Environment configuration
PRICE_BAND_WIDE = 0.1
LOWER_PRICE = 1.47 - PRICE_BAND_WIDE
HIGHER_PRICE = 1.93 + PRICE_BAND_WIDE
DEC_RATE = float(math.e ** (-4 * 10 ** (-6)))
DEC_RATE_HIGH = float(math.e ** (-4 * 10 ** (-6) * 4))

env_config = {
    "mkt_config": {
    "lower_price": [LOWER_PRICE for i in range(env.n_agents)],
    "higher_price": [HIGHER_PRICE for i in range(env.n_agents)],
    "parameteres": {
                "cost": [1 for i in range(env.n_agents)],
                "values": [2 for i in range(env.n_agents)],
                "ext_demand": 0,
                "substitution": 0.25,
    },
    "space_type": "MultiDiscrete",
    "gridpoints": 16,
    }
}

exploration_config = {"type": "EpsilonGreedy",
    "epsilon_schedule": ExponentialSchedule(
      schedule_timesteps = 1,
      framework="Torch",
      initial_p=1.0,
      decay_rate=DEC_RATE,
    ),
}

training_config = {
    "gamma": 0.95,
    "lr": 0.15,
    "env": "diffdemand",
    "exploration_config": exploration_config,
    "env_config": env_config,
    "horizon": 100,
    "soft_horizon": True,
    "no_done_at_end": True,
    "multiagent": {
        "policies": {
            policy_ids[i]: (
                None,
                env.observation_space["agent_{}".format(i)],
                env.action_space["agent_{}".format(i)],
                {},
            )
            for i in range(env.n_agents)
        },
        "policy_mapping_fn": (lambda agent_id: policy_ids[int(agent_id.split("_")[1])]),
    },
    "framework": "torch",
    "num_workers": NUM_CPUS - 1,
    "num_gpus": 0,
    "timesteps_per_iteration": 1000,
    "normalize_actions": False,
}


#stop = {"training_iteration": MAX_STEPS//1000}
#stop = {"info/num_steps_trained": MAX_STEPS}

# # Experiments

In [10]:
#DQN Methods: DQN, APEX, R2D2

# algo_list = ["DQN", "APEX", "R2D2"] 
algo_list=["DQN"]
for i in range(len(algo_list)):
    exp_name = exp_label+algo_list[i]
    results = tune.run(
        algo_list[i],
        name=exp_name,
        config=training_config,
        #checkpoint_freq=250,
        checkpoint_at_end=True,
        stop=stop,
        callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
        verbose=verbosity
    )


training_config_RBW=training_config.copy()
training_config_RBW["n_step"] = 5
training_config_RBW["noisy"] = True
training_config_RBW["num_atoms"] = 10
training_config_RBW["v_min"] = 0.5
training_config_RBW["v_min"] = 2


exp_name = exp_label+"RAINBOW"
results = tune.run(
    "DQN",
    name=exp_name,
    config=training_config_RBW,
    #checkpoint_freq=250,
    checkpoint_at_end=True,
    stop=stop,
    callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
    verbose=verbosity
)




 been deprecated. Use `num_framestacks (int)` instead. This will raise an error in the future!
Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=55.375701228566214,episode_reward_min=52.70515878265463,episode_reward_mean=53.989631159210525,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.452694502379327, 'policy_1': 25.353549715030596},policy_reward_max={'policy_0': 28.18242749704712, 'policy_1': 28.886397523932644},policy_reward_mean={'policy_0': 26.953981491613355, 'policy_1': 27.035649667597163},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.41581758178106626, 'mean_raw_obs_processing_ms': 0.6912736609430596, 'mean_inference_ms': 6.024561778153523, 'mean_action_processing_ms': 0.2431886030895875},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 37.11, 'learn_throughput': 862.299, 'update_time_ms': 6.15},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(0.4364367, dtype=float32), 'cur_lr

Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=56.312076668098676,episode_reward_min=52.235194358618685,episode_reward_mean=54.20764147875052,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.452694502379327, 'policy_1': 25.06152365188502},policy_reward_max={'policy_0': 29.336373631598217, 'policy_1': 29.97234051715639},policy_reward_mean={'policy_0': 26.92677401243328, 'policy_1': 27.280867466317225},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.39579802147856197, 'mean_raw_obs_processing_ms': 0.6509710821864046, 'mean_inference_ms': 5.697864595626517, 'mean_action_processing_ms': 0.22757158510876074},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 22.433, 'learn_throughput': 1426.476, 'update_time_ms': 4.705},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(0.39403528, dtype=float32), 'cur_lr': 0.15, 'mean_q': 22.227054595947266, 'min_q': 21.524044036865234, 'max_q': 22.532318115234

Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=56.312076668098676,episode_reward_min=50.599593098790365,episode_reward_mean=53.93104525920684,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.452694502379327, 'policy_1': 24.77796203157801},policy_reward_max={'policy_0': 29.336373631598217, 'policy_1': 29.97234051715639},policy_reward_mean={'policy_0': 26.84542977605708, 'policy_1': 27.085615483149763},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3838712816610335, 'mean_raw_obs_processing_ms': 0.6308215516362815, 'mean_inference_ms': 5.530091847271066, 'mean_action_processing_ms': 0.21993142842173924},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 21.403, 'learn_throughput': 1495.136, 'update_time_ms': 4.597},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(1.01224, dtype=float32), 'cur_lr': 0.15, 'mean_q': 18.138444900512695, 'min_q': 17.89937400817871, 'max_q': 18.42430305480957, '

Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=57.19325147727085,episode_reward_min=50.599593098790365,episode_reward_mean=54.024033594532995,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.452694502379327, 'policy_1': 24.63238917933922},policy_reward_max={'policy_0': 29.525345758437457, 'policy_1': 29.97234051715639},policy_reward_mean={'policy_0': 26.953089808334738, 'policy_1': 27.070943786198246},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3748115926601789, 'mean_raw_obs_processing_ms': 0.616077705528509, 'mean_inference_ms': 5.400995083105828, 'mean_action_processing_ms': 0.21422875192593427},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 20.888, 'learn_throughput': 1531.958, 'update_time_ms': 4.446},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(1.4356979, dtype=float32), 'cur_lr': 0.15, 'mean_q': 13.957423210144043, 'min_q': 13.796136856079102, 'max_q': 14.21234226226806

Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=57.19325147727085,episode_reward_min=50.599593098790365,episode_reward_mean=53.83164146592073,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.196368803419148, 'policy_1': 24.512341083964646},policy_reward_max={'policy_0': 29.525345758437457, 'policy_1': 29.97234051715639},policy_reward_mean={'policy_0': 26.964003866134632, 'policy_1': 26.867637599786093},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.35642054137772, 'mean_raw_obs_processing_ms': 0.5858226407913426, 'mean_inference_ms': 5.140469248295505, 'mean_action_processing_ms': 0.20261725427450775},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 18.896, 'learn_throughput': 1693.524, 'update_time_ms': 4.067},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(0.7606433, dtype=float32), 'cur_lr': 0.15, 'mean_q': 11.841926574707031, 'min_q': 11.747696876525879, 'max_q': 11.98324966430664,

Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=57.19325147727085,episode_reward_min=51.03177058567856,episode_reward_mean=53.982071293444996,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.196368803419148, 'policy_1': 24.512341083964646},policy_reward_max={'policy_0': 29.525345758437457, 'policy_1': 29.154379594304938},policy_reward_mean={'policy_0': 27.086332865239793, 'policy_1': 26.89573842820518},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3429839652005326, 'mean_raw_obs_processing_ms': 0.5646359865064884, 'mean_inference_ms': 4.947588073502048, 'mean_action_processing_ms': 0.19442964831518175},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 18.096, 'learn_throughput': 1768.347, 'update_time_ms': 3.912},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(0.34437594, dtype=float32), 'cur_lr': 0.15, 'mean_q': 10.10901927947998, 'min_q': 9.931096076965332, 'max_q': 10.24979114532470

Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=56.711663721461704,episode_reward_min=51.03177058567856,episode_reward_mean=53.944297483205354,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.196368803419148, 'policy_1': 24.512341083964646},policy_reward_max={'policy_0': 29.525345758437457, 'policy_1': 29.154379594304938},policy_reward_mean={'policy_0': 27.086926658007073, 'policy_1': 26.857370825198288},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.331389141328007, 'mean_raw_obs_processing_ms': 0.5463711596894171, 'mean_inference_ms': 4.781659821579145, 'mean_action_processing_ms': 0.18739438980236323},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 17.392, 'learn_throughput': 1839.922, 'update_time_ms': 4.447},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(0.15352567, dtype=float32), 'cur_lr': 0.15, 'mean_q': 8.291338920593262, 'min_q': 8.216338157653809, 'max_q': 8.40869808197021

Trial DQN_diffdemand_a2222_00000 reported episode_reward_max=56.711663721461704,episode_reward_min=51.03177058567856,episode_reward_mean=53.898204860266375,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.196368803419148, 'policy_1': 24.512341083964646},policy_reward_max={'policy_0': 29.489289384710688, 'policy_1': 29.154379594304938},policy_reward_mean={'policy_0': 27.078909308402547, 'policy_1': 26.819295551863835},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.32789223033043285, 'mean_raw_obs_processing_ms': 0.5409344880347227, 'mean_inference_ms': 4.73192470514, 'mean_action_processing_ms': 0.1852783890233001},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 17.577, 'learn_throughput': 1820.607, 'update_time_ms': 3.741},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'grad_gnorm': array(0.03786084, dtype=float32), 'cur_lr': 0.15, 'mean_q': 8.09105396270752, 'min_q': 7.987760066986084, 'max_q': 8.21635913848877, 'm

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQN_diffdemand_a2222_00000,TERMINATED,,20,38.6169,20000,53.8982,56.7117,51.0318,100


e call to include dim=X as an argument.
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92381)[0m   z = torch.range(
[2m[36m(pid=92381)[0m   support_prob_per_action = nn.functional.softmax(


is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m

se its behavior is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = to

is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m

ase because its behavior is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m

ts behavior is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.

ause its behavior is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = 

cause its behavior is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z =

 because its behavior is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   

 because its behavior is inconsistent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   

ent with Python's range builtin. Instead, use torch.arange, which produces values in [start, end).
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92388)[0m   z = torch.range(
[2m[36m(pid=92388)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92378)[0m   z = torch.range(
[2m[36m(pid=92378)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92380)[0m   z = torch.range(
[2m[36m(pid=92380)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=92379)[0m   support_prob_per_action = nn.functional.softmax(
[2m[36m(pid=92379)[0m   z = torch.range(
[2m[36m(pid=923

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DQN_diffdemand_c272e_00000,TERMINATED,,20,54.3106,20000,52.3789,55.2743,49.7107,100


In [11]:
#Policy Gradient Methods: PG, A2C, A3C, PPO, APPO

# algo_list=["PG", "A2C", "A3C", "PPO", "APPO"]
algo_list=["PG", "PPO"]
for i in range(len(algo_list)):
    exp_name = exp_label + algo_list[i]
    results = tune.run(
        algo_list[i],
        name=exp_name,
        config=training_config,
        #checkpoint_freq=250,
        checkpoint_at_end=True,
        stop=stop,
        callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
        verbose=verbosity
    )


tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
[2m[36m(pid=92492)[0m Instructions for updating:
[2m[36m(pid=92492)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92492)[0m Instructions for updating:
[2m[36m(pid=92492)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92493)[0m Instructions for updating:
[2m[36m(pid=92493)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92493)[0m Instructions for updating:
[2m[36m(pid=92493)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92495)[0m Instructions for updating:
[2m[36m(pid=92495)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92495)[0m Instructions for updating:
[2m[36m(pid=92495)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92494)[0m Instructions for updating:
[2m[36m(pid=92494)[0m non-resource variabl

Trial PG_diffdemand_eaa26_00000 reported episode_reward_max=56.87811984250468,episode_reward_min=51.67661881624113,episode_reward_mean=54.17148503051676,episode_len_mean=100.0,episodes_this_iter=20,policy_reward_min={'policy_0': 24.215886870482993, 'policy_1': 25.128520037088308},policy_reward_max={'policy_0': 29.44089808723671, 'policy_1': 30.8025306474089},policy_reward_mean={'policy_0': 26.98991681673382, 'policy_1': 27.181568213782935},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.4453855757161696, 'mean_raw_obs_processing_ms': 0.5048339956004466, 'mean_inference_ms': 4.588108935994122, 'mean_action_processing_ms': 0.24583846360437978},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 1278.729, 'sample_throughput': 1564.053, 'learn_time_ms': 99.82, 'learn_throughput': 20036.114, 'update_time_ms': 6.388},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'policy_loss': 71.7271728515625}, 'policy_1': {'allreduce_latency': 0.0, 'policy_loss': 70.18

Trial PG_diffdemand_eaa26_00000 reported episode_reward_max=56.274615806318764,episode_reward_min=51.78314819134301,episode_reward_mean=54.09769157270872,episode_len_mean=100.0,episodes_this_iter=20,policy_reward_min={'policy_0': 24.215886870482993, 'policy_1': 24.865572338239883},policy_reward_max={'policy_0': 29.314868967260576, 'policy_1': 30.8025306474089},policy_reward_mean={'policy_0': 27.085092779058723, 'policy_1': 27.01259879364999},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.47073790471799093, 'mean_raw_obs_processing_ms': 0.5384055599096483, 'mean_inference_ms': 4.875429438301872, 'mean_action_processing_ms': 0.25755846380460995},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 1337.472, 'sample_throughput': 1495.358, 'learn_time_ms': 106.131, 'learn_throughput': 18844.706, 'update_time_ms': 6.804},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'policy_loss': 139.2841033935547}, 'policy_1': {'allreduce_latency': 0.0, 'policy_loss':

Trial PG_diffdemand_eaa26_00000 reported episode_reward_max=56.274615806318764,episode_reward_min=51.78314819134301,episode_reward_mean=54.14683575455469,episode_len_mean=100.0,episodes_this_iter=20,policy_reward_min={'policy_0': 24.752288223944895, 'policy_1': 24.214876028182054},policy_reward_max={'policy_0': 30.7863379281433, 'policy_1': 29.139651515694126},policy_reward_mean={'policy_0': 27.44229066420881, 'policy_1': 26.70454509034588},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.48731098535735595, 'mean_raw_obs_processing_ms': 0.562412989274758, 'mean_inference_ms': 5.0664008746617375, 'mean_action_processing_ms': 0.2664874402929105},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 1378.148, 'sample_throughput': 1451.223, 'learn_time_ms': 115.715, 'learn_throughput': 17283.819, 'update_time_ms': 6.526},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'policy_loss': 238.1383056640625}, 'policy_1': {'allreduce_latency': 0.0, 'policy_loss': 6

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PG_diffdemand_eaa26_00000,TERMINATED,,10,15.1349,20000,54.1468,56.2746,51.7831,100


[36m(pid=92571)[0m Instructions for updating:
[2m[36m(pid=92571)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92572)[0m Instructions for updating:
[2m[36m(pid=92572)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92572)[0m Instructions for updating:
[2m[36m(pid=92572)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92573)[0m Instructions for updating:
[2m[36m(pid=92573)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92573)[0m Instructions for updating:
[2m[36m(pid=92573)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92574)[0m Instructions for updating:
[2m[36m(pid=92574)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92574)[0m Instructions for updating:
[2m[36m(pid=92574)[0m non-resource variables are not supported in the long term
[2m[36m(pid=92576)[0m Instructions for updating:
[2m[36

Trial PPO_diffdemand_fea7d_00000 reported episode_reward_max=56.18908721681132,episode_reward_min=51.60864418261823,episode_reward_mean=54.118034302986,episode_len_mean=100.0,episodes_this_iter=40,policy_reward_min={'policy_0': 24.743845635805606, 'policy_1': 24.568367919442},policy_reward_max={'policy_0': 29.063576027136055, 'policy_1': 28.910377676977422},policy_reward_mean={'policy_0': 27.046792012152356, 'policy_1': 27.071242290833645},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3558622236460053, 'mean_raw_obs_processing_ms': 0.44937971651098163, 'mean_inference_ms': 4.643043900569673, 'mean_action_processing_ms': 0.1885209045968464},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 2149.444, 'sample_throughput': 1860.946, 'learn_time_ms': 20909.811, 'learn_throughput': 191.298, 'update_time_ms': 2.795},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.30000000000000004, 'cur_lr': 0.15, 'total_loss': inf, 'policy_loss': 0.27

Trial PPO_diffdemand_fea7d_00000 reported episode_reward_max=57.167392650101775,episode_reward_min=51.12231804601898,episode_reward_mean=53.98373507607108,episode_len_mean=100.0,episodes_this_iter=40,policy_reward_min={'policy_0': 24.185918819608695, 'policy_1': 24.568367919442},policy_reward_max={'policy_0': 29.85594849525549, 'policy_1': 29.314852406282967},policy_reward_mean={'policy_0': 26.791111185732134, 'policy_1': 27.19262389033894},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3132450586248297, 'mean_raw_obs_processing_ms': 0.3957716956568019, 'mean_inference_ms': 4.084008037101896, 'mean_action_processing_ms': 0.16586263376742727},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 1834.716, 'sample_throughput': 2180.174, 'learn_time_ms': 20156.542, 'learn_throughput': 198.447, 'update_time_ms': 2.773},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.45000000000000007, 'cur_lr': 0.15, 'total_loss': inf, 'policy_loss': 0.2

Trial PPO_diffdemand_fea7d_00000 reported episode_reward_max=57.167392650101775,episode_reward_min=50.66014037770275,episode_reward_mean=53.77219008659806,episode_len_mean=100.0,episodes_this_iter=40,policy_reward_min={'policy_0': 24.185918819608695, 'policy_1': 24.568367919442},policy_reward_max={'policy_0': 29.85594849525549, 'policy_1': 29.314852406282967},policy_reward_mean={'policy_0': 26.445710848328396, 'policy_1': 27.32647923826966},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.26946607552876506, 'mean_raw_obs_processing_ms': 0.3422325056800423, 'mean_inference_ms': 3.5115468373732983, 'mean_action_processing_ms': 0.14234878426407493},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 1670.118, 'sample_throughput': 2395.04, 'learn_time_ms': 20644.92, 'learn_throughput': 193.752, 'update_time_ms': 2.763},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.675, 'cur_lr': 0.15, 'total_loss': 1.7478932263329625, 'policy_loss': 0.

Trial PPO_diffdemand_fea7d_00000 reported episode_reward_max=57.167392650101775,episode_reward_min=50.66014037770275,episode_reward_mean=53.4878002718237,episode_len_mean=100.0,episodes_this_iter=40,policy_reward_min={'policy_0': 21.776290429158294, 'policy_1': 24.573441270717304},policy_reward_max={'policy_0': 29.85594849525549, 'policy_1': 29.81121800498625},policy_reward_mean={'policy_0': 26.05488076095648, 'policy_1': 27.432919510867226},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2497503433265588, 'mean_raw_obs_processing_ms': 0.31911540247014486, 'mean_inference_ms': 3.2857669847915245, 'mean_action_processing_ms': 0.13176612192065573},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 1670.963, 'sample_throughput': 2393.83, 'learn_time_ms': 22117.438, 'learn_throughput': 180.853, 'update_time_ms': 3.046},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'cur_kl_coeff': 0.3375, 'cur_lr': 0.15, 'total_loss': 2.1918684523552656, 'policy_loss':

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_diffdemand_fea7d_00000,TERMINATED,,5,118.989,20000,53.4878,57.1674,50.6601,100


In [12]:
#Mixed:  SAC, IMPALA

algo_list=["SAC", "IMPALA"]
for i in range(len(algo_list)):
    exp_name = exp_label + algo_list[i]
    results = tune.run(
        algo_list[i],
        name=exp_name,
        config=training_config,
        #checkpoint_freq=250,
        checkpoint_at_end=True,
        stop=stop,
        callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
        verbose=verbosity
    )

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.223414145087396,episode_reward_min=52.59753158053962,episode_reward_mean=54.032624570937756,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 25.913256888262424},policy_reward_max={'policy_0': 27.828359061891703, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.673963307714615, 'policy_1': 27.35866126322312},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.33453758189220306, 'mean_raw_obs_processing_ms': 1.0815993050076314, 'mean_inference_ms': 4.800894876189578, 'mean_action_processing_ms': 0.1838613030136816},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 57.395, 'learn_throughput': 4460.32, 'update_time_ms': 5.378},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([2.6024063, 2.7778525, 2.756106 , 2.8482442, 2.814815 , 2.7552133,
       2.7299361, 2.9125924, 2.7489822, 2.9064794, 2.7639098, 2.9

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.575183122390555,episode_reward_min=52.391939874876066,episode_reward_mean=54.27257206398218,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 25.913256888262424},policy_reward_max={'policy_0': 28.671747207665536, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.864750615277046, 'policy_1': 27.40782144870513},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3304556222494282, 'mean_raw_obs_processing_ms': 1.0601142151607807, 'mean_inference_ms': 4.556040440079769, 'mean_action_processing_ms': 0.18193167038056468},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.315, 'learn_throughput': 7460.214, 'update_time_ms': 4.548},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.09901857, 0.0873909 , 0.1596272 , 0.19045389, 0.09468174,
       0.08579075, 0.10069013, 0.15731788, 0.05932772, 0.04298234,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.575183122390555,episode_reward_min=52.391939874876066,episode_reward_mean=54.13910045717052,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 25.913256888262424},policy_reward_max={'policy_0': 28.671747207665536, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.72876307471733, 'policy_1': 27.41033738245317},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.32533209082356496, 'mean_raw_obs_processing_ms': 1.0403998982103047, 'mean_inference_ms': 4.380468241436147, 'mean_action_processing_ms': 0.17952118989258106},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 31.333, 'learn_throughput': 8170.388, 'update_time_ms': 4.082},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.02505338, 0.03147662, 0.00974441, 0.10991204, 0.03075695,
       0.08248162, 0.07839036, 0.05248988, 0.2226845 , 0.07239628,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.665607886747594,episode_reward_min=52.391939874876066,episode_reward_mean=54.14437523427089,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.886472699970263, 'policy_1': 27.257902534300626},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.31992371573860856, 'mean_raw_obs_processing_ms': 1.0209530161329938, 'mean_inference_ms': 4.2403296101059444, 'mean_action_processing_ms': 0.17680026209323874},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 31.387, 'learn_throughput': 8156.263, 'update_time_ms': 4.359},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.13914514, 0.11717105, 0.05945349, 0.02869225, 0.03305221,
       0.06176639, 0.06476355, 0.08635569, 0.07463455, 0.06279731,


Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.665607886747594,episode_reward_min=52.391939874876066,episode_reward_mean=54.08506074336616,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.912889870164534, 'policy_1': 27.172170873201615},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3151777768213432, 'mean_raw_obs_processing_ms': 1.0045095755193376, 'mean_inference_ms': 4.127350477016877, 'mean_action_processing_ms': 0.1743458938670615},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 29.845, 'learn_throughput': 8577.555, 'update_time_ms': 4.045},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.02538395, 0.10494709, 0.05487061, 0.03742647, 0.03369951,
       0.0434525 , 0.04732823, 0.05226421, 0.03351331, 0.04197621,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.665607886747594,episode_reward_min=51.93865130896431,episode_reward_mean=53.980391141501244,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.83160401163763, 'policy_1': 27.1487871298636},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.31111320590197594, 'mean_raw_obs_processing_ms': 0.9906287882569835, 'mean_inference_ms': 4.034821449539909, 'mean_action_processing_ms': 0.17216177876022945},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 29.947, 'learn_throughput': 8548.403, 'update_time_ms': 3.971},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.15076089, 0.11514115, 0.04794264, 0.14049268, 0.12056994,
       0.08373928, 0.12531686, 0.12769318, 0.02015471, 0.15738821,
    

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.665607886747594,episode_reward_min=51.93865130896431,episode_reward_mean=54.04105062579884,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.93393024931546, 'policy_1': 27.10712037648337},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.30765799476673017, 'mean_raw_obs_processing_ms': 0.9788051943505514, 'mean_inference_ms': 3.957290481581363, 'mean_action_processing_ms': 0.17029642541208356},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 31.371, 'learn_throughput': 8160.304, 'update_time_ms': 4.891},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.06060123, 0.09696674, 0.0920198 , 0.00804353, 0.11097074,
       0.03173256, 0.06104517, 0.13414478, 0.0055778 , 0.12855577,
    

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.665607886747594,episode_reward_min=51.93865130896431,episode_reward_mean=54.0334928100429,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.923235604689012, 'policy_1': 27.110257205353868},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3046341168157266, 'mean_raw_obs_processing_ms': 0.9684073508279827, 'mean_inference_ms': 3.8916034706408382, 'mean_action_processing_ms': 0.16865932863282135},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 32.508, 'learn_throughput': 7874.877, 'update_time_ms': 4.479},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.08401155, 0.02666211, 0.05248189, 0.06168342, 0.03949142,
       0.07777286, 0.09406185, 0.09166431, 0.12928963, 0.02579284,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.74571268633845,episode_reward_min=51.93865130896431,episode_reward_mean=54.0426290784243,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.945656251659376, 'policy_1': 27.09697282676491},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.3019698116723966, 'mean_raw_obs_processing_ms': 0.9593957447444789, 'mean_inference_ms': 3.835292635468724, 'mean_action_processing_ms': 0.16722366436466404},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 32.153, 'learn_throughput': 7961.947, 'update_time_ms': 4.296},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.05633497, 0.14397097, 0.09490538, 0.03333187, 0.02543926,
       0.02774048, 0.11752796, 0.16156912, 0.15562963, 0.05446148,
      

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.74571268633845,episode_reward_min=51.93865130896431,episode_reward_mean=54.044903260634044,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 24.53375788544306, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.1309050083817},policy_reward_mean={'policy_0': 26.922609136863045, 'policy_1': 27.122294123770995},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2996499580282434, 'mean_raw_obs_processing_ms': 0.9515369471306974, 'mean_inference_ms': 3.787420160926597, 'mean_action_processing_ms': 0.16596245695285816},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 32.424, 'learn_throughput': 7895.335, 'update_time_ms': 4.117},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.1075387 , 0.01884604, 0.155869  , 0.19268417, 0.03319788,
       0.04680204, 0.0700531 , 0.07933569, 0.05153418, 0.03721952,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.74571268633845,episode_reward_min=51.93865130896431,episode_reward_mean=54.03533211252712,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 26.913086616906465, 'policy_1': 27.122245495620643},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2939769957442217, 'mean_raw_obs_processing_ms': 0.9312216999647224, 'mean_inference_ms': 3.6403686442043424, 'mean_action_processing_ms': 0.16296658771412534},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 32.687, 'learn_throughput': 7831.975, 'update_time_ms': 4.236},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.10980177, 0.01986074, 0.06083155, 0.07426357, 0.01595736,
       0.09258604, 0.16934347, 0.07710981, 0.02520132, 0.09772921,


Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.74571268633845,episode_reward_min=51.93865130896431,episode_reward_mean=54.03055168967935,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 26.96121062243856, 'policy_1': 27.06934106724079},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2891025182021225, 'mean_raw_obs_processing_ms': 0.9153822140594601, 'mean_inference_ms': 3.541032816289312, 'mean_action_processing_ms': 0.16032828379632605},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 33.464, 'learn_throughput': 7650.082, 'update_time_ms': 4.209},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.01131773, 0.05713224, 0.00835943, 0.14782763, 0.11958456,
       0.0559597 , 0.14670038, 0.10545492, 0.05909681, 0.04126263,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.74571268633845,episode_reward_min=51.93865130896431,episode_reward_mean=54.02503185464976,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.027579683337688},policy_reward_max={'policy_0': 29.385364331454493, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 27.00418139849303, 'policy_1': 27.02085045615671},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2853339553169343, 'mean_raw_obs_processing_ms': 0.903454824723655, 'mean_inference_ms': 3.4689994918675087, 'mean_action_processing_ms': 0.15820257005852859},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.453, 'learn_throughput': 7430.472, 'update_time_ms': 4.637},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.06101847, 0.09063005, 0.02331448, 0.12389135, 0.12953758,
       0.12727118, 0.02022171, 0.05498457, 0.06428194, 0.05122805,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.74571268633845,episode_reward_min=51.93865130896431,episode_reward_mean=53.99422949699396,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.323098720744074},policy_reward_max={'policy_0': 29.239676245636428, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 26.978832171280327, 'policy_1': 27.015397325713625},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2828300292931396, 'mean_raw_obs_processing_ms': 0.8959549367733958, 'mean_inference_ms': 3.4188122860073933, 'mean_action_processing_ms': 0.15674650537944584},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 35.995, 'learn_throughput': 7112.098, 'update_time_ms': 4.52},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.13705349, 0.07343674, 0.05089951, 0.1270895 , 0.02918243,
       0.09737539, 0.05571222, 0.19510841, 0.05270624, 0.03286791,
 

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=55.74571268633845,episode_reward_min=51.93865130896431,episode_reward_mean=53.99692115091253,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.14346105532576},policy_reward_max={'policy_0': 28.914254395779054, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 27.006713477228182, 'policy_1': 26.990207673684335},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.28170472330302904, 'mean_raw_obs_processing_ms': 0.89347729311784, 'mean_inference_ms': 3.389777799095562, 'mean_action_processing_ms': 0.15605653009475995},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 54.471, 'learn_throughput': 4699.757, 'update_time_ms': 8.374},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.12409401, 0.05384731, 0.09902477, 0.12409401, 0.01889277,
       0.13877726, 0.05130482, 0.12936401, 0.03623533, 0.05518961,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=56.16971294857838,episode_reward_min=52.211156840716136,episode_reward_mean=54.08205784182822,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.14346105532576},policy_reward_max={'policy_0': 28.97552966769836, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 27.09566418440724, 'policy_1': 26.98639365742097},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2824175467139101, 'mean_raw_obs_processing_ms': 0.8977016561552807, 'mean_inference_ms': 3.3857744190881136, 'mean_action_processing_ms': 0.1563998900532874},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 58.226, 'learn_throughput': 4396.671, 'update_time_ms': 7.959},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.1047368 , 0.01064777, 0.09462357, 0.0254097 , 0.00554609,
       0.06837463, 0.01323032, 0.00818491, 0.21554232, 0.10912561,
    

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=56.16971294857838,episode_reward_min=52.143354545156114,episode_reward_mean=54.04883141705057,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.14346105532576},policy_reward_max={'policy_0': 28.97552966769836, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 27.071633349150183, 'policy_1': 26.97719806790038},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2845165639670467, 'mean_raw_obs_processing_ms': 0.9069106294823022, 'mean_inference_ms': 3.4010478863221625, 'mean_action_processing_ms': 0.15751137780416444},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 54.409, 'learn_throughput': 4705.067, 'update_time_ms': 6.838},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.130445  , 0.06190348, 0.07477808, 0.06035376, 0.05180502,
       0.00734901, 0.14519882, 0.02522182, 0.09123707, 0.10977888,
  

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=56.16971294857838,episode_reward_min=52.143354545156114,episode_reward_mean=54.04235080643612,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.14346105532576},policy_reward_max={'policy_0': 28.97552966769836, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 27.08034945081097, 'policy_1': 26.962001355625144},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.28736586111444895, 'mean_raw_obs_processing_ms': 0.9187439359584375, 'mean_inference_ms': 3.426136534881609, 'mean_action_processing_ms': 0.15901685104598554},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 42.058, 'learn_throughput': 6086.82, 'update_time_ms': 5.347},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.10442543, 0.06721592, 0.09706545, 0.07327509, 0.00814915,
       0.07320738, 0.01282024, 0.0174737 , 0.01063442, 0.21013641,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=56.16971294857838,episode_reward_min=51.96061778905331,episode_reward_mean=54.00138297138012,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.14346105532576},policy_reward_max={'policy_0': 28.97552966769836, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 26.988908816296515, 'policy_1': 27.012474155083606},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.29038384141508167, 'mean_raw_obs_processing_ms': 0.9310432198470696, 'mean_inference_ms': 3.454361428716601, 'mean_action_processing_ms': 0.1606046798576553},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 32.801, 'learn_throughput': 7804.752, 'update_time_ms': 3.974},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.01508474, 0.06721973, 0.0146389 , 0.1015048 , 0.12068653,
       0.10523033, 0.12428904, 0.02371883, 0.06988239, 0.0306797 ,
   

Trial SAC_diffdemand_5155b_00000 reported episode_reward_max=56.16971294857838,episode_reward_min=51.96061778905331,episode_reward_mean=54.00863748182219,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 23.635681222981923, 'policy_1': 24.14346105532576},policy_reward_max={'policy_0': 28.97552966769836, 'policy_1': 30.785241210711774},policy_reward_mean={'policy_0': 26.997462009378136, 'policy_1': 27.01117547244405},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2932574772443507, 'mean_raw_obs_processing_ms': 0.9428099666445194, 'mean_inference_ms': 3.4814020150498544, 'mean_action_processing_ms': 0.1621193014614093},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 28.771, 'learn_throughput': 8897.857, 'update_time_ms': 3.54},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.20400143, 0.02300262, 0.02227783, 0.0663929 , 0.08792162,
       0.0708456 , 0.02066755, 0.16983843, 0.10351276, 0.12308359,
     

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_diffdemand_5155b_00000,TERMINATED,,20,176.474,20500,54.0086,56.1697,51.9606,100


ported in the long term
[2m[36m(pid=93050)[0m Instructions for updating:
[2m[36m(pid=93050)[0m non-resource variables are not supported in the long term
[2m[36m(pid=93050)[0m Instructions for updating:
[2m[36m(pid=93050)[0m non-resource variables are not supported in the long term
[2m[36m(pid=93051)[0m Instructions for updating:
[2m[36m(pid=93051)[0m non-resource variables are not supported in the long term
[2m[36m(pid=93051)[0m Instructions for updating:
[2m[36m(pid=93051)[0m non-resource variables are not supported in the long term
[2m[36m(pid=93049)[0m Instructions for updating:
[2m[36m(pid=93049)[0m non-resource variables are not supported in the long term
[2m[36m(pid=93049)[0m Instructions for updating:
[2m[36m(pid=93049)[0m non-resource variables are not supported in the long term
[2m[36m(pid=93054)[0m Instructions for updating:
[2m[36m(pid=93054)[0m non-resource variables are not supported in the long term
[2m[36m(pid=93054)[0m Instru

Trial IMPALA_diffdemand_c30a2_00000 reported episode_reward_max=56.95225260440203,episode_reward_min=51.341387656588275,episode_reward_mean=54.18095969886395,episode_len_mean=100.0,episodes_this_iter=269,policy_reward_min={'policy_0': 24.625006323530005, 'policy_1': 23.675805086061146},policy_reward_max={'policy_0': 30.02554749121364, 'policy_1': 29.00262820821685},policy_reward_mean={'policy_0': 27.168781767690827, 'policy_1': 27.012177931173117},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2770818503895667, 'mean_raw_obs_processing_ms': 0.3508450224630989, 'mean_inference_ms': 2.873557598560234, 'mean_action_processing_ms': 0.14571034711722666},off_policy_estimator={},num_healthy_workers=10,timers={'sample_time_ms': 196.99, 'sample_throughput': 2538.198},info={'num_steps_sampled': 28050, 'num_steps_trained': 27500, 'num_weight_broadcasts': 52, 'learner_queue': {'size_count': 56, 'size_mean': 0.06, 'size_std': 0.23748684174075835, 'size_quantiles': [0.0, 0.0, 0.0, 0.0, 1.0]},

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
IMPALA_diffdemand_c30a2_00000,TERMINATED,,2,10.8159,28050,54.181,56.9523,51.3414,100


### Continuous Space


In [13]:
#DDGP uses its own exploration config
# See exploration config in https://github.com/ray-project/ray/blob/master/rllib/utils/exploration/ornstein_uhlenbeck_noise.pyDDPG 
exploration_config_cont = {
        # DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output
        # actions (after a possible pure random phase of n timesteps).
        "type": "OrnsteinUhlenbeckNoise",
        "final_scale": 0.02,
        "scale_timesteps": 100000,
    }

training_config_cont=training_config.copy()
env_config_cont=env_config.copy()
training_config_cont["exploration_config"] = exploration_config_cont
env_config_cont["mkt_config"]["space_type"] = "Continuous"

env=DiffDemand(env_config_cont)
training_config_cont["env_config"] = env_config_cont
training_config_cont["multiagent"]["policies"] =  {
            policy_ids[i]: (None, env.observation_space[f"agent_{i}"], env.action_space[f"agent_{i}"],{},) for i in range(env.n_agents)
}
#print(env_config)
print(training_config_cont)
print(env.action_space)


{'gamma': 0.95, 'lr': 0.15, 'env': 'diffdemand', 'exploration_config': {'type': 'OrnsteinUhlenbeckNoise', 'final_scale': 0.02, 'scale_timesteps': 100000}, 'env_config': {'mkt_config': {'lower_price': [1.3699999999999999, 1.3699999999999999], 'higher_price': [2.03, 2.03], 'parameteres': {'cost': [1, 1], 'values': [2, 2], 'ext_demand': 0, 'substitution': 0.25}, 'space_type': 'Continuous', 'gridpoints': 16}}, 'horizon': 100, 'soft_horizon': True, 'no_done_at_end': True, 'multiagent': {'policies': {'policy_0': (None, Box(1.3699999999999999, 2.03, (2,), float64), Box(1.3699999999999999, 2.03, (1,), float64), {}), 'policy_1': (None, Box(1.3699999999999999, 2.03, (2,), float64), Box(1.3699999999999999, 2.03, (1,), float64), {})}, 'policy_mapping_fn': <function <lambda> at 0x1a092b550>}, 'framework': 'torch', 'num_workers': 10, 'num_gpus': 0, 'timesteps_per_iteration': 1000, 'normalize_actions': False}
{'agent_0': Box(1.3699999999999999, 2.03, (1,), float64), 'agent_1': Box(1.3699999999999999,

In [16]:
#COntinuous action space DQN

algo_list=["DDPG", "TD3", "SAC"]
for i in range(len(algo_list)):
    exp_name = exp_label + "_cont_" + algo_list[i]
    results = tune.run(
        algo_list[i],
        name=exp_name,
        config=training_config_cont,
        #checkpoint_freq=250,
        checkpoint_at_end=True,
        stop=stop,
        callbacks=[MLflowLoggerCallback(experiment_name=exp_name, save_artifact=True)],
        verbose=verbosity
    )

[2m[36m(pid=93503)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93503)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93501)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93501)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93506)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93506)[0m   torch.from_numpy(self.action_space.low).float())
Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=55.915074463545736,episode_reward_min=52.46684292109934,episode_reward_mean=54.56381894687225,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 26.339421651271653, 'policy_1': 25.35429049001401},policy_reward_max={'policy_0': 28.73453546005024, 'policy_1': 28.834247803932303},policy_reward_mean={'policy_0': 27.135140464568774, 'policy_1': 27.428678482303475},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16801278322737737, 'mean_raw_obs_processi

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=52.46684292109934,episode_reward_mean=57.326790616273605,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 26.339421651271653, 'policy_1': 25.35429049001401},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 31.680466056709182},policy_reward_mean={'policy_0': 28.25202278478015, 'policy_1': 29.074767831493496},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.1675674492021824, 'mean_raw_obs_processing_ms': 0.4473235244747957, 'mean_inference_ms': 1.8842535507571732, 'mean_action_processing_ms': 0.16167683134745528},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 23.923, 'learn_throughput': 10701.184, 'update_time_ms': 2.44},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.32419735193252563, 'critic_loss': 0.0006890440708957613, 'mean_q': 0.33072876930236816, 'max_q': 0.3812454342842102, 'min_q': 0.

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=48.0637188349328,episode_reward_mean=54.4413100569494,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 14.469945365393617, 'policy_1': 25.35429049001401},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 23.735443997803532, 'policy_1': 30.705866059145887},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16687435604927642, 'mean_raw_obs_processing_ms': 0.4449957377765957, 'mean_inference_ms': 1.8652855971551052, 'mean_action_processing_ms': 0.16043659172629268},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 23.668, 'learn_throughput': 10816.443, 'update_time_ms': 2.372},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.346090704202652, 'critic_loss': 0.0004901136853732169, 'mean_q': 0.35927969217300415, 'max_q': 0.4919116795063019, 'min_q': 0.21

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=37.76736021593465,episode_reward_mean=50.39035308466814,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 25.35429049001401},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 20.870171775190816, 'policy_1': 29.520181309477344},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16708133055317237, 'mean_raw_obs_processing_ms': 0.44508611233124834, 'mean_inference_ms': 1.8613869441144264, 'mean_action_processing_ms': 0.16030982981071937},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 23.716, 'learn_throughput': 10794.293, 'update_time_ms': 2.375},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.4291670024394989, 'critic_loss': 0.0005613394896499813, 'mean_q': 0.39633363485336304, 'max_q': 0.5289214253425598, 'min_q': 

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=36.47422436342299,episode_reward_mean=47.6842523234848,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.394771361195616},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 20.3548167634982, 'policy_1': 27.329435559986614},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.1670234583522484, 'mean_raw_obs_processing_ms': 0.44434684337116365, 'mean_inference_ms': 1.8562907568871176, 'mean_action_processing_ms': 0.1599823373245354},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 23.562, 'learn_throughput': 10865.061, 'update_time_ms': 2.343},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.4752320945262909, 'critic_loss': 0.0005116881220601499, 'mean_q': 0.4554104804992676, 'max_q': 0.5822457075119019, 'min_q': 0.281

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=36.47422436342299,episode_reward_mean=46.460266421762725,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.394771361195616},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 19.927982513337952, 'policy_1': 26.53228390842477},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16685771893578905, 'mean_raw_obs_processing_ms': 0.44339145639675154, 'mean_inference_ms': 1.8502612094268474, 'mean_action_processing_ms': 0.15952118251475103},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 23.986, 'learn_throughput': 10672.964, 'update_time_ms': 2.43},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.5105652809143066, 'critic_loss': 0.0004562211106531322, 'mean_q': 0.4932008981704712, 'max_q': 0.6365997791290283, 'min_q': 0

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=36.47422436342299,episode_reward_mean=45.6688143478525,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.394771361195616},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 19.582937858292315, 'policy_1': 26.085876489560196},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16668809795064124, 'mean_raw_obs_processing_ms': 0.44239331667940845, 'mean_inference_ms': 1.8447597063015344, 'mean_action_processing_ms': 0.159060630077325},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 24.261, 'learn_throughput': 10551.708, 'update_time_ms': 2.368},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.5639980435371399, 'critic_loss': 0.0004962100647389889, 'mean_q': 0.5445050597190857, 'max_q': 0.6879479885101318, 'min_q': 0.3

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=36.15271004135815,episode_reward_mean=44.516825460125894,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 19.40983575365738, 'policy_1': 25.106989706468518},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16654646816927338, 'mean_raw_obs_processing_ms': 0.44138976703408117, 'mean_inference_ms': 1.8394844134680182, 'mean_action_processing_ms': 0.1586316520845955},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 26.295, 'learn_throughput': 9735.843, 'update_time_ms': 2.362},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.5827307105064392, 'critic_loss': 0.0005447074654512107, 'mean_q': 0.567003071308136, 'max_q': 0.7304133176803589, 'min_q': 0.3

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=36.15271004135815,episode_reward_mean=44.0572848605343,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 19.12951918994646, 'policy_1': 24.927765670587856},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.166408524726503, 'mean_raw_obs_processing_ms': 0.4404860927895198, 'mean_inference_ms': 1.8346306260131746, 'mean_action_processing_ms': 0.15823589079254205},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 26.603, 'learn_throughput': 9622.817, 'update_time_ms': 2.378},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.6332108974456787, 'critic_loss': 0.0005167944473214447, 'mean_q': 0.6203120946884155, 'max_q': 0.7958001494407654, 'min_q': 0.4577

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=36.15271004135815,episode_reward_mean=43.59602896259908,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 18.772614091111226, 'policy_1': 24.82341487148785},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16628709218075102, 'mean_raw_obs_processing_ms': 0.4396598001762897, 'mean_inference_ms': 1.830258015651441, 'mean_action_processing_ms': 0.1578620411458723},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 26.756, 'learn_throughput': 9567.896, 'update_time_ms': 2.366},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.6406006217002869, 'critic_loss': 0.0005850872257724404, 'mean_q': 0.6255108714103699, 'max_q': 0.8353226184844971, 'min_q': 0.454

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=60.491620774571025,episode_reward_min=36.15271004135815,episode_reward_mean=42.18452375598365,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 30.260832257976894, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 17.734195137319833, 'policy_1': 24.450328618663825},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16601180491835152, 'mean_raw_obs_processing_ms': 0.43795608178717044, 'mean_inference_ms': 1.8182403299975038, 'mean_action_processing_ms': 0.15697650901884605},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 28.665, 'learn_throughput': 8930.605, 'update_time_ms': 2.476},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.665917158126831, 'critic_loss': 0.0004409344692248851, 'mean_q': 0.6600766777992249, 'max_q': 0.8910689353942871, 'min_q': 0.

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=49.15935751168885,episode_reward_min=36.15271004135815,episode_reward_mean=40.280840926005155,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 18.569562750236397, 'policy_1': 34.479549199199774},policy_reward_mean={'policy_0': 16.529669052967225, 'policy_1': 23.751171873037936},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.1658165026302271, 'mean_raw_obs_processing_ms': 0.4366338108410372, 'mean_inference_ms': 1.8119136092041128, 'mean_action_processing_ms': 0.15640105265291365},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 26.238, 'learn_throughput': 9756.828, 'update_time_ms': 2.46},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.6986881494522095, 'critic_loss': 0.0006665748660452664, 'mean_q': 0.6913944482803345, 'max_q': 0.9247638583183289, 'min_q': 0.53

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=41.66665647788877,episode_reward_min=36.15271004135815,episode_reward_mean=39.52108525566069,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 12.025090484603604, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 18.569562750236397, 'policy_1': 26.644080990404856},policy_reward_mean={'policy_0': 16.767793821781876, 'policy_1': 22.753291433878807},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16576535483349716, 'mean_raw_obs_processing_ms': 0.43570155161634744, 'mean_inference_ms': 1.808249035982925, 'mean_action_processing_ms': 0.1559999596404979},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 27.343, 'learn_throughput': 9362.449, 'update_time_ms': 3.016},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.7281663417816162, 'critic_loss': 0.0004832445702049881, 'mean_q': 0.7194622755050659, 'max_q': 0.9848381876945496, 'min_q': 0.59

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=41.66665647788877,episode_reward_min=36.15271004135815,episode_reward_mean=39.81687278935012,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 15.427423694530756, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 18.569562750236397, 'policy_1': 24.689466855795516},policy_reward_mean={'policy_0': 17.22033853480863, 'policy_1': 22.596534254541485},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16550168729910772, 'mean_raw_obs_processing_ms': 0.4342978324687758, 'mean_inference_ms': 1.8026311499674663, 'mean_action_processing_ms': 0.15540824077116355},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 26.211, 'learn_throughput': 9767.043, 'update_time_ms': 2.334},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.7800033092498779, 'critic_loss': 0.000642246741335839, 'mean_q': 0.7706338763237, 'max_q': 1.0474261045455933, 'min_q': 0.615565

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=45.88812641566851,episode_reward_min=36.15271004135815,episode_reward_mean=40.68532270448643,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 15.427423694530756, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 22.04745456807332, 'policy_1': 24.689466855795516},policy_reward_mean={'policy_0': 17.563963372257444, 'policy_1': 23.12135933222899},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16532732930126115, 'mean_raw_obs_processing_ms': 0.4332318851754073, 'mean_inference_ms': 1.7984899189679167, 'mean_action_processing_ms': 0.15494611231470592},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 26.397, 'learn_throughput': 9698.067, 'update_time_ms': 2.318},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.8376240730285645, 'critic_loss': 0.0005319328047335148, 'mean_q': 0.8304014205932617, 'max_q': 1.1099238395690918, 'min_q': 0.657

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=66.4699018500622,episode_reward_min=36.15271004135815,episode_reward_mean=43.29235936613177,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 15.427423694530756, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 35.429909188556344, 'policy_1': 32.31272620386608},policy_reward_mean={'policy_0': 19.261652407104766, 'policy_1': 24.030706959027},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.1652258954023627, 'mean_raw_obs_processing_ms': 0.4324220671985814, 'mean_inference_ms': 1.7960795863183519, 'mean_action_processing_ms': 0.15463194685926923},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 26.237, 'learn_throughput': 9757.351, 'update_time_ms': 2.375},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.8772198557853699, 'critic_loss': 0.0005529485642910004, 'mean_q': 0.8686621189117432, 'max_q': 1.157882809638977, 'min_q': 0.66748994

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=66.4699018500622,episode_reward_min=36.15271004135815,episode_reward_mean=45.825258046372994,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 15.427423694530756, 'policy_1': 18.057032812810903},policy_reward_max={'policy_0': 35.429909188556344, 'policy_1': 32.78787765079728},policy_reward_mean={'policy_0': 20.88861026266139, 'policy_1': 24.936647783711614},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.16527560242010783, 'mean_raw_obs_processing_ms': 0.4322074870833273, 'mean_inference_ms': 1.7958989171063338, 'mean_action_processing_ms': 0.15453139228636906},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 31.953, 'learn_throughput': 8011.809, 'update_time_ms': 3.213},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.9353542327880859, 'critic_loss': 0.000604449596721679, 'mean_q': 0.9295046329498291, 'max_q': 1.2192820310592651, 'min_q': 0.7104

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=66.4699018500622,episode_reward_min=39.162379820473966,episode_reward_mean=48.79378727478382,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 15.427423694530756, 'policy_1': 23.2859467173365},policy_reward_max={'policy_0': 35.429909188556344, 'policy_1': 33.445454749789896},policy_reward_mean={'policy_0': 22.37783902022837, 'policy_1': 26.415948254555442},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.165688478106185, 'mean_raw_obs_processing_ms': 0.43329044430073926, 'mean_inference_ms': 1.800306562273161, 'mean_action_processing_ms': 0.1548313228062064},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 36.9, 'learn_throughput': 6937.621, 'update_time_ms': 3.165},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -1.0233067274093628, 'critic_loss': 0.0006054675322957337, 'mean_q': 1.007497787475586, 'max_q': 1.3060365915298462, 'min_q': 0.7428999543

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=66.4699018500622,episode_reward_min=39.162379820473966,episode_reward_mean=51.36773965384444,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 15.427423694530756, 'policy_1': 23.38678028628145},policy_reward_max={'policy_0': 35.429909188556344, 'policy_1': 33.445454749789896},policy_reward_mean={'policy_0': 24.00647471389159, 'policy_1': 27.361264939952857},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.1664023892772791, 'mean_raw_obs_processing_ms': 0.435269275675743, 'mean_inference_ms': 1.8082296174679358, 'mean_action_processing_ms': 0.15544951023382148},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 36.969, 'learn_throughput': 6924.78, 'update_time_ms': 3.215},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -1.0341453552246094, 'critic_loss': 0.0005861598183400929, 'mean_q': 1.0194568634033203, 'max_q': 1.3358553647994995, 'min_q': 0.770114

Trial DDPG_diffdemand_b060d_00000 reported episode_reward_max=66.4699018500622,episode_reward_min=40.043475337221786,episode_reward_mean=54.030472616129124,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 16.538131572038687, 'policy_1': 23.38678028628145},policy_reward_max={'policy_0': 35.429909188556344, 'policy_1': 33.445454749789896},policy_reward_mean={'policy_0': 25.7602908518205, 'policy_1': 28.270181764308653},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.1673555301362332, 'mean_raw_obs_processing_ms': 0.4379920314630813, 'mean_inference_ms': 1.8190822184380933, 'mean_action_processing_ms': 0.1563476619778151},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.457, 'learn_throughput': 7429.531, 'update_time_ms': 3.108},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -1.072484016418457, 'critic_loss': 0.0006457503768615425, 'mean_q': 1.053411841392517, 'max_q': 1.3779282569885254, 'min_q': 0.7806329

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
DDPG_diffdemand_b060d_00000,TERMINATED,,20,174.324,20500,54.0305,66.4699,40.0435,100


arning: `framestack` has been deprecated. Use `num_framestacks (int)` instead. This will raise an error in the future!
[2m[36m(pid=93845)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93845)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93851)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93851)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93848)[0m   torch.from_numpy(self.action_space.low).float())
[2m[36m(pid=93848)[0m   torch.from_numpy(self.action_space.low).float())
Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=56.51570347168493,episode_reward_min=52.5463968004513,episode_reward_mean=54.791642613334204,episode_len_mean=100.0,episodes_this_iter=100,policy_reward_min={'policy_0': 25.81220553930142, 'policy_1': 24.7356492121221},policy_reward_max={'policy_0': 30.792807839550406, 'policy_1': 29.239372865984645},policy_reward_mean={'policy_0': 27.361735004278472, 'policy_1'

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=56.51570347168493,episode_reward_min=41.38124166168904,episode_reward_mean=53.43263812796784,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 24.7356492121221},policy_reward_max={'policy_0': 30.792807839550406, 'policy_1': 33.08544556960358},policy_reward_mean={'policy_0': 25.503095925701214, 'policy_1': 27.929542202266607},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23884676548177045, 'mean_raw_obs_processing_ms': 0.637381538473852, 'mean_inference_ms': 2.8282316127468605, 'mean_action_processing_ms': 0.2285505305745287},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 28.146, 'learn_throughput': 3552.893, 'update_time_ms': 4.068},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.28464269638061523, 'critic_loss': 0.011422939598560333, 'mean_q': 0.24594299495220184, 'max_q': 0.3405848741531372, 'min_q': 0.1254839

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=56.51570347168493,episode_reward_min=36.05331105930171,episode_reward_mean=51.583961929128364,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 17.99608636036254},policy_reward_max={'policy_0': 30.792807839550406, 'policy_1': 33.08544556960358},policy_reward_mean={'policy_0': 24.625075869571464, 'policy_1': 26.9588860595569},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.238581588157896, 'mean_raw_obs_processing_ms': 0.6361625691395337, 'mean_inference_ms': 2.8280496847079384, 'mean_action_processing_ms': 0.22861987841787226},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 27.024, 'learn_throughput': 3700.348, 'update_time_ms': 4.041},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.44413357973098755, 'critic_loss': 0.01016202662140131, 'mean_q': 0.4249178171157837, 'max_q': 0.5179391503334045, 'min_q': 0.283217936

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=56.51214535185002,episode_reward_min=35.902740210210396,episode_reward_mean=49.70773825121816,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 17.967899509758002},policy_reward_max={'policy_0': 30.792807839550406, 'policy_1': 33.08544556960358},policy_reward_mean={'policy_0': 23.676792773249268, 'policy_1': 26.030945477968885},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23874131748775757, 'mean_raw_obs_processing_ms': 0.6354494153250245, 'mean_inference_ms': 2.8287415898943613, 'mean_action_processing_ms': 0.22894611905848922},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 28.355, 'learn_throughput': 3526.673, 'update_time_ms': 4.014},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.5260923504829407, 'critic_loss': 0.013260522857308388, 'mean_q': 0.5100972652435303, 'max_q': 0.6077267527580261, 'min_q': 0.3580

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=56.51214535185002,episode_reward_min=35.8518094149955,episode_reward_mean=47.83456237518253,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 17.967899509758002},policy_reward_max={'policy_0': 30.792807839550406, 'policy_1': 33.08544556960358},policy_reward_mean={'policy_0': 22.73756047804026, 'policy_1': 25.097001897142274},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23838892870043785, 'mean_raw_obs_processing_ms': 0.634454277165651, 'mean_inference_ms': 2.8256179498274583, 'mean_action_processing_ms': 0.22872728044332802},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 30.074, 'learn_throughput': 3325.15, 'update_time_ms': 4.43},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.6574195623397827, 'critic_loss': 0.010838694870471954, 'mean_q': 0.6563921570777893, 'max_q': 0.7711706757545471, 'min_q': 0.5111712813

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=56.51214535185002,episode_reward_min=35.8518094149955,episode_reward_mean=46.17286571021903,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 12.576850814090205},policy_reward_max={'policy_0': 30.792807839550406, 'policy_1': 33.08544556960358},policy_reward_mean={'policy_0': 22.526578183661833, 'policy_1': 23.646287526557195},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2381942561915183, 'mean_raw_obs_processing_ms': 0.6355119252014054, 'mean_inference_ms': 2.826990293338926, 'mean_action_processing_ms': 0.22917819407232534},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 31.007, 'learn_throughput': 3225.125, 'update_time_ms': 4.526},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.7430176734924316, 'critic_loss': 0.008578493259847164, 'mean_q': 0.7394083142280579, 'max_q': 0.8700994253158569, 'min_q': 0.61792874

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=59.16246682504415,episode_reward_min=35.8518094149955,episode_reward_mean=46.57749471933613,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 12.576850814090205},policy_reward_max={'policy_0': 36.396463967881346, 'policy_1': 33.08544556960358},policy_reward_mean={'policy_0': 23.40709695946376, 'policy_1': 23.17039775987236},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23832873773171281, 'mean_raw_obs_processing_ms': 0.6373440442666743, 'mean_inference_ms': 2.8292012605604855, 'mean_action_processing_ms': 0.22943261964768896},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 31.243, 'learn_throughput': 3200.686, 'update_time_ms': 4.314},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -0.9093199372291565, 'critic_loss': 0.0074967313557863235, 'mean_q': 0.8794757723808289, 'max_q': 1.0279383659362793, 'min_q': 0.7120264

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=66.30446893711472,episode_reward_min=35.8518094149955,episode_reward_mean=47.705947467771786,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 12.576850814090205},policy_reward_max={'policy_0': 36.396463967881346, 'policy_1': 33.08544556960358},policy_reward_mean={'policy_0': 24.031080962932656, 'policy_1': 23.674866504839123},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23873369450580473, 'mean_raw_obs_processing_ms': 0.6405087220731732, 'mean_inference_ms': 2.831697885172086, 'mean_action_processing_ms': 0.22930442333141368},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 36.142, 'learn_throughput': 2766.848, 'update_time_ms': 4.207},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -1.052698016166687, 'critic_loss': 0.012576263397932053, 'mean_q': 1.0212485790252686, 'max_q': 1.1958256959915161, 'min_q': 0.8315830

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=66.30446893711472,episode_reward_min=35.8518094149955,episode_reward_mean=48.876229691363484,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 12.576850814090205},policy_reward_max={'policy_0': 36.396463967881346, 'policy_1': 33.80018996332848},policy_reward_mean={'policy_0': 24.621533055008427, 'policy_1': 24.25469663635507},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23964626622918028, 'mean_raw_obs_processing_ms': 0.6456290429579817, 'mean_inference_ms': 2.8366240591159118, 'mean_action_processing_ms': 0.22941605675446908},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 35.572, 'learn_throughput': 2811.211, 'update_time_ms': 4.244},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -1.1933315992355347, 'critic_loss': 0.011317477561533451, 'mean_q': 1.163373351097107, 'max_q': 1.3355191946029663, 'min_q': 1.0008132

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=66.55644537057891,episode_reward_min=35.8518094149955,episode_reward_mean=50.05678659538623,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 12.576850814090205},policy_reward_max={'policy_0': 36.396463967881346, 'policy_1': 35.52369827722434},policy_reward_mean={'policy_0': 25.004780307695828, 'policy_1': 25.052006287690396},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.24174581971680978, 'mean_raw_obs_processing_ms': 0.6508077982180491, 'mean_inference_ms': 2.840567190672319, 'mean_action_processing_ms': 0.23002701916261376},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 31.563, 'learn_throughput': 3168.267, 'update_time_ms': 3.906},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -1.2962307929992676, 'critic_loss': 0.011985835619270802, 'mean_q': 1.2604035139083862, 'max_q': 1.4431922435760498, 'min_q': 1.0294721

Trial TD3_diffdemand_1d552_00000 reported episode_reward_max=66.55644537057891,episode_reward_min=35.8518094149955,episode_reward_mean=51.22652960426688,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 8.904237445780861, 'policy_1': 12.576850814090205},policy_reward_max={'policy_0': 36.396463967881346, 'policy_1': 35.52369827722434},policy_reward_mean={'policy_0': 25.52892370970021, 'policy_1': 25.697605894566674},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.24372898539135754, 'mean_raw_obs_processing_ms': 0.6559068119759673, 'mean_inference_ms': 2.84492478680558, 'mean_action_processing_ms': 0.2306844980427144},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 29.792, 'learn_throughput': 3356.56, 'update_time_ms': 3.646},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'actor_loss': -1.4389007091522217, 'critic_loss': 0.011686209589242935, 'mean_q': 1.4060423374176025, 'max_q': 1.5764614343643188, 'min_q': 1.18234992027

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
TD3_diffdemand_1d552_00000,TERMINATED,,11,75.1829,20000,51.2265,66.5564,35.8518,100


fcnet_hiddens` and `Q_model.fcnet_hiddens`.
Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=53.65755955499828,episode_reward_mean=54.99736283269609,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 26.286429684028864, 'policy_1': 27.03838503611871},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 27.212126118683962, 'policy_1': 27.785236714012115},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2404255582796817, 'mean_raw_obs_processing_ms': 0.6333359029908843, 'mean_inference_ms': 3.5959013250489904, 'mean_action_processing_ms': 0.23400467752620874},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 50.091, 'learn_throughput': 5110.696, 'update_time_ms': 4.162},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([1.1128109 , 1.3155978 , 4.1515994 , 1.014239  , 1.1380279 ,
       1.2531071 , 1.109

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=43.89835277669481,episode_reward_mean=49.94479427419917,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 20.56798214557457, 'policy_1': 21.338348728442988},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 24.927457622480652, 'policy_1': 25.017336651718608},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2415566748734348, 'mean_raw_obs_processing_ms': 0.6348581575900093, 'mean_inference_ms': 3.439960788980341, 'mean_action_processing_ms': 0.23409658955230594},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 36.542, 'learn_throughput': 7005.599, 'update_time_ms': 3.844},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.13256171, 0.71899116, 0.14899549, 0.4310308 , 0.21828112,
       0.55601   , 0.7208034 , 0.2639184 , 1.3622742 , 0.44065246,
  

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=45.15288418235074,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 22.54648241476274, 'policy_1': 22.60640176758805},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.24062835787756365, 'mean_raw_obs_processing_ms': 0.6310879002469465, 'mean_inference_ms': 3.3292956480247473, 'mean_action_processing_ms': 0.23289313861692226},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.893, 'learn_throughput': 7336.773, 'update_time_ms': 3.582},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([1.77272701e+00, 6.43896461e-02, 1.00858927e+00, 2.03806102e-01,
       1.67731833e+00, 1.80559099e-01, 4.57945049e-01, 7.45908022e

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=42.756929136426535,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 21.35599481090379, 'policy_1': 21.40093432552277},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23967058838298322, 'mean_raw_obs_processing_ms': 0.6274821097987612, 'mean_inference_ms': 3.248937498260541, 'mean_action_processing_ms': 0.2318077915719054},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.818, 'learn_throughput': 7352.613, 'update_time_ms': 3.658},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.16824538, 0.51971346, 0.0624516 , 0.13297719, 1.6836692 ,
       0.27957934, 0.28283638, 0.8026272 , 0.65809333, 0.8356438 ,
    

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=41.319356108872014,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 20.641702248588416, 'policy_1': 20.6776538602836},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2387583607824025, 'mean_raw_obs_processing_ms': 0.6241236748685397, 'mean_inference_ms': 3.187353496285664, 'mean_action_processing_ms': 0.23093142688740362},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.665, 'learn_throughput': 7385.039, 'update_time_ms': 3.549},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.33766454, 0.30945438, 0.10589379, 0.2987687 , 0.31180638,
       0.30552417, 0.25122374, 0.26957053, 0.04607642, 0.31808358,
    

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=40.360974090502324,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 20.165507207044833, 'policy_1': 20.195466883457485},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23780493848843298, 'mean_raw_obs_processing_ms': 0.6208304073780291, 'mean_inference_ms': 3.1367645520017926, 'mean_action_processing_ms': 0.22992229910161258},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 33.616, 'learn_throughput': 7615.39, 'update_time_ms': 3.472},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.26436126, 0.01562059, 0.01192915, 0.7391393 , 0.3791864 ,
       0.10309756, 0.03844583, 0.32530046, 0.01646304, 1.8343096 ,
 

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=39.67641550595254,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 19.825367891656562, 'policy_1': 19.851047614295975},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23683620349460527, 'mean_raw_obs_processing_ms': 0.6176958925491713, 'mean_inference_ms': 3.0938055411650516, 'mean_action_processing_ms': 0.22888518607740105},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 33.512, 'learn_throughput': 7638.99, 'update_time_ms': 3.484},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.30427378, 0.20489335, 0.02344519, 0.30116302, 0.11595124,
       0.11962765, 1.0086203 , 0.26913315, 0.4906454 , 0.9447869 ,
  

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=39.162996567540226,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 19.570263405115362, 'policy_1': 19.59273316242485},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2359843563473163, 'mean_raw_obs_processing_ms': 0.6149457473956793, 'mean_inference_ms': 3.057283689008252, 'mean_action_processing_ms': 0.22792602554809066},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 36.178, 'learn_throughput': 7076.148, 'update_time_ms': 3.591},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.30627638, 0.5013218 , 0.29528838, 0.25803083, 0.28143346,
       0.10015231, 0.34753364, 0.68614614, 0.09801024, 0.34198838,
   

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=38.76367072655286,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 19.3718488044722, 'policy_1': 19.391821922080634},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2352877497562147, 'mean_raw_obs_processing_ms': 0.612781232971033, 'mean_inference_ms': 3.027105654070326, 'mean_action_processing_ms': 0.22714341761603118},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 37.437, 'learn_throughput': 6838.157, 'update_time_ms': 3.623},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.28713477, 1.0881304 , 0.11141551, 0.25709617, 0.06649458,
       0.6346916 , 0.3694924 , 2.0534444 , 0.21917164, 0.370427  ,
      

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=56.90605990482583,episode_reward_min=35.56906399865392,episode_reward_mean=38.44421005376296,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 28.420475567026184, 'policy_1': 28.485584337799615},policy_reward_mean={'policy_0': 19.213117123957673, 'policy_1': 19.231092929805264},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2347718801010702, 'mean_raw_obs_processing_ms': 0.6113266050106929, 'mean_inference_ms': 3.0021580257100013, 'mean_action_processing_ms': 0.2265277650199154},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 38.259, 'learn_throughput': 6691.302, 'update_time_ms': 3.735},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([3.77784014e-01, 3.43290329e-01, 7.40079880e-02, 4.59684491e-01,
       1.32093191e-01, 4.03965592e-01, 3.65216851e-01, 4.54704881e

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=45.616572996416146,episode_reward_min=35.56906399865392,episode_reward_mean=36.50138017035874,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 23.952004644024903, 'policy_1': 23.887979821040524},policy_reward_mean={'policy_0': 18.27035771202197, 'policy_1': 18.231022458336746},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23375157326456367, 'mean_raw_obs_processing_ms': 0.6080452049238508, 'mean_inference_ms': 2.9199986951932675, 'mean_action_processing_ms': 0.22520050774408784},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 36.747, 'learn_throughput': 6966.482, 'update_time_ms': 3.582},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.64835036, 0.09494567, 0.13226104, 0.31719297, 0.16512102,
       0.24582309, 1.8538909 , 0.29488307, 0.24842638, 0.3215626 ,
 

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23238499314109698, 'mean_raw_obs_processing_ms': 0.6041938737503532, 'mean_inference_ms': 2.867082955449467, 'mean_action_processing_ms': 0.2237313715272529},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.056, 'learn_throughput': 7516.942, 'update_time_ms': 3.39},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.17555612, 0.33211684, 0.21663791, 0.12850517, 0.29278517,
       0.32959026, 0.13005775, 0.32994622, 1.4985113 , 0.30712497,
      

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.23121579272493167, 'mean_raw_obs_processing_ms': 0.6011280350627994, 'mean_inference_ms': 2.8290550444264597, 'mean_action_processing_ms': 0.2224406967894782},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 32.116, 'learn_throughput': 7971.103, 'update_time_ms': 3.038},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.47465682, 0.36760736, 0.29252172, 1.9858804 , 0.21753657,
       0.33266407, 0.33343947, 0.29342735, 0.385822  , 0.09540296,
    

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2300434424278735, 'mean_raw_obs_processing_ms': 0.5982772541032161, 'mean_inference_ms': 2.798373228307339, 'mean_action_processing_ms': 0.2211602588660918},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 32.926, 'learn_throughput': 7775.002, 'update_time_ms': 3.274},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([3.71265173e-01, 3.21606994e-02, 3.64375710e-02, 1.54806972e-02,
       2.74557233e-01, 3.15366566e-01, 1.31932974e+00, 2.75753140e-02

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2289268224912642, 'mean_raw_obs_processing_ms': 0.5957930533669135, 'mean_inference_ms': 2.77261474610403, 'mean_action_processing_ms': 0.2198688118619038},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 34.156, 'learn_throughput': 7495.093, 'update_time_ms': 3.345},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([1.37631655e-01, 1.61937356e-01, 2.05765963e-01, 3.17164183e-01,
       7.38498926e-01, 8.22963715e-02, 3.32127512e-01, 3.31724107e-01,

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.2279439763401444, 'mean_raw_obs_processing_ms': 0.5937865518342916, 'mean_inference_ms': 2.751269402917275, 'mean_action_processing_ms': 0.2187519475334672},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 35.382, 'learn_throughput': 7235.218, 'update_time_ms': 3.544},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.3174916 , 0.20058107, 0.6412006 , 0.32664758, 0.21401584,
       0.26501906, 0.22625828, 0.26612288, 0.3634497 , 2.6732175 ,
      

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.22712097581152502, 'mean_raw_obs_processing_ms': 0.5923059751214672, 'mean_inference_ms': 2.7338986330391135, 'mean_action_processing_ms': 0.21781893369601973},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 35.592, 'learn_throughput': 7192.534, 'update_time_ms': 3.452},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([1.24042392e-01, 2.36801624e-01, 1.40071201e+00, 2.84018755e-01,
       8.44697833e-01, 1.89004779e-01, 2.05270052e-01, 8.64665389e

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.22635807901247385, 'mean_raw_obs_processing_ms': 0.5911363832935236, 'mean_inference_ms': 2.7193182620235046, 'mean_action_processing_ms': 0.21698726162177556},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 35.815, 'learn_throughput': 7147.791, 'update_time_ms': 3.447},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.29183316, 0.17442852, 0.2975341 , 0.27172184, 0.22970313,
       0.07796544, 0.05606085, 0.29716033, 0.18060118, 0.13514036,
   

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.22561915724990264, 'mean_raw_obs_processing_ms': 0.5900174837469726, 'mean_inference_ms': 2.705746179532428, 'mean_action_processing_ms': 0.21616237390417267},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 35.333, 'learn_throughput': 7245.378, 'update_time_ms': 3.435},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.1264171 , 0.29616696, 0.27095938, 0.07411581, 0.0102213 ,
       0.98772615, 0.6187932 , 0.2176131 , 0.29097664, 2.0123353 ,
    

Trial SAC_diffdemand_5051f_00000 reported episode_reward_max=35.56906399865392,episode_reward_min=35.56906399865392,episode_reward_mean=35.56906399865391,episode_len_mean=100.0,episodes_this_iter=10,policy_reward_min={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_max={'policy_0': 17.78453199932693, 'policy_1': 17.78453199932693},policy_reward_mean={'policy_0': 17.784531999326926, 'policy_1': 17.784531999326926},custom_metrics={},sampler_perf={'mean_env_wait_ms': 0.22481390682942082, 'mean_raw_obs_processing_ms': 0.5885994272560333, 'mean_inference_ms': 2.692354194238218, 'mean_action_processing_ms': 0.21530319862692118},off_policy_estimator={},num_healthy_workers=10,timers={'learn_time_ms': 35.198, 'learn_throughput': 7273.225, 'update_time_ms': 3.401},info={'learner': {'policy_0': {'allreduce_latency': 0.0, 'td_error': array([0.32231015, 0.331572  , 0.04131418, 0.1415388 , 0.29455274,
       0.330593  , 0.05168229, 0.19234914, 0.05784518, 0.21074766,
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
SAC_diffdemand_5051f_00000,TERMINATED,,20,146.848,20500,35.5691,35.5691,35.5691,100


In [None]:
shutdown()