In [2]:
import numpy as np
import matplotlib.pyplot as plt

import gym

from gym import spaces


In [7]:
import ray
from ray import tune
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
from ray.rllib.examples.models.shared_weights_model import \
    SharedWeightsModel1, SharedWeightsModel2, TF2SharedWeightsModel, \
    TorchSharedWeightsModel
from ray.rllib.models import ModelCatalog
# from ray.rllib.policy import PolicySpec
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune.registry import register_env

from ray.rllib.env.multi_agent_env import MultiAgentEnv


import ray
from ray.rllib.agents.ppo import PPOTrainer, PPOTorchPolicy
from ray.rllib.agents.ppo import DEFAULT_CONFIG as DEFAULT_CONFIG_PPO

from ray.rllib.agents.dqn import DQNTrainer, DQNTorchPolicy
from ray.rllib.agents.dqn import  DEFAULT_CONFIG as DEFAULT_CONFIG_DQN


from ray.tune.registry import register_env
from ray.tune.logger import pretty_print

ray.init(ignore_reinit_error=True, log_to_driver=False)

2021-07-27 10:31:56,321	INFO worker.py:745 -- Calling ray.init() again after it has already been called.


In [4]:
class MatrixGame():
    
    def __init__(self, RPST=(3,1,0,5)):
        
        self.RPST = RPST
        
        self.payoff_mat = np.empty((2,2), dtype=np.object)
        
        self.payoff_mat[0, 0] = (RPST[0], RPST[0])
        self.payoff_mat[1, 1] = (RPST[1], RPST[1])
        self.payoff_mat[0, 1] = (RPST[2], RPST[3])
        self.payoff_mat[1, 0] = (RPST[3], RPST[2])
        
    def play(self, a_row, a_col):
        # for ease of things 0 is coooperate
#                            1 is defect
        
        
#         if a_row == 'c':
#             row = 0
#         else:
#             row = 1
            
#         if a_col == 'c':
#             col = 0
#         else:
#             col = 1
            
        return self.payoff_mat[a_row, a_col]
        

In [5]:
class TwoAgentMatrixGameEnv(MultiAgentEnv):
    
    def __init__(self, RPST=(3,1,0,5), history_n=100):
        
        self.num_agents = 2
        
        self.RPST = RPST
        self.history_n = history_n
        self.history = np.zeros((2,2,self.history_n))
        
        self._counter = 0
        self._setup_spaces()
        self.game = MatrixGame(RPST=self.RPST)
    
    
    def _setup_spaces(self):
        
        self.action_space = spaces.Discrete(2)
        
        self.observation_space = spaces.Box(0, 1,
                                           shape=(self.history_n * 4,))
        
        
    def history_to_states(self, history=None):
        
        if history is None:
            history = self.history
            
        state1 = history.flatten()
        state2 = history[::-1,:,:].flatten()
        
        states = {0: state1, 1:state2}
        
        return states
            
        
        
    def step(self, action_dict):
        
        print((action_dict[0], action_dict[1]))
        rewards = self.game.play(action_dict[0], action_dict[1])
        rew = {i: rewards[i] for i in [0, 1]}
        
        self.history[0, action_dict[0], self._counter] = 1
        self.history[1, action_dict[1], self._counter] = 1
        
        obs = self.history_to_states(self.history)
        
        self._counter += 1
        
        is_done = self._counter >= self.history_n
        done = {i: is_done for i in [0, 1, "__all__"]}
        
        info = {0:{}, 1:{}}
        
        return obs, rew, done, info
        
        
        
    def reset(self):
        
        self.history = np.zeros((2,2,self.history_n))
        obs = self.history_to_states(self.history)

        self._counter = 0
        
        return obs
        

In [6]:
env = TwoAgentMatrixGameEnv()

In [94]:
aa = (4,3)
{i:aa[i] for i in [0,1]}

{0: 4, 1: 3}

In [95]:
env.step({0:0,1:0})

(0, 0)


({0: array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [79]:
info ={}

In [80]:
info[0] = {}

In [81]:
info[1] = {}

In [82]:
info

{0: {}, 1: {}}

In [96]:
register_env('twoagent_PD', lambda c: TwoAgentMatrixGameEnv())

In [84]:
trainer_config_ppo = DEFAULT_CONFIG_PPO.copy()
trainer_config_ppo['num_workers'] = 1
trainer_config_ppo['num_sgd_iter'] = 20
trainer_config_ppo['sgd_minibatch_size'] = 32
# trainer_config_ppo['model']['fcnet_hiddens'] = [1024, 512,512, 256,256,32,8]
trainer_config_ppo['model']['fcnet_hiddens'] = [256,256,32,8]

trainer_config_ppo['num_cpus_per_worker'] = 0

In [85]:
trainer = PPOTrainer(trainer_config_ppo, env="twoagent_PD");


2021-07-26 15:52:54,124	INFO trainable.py:101 -- Trainable.setup took 13.018 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [86]:
for i in range(2):
    print("Training iteration {}...".format(i))
    result=trainer.train()
    print(pretty_print(result))

Training iteration 0...
agent_timesteps_total: 4000
custom_metrics: {}
date: 2021-07-26_15-53-24
done: false
episode_len_mean: 100.0
episode_media: {}
episode_reward_max: 478.0
episode_reward_mean: 455.8
episode_reward_min: 428.0
episodes_this_iter: 20
episodes_total: 20
experiment_id: 39b34ab8c778431db4c7d7363eca7f2b
hostname: coolo-computer
info:
  learner:
    default_policy:
      learner_stats:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 0.6630105972290039
        entropy_coeff: 0.0
        kl: 0.031176701188087463
        model: {}
        policy_loss: -0.036966897547245026
        total_loss: 8868.671875
        vf_explained_var: -7.157325967455108e-07
        vf_loss: 8868.7041015625
  num_agent_steps_sampled: 4000
  num_agent_steps_trained: 4000
  num_steps_sampled: 4000
  num_steps_trained: 4000
iterations_since_restore: 1
node_ip: 192.168.1.21
num_healthy_workers: 1
off_policy_estimator: {}
perf:
  cpu_util_percent: 44.472

In [87]:
env = TwoAgentMatrixGameEnv()

In [1]:

n_samples = 50


defects = []
rewards = []
for i in range(n_samples):
    state = env.reset()
    
    total_defect = 0
    cum_reward = 0
    done = False
    i = 0
    while not done:
        print('===========')
        print(i)
        print('===========')
        
        i+=1
        action = trainer.compute_actions(state)
        print(action)
#         total_defect += action
        state, reward, done, results = env.step(action)
#         cum_reward += reward
    defects.append(total_defect)
    rewards.append(cum_reward)

NameError: name 'env' is not defined

In [8]:

def env_creator(_):
    return TwoAgentMatrixGameEnv()
single_env = TwoAgentMatrixGameEnv()
env_name = "TwoAgent_PD"
register_env(env_name, env_creator)


obs_space = single_env.observation_space
act_space = single_env.action_space
num_agents = single_env.num_agents

def gen_policy():
    return (PPOTorchPolicy, obs_space, act_space, {})
policy_graphs = {}

for i in range(num_agents):
    policy_graphs['agent-' + str(i)] = gen_policy()
def policy_mapping_fn(agent_id):
        return 'agent-' + str(agent_id)

In [11]:
config={
    "log_level": "WARN",
    "num_workers": 3,
    "num_cpus_for_driver": 1,
    "num_cpus_per_worker": 1,
    "lr": 5e-3,
    "model":{"fcnet_hiddens": [1024, 512,256,32,8]},
    "multiagent": {
        "policies": policy_graphs,
        "policy_mapping_fn": policy_mapping_fn,
    },
    "env": "TwoAgent_PD"
}

In [12]:
exp_name = 'TA_TEST1'
exp_dict = {
        'name': exp_name,
        'run_or_experiment': 'PPO',
        "stop": {
            "training_iteration": 20
        },
        'checkpoint_freq': 20,
        "config": config,
}
# ray.init()
tune.run(**exp_dict)

Trial name,status,loc
PPO_TwoAgent_PD_4db32_00000,PENDING,


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 15996
  custom_metrics: {}
  date: 2021-07-27_10-42-57
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 487.0
  episode_reward_mean: 448.8974358974359
  episode_reward_min: 410.0
  episodes_this_iter: 78
  episodes_total: 78
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.1999999999999999
          cur_lr: 0.005000000000000001
          entropy: 0.6898711702180287
          entropy_coeff: 0.0
          kl: 0.0032961564551774315
          policy_loss: 0.0025286045930688343
          total_loss: 6878.073854476686
          vf_explained_var: -1.5137688436084318e-08
          vf_loss: 6878.070707775298
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.1999999999999999
          cur_lr: 0.005000000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,1,217.268,7998,448.897,487,410,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 31992
  custom_metrics: {}
  date: 2021-07-27_10-46-45
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 487.0
  episode_reward_mean: 444.45
  episode_reward_min: 405.0
  episodes_this_iter: 81
  episodes_total: 159
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.09999999999999995
          cur_lr: 0.005000000000000001
          entropy: 0.6820998570275685
          entropy_coeff: 0.0
          kl: 0.010965851231640766
          policy_loss: 0.000675432997504397
          total_loss: 4987.425509982639
          vf_explained_var: -1.8922110545105397e-09
          vf_loss: 4987.423719618056
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.09999999999999995
          cur_lr: 0.005000000000000001
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,2,444.835,15996,444.45,487,405,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 47988
  custom_metrics: {}
  date: 2021-07-27_10-50-23
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 476.0
  episode_reward_mean: 430.9
  episode_reward_min: 382.0
  episodes_this_iter: 78
  episodes_total: 237
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.09999999999999995
          cur_lr: 0.005000000000000001
          entropy: 0.6442135998180935
          entropy_coeff: 0.0
          kl: 0.027008375419037684
          policy_loss: -0.006746554311128363
          total_loss: 3417.8270205543154
          vf_explained_var: 1.8922110545105397e-09
          vf_loss: 3417.831077938988
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.09999999999999995
          cur_lr: 0.005000000000000001
      

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,3,663.294,23994,430.9,476,382,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 63984
  custom_metrics: {}
  date: 2021-07-27_10-54-02
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 456.0
  episode_reward_mean: 386.56
  episode_reward_min: 342.0
  episodes_this_iter: 81
  episodes_total: 318
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.15000000000000008
          cur_lr: 0.005000000000000001
          entropy: 0.6472189994085402
          entropy_coeff: 0.0
          kl: 0.0050972449513094955
          policy_loss: 2.3004632177097456e-05
          total_loss: 1755.48923843626
          vf_explained_var: -7.568844218042159e-09
          vf_loss: 1755.4884052579366
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.09999999999999995
          cur_lr: 0.005000000000000001
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,4,881.605,31992,386.56,456,342,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 79980
  custom_metrics: {}
  date: 2021-07-27_10-57-52
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 403.0
  episode_reward_mean: 353.05
  episode_reward_min: 320.0
  episodes_this_iter: 81
  episodes_total: 399
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.15000000000000008
          cur_lr: 0.005000000000000001
          entropy: 0.5248797119609894
          entropy_coeff: 0.0
          kl: 0.004781771783850023
          policy_loss: 0.0005746063820662952
          total_loss: 1466.4229038783483
          vf_explained_var: -4.730527525254047e-09
          vf_loss: 1466.4216095455108
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.09999999999999995
          cur_lr: 0.005000000000000001
    

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,5,1112.09,39990,353.05,403,320,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 95976
  custom_metrics: {}
  date: 2021-07-27_11-02-10
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 383.0
  episode_reward_mean: 319.43
  episode_reward_min: 260.0
  episodes_this_iter: 78
  episodes_total: 477
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.07500000000000004
          cur_lr: 0.005000000000000001
          entropy: 0.4641875534776657
          entropy_coeff: 0.0
          kl: 0.0036923268072559805
          policy_loss: 0.0015884676120347446
          total_loss: 892.5746159629216
          vf_explained_var: 9.461055272552699e-10
          vf_loss: 892.57274421813
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.09999999999999995
          cur_lr: 0.005000000000000001
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,6,1370.17,47988,319.43,383,260,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 111972
  custom_metrics: {}
  date: 2021-07-27_11-06-54
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 354.0
  episode_reward_mean: 297.5
  episode_reward_min: 259.0
  episodes_this_iter: 81
  episodes_total: 558
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.03750000000000002
          cur_lr: 0.005000000000000001
          entropy: 0.4032939303488958
          entropy_coeff: 0.0
          kl: 0.0034499725436020643
          policy_loss: -0.0006354218644518701
          total_loss: 869.8221900576636
          vf_explained_var: 0.0
          vf_loss: 869.8226899646577
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.049999999999999975
          cur_lr: 0.005000000000000001
          entropy: 0.4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,7,1653.53,55986,297.5,354,259,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 127968
  custom_metrics: {}
  date: 2021-07-27_11-11-38
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 326.0
  episode_reward_mean: 282.32
  episode_reward_min: 242.0
  episodes_this_iter: 81
  episodes_total: 639
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.01875000000000001
          cur_lr: 0.005000000000000001
          entropy: 0.34030963977177936
          entropy_coeff: 0.0
          kl: 0.0017513137333538444
          policy_loss: 0.0013848175959927695
          total_loss: 874.3887619745163
          vf_explained_var: 0.0
          vf_loss: 874.3873475089906
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.024999999999999988
          cur_lr: 0.005000000000000001
          entropy: 0.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,8,1937.92,63984,282.32,326,242,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 143964
  custom_metrics: {}
  date: 2021-07-27_11-16-21
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 318.0
  episode_reward_mean: 274.58
  episode_reward_min: 240.0
  episodes_this_iter: 78
  episodes_total: 717
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.009375000000000005
          cur_lr: 0.005000000000000001
          entropy: 0.34834953480296665
          entropy_coeff: 0.0
          kl: 0.0011223369196707766
          policy_loss: 0.0004992450750063337
          total_loss: 784.6564146980406
          vf_explained_var: 6.622738357719982e-09
          vf_loss: 784.6559002588666
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.012499999999999994
          cur_lr: 0.005000000000000001
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,9,2220.48,71982,274.58,318,240,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 159960
  custom_metrics: {}
  date: 2021-07-27_11-20-36
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 318.0
  episode_reward_mean: 271.52
  episode_reward_min: 243.0
  episodes_this_iter: 81
  episodes_total: 798
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.004687500000000002
          cur_lr: 0.005000000000000001
          entropy: 0.32558965777593946
          entropy_coeff: 0.0
          kl: 0.003628143756132987
          policy_loss: -0.000336638374608897
          total_loss: 635.5340023949033
          vf_explained_var: 9.461055272552699e-10
          vf_loss: 635.5343211340526
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.006249999999999997
          cur_lr: 0.005000000000000001
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,10,2475.65,79980,271.52,318,243,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 175956
  custom_metrics: {}
  date: 2021-07-27_11-25-06
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 296.0
  episode_reward_mean: 261.0
  episode_reward_min: 227.0
  episodes_this_iter: 81
  episodes_total: 879
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.002343750000000001
          cur_lr: 0.005000000000000001
          entropy: 0.33420158757103813
          entropy_coeff: 0.0
          kl: 0.0008314444835016889
          policy_loss: 0.0002732199306289355
          total_loss: 563.1373416961186
          vf_explained_var: 2.838316470743507e-09
          vf_loss: 563.1370670379155
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.006249999999999997
          cur_lr: 0.005000000000000001
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,11,2745.82,87978,261,296,227,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 191952
  custom_metrics: {}
  date: 2021-07-27_11-29-48
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 291.0
  episode_reward_mean: 250.6
  episode_reward_min: 221.0
  episodes_this_iter: 78
  episodes_total: 957
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0011718750000000006
          cur_lr: 0.005000000000000001
          entropy: 0.24454979669480098
          entropy_coeff: 0.0
          kl: 0.00340556198849328
          policy_loss: 0.0004153317284016382
          total_loss: 496.73692103794644
          vf_explained_var: 2.838316470743507e-09
          vf_loss: 496.7365010579427
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0031249999999999984
          cur_lr: 0.005000000000000001
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,12,3027.43,95976,250.6,291,221,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 207948
  custom_metrics: {}
  date: 2021-07-27_11-34-24
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 273.0
  episode_reward_mean: 236.03
  episode_reward_min: 215.0
  episodes_this_iter: 81
  episodes_total: 1038
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0005859375000000003
          cur_lr: 0.005000000000000001
          entropy: 0.19134178521141174
          entropy_coeff: 0.0
          kl: 0.002215550879814795
          policy_loss: -0.0005817777506770596
          total_loss: 438.0082760765439
          vf_explained_var: 3.7844221090210795e-09
          vf_loss: 438.0088563949343
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0015624999999999992
          cur_lr: 0.00500000000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,13,3304.03,103974,236.03,273,215,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 223944
  custom_metrics: {}
  date: 2021-07-27_11-38-49
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 258.0
  episode_reward_mean: 227.56
  episode_reward_min: 209.0
  episodes_this_iter: 81
  episodes_total: 1119
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.00029296875000000015
          cur_lr: 0.005000000000000001
          entropy: 0.14984841479195488
          entropy_coeff: 0.0
          kl: 0.0019087931658658716
          policy_loss: -0.0024501881192600917
          total_loss: 441.38036189003594
          vf_explained_var: 1.8922110545105397e-09
          vf_loss: 441.38280620272195
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0007812499999999996
          cur_lr: 0.0050000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,14,3569.15,111972,227.56,258,209,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 239940
  custom_metrics: {}
  date: 2021-07-27_11-43-05
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 242.0
  episode_reward_mean: 222.77
  episode_reward_min: 206.0
  episodes_this_iter: 78
  episodes_total: 1197
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.00014648437500000008
          cur_lr: 0.005000000000000001
          entropy: 0.14873459048214413
          entropy_coeff: 0.0
          kl: 0.0008780278390437911
          policy_loss: 0.0006045169670075651
          total_loss: 415.55897110227556
          vf_explained_var: 4.730527525254047e-09
          vf_loss: 415.55836898561506
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0003906249999999998
          cur_lr: 0.005000000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,15,3824.16,119970,222.77,242,206,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 255936
  custom_metrics: {}
  date: 2021-07-27_11-47-28
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 239.0
  episode_reward_mean: 220.57
  episode_reward_min: 206.0
  episodes_this_iter: 81
  episodes_total: 1278
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 7.324218750000004e-05
          cur_lr: 0.005000000000000001
          entropy: 0.12838779709168843
          entropy_coeff: 0.0
          kl: 0.001104504105711161
          policy_loss: -0.0017437147981827221
          total_loss: 398.268062531002
          vf_explained_var: 0.0
          vf_loss: 398.26980978345114
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.0001953124999999999
          cur_lr: 0.005000000000000001
          entropy

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,16,4087.24,127968,220.57,239,206,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 271932
  custom_metrics: {}
  date: 2021-07-27_11-51-49
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 233.0
  episode_reward_mean: 216.04
  episode_reward_min: 200.0
  episodes_this_iter: 81
  episodes_total: 1359
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 3.662109375000002e-05
          cur_lr: 0.005000000000000001
          entropy: 0.0950216409705934
          entropy_coeff: 0.0
          kl: 0.0010359105035026986
          policy_loss: -0.0008176186580270056
          total_loss: 391.35582333519346
          vf_explained_var: 4.730527525254047e-09
          vf_loss: 391.3566371372768
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.765624999999995e-05
          cur_lr: 0.00500000000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,17,4348.39,135966,216.04,233,200,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 287928
  custom_metrics: {}
  date: 2021-07-27_11-56-23
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 230.0
  episode_reward_mean: 211.77
  episode_reward_min: 200.0
  episodes_this_iter: 78
  episodes_total: 1437
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.831054687500001e-05
          cur_lr: 0.005000000000000001
          entropy: 0.08146685184467406
          entropy_coeff: 0.0
          kl: 0.0003159406197872678
          policy_loss: 0.0005013222643543803
          total_loss: 378.42431010897195
          vf_explained_var: 9.461055272552699e-10
          vf_loss: 378.42380584232393
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.8828124999999976e-05
          cur_lr: 0.005000000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,18,4622.85,143964,211.77,230,200,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 303924
  custom_metrics: {}
  date: 2021-07-27_12-00-48
  done: false
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 228.0
  episode_reward_mean: 210.07
  episode_reward_min: 200.0
  episodes_this_iter: 81
  episodes_total: 1518
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 9.155273437500005e-06
          cur_lr: 0.005000000000000001
          entropy: 0.0736085135075781
          entropy_coeff: 0.0
          kl: 0.0008138585941712754
          policy_loss: 0.0008497075990788521
          total_loss: 353.0207282172309
          vf_explained_var: 2.838316470743507e-09
          vf_loss: 353.0198814755394
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 2.4414062499999988e-05
          cur_lr: 0.005000000000000001

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,19,4887.69,151962,210.07,228,200,100


Result for PPO_TwoAgent_PD_4db32_00000:
  agent_timesteps_total: 319920
  custom_metrics: {}
  date: 2021-07-27_12-05-17
  done: true
  episode_len_mean: 100.0
  episode_media: {}
  episode_reward_max: 224.0
  episode_reward_mean: 206.51
  episode_reward_min: 200.0
  episodes_this_iter: 81
  episodes_total: 1599
  experiment_id: 3f8acaa62e554dc3a3d95c19c2898547
  hostname: coolo-computer
  info:
    learner:
      agent-0:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 4.577636718750002e-06
          cur_lr: 0.005000000000000001
          entropy: 0.07081073487088793
          entropy_coeff: 0.0
          kl: 0.00016902480977492698
          policy_loss: 0.0011319992176833608
          total_loss: 355.19950309632316
          vf_explained_var: 3.7844221090210795e-09
          vf_loss: 355.19837055509055
      agent-1:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 1.2207031249999994e-05
          cur_lr: 0.00500000000000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,RUNNING,192.168.1.21:105126,20,5156.18,159960,206.51,224,200,100


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_TwoAgent_PD_4db32_00000,TERMINATED,,20,5156.18,159960,206.51,224,200,100


2021-07-27 12:05:17,970	INFO tune.py:549 -- Total run time: 5163.70 seconds (5163.13 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f2cc2dee0a0>

In [None]:
tra