In [1]:
import gym
import matplotlib.pyplot as plt 
import scipy.io as spio
import numpy as np
import os
# import gym_strucSA
from ray.rllib.env.multi_agent_env import MultiAgentEnv

In [215]:
class StructMA(MultiAgentEnv):

    def __init__(self, config=None):
        empty_config = {"config": {"components": 2} }
        config_ = config or empty_config
        # Number of components #
        self.ncomp = config_['config']["components"]
        self.time = 0
        self.ep_length = 30
        self.nstcomp = 30
        self.nobs = 2
        self.actions_total = int(3)
        self.obs_total = int(30 + 31 + 31)

        # configure spaces
        self.action_space = gym.spaces.Discrete(self.actions_total)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(self.obs_total,), dtype=np.float64)
        ### Loading the underlying POMDP model ###
        drmodel = np.load('Dr3031C10.npz')
        self.belief0 = drmodel['belief0'][0,0:self.ncomp,:,0] # (10 components, 30 crack states)
        self.P = drmodel['P'][:,0:self.ncomp,:,:,:] # (3 actions, 10 components, 31 det rates, 30 cracks, 30 cracks)
        self.O = drmodel['O'][:,0:self.ncomp,:,:] # (3 actions, 10 components, 30 cracks, 2 observations)
        
        self.agent_list = []
        for i in range(self.ncomp):
            item = "agent_"+ str(i)
            self.agent_list.append(item)
        self._agent_ids = self.agent_list
        # Reset env.
        self.reset()
            
    def reset(self):
        # We need the following line to seed self.np_random
        # super().reset(seed=seed)

        # Choose the agent's belief
        self.time_step = 0
        self.agent_belief = self.belief0
        self.drate = np.zeros((self.ncomp, 1), dtype=int)
        observations = {}
        for i in range(self.ncomp):
            observations[self.agent_list[i]] = np.concatenate( (self.agent_belief[i] , \
                self.one_hot_drate(self.drate, self.ep_length+1)[i] , self.one_hot_time(self.time_step ,self.ep_length+1) ) ) 
 
        return observations
    
    def step(self, action: dict):
        action_ = np.zeros(self.ncomp, dtype=int)
        for i in range(self.ncomp):
            action_[i] = action[self.agent_list[i]]

        observation_, belief_prime, drate_prime = self.belief_update(self.agent_belief, action_, self.drate)
        
        observations = {}
        for i in range(self.ncomp):
            observations[self.agent_list[i]] = np.concatenate( (belief_prime[i] , \
                self.one_hot_drate(drate_prime, self.ep_length+1)[i] , self.one_hot_time(self.time_step+1 ,self.ep_length+1) ) )
        reward_ = self.immediate_cost(self.agent_belief, action_, belief_prime, self.drate)
        reward = reward_.item() #Convert float64 to float
        
        rewards = {}
        for i in range(self.ncomp):
            rewards[self.agent_list[i]] = reward
            
        self.time_step += 1 
        self.agent_belief = belief_prime
        self.drate = drate_prime
        # An episode is done if the agent has reached the target
        done = np.array_equal(self.time_step, self.ep_length)
        dones = {"__all__": done}
        # info = {"belief": self.agent_belief}
        return observations, rewards, dones, {} 
    
    
    def pf_sys(self, pf, k): # compute pf_sys for k-out-of-n components 
        n = pf.size
        # k = ncomp-1
        PF_sys = np.zeros(1)
        nk = n-k
        m = k+1
        A = np.zeros(m+1)
        A [1] = 1
        L = 1
        for j in range(1,n+1):
            h = j + 1
            Rel = 1-pf[j-1]
            if nk < j:
                L = h - nk
            if k < j:
                A[m] = A[m] + A[k]*Rel
                h = k
            for i in range(h, L-1, -1):
                A[i] = A[i] + (A[i-1]-A[i])*Rel
        PF_sys = 1-A[m]
        return PF_sys  
    
    def immediate_cost(self, B, a, B_, drate): # immediate reward (-cost), based on current damage state and action#
        cost_system = 0
        PF = np.zeros((1,1))
        PF = B[:,-1]
        PF_ = np.zeros((1,1))
        PF_ = B_[:,-1].copy()
        for i in range(self.ncomp):
            if a[i]==1:
                cost_system += -1
                Bplus = self.P[a[i],i,drate[i,0]].T.dot(B[i,:])
                PF_[i] = Bplus[-1]         
            elif a[i]==2:
                cost_system +=  - 20
        if self.ncomp < 2: # single component setting
            PfSyS_ = PF_
            PfSyS = PF
        else:
            PfSyS_ = self.pf_sys(PF_, self.ncomp-1) 
            PfSyS = self.pf_sys(PF, self.ncomp-1) 
        if PfSyS_ < PfSyS:
            cost_system += PfSyS_*(-10000)
        else:
            cost_system += (PfSyS_-PfSyS)*(-10000) 
        return cost_system
    
    def belief_update(self, b, a, drate):  # Bayesian belief update based on previous belief, current observation, and action taken
        b_prime = np.zeros((self.ncomp, self.nstcomp))
        b_prime[:] = b
        ob = np.zeros(self.ncomp)
        drate_prime = np.zeros((self.ncomp, 1), dtype=int)
        for i in range(self.ncomp):
            p1 = self.P[a[i],i,drate[i,0]].T.dot(b_prime[i,:])  # environment transition
            b_prime[i,:] = p1
            drate_prime[i, 0] = drate[i, 0] + 1
            ob[i] = 2
            if a[i]==1:
                Obs0 = np.sum(p1* self.O[a[i],i,:,0])
                Obs1 = 1 - Obs0
                if Obs1 < 1e-5:
                    ob[i] = 0
                else:
                    ob_dist = np.array([Obs0, Obs1])
                    ob[i] = np.random.choice(range(0,self.nobs), size=None, replace=True, p=ob_dist)           
                b_prime[i,:] = p1* self.O[a[i],i,:,int(ob[i])]/(p1.dot(self.O[a[i],i,:,int(ob[i])])) # belief update
            if a[i] == 2:
                drate_prime[i, 0] = 0
        return ob, b_prime, drate_prime
    
    def one_hot_drate(self, drate, ep_length):
        ohDrate = np.zeros((self.ncomp, ep_length), dtype=int)
        for i in range(self.ncomp):
            ohDrate[i, drate[i][0]] = 1
        return ohDrate

    def one_hot_time(self, time, ep_length):
        ohTime = np.zeros((ep_length), dtype=int)
        ohTime[time] = 1
        return ohTime

In [210]:
strucMA_heur = StructMA()
strucMA_heur.reset()

{'agent_0': array([1.052000e-04, 5.500000e-05, 8.660000e-05, 1.261000e-04,
        2.006000e-04, 3.173000e-04, 4.853000e-04, 7.444000e-04,
        1.138400e-03, 1.783100e-03, 2.713600e-03, 4.235700e-03,
        6.473200e-03, 1.002420e-02, 1.530330e-02, 2.316180e-02,
        3.453640e-02, 5.087030e-02, 7.324320e-02, 1.008326e-01,
        1.309823e-01, 1.539425e-01, 1.567708e-01, 1.275575e-01,
        7.401660e-02, 2.583390e-02, 4.230100e-03, 2.268000e-04,
        3.200000e-06, 0.000000e+00, 1.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 1.00000

### Initialization of the environment

In [184]:
strucMA_heur = StructMA()

In [175]:
config = {"config": {"components": 5} }
strucMA_heur = StructMA(config)

In [185]:
strucMA_heur.reset()

{'agent_0': array([1.052000e-04, 5.500000e-05, 8.660000e-05, 1.261000e-04,
        2.006000e-04, 3.173000e-04, 4.853000e-04, 7.444000e-04,
        1.138400e-03, 1.783100e-03, 2.713600e-03, 4.235700e-03,
        6.473200e-03, 1.002420e-02, 1.530330e-02, 2.316180e-02,
        3.453640e-02, 5.087030e-02, 7.324320e-02, 1.008326e-01,
        1.309823e-01, 1.539425e-01, 1.567708e-01, 1.275575e-01,
        7.401660e-02, 2.583390e-02, 4.230100e-03, 2.268000e-04,
        3.200000e-06, 0.000000e+00, 1.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00,
        0.000000e+00, 1.00000

+ DN => -12.22

In [187]:
agent_list = []
for i in range(strucMA_heur.ncomp):
    item = "agent_"+ str(i)
    agent_list.append(item)
# print(agent_list)
act = {}
for i in range(strucMA_heur.ncomp):
    act[agent_list[i]] = 2 # Assign action to all components
print(act)

{'agent_0': 2, 'agent_1': 2}


In [199]:
observation, reward, done, info = strucMA_heur.step(act)
reward

time: 12


{'agent_0': -40.0, 'agent_1': -40.0}

### Evaluation of the environment

In [131]:
total_rew = 0
for episodes in range(1,2):
    cum_reward = 0
    strucMA_heur.reset()
    for t in range(30):
#         if t%4 == 0:
#             action_ = 0
#         else:
#             action_ = 0
        observation, reward, done, info = strucMA_heur.step(act)
        cum_reward += reward["agent_0"]*0.95**t
        print(t, reward, cum_reward, done)
    total_rew += cum_reward
    #print(episodes, total_rew)
exp_reward = total_rew/episodes
print(exp_reward)

0 {'agent_0': -4.000133557724439e-09, 'agent_1': -4.000133557724439e-09, 'agent_2': -4.000133557724439e-09, 'agent_3': -4.000133557724439e-09, 'agent_4': -4.000133557724439e-09} -4.000133557724439e-09 {'__all__': False}
1 {'agent_0': -3.476958720938228e-06, 'agent_1': -3.476958720938228e-06, 'agent_2': -3.476958720938228e-06, 'agent_3': -3.476958720938228e-06, 'agent_4': -3.476958720938228e-06} -3.307110918449041e-06 {'__all__': False}
2 {'agent_0': -0.0001687297479513461, 'agent_1': -0.0001687297479513461, 'agent_2': -0.0001687297479513461, 'agent_3': -0.0001687297479513461, 'agent_4': -0.0001687297479513461} -0.0001555857084445389 {'__all__': False}
3 {'agent_0': -0.001843616697083661, 'agent_1': -0.001843616697083661, 'agent_2': -0.001843616697083661, 'agent_3': -0.001843616697083661, 'agent_4': -0.001843616697083661} -0.0017362565741066425 {'__all__': False}
4 {'agent_0': -0.01013486400913699, 'agent_1': -0.01013486400913699, 'agent_2': -0.01013486400913699, 'agent_3': -0.010134864

### Evaluation of the environment with the trained policy

In [7]:
total_rew = 0
for episodes in range(1,4):
    cum_reward = 0
    obs = struc_heur.reset()
    action_ = trainer.compute_single_action(obs)
    for t in range(30):
        obs, reward, done, info = struc_heur.step(action_)
        action_ = trainer.compute_single_action(obs)
        cum_reward += reward*0.95**t
        # print(t, reward, cum_reward, done)
        print(t, action_)
    total_rew += cum_reward
    #print(episodes, total_rew)
exp_reward = total_rew/episodes
print(exp_reward)

NameError: name 'trainer' is not defined

### Configuration of the trainer

In [217]:
import numpy as np
import pprint
import ray
from datetime import datetime
import tempfile
from ray.tune.logger import Logger, UnifiedLogger

# Start a new instance of Ray (when running this tutorial locally) or
# connect to an already running one (when running this tutorial through Anyscale).

ray.init()  # Hear the engine humming? ;)

# In case you encounter the following error during our tutorial: `RuntimeError: Maybe you called ray.init twice by accident?`
# Try: `ray.shutdown() + ray.init()` or `ray.init(ignore_reinit_error=True)`

{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': None,
 'object_store_address': 'tcp://127.0.0.1:56200',
 'raylet_socket_name': 'tcp://127.0.0.1:60375',
 'webui_url': None,
 'session_dir': 'C:\\Users\\user\\AppData\\Local\\Temp\\ray\\session_2022-07-06_10-39-16_480458_2200',
 'metrics_export_port': 60243,
 'gcs_address': '127.0.0.1:60291',
 'address': '127.0.0.1:60291',
 'node_id': 'e8732f9087481cf90d5cbf20fa6f5cbac25cd55ffd1d2379b56f3746'}

In [216]:
### Shutdown Ray's session
ray.shutdown() 

In [218]:
n_components = 3
PATH_logger = "D:/14_DecomposedQ_DRL/multiagent_environment/01_kOutOfN/log_files"
config_env = {"config": {"components": n_components} }

env = StructMA(config_env)
agent_list = []
for i in range(env.ncomp):
    item = "agent_"+ str(i)
    agent_list.append(item)

agent_list = []
policy_list = []
for i in range(env.ncomp):
    item = "agent_"+ str(i)
    item_ = "policy"+ str(i)
    agent_list.append(item)
    policy_list.append(item_)

policies = {}
mapping_agent2policy = {}
# print(env.ncomp)
for i in range(env.ncomp):
    mapping_agent2policy[agent_list[i]] = policy_list[i]
    policies[policy_list[i]] = (None, env.observation_space, env.action_space, {})

# Define an agent->policy mapping function.
# Which agents (defined by the environment) use which policies (defined by us)?
# The mapping here is M (agents) -> N (policies), where M >= N.
def policy_mapping_fn(agent_id: str):
    return mapping_agent2policy[agent_id]

from ray.rllib.agents.dqn import DQNTrainer
# from ray.rllib.agents.ppo import PPOTrainer
# Create an RLlib Trainer instance.

config={
        # Env class to use (here: our gym.Env sub-class from above).
        "env": StructMA,
        
        "env_config": {
            "config": {"components": n_components},
        },
        # Number of steps after which the episode is forced to terminate. Defaults
        # to `env.spec.max_episode_steps` (if present) for Gym envs.
        "horizon": 30,
        # Parallelize environment rollouts.
        "num_workers": 4,
        # Discount factor of the MDP.
        "gamma": 0.95,
        
        # https://github.com/ray-project/ray/blob/releases/1.11.1/rllib/models/catalog.py
        # FullyConnectedNetwork (tf and torch): rllib.models.tf|torch.fcnet.py
        # These are used if no custom model is specified and the input space is 1D.
        # Number of hidden layers to be used.
        # Activation function descriptor.
        # Supported values are: "tanh", "relu", "swish" (or "silu"),
        # "linear" (or None).
        "model": {
            "fcnet_hiddens": [100],
            "fcnet_activation": "relu"
        },
        
        "create_env_on_driver": True,
        
        #"evaluation_interval": 2,
        "evaluation_num_workers": 1,
        "evaluation_duration": 50,
    
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
            # We'll leave this empty: Means, we train both policy1 and policy2.
            # "policies_to_train": policies_to_train,
        },
        # === Deep Learning Framework Settings ===
        # tf: TensorFlow (static-graph)
        # tf2: TensorFlow 2.x (eager or traced, if eager_tracing=True)
        # tfe: TensorFlow eager (or traced, if eager_tracing=True)
        # torch: PyTorch
#         "framework": "torch",
    }

### Defaut logger creator ###
def custom_log_creator(custom_path, custom_str):

    timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
    logdir_prefix = "{}_{}".format(custom_str, timestr)
    def logger_creator(config):

        if not os.path.exists(custom_path):
            os.makedirs(custom_path)
        logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=custom_path)
        print(logdir)
        return UnifiedLogger(config, logdir, loggers=None)

    return logger_creator

trainer = DQNTrainer(config=config, logger_creator=custom_log_creator(PATH_logger, 'dqn') )
# trainer = PPOTrainer(config=config)



D:/14_DecomposedQ_DRL/multiagent_environment/01_kOutOfN/log_files\dqn_2022-07-06_10-39-22500qcail


 pid=14536)[0m Instructions for updating:
 pid=14536)[0m If using Keras pass *_constraint arguments to layers.
 pid=11752)[0m Instructions for updating:
 pid=11752)[0m If using Keras pass *_constraint arguments to layers.
 pid=972)[0m Instructions for updating:
 pid=972)[0m If using Keras pass *_constraint arguments to layers.
 pid=32864)[0m Instructions for updating:
 pid=32864)[0m If using Keras pass *_constraint arguments to layers.


Train policy and conduct evaluations periodically

## Training and evaluating specific policies

In [219]:
for i in range(100):
    results = trainer.train()
    #if i%100==0:
    #trainer.export_policy_model("D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel")
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
    print(f"Iter (policy_1): {i}; avg. reward={results['policy_reward_mean']['policy1']}")
    print(f"Iter (policy_2): {i}; avg. reward={results['policy_reward_mean']['policy2']}")
    #print(f"Iter: {i}; evaluation={results['evaluation']['episode_reward_mean']}")
    
    if i%5==0:
        evaluat = trainer.evaluate()
        print(evaluat['evaluation']['episode_reward_mean'])
        print(evaluat['evaluation']['policy_reward_mean']['policy1'])
        print(evaluat['evaluation']['policy_reward_mean']['policy2'])
#         print(f"Iter: {i}; evaluation={results['evaluation']['episode_reward_mean']}")
        
''' export policy checkpoint
def export_policy_checkpoint(
            self,
            export_dir: str,
            filename_prefix: str = "model",
            policy_id: PolicyID = DEFAULT_POLICY_ID,
    )   
'''
PATH_model = "D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel"

# trainer.export_policy_checkpoint(PATH_model, filename_prefix='modelx')

# trainer.save_checkpoint(PATH_model)

[2m[36m(RolloutWorker pid=7840)[0m Instructions for updating:
[2m[36m(RolloutWorker pid=7840)[0m If using Keras pass *_constraint arguments to layers.


Iter: 0; avg. reward=-1850.5717400658423
Iter (policy_1): 0; avg. reward=-616.857246688614
Iter (policy_2): 0; avg. reward=-616.857246688614




-657.6189208802923
-219.20630696009738
-219.20630696009738
Iter: 1; avg. reward=-1758.9365479876046
Iter (policy_1): 1; avg. reward=-586.312182662535
Iter (policy_2): 1; avg. reward=-586.312182662535
Iter: 2; avg. reward=-1666.9854081368499
Iter (policy_1): 2; avg. reward=-555.6618027122834
Iter (policy_2): 2; avg. reward=-555.6618027122834
Iter: 3; avg. reward=-1519.7199700226877
Iter (policy_1): 3; avg. reward=-506.57332334089597
Iter (policy_2): 3; avg. reward=-506.57332334089597
Iter: 4; avg. reward=-1301.1130152443181
Iter (policy_1): 4; avg. reward=-433.70433841477274
Iter (policy_2): 4; avg. reward=-433.70433841477274
Iter: 5; avg. reward=-1107.836623509402
Iter (policy_1): 5; avg. reward=-369.278874503134
Iter (policy_2): 5; avg. reward=-369.278874503134
-379.9407269595361
-126.64690898651205
-126.64690898651205
Iter: 6; avg. reward=-894.6071676977386
Iter (policy_1): 6; avg. reward=-298.2023892325796
Iter (policy_2): 6; avg. reward=-298.2023892325796
Iter: 7; avg. reward=-706.

Iter: 53; avg. reward=-150.49223309584778
Iter (policy_1): 53; avg. reward=-50.16407769861591
Iter (policy_2): 53; avg. reward=-50.16407769861591
Iter: 54; avg. reward=-160.45241814363254
Iter (policy_1): 54; avg. reward=-53.484139381210845
Iter (policy_2): 54; avg. reward=-53.484139381210845
Iter: 55; avg. reward=-173.8483921511862
Iter (policy_1): 55; avg. reward=-57.94946405039539
Iter (policy_2): 55; avg. reward=-57.94946405039539
-155.7830156390995
-51.92767187969984
-51.92767187969984
Iter: 56; avg. reward=-173.2117945779884
Iter (policy_1): 56; avg. reward=-57.737264859329464
Iter (policy_2): 56; avg. reward=-57.737264859329464
Iter: 57; avg. reward=-177.8944160366257
Iter (policy_1): 57; avg. reward=-59.29813867887522
Iter (policy_2): 57; avg. reward=-59.29813867887522
Iter: 58; avg. reward=-169.8781397675141
Iter (policy_1): 58; avg. reward=-56.62604658917137
Iter (policy_2): 58; avg. reward=-56.62604658917137
Iter: 59; avg. reward=-170.24893401596557
Iter (policy_1): 59; avg.

In [226]:
results

{'episode_reward_max': -513.226132756993,
 'episode_reward_min': -1638.0075702959427,
 'episode_reward_mean': -1086.9377084412256,
 'episode_len_mean': 30.0,
 'episode_media': {},
 'episodes_this_iter': 34,
 'policy_reward_min': {'policy0': -546.0025234319812,
  'policy1': -546.0025234319812,
  'policy2': -546.0025234319812},
 'policy_reward_max': {'policy0': -171.07537758566437,
  'policy1': -171.07537758566437,
  'policy2': -171.07537758566437},
 'policy_reward_mean': {'policy0': -362.3125694804086,
  'policy1': -362.3125694804086,
  'policy2': -362.3125694804086},
 'custom_metrics': {},
 'hist_stats': {'episode_reward': [-882.4683286363015,
   -1566.0026840314174,
   -1359.0015357448517,
   -1635.0052196033155,
   -1017.056338913421,
   -1467.0024147505567,
   -1638.0075702959427,
   -1242.0067186706358,
   -1617.0004678286161,
   -1236.1355738702478,
   -979.55606251021,
   -1083.0196116272584,
   -1548.006079969102,
   -1263.0096808305107,
   -1212.0094545178354,
   -1146.19272644

In [None]:
# del results["config"]
pprint.pprint(results)

## Relevant methods => Check policy

In [None]:
policy = trainer.get_policy()

In [None]:
model = trainer.get_policy().model
model

In [None]:
# Print out the policy's action and observation spaces.
print(f"Our Policy's observation space is: {policy.observation_space}")
print(f"Our Policy's action space is: {policy.action_space}")

In [None]:
# Produce a random obervation (B=1; batch of size 1).
obs = np.array([policy.observation_space.sample()])
# Alternatively for PyTorch:
#import torch
#obs = torch.from_numpy(obs)
obs

In [None]:
logits, _ = model({"obs": obs})
logits

In [None]:
logits_np = policy.get_session().run(logits)
logits_np.shape

In [None]:
from ray.rllib.utils.numpy import softmax
action_probs = np.squeeze(softmax(logits_np))
action_probs

## Action scores

In [None]:
(action_scores, logits, dist) = model.get_q_value_distributions(model_out)

In [None]:
action_scores.graph

### Mapping policies and agents ids

In [218]:
# Define the policies definition dict:
# Each policy in there is defined by its ID (key) mapping to a 4-tuple (value):
# - Policy class (None for using the "default" class, e.g. PPOTFPolicy for PPO+tf or PPOTorchPolicy for PPO+torch).
# - obs-space (we get this directly from our already created env object).
# - act-space (we get this directly from our already created env object).
# - config-overrides dict (leave empty for using the Trainer's config as-is)

config_env = {"config": {"components": 4} }

env = StructMA(config_env)
agent_list = []
for i in range(env.ncomp):
    item = "agent_"+ str(i)
    agent_list.append(item)

agent_list = []
policy_list = []
for i in range(env.ncomp):
    item = "agent_"+ str(i)
    item_ = "policy"+ str(i)
    agent_list.append(item)
    policy_list.append(item_)

policies = {}
mapping_agent2policy = {}
print(env.ncomp)
for i in range(env.ncomp):
    mapping_agent2policy[agent_list[i]] = policy_list[i]
    policies[policy_list[i]] = (None, env.observation_space, env.action_space, {})

# Define an agent->policy mapping function.
# Which agents (defined by the environment) use which policies (defined by us)?
# The mapping here is M (agents) -> N (policies), where M >= N.
def policy_mapping_fn(agent_id: str):
    return mapping_agent2policy[agent_id]

4


In [212]:
policies

{'policy0': (None,
  Box([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0.], [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
   1. 1. 1. 1. 1. 1. 1.], (31,), float64),
  Discrete(3),
  {}),
 'policy1': (None,
  Box([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0.], [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
   1. 1. 1. 1. 1. 1. 1.], (31,), float64),
  Discrete(3),
  {}),
 'policy2': (None,
  Box([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0.], [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
   1. 1. 1. 1. 1. 1. 1.], (31,), float64),
  Discrete(3),
  {}),
 'policy3': (None,
  Box([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0.], [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
   1. 1. 1. 1. 1.

In [180]:
policy_mapping_fn('agent_0')

'policy0'

## If Ray does not start...

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"