In [1]:
import gym
import matplotlib.pyplot as plt 
import scipy.io as spio
import numpy as np
# import gym_strucSA

In [137]:
class StructSA(gym.Env):

    def __init__(self, config=None):
        empty_config = {"config": {"components": 2} }
        config = config or empty_config
        # Number of components #
        self.ncomp = config['config'].get("components", 2)
        self.time = 0
        self.ep_length = 30
        self.nstcomp = 30
        self.nobs = 2
        self.actions_total = int(3**self.ncomp)
        self.obs_total = int(self.ncomp*30 + 1)

        # configure spaces
        self.action_space = gym.spaces.Discrete(self.actions_total)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(self.obs_total,), dtype=np.float64)
        ### Loading the underlying POMDP model ###
        drmodel = np.load('Dr3031C10.npz')
        self.belief0 = drmodel['belief0'][0,0:self.ncomp,:,0] # (10 components, 30 crack states)
        self.P = drmodel['P'][:,0:self.ncomp,:,:,:] # (3 actions, 10 components, 31 det rates, 30 cracks, 30 cracks)
        self.O = drmodel['O'][:,0:self.ncomp,:,:] # (3 actions, 10 components, 30 cracks, 2 observations)
            
    def reset(self, seed=None, return_info=False, options=None):
        # We need the following line to seed self.np_random
        # super().reset(seed=seed)

        # Choose the agent's belief
        self.time_step = 0
        self.agent_belief = self.belief0
        self.drate = np.zeros((self.ncomp, 1), dtype=int)
        
        observation = np.concatenate( ((self.agent_belief).reshape(self.obs_total - 1), [self.time_step/30]) )
        info = {"belief": self.agent_belief}
        return (observation, info) if return_info else observation
    
    def step(self, action, return_info=False):
        action_ = np.zeros(1, dtype=int)
        action_ = action
        action_ = self.convert_base_action(action_, 3, self.ncomp)
        observation_, belief_prime, drate_prime = self.belief_update(self.agent_belief, action_, self.drate)
        observation = np.concatenate( (belief_prime.reshape(self.obs_total - 1), [self.time_step/30]) )
        reward_ = self.immediate_cost(self.agent_belief, action_, belief_prime, self.drate)
        reward = reward_.item() #Convert float64 to float
        self.time_step += 1 
        self.agent_belief = belief_prime
        self.drate = drate_prime
        # An episode is done if the agent has reached the target
        done = np.array_equal(self.time_step, self.ep_length)
        info = {"belief": self.agent_belief}
        return (observation, reward, done, info) 
    
    
    def pf_sys(self, pf, k): # compute pf_sys for k-out-of-n components 
        n = pf.size
        # k = ncomp-1
        PF_sys = np.zeros(1)
        nk = n-k
        m = k+1
        A = np.zeros(m+1)
        A [1] = 1
        L = 1
        for j in range(1,n+1):
            h = j + 1
            Rel = 1-pf[j-1]
            if nk < j:
                L = h - nk
            if k < j:
                A[m] = A[m] + A[k]*Rel
                h = k
            for i in range(h, L-1, -1):
                A[i] = A[i] + (A[i-1]-A[i])*Rel
        PF_sys = 1-A[m]
        return PF_sys  
    
    def immediate_cost(self, B, a, B_, drate): # immediate reward (-cost), based on current damage state and action#
        cost_system = 0
        PF = np.zeros((1,1))
        PF = B[:,-1]
        PF_ = np.zeros((1,1))
        PF_ = B_[:,-1].copy()
        for i in range(self.ncomp):
            if a[i]==1:
                cost_system += -1
                Bplus = self.P[a[i],i,drate[i,0]].T.dot(B[i,:])
                PF_[i] = Bplus[-1]         
            elif a[i]==2:
                cost_system +=  - 20
        if self.ncomp < 2: # single component setting
            PfSyS_ = PF_
            PfSyS = PF
        else:
            PfSyS_ = self.pf_sys(PF_, self.ncomp-1) 
            PfSyS = self.pf_sys(PF, self.ncomp-1) 
        if PfSyS_ < PfSyS:
            cost_system += PfSyS_*(-10000)
        else:
            cost_system += (PfSyS_-PfSyS)*(-10000) 
        return cost_system
    
    def belief_update(self, b, a, drate):  # Bayesian belief update based on previous belief, current observation, and action taken
        b_prime = np.zeros((self.ncomp, self.nstcomp))
        b_prime[:] = b
        ob = np.zeros(self.ncomp)
        drate_prime = np.zeros((self.ncomp, 1), dtype=int)
        for i in range(self.ncomp):
            p1 = self.P[a[i],i,drate[i,0]].T.dot(b_prime[i,:])  # environment transition
            b_prime[i,:] = p1
            drate_prime[i, 0] = drate[i, 0] + 1
            ob[i] = 2
            if a[i]==1:
                Obs0 = np.sum(p1* self.O[a[i],i,:,0])
                Obs1 = 1 - Obs0
                if Obs1 < 1e-5:
                    ob[i] = 0
                else:
                    ob_dist = np.array([Obs0, Obs1])
                    ob[i] = np.random.choice(range(0,self.nobs), size=None, replace=True, p=ob_dist)           
                b_prime[i,:] = p1* self.O[a[i],i,:,int(ob[i])]/(p1.dot(self.O[a[i],i,:,int(ob[i])])) # belief update
            if a[i] == 2:
                drate_prime[i, 0] = 0
        return ob, b_prime, drate_prime
    
    def convert_base_action(self, action_, base, comp):
        action_multi = np.zeros((comp,), dtype=int)
        if action_ == 0:
                return action_multi
        digits = []
        index_comp = int(comp) - 1 
        while action_:
            digits = (int(action_ % base))
            action_multi[index_comp] = digits
            action_ //= base
            index_comp -= 1
        return action_multi

### Initialization of the environment

In [138]:
struc_heur = StructSA()

In [99]:
config = {"config": {"components": 5} }
struc_heur = StructSA(config)

In [139]:
struc_heur.reset()

array([1.052000e-04, 5.500000e-05, 8.660000e-05, 1.261000e-04,
       2.006000e-04, 3.173000e-04, 4.853000e-04, 7.444000e-04,
       1.138400e-03, 1.783100e-03, 2.713600e-03, 4.235700e-03,
       6.473200e-03, 1.002420e-02, 1.530330e-02, 2.316180e-02,
       3.453640e-02, 5.087030e-02, 7.324320e-02, 1.008326e-01,
       1.309823e-01, 1.539425e-01, 1.567708e-01, 1.275575e-01,
       7.401660e-02, 2.583390e-02, 4.230100e-03, 2.268000e-04,
       3.200000e-06, 0.000000e+00, 1.052000e-04, 5.500000e-05,
       8.660000e-05, 1.261000e-04, 2.006000e-04, 3.173000e-04,
       4.853000e-04, 7.444000e-04, 1.138400e-03, 1.783100e-03,
       2.713600e-03, 4.235700e-03, 6.473200e-03, 1.002420e-02,
       1.530330e-02, 2.316180e-02, 3.453640e-02, 5.087030e-02,
       7.324320e-02, 1.008326e-01, 1.309823e-01, 1.539425e-01,
       1.567708e-01, 1.275575e-01, 7.401660e-02, 2.583390e-02,
       4.230100e-03, 2.268000e-04, 3.200000e-06, 0.000000e+00,
       0.000000e+00])

+ DN => -12.22

In [None]:
act = 0
observation, reward, done, info = struc_heur.step(act)
reward

### Evaluation of the environment

In [8]:
total_rew = 0
for episodes in range(1,2):
    cum_reward = 0
    struc_heur.reset()
    for t in range(30):
        if t%4 == 0:
            action_ = 0
        else:
            action_ = 0
        observation, reward, done, info = struc_heur.step(action_)
        cum_reward += reward*0.95**t
        print(t, reward, cum_reward, done)
    total_rew += cum_reward
    #print(episodes, total_rew)
exp_reward = total_rew/episodes
print(exp_reward)

0 -4.000133557724439e-09 -4.000133557724439e-09 False
1 -3.476958720938228e-06 -3.307110918449041e-06 False
2 -0.0001687297479513461 -0.0001555857084445389 False
3 -0.001843616697083661 -0.0017362565741066425 False
4 -0.01013486400913699 -0.009991166652448777 False
5 -0.03780590645985349 -0.039244656395991506 False
6 -0.09920204552260614 -0.11216727559307135 False
7 -0.2238554200961751 -0.2684938643789647 False
8 -0.4413550902615526 -0.5612978487119069 False
9 -0.7593812539308242 -1.0398974357577424 False
10 -1.2462156899584187 -1.7860528035942904 False
11 -1.8981349784419344 -2.865712154485238 False
12 -2.744313284887001 -4.348629521680516 False
13 -3.7775442486709387 -6.287801955973768 False
14 -5.008365820033944 -8.730256652661753 False
15 -6.437940208906534 -11.712897891741 False
16 -8.10441202881651 -15.279865759365304 False
17 -9.941630547547486 -19.43666365653107 False
18 -11.958771886060315 -24.18685908084982 False
19 -14.14826851551143 -29.525759174814922 False
20 -16.55267711

### Evaluation of the environment with the trained policy

In [None]:
total_rew = 0
for episodes in range(1,4):
    cum_reward = 0
    obs = struc_heur.reset()
    action_ = trainer.compute_single_action(obs)
    for t in range(30):
        obs, reward, done, info = struc_heur.step(action_)
        action_ = trainer.compute_single_action(obs)
        cum_reward += reward*0.95**t
        # print(t, reward, cum_reward, done)
        print(t, action_)
    total_rew += cum_reward
    #print(episodes, total_rew)
exp_reward = total_rew/episodes
print(exp_reward)

### Configuration of the trainer

In [141]:
import numpy as np
import pprint
import ray

# Start a new instance of Ray (when running this tutorial locally) or
# connect to an already running one (when running this tutorial through Anyscale).

ray.init()  # Hear the engine humming? ;)

# In case you encounter the following error during our tutorial: `RuntimeError: Maybe you called ray.init twice by accident?`
# Try: `ray.shutdown() + ray.init()` or `ray.init(ignore_reinit_error=True)`

{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': None,
 'object_store_address': 'tcp://127.0.0.1:65094',
 'raylet_socket_name': 'tcp://127.0.0.1:62789',
 'webui_url': None,
 'session_dir': 'C:\\Users\\user\\AppData\\Local\\Temp\\ray\\session_2022-07-03_18-29-37_429622_11440',
 'metrics_export_port': 63003,
 'gcs_address': '127.0.0.1:63263',
 'address': '127.0.0.1:63263',
 'node_id': '9f515986cdb5851653b526151abfd8fc3e7036af6568838d06aef1b3'}

In [140]:
### Shutdown Ray's session
ray.shutdown() 

In [142]:
from ray.rllib.agents.dqn import DQNTrainer
# Create an RLlib Trainer instance.

config={
        # Env class to use (here: our gym.Env sub-class from above).
        "env": StructSA,
        
        "env_config": {
            "config": {"components": 3},
        },
        # Number of steps after which the episode is forced to terminate. Defaults
        # to `env.spec.max_episode_steps` (if present) for Gym envs.
        "horizon": 30,
        # Parallelize environment rollouts.
        "num_workers": 1,
        # Discount factor of the MDP.
        "gamma": 0.95,
        
        # https://github.com/ray-project/ray/blob/releases/1.11.1/rllib/models/catalog.py
        # FullyConnectedNetwork (tf and torch): rllib.models.tf|torch.fcnet.py
        # These are used if no custom model is specified and the input space is 1D.
        # Number of hidden layers to be used.
        # Activation function descriptor.
        # Supported values are: "tanh", "relu", "swish" (or "silu"),
        # "linear" (or None).
        "model": {
            "fcnet_hiddens": [50],
            "fcnet_activation": "relu"
        },
        
        "create_env_on_driver": True,
        
        #"evaluation_interval": 2,
        "evaluation_num_workers": 1,
        "evaluation_duration": 50,
        # === Deep Learning Framework Settings ===
        # tf: TensorFlow (static-graph)
        # tf2: TensorFlow 2.x (eager or traced, if eager_tracing=True)
        # tfe: TensorFlow eager (or traced, if eager_tracing=True)
        # torch: PyTorch
#         "framework": "torch",
    }

trainer = DQNTrainer(config=config)

 pid=25916)[0m Instructions for updating:
 pid=25916)[0m If using Keras pass *_constraint arguments to layers.
 pid=7992)[0m Instructions for updating:
 pid=7992)[0m If using Keras pass *_constraint arguments to layers.


Train policy and conduct evaluations periodically

In [143]:
for i in range(6):
    results = trainer.train()
    #if i%100==0:
    #trainer.export_policy_model("D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel")
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
    #print(f"Iter: {i}; evaluation={results['evaluation']['episode_reward_mean']}")
    
    if i%5==0:
        evaluat = trainer.evaluate()
        print(evaluat['evaluation']['episode_reward_mean'])
#         print(f"Iter: {i}; evaluation={results['evaluation']['episode_reward_mean']}")
        
''' export policy checkpoint
def export_policy_checkpoint(
            self,
            export_dir: str,
            filename_prefix: str = "model",
            policy_id: PolicyID = DEFAULT_POLICY_ID,
    )   
'''
PATH_model = "D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel"

# trainer.export_policy_checkpoint(PATH_model, filename_prefix='modelx')

# trainer.save_checkpoint(PATH_model)

Iter: 0; avg. reward=-610.0629631788396
-213.54618459333696
Iter: 1; avg. reward=-574.8093000168545
Iter: 2; avg. reward=-542.9833661008147
Iter: 3; avg. reward=-470.7264597790566
Iter: 4; avg. reward=-401.3604766520824
Iter: 5; avg. reward=-336.88161575437755
-115.61829361948274


In [145]:
del results["config"]
pprint.pprint(results)

{'agent_timesteps_total': 6000,
 'custom_metrics': {},
 'date': '2022-07-03_18-30-31',
 'done': False,
 'episode_len_mean': 30.0,
 'episode_media': {},
 'episode_reward_max': -178.17161322120927,
 'episode_reward_mean': -336.88161575437755,
 'episode_reward_min': -607.0112646335048,
 'episodes_this_iter': 34,
 'episodes_total': 200,
 'experiment_id': '47066dc8bb4c46d8ab8e03c9d9023d69',
 'hist_stats': {'episode_lengths': [30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
                                    30,
           

## Relevant methods => Check policy

In [122]:
policy = trainer.get_policy()

In [120]:
model = trainer.get_policy().model
model

<ray.rllib.models.catalog.FullyConnectedNetwork_as_DistributionalQTFModel at 0x1e51c93fac8>

In [123]:
# Print out the policy's action and observation spaces.
print(f"Our Policy's observation space is: {policy.observation_space}")
print(f"Our Policy's action space is: {policy.action_space}")

Our Policy's observation space is: Box([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.], [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.

In [125]:
# Produce a random obervation (B=1; batch of size 1).
obs = np.array([policy.observation_space.sample()])
# Alternatively for PyTorch:
#import torch
#obs = torch.from_numpy(obs)
obs

array([[4.49647475e-01, 1.98659094e-01, 9.97855666e-01, 9.67534702e-01,
        5.73481555e-01, 7.04307302e-01, 5.62468851e-01, 3.08130601e-01,
        8.91871071e-01, 1.51253779e-01, 1.50967371e-02, 2.00555386e-01,
        3.89876109e-01, 7.89557344e-01, 9.75530714e-01, 2.18778612e-03,
        2.34528466e-01, 3.85462266e-01, 1.82480210e-01, 2.87438213e-01,
        3.66434182e-01, 1.46003751e-01, 3.78857642e-01, 9.95665012e-01,
        7.89338130e-01, 9.12188021e-01, 3.20778832e-02, 4.80002429e-01,
        9.09792072e-01, 4.44335792e-01, 6.79965002e-01, 8.73256638e-01,
        1.41052361e-01, 4.41963088e-02, 3.08225769e-02, 9.84142132e-01,
        2.55747697e-01, 9.19058891e-01, 8.91983720e-01, 8.95172290e-01,
        1.43027340e-02, 2.80865647e-02, 9.87832092e-01, 2.34981230e-01,
        6.23564863e-01, 6.41698085e-01, 9.37441152e-01, 2.82349040e-01,
        6.71407780e-01, 5.00220609e-04, 8.60605001e-01, 7.10283511e-01,
        6.04492286e-01, 6.44309111e-01, 6.30302662e-01, 5.554974

In [127]:
logits, _ = model({"obs": obs})
logits

<tf.Tensor 'model_1/fc_out/Relu:0' shape=(1, 50) dtype=float32>

In [130]:
logits_np = policy.get_session().run(logits)
logits_np.shape

(1, 50)

In [133]:
from ray.rllib.utils.numpy import softmax
action_probs = np.squeeze(softmax(logits_np))
action_probs

array([9.99999997e-07, 2.52637593e-03, 9.99999997e-07, 3.36667709e-02,
       9.99999997e-07, 7.53862550e-03, 1.16389645e-02, 1.10806234e-01,
       9.99999997e-07, 9.99999997e-07, 5.46629541e-03, 9.99999997e-07,
       5.45822131e-03, 9.99999997e-07, 9.99999997e-07, 5.21160720e-04,
       9.67060104e-02, 9.99999997e-07, 6.54321313e-02, 9.99999997e-07,
       9.99999997e-07, 9.99999997e-07, 2.38046553e-02, 4.99389060e-02,
       4.55672555e-02, 3.84213141e-04, 1.47604469e-05, 5.44570908e-02,
       9.99999997e-07, 2.57992037e-02, 5.26364446e-02, 9.99999997e-07,
       1.95286050e-02, 9.09144001e-06, 9.13316663e-03, 6.31047934e-02,
       1.87054388e-02, 7.38888932e-03, 9.99999997e-07, 1.95658319e-02,
       9.99999997e-07, 1.11107498e-01, 9.99999997e-07, 9.99999997e-07,
       1.08681709e-01, 9.99999997e-07, 5.03034182e-02, 9.99999997e-07,
       9.99999997e-07, 1.07398111e-04], dtype=float32)

## Action scores

In [None]:
(action_scores, logits, dist) = model.get_q_value_distributions(model_out)

In [None]:
action_scores.graph

## Definition of action transformation (change of base)

In [None]:
def convert_base_action(action_, base, comp):
    action_multi = np.zeros((comp,), dtype=int)
    if action_ == 0:
            return action_multi
    digits = []
    index_comp = int(comp) - 1 
    while action_:
        digits = (int(action_ % base))
        #print(digits)
        action_multi[index_comp] = digits
        action_ //= base
        index_comp -= 1
        #print(index_comp, action_multi)
    return action_multi

# Testing
# action_test = convert_base_action(5, 3, 2)
# action_test

## If Ray does not start...

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

### To be investigated

https://docs.ray.io/en/latest/rllib/rllib-training.html#accessing-policy-state

help(trainer.get_policy)

In [None]:
trainer.get_policy().export_model(PATH_model+'test')

In [None]:
import tensorflow as tf
rr = trainer.get_policy().q_values

In [None]:
dir(trainer.get_policy())

In [None]:
model.save('my_model.h5')

In [None]:
ee = model.value_function()

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(action_scores)

In [None]:
action_scores.numpy()

In [None]:
help(model.base_model)

In [None]:
model.base_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),loss='mse')

In [None]:
model.base_model.summary()

In [None]:
model_out = model.__call__({"obs": np.array([struc_heur.reset()])})
model_out

In [None]:
with tf.Session():
    model_out[0].eval()

In [None]:
dir(model_out[0])

In [None]:
sess = tf.Session()
sess.run(model_out[0])

Evaluate the trained policy

In [None]:
evaluation = trainer.evaluate()
evaluation

Storing and restoring checkpoint

In [None]:
PATH_model = "D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel"
trainer.save(PATH_model)

In [None]:
trainer.restore(PATH_model+'/checkpoint_000000/checkpoint-0')

Get action from the trained policy

In [9]:
trainer.compute_single_action(struc_heur.reset())

2022-07-03 17:39:01,663	ERROR tf_run_builder.py:47 -- Error fetching: [<tf.Tensor 'default_policy/cond/Merge:0' shape=(?,) dtype=int64>, {'action_prob': <tf.Tensor 'default_policy/Exp:0' shape=(?,) dtype=float32>, 'action_logp': <tf.Tensor 'default_policy/zeros_like_1:0' shape=(?,) dtype=float32>, 'action_dist_inputs': <tf.Tensor 'default_policy/add_2:0' shape=(?, 6) dtype=float32>, 'q_values': <tf.Tensor 'default_policy/add_2:0' shape=(?, 6) dtype=float32>}], feed_dict={<tf.Tensor 'default_policy/obs:0' shape=(?, 61) dtype=float32>: array([[1.052000e-04, 5.500000e-05, 8.660000e-05, 1.261000e-04,
        2.006000e-04, 3.173000e-04, 4.853000e-04, 7.444000e-04,
        1.138400e-03, 1.783100e-03, 2.713600e-03, 4.235700e-03,
        6.473200e-03, 1.002420e-02, 1.530330e-02, 2.316180e-02,
        3.453640e-02, 5.087030e-02, 7.324320e-02, 1.008326e-01,
        1.309823e-01, 1.539425e-01, 1.567708e-01, 1.275575e-01,
        7.401660e-02, 2.583390e-02, 4.230100e-03, 2.268000e-04,
        3.20

ValueError: Cannot feed value of shape (1, 151) for Tensor 'default_policy/obs:0', which has shape '(?, 61)'

Load checkpoint (it required setting up the same configuration as during training)

In [None]:
trainer.load_checkpoint("D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel/checkpoint-52")

Running from the console

In [None]:
!rllib train --run DQN --env CartPole-v0

Additional test code

In [None]:
for i in range(1):
    results = trainer.train()
    #if i%100==0:
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")

In [None]:
from ray.rllib.algorithms.ppo import PPO

In [None]:
dir(agents.qmix.QMixTrainer)