In [1]:
import gym
import matplotlib.pyplot as plt 
import scipy.io as spio
import numpy as np
import os

In [7]:
class StructSA(gym.Env):

    def __init__(self, config=None):
        empty_config = {"config": {"components": 2} }
        config = config or empty_config
        # Number of components #
        self.ncomp = config['config'].get("components", 2)
        self.time = 0
        self.ep_length = 30
        self.nstcomp = 30
        self.nobs = 2
        self.actions_total = int(3**self.ncomp)
        self.obs_total = int(self.ncomp*self.nstcomp + self.ncomp*31 + 31)

        # configure spaces
        self.action_space = gym.spaces.Discrete(self.actions_total)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(self.obs_total,), dtype=np.float64)
        ### Loading the underlying POMDP model ###
        drmodel = np.load('Dr3031C10.npz')
        self.belief0 = drmodel['belief0'][0,0:self.ncomp,:,0] # (10 components, 30 crack states)
        self.P = drmodel['P'][:,0:self.ncomp,:,:,:] # (3 actions, 10 components, 31 det rates, 30 cracks, 30 cracks)
        self.O = drmodel['O'][:,0:self.ncomp,:,:] # (3 actions, 10 components, 30 cracks, 2 observations)
            
    def reset(self, seed=None, return_info=False, options=None):
        # We need the following line to seed self.np_random
        # super().reset(seed=seed)

        # Choose the agent's belief
        self.time_step = 0
        self.agent_belief = self.belief0
        self.drate = np.zeros((self.ncomp, 1), dtype=int)
        
        observation = np.concatenate( ((self.agent_belief).reshape(self.nstcomp*self.ncomp), \
            ( self.one_hot_drate(self.drate, self.ep_length+1) ).reshape(31*self.ncomp), \
             self.one_hot_time(self.time_step ,self.ep_length+1)  ) )
        info = {"belief": self.agent_belief}
        return (observation, info) if return_info else observation
    
    def step(self, action, return_info=False):
        action_ = np.zeros(1, dtype=int)
        action_ = action
        action_ = self.convert_base_action(action_, 3, self.ncomp)
        observation_, belief_prime, drate_prime = self.belief_update(self.agent_belief, action_, self.drate)
        observation = np.concatenate( ((belief_prime).reshape(self.nstcomp*self.ncomp), \
            ( self.one_hot_drate(drate_prime, self.ep_length+1) ).reshape(31*self.ncomp), \
             self.one_hot_time(self.time_step+1 ,self.ep_length+1)  ) )
        reward_ = self.immediate_cost(self.agent_belief, action_, belief_prime, self.drate)
        reward = reward_.item() #Convert float64 to float
        self.time_step += 1 
        self.agent_belief = belief_prime
        self.drate = drate_prime
        # An episode is done if the agent has reached the target
        done = np.array_equal(self.time_step, self.ep_length)
        info = {"belief": self.agent_belief}
        return (observation, reward, done, info) 
    
    
    def pf_sys(self, pf, k): # compute pf_sys for k-out-of-n components 
        n = pf.size
        # k = ncomp-1
        PF_sys = np.zeros(1)
        nk = n-k
        m = k+1
        A = np.zeros(m+1)
        A [1] = 1
        L = 1
        for j in range(1,n+1):
            h = j + 1
            Rel = 1-pf[j-1]
            if nk < j:
                L = h - nk
            if k < j:
                A[m] = A[m] + A[k]*Rel
                h = k
            for i in range(h, L-1, -1):
                A[i] = A[i] + (A[i-1]-A[i])*Rel
        PF_sys = 1-A[m]
        return PF_sys  
    
    def immediate_cost(self, B, a, B_, drate): # immediate reward (-cost), based on current damage state and action#
        cost_system = 0
        PF = np.zeros((1,1))
        PF = B[:,-1]
        PF_ = np.zeros((1,1))
        PF_ = B_[:,-1].copy()
        for i in range(self.ncomp):
            if a[i]==1:
                cost_system += -1
                Bplus = self.P[a[i],i,drate[i,0]].T.dot(B[i,:])
                PF_[i] = Bplus[-1]         
            elif a[i]==2:
                cost_system +=  - 20
        if self.ncomp < 2: # single component setting
            PfSyS_ = PF_
            PfSyS = PF
        else:
            PfSyS_ = self.pf_sys(PF_, self.ncomp-1) 
            PfSyS = self.pf_sys(PF, self.ncomp-1) 
        if PfSyS_ < PfSyS:
            cost_system += PfSyS_*(-10000)
        else:
            cost_system += (PfSyS_-PfSyS)*(-10000) 
        return cost_system
    
    def belief_update(self, b, a, drate):  # Bayesian belief update based on previous belief, current observation, and action taken
        b_prime = np.zeros((self.ncomp, self.nstcomp))
        b_prime[:] = b
        ob = np.zeros(self.ncomp)
        drate_prime = np.zeros((self.ncomp, 1), dtype=int)
        for i in range(self.ncomp):
            p1 = self.P[a[i],i,drate[i,0]].T.dot(b_prime[i,:])  # environment transition
            b_prime[i,:] = p1
            drate_prime[i, 0] = drate[i, 0] + 1
            ob[i] = 2
            if a[i]==1:
                Obs0 = np.sum(p1* self.O[a[i],i,:,0])
                Obs1 = 1 - Obs0
                if Obs1 < 1e-5:
                    ob[i] = 0
                else:
                    ob_dist = np.array([Obs0, Obs1])
                    ob[i] = np.random.choice(range(0,self.nobs), size=None, replace=True, p=ob_dist)           
                b_prime[i,:] = p1* self.O[a[i],i,:,int(ob[i])]/(p1.dot(self.O[a[i],i,:,int(ob[i])])) # belief update
            if a[i] == 2:
                drate_prime[i, 0] = 0
        return ob, b_prime, drate_prime
    
    def convert_base_action(self, action_, base, comp):
        action_multi = np.zeros((comp,), dtype=int)
        if action_ == 0:
                return action_multi
        digits = []
        index_comp = int(comp) - 1 
        while action_:
            digits = (int(action_ % base))
            action_multi[index_comp] = digits
            action_ //= base
            index_comp -= 1
        return action_multi
    
    def one_hot_drate(self, drate, ep_length):
        ohDrate = np.zeros((self.ncomp, ep_length), dtype=int)
        for i in range(self.ncomp):
            ohDrate[i, drate[i][0]] = 1
        return ohDrate

    def one_hot_time(self, time, ep_length):
        ohTime = np.zeros((ep_length), dtype=int)
        ohTime[time] = 1
        return ohTime

### Initialization of the environment

In [8]:
struc_heur = StructSA()

In [5]:
config = {"config": {"components": 5} }
struc_heur = StructSA(config)

In [9]:
struc_heur.reset()

array([1.052000e-04, 5.500000e-05, 8.660000e-05, 1.261000e-04,
       2.006000e-04, 3.173000e-04, 4.853000e-04, 7.444000e-04,
       1.138400e-03, 1.783100e-03, 2.713600e-03, 4.235700e-03,
       6.473200e-03, 1.002420e-02, 1.530330e-02, 2.316180e-02,
       3.453640e-02, 5.087030e-02, 7.324320e-02, 1.008326e-01,
       1.309823e-01, 1.539425e-01, 1.567708e-01, 1.275575e-01,
       7.401660e-02, 2.583390e-02, 4.230100e-03, 2.268000e-04,
       3.200000e-06, 0.000000e+00, 1.052000e-04, 5.500000e-05,
       8.660000e-05, 1.261000e-04, 2.006000e-04, 3.173000e-04,
       4.853000e-04, 7.444000e-04, 1.138400e-03, 1.783100e-03,
       2.713600e-03, 4.235700e-03, 6.473200e-03, 1.002420e-02,
       1.530330e-02, 2.316180e-02, 3.453640e-02, 5.087030e-02,
       7.324320e-02, 1.008326e-01, 1.309823e-01, 1.539425e-01,
       1.567708e-01, 1.275575e-01, 7.401660e-02, 2.583390e-02,
       4.230100e-03, 2.268000e-04, 3.200000e-06, 0.000000e+00,
       1.000000e+00, 0.000000e+00, 0.000000e+00, 0.0000

+ DN => -12.22

In [18]:
act = 0
observation, reward, done, info = struc_heur.step(act)
observation

array([1.052000e-04, 5.500000e-05, 8.660000e-05, 1.258000e-04,
       2.009000e-04, 3.164000e-04, 4.854000e-04, 7.417000e-04,
       1.136900e-03, 1.778300e-03, 2.701200e-03, 4.202100e-03,
       6.416000e-03, 9.892800e-03, 1.504950e-02, 2.263210e-02,
       3.342290e-02, 4.873800e-02, 6.927450e-02, 9.389430e-02,
       1.200616e-01, 1.401097e-01, 1.444372e-01, 1.247854e-01,
       8.591560e-02, 4.474280e-02, 1.769880e-02, 5.987300e-03,
       2.144100e-03, 2.861900e-03, 1.052000e-04, 5.500000e-05,
       8.660000e-05, 1.259000e-04, 2.008000e-04, 3.170000e-04,
       4.854000e-04, 7.430000e-04, 1.136800e-03, 1.781100e-03,
       2.706700e-03, 4.218400e-03, 6.436100e-03, 9.955300e-03,
       1.516110e-02, 2.286210e-02, 3.391110e-02, 4.966510e-02,
       7.095350e-02, 9.689090e-02, 1.246830e-01, 1.459779e-01,
       1.498704e-01, 1.269122e-01, 8.244760e-02, 3.756170e-02,
       1.136830e-02, 2.490700e-03, 5.424000e-04, 3.487000e-04,
       0.000000e+00, 0.000000e+00, 0.000000e+00, 0.0000

### Evaluation of the environment

In [6]:
total_rew = 0
for episodes in range(1,2):
    cum_reward = 0
    struc_heur.reset()
    for t in range(30):
        if t%4 == 0:
            action_ = 0
        else:
            action_ = 0
        observation, reward, done, info = struc_heur.step(action_)
        cum_reward += reward*0.95**t
        print(t, reward, cum_reward, done)
    total_rew += cum_reward
    #print(episodes, total_rew)
exp_reward = total_rew/episodes
print(exp_reward)

0 -3.9968028886505635e-10 -3.9968028886505635e-10 False
1 -3.4770075707513115e-07 -3.3071539951023965e-07 False
2 -1.6874399655364414e-05 -1.5559861088476623e-05 False
3 -0.00018441750015973923 -0.00017367481528793302 False
4 -0.001014276900246358 -0.000999809689769218 False
5 -0.003786815999662352 -0.003929975724127952 False
6 -0.009949665500386118 -0.011243894147893119 False
7 -0.02249372169971231 -0.02695209893875552 False
8 -0.04445859600021862 -0.05644683987172673 False
9 -0.07673020799958685 -0.10480600817151282 False
10 -0.12639293029970133 -0.18048212440052575 False
11 -0.19336570119987684 -0.290468553086118 False
12 -0.28099668930026134 -0.4423079487493179 False
13 -0.3890220447000381 -0.6420093356172881 False
14 -0.5190692265000241 -0.8951464098102014 False
15 -0.6718887338996726 -1.2064265678690598 False
16 -0.8522119999998523 -1.5815077964140531 False
17 -1.0539131616005282 -2.022170320834369 False
18 -1.2787437253003464 -2.530105638162269 False
19 -1.526750699999413 -3.106

### Evaluation of the environment with the trained policy

In [7]:
total_rew = 0
for episodes in range(1,4):
    cum_reward = 0
    obs = struc_heur.reset()
    action_ = trainer.compute_single_action(obs)
    for t in range(30):
        obs, reward, done, info = struc_heur.step(action_)
        action_ = trainer.compute_single_action(obs)
        cum_reward += reward*0.95**t
        # print(t, reward, cum_reward, done)
        print(t, action_)
    total_rew += cum_reward
    #print(episodes, total_rew)
exp_reward = total_rew/episodes
print(exp_reward)

NameError: name 'trainer' is not defined

### Configuration of the trainer

In [19]:
import numpy as np
import pprint
import ray
from datetime import datetime
import tempfile
from ray.tune.logger import Logger, UnifiedLogger

# Start a new instance of Ray (when running this tutorial locally) or
# connect to an already running one (when running this tutorial through Anyscale).

ray.init()  # Hear the engine humming? ;)

# In case you encounter the following error during our tutorial: `RuntimeError: Maybe you called ray.init twice by accident?`
# Try: `ray.shutdown() + ray.init()` or `ray.init(ignore_reinit_error=True)`

{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': None,
 'object_store_address': 'tcp://127.0.0.1:56811',
 'raylet_socket_name': 'tcp://127.0.0.1:58687',
 'webui_url': None,
 'session_dir': 'C:\\Users\\user\\AppData\\Local\\Temp\\ray\\session_2022-07-07_15-25-45_242275_28996',
 'metrics_export_port': 61219,
 'gcs_address': '127.0.0.1:55030',
 'address': '127.0.0.1:55030',
 'node_id': '394336ab1d3b78a7f7dfad335b8485f5d848fb0b06d2cf479b4687ea'}

In [None]:
### Shutdown Ray's session
ray.shutdown() 

In [20]:
from ray.rllib.agents.dqn import DQNTrainer
# Create an RLlib Trainer instance.

config={
        # Env class to use (here: our gym.Env sub-class from above).
        "env": StructSA,
        
        "env_config": {
            "config": {"components": 3},
        },
        # Number of steps after which the episode is forced to terminate. Defaults
        # to `env.spec.max_episode_steps` (if present) for Gym envs.
        "horizon": 30,
        # Parallelize environment rollouts.
        "num_workers": 3,
        # Discount factor of the MDP.
        "gamma": 0.95,
        
        # https://github.com/ray-project/ray/blob/releases/1.11.1/rllib/models/catalog.py
        # FullyConnectedNetwork (tf and torch): rllib.models.tf|torch.fcnet.py
        # These are used if no custom model is specified and the input space is 1D.
        # Number of hidden layers to be used.
        # Activation function descriptor.
        # Supported values are: "tanh", "relu", "swish" (or "silu"),
        # "linear" (or None).
        "model": {
            "fcnet_hiddens": [100],
            "fcnet_activation": "relu"
        },
        
        "create_env_on_driver": True,
        
        #"evaluation_interval": 2,
        "evaluation_num_workers": 1,
        "evaluation_duration": 50,
        # === Deep Learning Framework Settings ===
        # tf: TensorFlow (static-graph)
        # tf2: TensorFlow 2.x (eager or traced, if eager_tracing=True)
        # tfe: TensorFlow eager (or traced, if eager_tracing=True)
        # torch: PyTorch
#         "framework": "torch",
    }

PATH_logger = "D:/14_DecomposedQ_DRL/single_agent_environment/01_single_agent_multiComponent/log_files"
### Defaut logger creator ###
def custom_log_creator(custom_path, custom_str):

    timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
    logdir_prefix = "{}_{}".format(custom_str, timestr)
    def logger_creator(config):

        if not os.path.exists(custom_path):
            os.makedirs(custom_path)
        logdir = tempfile.mkdtemp(prefix=logdir_prefix, dir=custom_path)
        print(logdir)
        return UnifiedLogger(config, logdir, loggers=None)

    return logger_creator

trainer = DQNTrainer(config=config, logger_creator=custom_log_creator(PATH_logger, 'dqn'))

2022-07-07 15:26:34,143	INFO trainer.py:2141 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
2022-07-07 15:26:34,147	INFO simple_q.py:155 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting `simple_optimizer=True` if this doesn't work for you.
2022-07-07 15:26:34,148	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


D:/14_DecomposedQ_DRL/single_agent_environment/01_single_agent_multiComponent/log_files\dqn_2022-07-07_15-26-34oik05esn
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


 pid=24256)[0m Instructions for updating:
 pid=24256)[0m If using Keras pass *_constraint arguments to layers.
 pid=10488)[0m Instructions for updating:
 pid=10488)[0m If using Keras pass *_constraint arguments to layers.
 pid=33260)[0m Instructions for updating:
 pid=33260)[0m If using Keras pass *_constraint arguments to layers.
 pid=28744)[0m Instructions for updating:
 pid=28744)[0m If using Keras pass *_constraint arguments to layers.


Train policy and conduct evaluations periodically

In [21]:
for i in range(100):
    results = trainer.train()
    #if i%100==0:
    #trainer.export_policy_model("D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel")
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")
    #print(f"Iter: {i}; evaluation={results['evaluation']['episode_reward_mean']}")
    
    if i%5==0:
        evaluat = trainer.evaluate()
        print(evaluat['evaluation']['episode_reward_mean'])
#         print(f"Iter: {i}; evaluation={results['evaluation']['episode_reward_mean']}")
        
# ''' export policy checkpoint
# def export_policy_checkpoint(
#             self,
#             export_dir: str,
#             filename_prefix: str = "model",
#             policy_id: PolicyID = DEFAULT_POLICY_ID,
#     )   
# '''
# PATH_model = "D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel"

# trainer.export_policy_checkpoint(PATH_model, filename_prefix='modelx')

# trainer.save_checkpoint(PATH_model)

Iter: 0; avg. reward=-622.3679274583674
-797.5626443162677
Iter: 1; avg. reward=-596.1110848079109
Iter: 2; avg. reward=-562.8474449614669
Iter: 3; avg. reward=-493.97519639753494
Iter: 4; avg. reward=-416.8451801202591
Iter: 5; avg. reward=-352.4606007736644
-115.61829361948274
Iter: 6; avg. reward=-302.0749573562642
Iter: 7; avg. reward=-244.20190277362275
Iter: 8; avg. reward=-184.5709092635305
Iter: 9; avg. reward=-132.64731963854294
Iter: 10; avg. reward=-117.42668895301873
-123.07695051959621
Iter: 11; avg. reward=-132.5139914022651
Iter: 12; avg. reward=-122.05821773723605
Iter: 13; avg. reward=-138.6183830932686
Iter: 14; avg. reward=-108.9546088362364
Iter: 15; avg. reward=-140.31326713164546
-115.61829361948274
Iter: 16; avg. reward=-117.7515263244716
Iter: 17; avg. reward=-128.40167224745815
Iter: 18; avg. reward=-101.18263108011341
Iter: 19; avg. reward=-91.23610083361514
Iter: 20; avg. reward=-77.95565266126897
-115.72290558069476
Iter: 21; avg. reward=-108.0189625936939
I

' export policy checkpoint\ndef export_policy_checkpoint(\n            self,\n            export_dir: str,\n            filename_prefix: str = "model",\n            policy_id: PolicyID = DEFAULT_POLICY_ID,\n    )   \n'

In [None]:
# del results["config"]
pprint.pprint(results)

## Relevant methods => Check policy

In [None]:
policy = trainer.get_policy()

In [None]:
model = trainer.get_policy().model
model

In [None]:
# Print out the policy's action and observation spaces.
print(f"Our Policy's observation space is: {policy.observation_space}")
print(f"Our Policy's action space is: {policy.action_space}")

In [None]:
# Produce a random obervation (B=1; batch of size 1).
obs = np.array([policy.observation_space.sample()])
# Alternatively for PyTorch:
#import torch
#obs = torch.from_numpy(obs)
obs

In [None]:
logits, _ = model({"obs": obs})
logits

In [None]:
logits_np = policy.get_session().run(logits)
logits_np.shape

In [None]:
from ray.rllib.utils.numpy import softmax
action_probs = np.squeeze(softmax(logits_np))
action_probs

## Action scores

In [None]:
(action_scores, logits, dist) = model.get_q_value_distributions(model_out)

In [None]:
action_scores.graph

## Definition of action transformation (change of base)

In [None]:
def convert_base_action(action_, base, comp):
    action_multi = np.zeros((comp,), dtype=int)
    if action_ == 0:
            return action_multi
    digits = []
    index_comp = int(comp) - 1 
    while action_:
        digits = (int(action_ % base))
        #print(digits)
        action_multi[index_comp] = digits
        action_ //= base
        index_comp -= 1
        #print(index_comp, action_multi)
    return action_multi

# Testing
# action_test = convert_base_action(5, 3, 2)
# action_test

In [136]:
policy.get_session().run(q_val)

InvalidArgumentError: You must feed a value for placeholder tensor 'default_policy/obs' with dtype float and shape [?,241]
	 [[node default_policy/obs (defined at C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]]

Original stack trace for 'default_policy/obs':
  File "C:\Users\user\Anaconda3\envs\gym\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\user\Anaconda3\envs\gym\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
    app.start()
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\platform\asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\user\Anaconda3\envs\gym\lib\asyncio\base_events.py", line 438, in run_forever
    self._run_once()
  File "C:\Users\user\Anaconda3\envs\gym\lib\asyncio\base_events.py", line 1451, in _run_once
    handle._run()
  File "C:\Users\user\Anaconda3\envs\gym\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\gen.py", line 787, in inner
    self.run()
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ipykernel\kernelbase.py", line 545, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\IPython\core\interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\IPython\core\interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\IPython\core\interactiveshell.py", line 3051, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\IPython\core\interactiveshell.py", line 3242, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-106-7cdb4d8bac2b>", line 44, in <module>
    trainer = DQNTrainer(config=config)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\agents\trainer.py", line 747, in __init__
    sync_function_tpl)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\tune\trainable.py", line 124, in __init__
    self.setup(copy.deepcopy(self.config))
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\agents\trainer.py", line 827, in setup
    num_workers=self.config["num_workers"])
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\agents\trainer.py", line 2002, in _make_workers
    logdir=self.logdir,
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 132, in __init__
    spaces=spaces,
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\evaluation\worker_set.py", line 540, in _make_worker
    spaces=spaces,
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 590, in __init__
    seed=seed)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\evaluation\rollout_worker.py", line 1578, in _build_policy_map
    conf, merged_conf)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\policy\policy_map.py", line 134, in create_policy
    observation_space, action_space, merged_config)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\policy\tf_policy_template.py", line 252, in __init__
    get_batch_divisibility_req=get_batch_divisibility_req,
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 247, in __init__
    **prev_action_ph))
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\policy\dynamic_tf_policy.py", line 608, in _get_input_dict_and_dummy_batch
    flatten=flatten,
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\ray\rllib\utils\tf_utils.py", line 216, in get_placeholder
    name=name,
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 2630, in placeholder
    return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py", line 6670, in placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 793, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3360, in create_op
    attrs, op_def, compute_device)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3429, in _create_op_internal
    op_def=op_def)
  File "C:\Users\user\Anaconda3\envs\gym\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1751, in __init__
    self._traceback = tf_stack.extract_stack()


## If Ray does not start...

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

### To be investigated

https://docs.ray.io/en/latest/rllib/rllib-training.html#accessing-policy-state

help(trainer.get_policy)

In [None]:
trainer.get_policy().export_model(PATH_model+'test')

In [None]:
import tensorflow as tf
rr = trainer.get_policy().q_values

In [None]:
dir(trainer.get_policy())

In [None]:
model.save('my_model.h5')

In [None]:
ee = model.value_function()

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(action_scores)

In [None]:
action_scores.numpy()

In [None]:
help(model.base_model)

In [None]:
model.base_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),loss='mse')

In [None]:
model.base_model.summary()

In [None]:
model_out = model.__call__({"obs": np.array([struc_heur.reset()])})
model_out

In [None]:
with tf.Session():
    model_out[0].eval()

In [None]:
dir(model_out[0])

In [None]:
sess = tf.Session()
sess.run(model_out[0])

Evaluate the trained policy

In [None]:
evaluation = trainer.evaluate()
evaluation

Storing and restoring checkpoint

In [None]:
PATH_model = "D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel"
trainer.save(PATH_model)

In [None]:
trainer.restore(PATH_model+'/checkpoint_000000/checkpoint-0')

Get action from the trained policy

In [None]:
trainer.compute_single_action(struc_heur.reset())

Load checkpoint (it required setting up the same configuration as during training)

In [None]:
trainer.load_checkpoint("D:/14_DecomposedQ_DRL/single_agent_environment/struc_SA_jupyter/savedModel/checkpoint-52")

Running from the console

In [None]:
!rllib train --run DQN --env CartPole-v0

Additional test code

In [None]:
for i in range(1):
    results = trainer.train()
    #if i%100==0:
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")

In [None]:
from ray.rllib.algorithms.ppo import PPO

In [None]:
dir(agents.qmix.QMixTrainer)