In [88]:
import os
import numpy as np
import random
import tqdm as notebook_tqdm

import gym
from gym import Env
from gym.spaces import Discrete, Box, MultiDiscrete

import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
#%load_ext tensorboard

In [89]:
env_name = 'tria-3d-rl-model-'

t_ini= 70.0; h_ini= 40.0; a_ini= 10.0 

t_min =-40.0; t_max=110.0; h_min=0.0; h_max=100.0; a_min=0.0; a_max=5000.0

act_state = 2

stat_rand_min = -1.0; stat_rand_max = 1.0

equilibrium_cycles= 60

r1 = -0.25; r2 = -0.5; r3 = 2.0; nr3 = -2.0

const_weight_vec  = [1, 1, 1, 1]

d3 = {
     0 : [65.0, 80.0, 50.0, 85.0, 40.0, 90.0], 
     1 : [30.0, 50.0, 20.0, 60.0, 10.0, 70.0], 
     2 : [0.0, 19.0, 200.0, 599.0, 600.0, 2000.0]
    }

d1 = {0: [65.0, 80.0], 1: [30.0, 50.0], 2: [0.0, 20.0]}

ppo_model_timesteps= 20000; neural_model_timesteps=20000; a2c_model_timesteps=20000

ppo_model = env_name + 'ppo'; neural_model = env_name + 'ppo-neural'; a2c_model = env_name + 'a2c'

In [90]:
class TriaEnv(Env):
    
    def __init__(self):
        self.action_space = MultiDiscrete(np.array([act_state,act_state,act_state,act_state,act_state]))
        
        low= np.array([t_min, h_min, a_min]).astype(np.float64)
        
        high= np.array([t_max, h_max, a_max]).astype(np.float64)

        self.observation_space = Box(low, high)
        
        self.state = [t_ini + random.uniform(stat_rand_min, stat_rand_max), h_ini + random.uniform(stat_rand_min, stat_rand_max), a_ini + random.uniform(stat_rand_min, stat_rand_max)]
        
        #print('^^^', self.state, self.action_space)
        
        self.equilibrium_cycles_len = equilibrium_cycles
        
    def step(self, action):
        
        ap_scaled = [1 if e == 1 else -1 for e in action]  # 0 (off) => -1 and 1 (on) => 1
        
        actionPrime = [a * b for a, b in zip(ap_scaled, const_weight_vec)] 
        
        actionAlgo = [actionPrime[a] - actionPrime[len(actionPrime) -a -1] for a in range(len(actionPrime) // 2)]
        
        actionAlgo.append(actionPrime[len(actionPrime) // 2])                                                              
        
        #print('***',actionAlgo, self.state)
        
        self.state = [a + b for a, b in zip(actionAlgo, self.state)]
        
        #print('&&&', actionAlgo, self.state)
        
        #reduce tria simulation length by 1 second
        self.equilibrium_cycles_len -= 1
        
        reward = [r3 if e >= d3[i][0] and e<= d3[i][1] else r2 if e >= d3[i][2] and e<= d3[i][3] else r1 if e >= d3[i][4] and e <= d3[i][5] else nr3 for i, e in enumerate(self.state)]
        #reward = [r3 if e >= d1[i][0] and e <= d1[i][1] else nr3  for i, e in enumerate(self.state)]

        reward = sum(reward)
        #print('$$$', reward)
            
        if self.equilibrium_cycles_len <= 0:
            terminated = True
        else:
            terminated = False
            
        info = {}
        #print('reward:{} state:{}'.format(reward, self.state))
        return self.state, reward, terminated, info
    
    def render(self):
        pass
    
    def reset(self):
        
        self.state =[t_ini + random.uniform(stat_rand_min, stat_rand_max), h_ini + random.uniform(stat_rand_min, stat_rand_max), a_ini + random.uniform(stat_rand_min, stat_rand_max)]
        #print('@@@', self.state)
        self.equilibrium_cycles_len = equilibrium_cycles
        
        return self.state

In [91]:
env= TriaEnv()

print("1. Sample observation space: {}".format(env.observation_space.sample()))
print("2. Sample action space     : {}".format(env.action_space.sample()))
print("3. Sample state            : {}".format(env.state))  

1. Sample observation space: [  73.16288    18.669289 3638.421   ]
2. Sample action space     : [0 0 0 1 0]
3. Sample state            : [70.73190467409455, 40.524241878246094, 9.37801770566722]


In [92]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    #print(state)
    terminated = False
    score = 0 #[0,0,0] 
    
    while not terminated:
        #env.render()
        action = env.action_space.sample()
        #print(action, terminated , reward)
        #print(env.step(action))
        next_state, reward, terminated,  info = env.step(action) 
        score += reward #[a + b for a, b in zip(reward, score)]
    print('Episode: {} Score: {}'.format(episode, score))
env.close()

Episode: 1 Score: 308.5
Episode: 2 Score: 319.0
Episode: 3 Score: 150.75
Episode: 4 Score: 340.0
Episode: 5 Score: 348.5


In [93]:
from stable_baselines3 import A2C


log_path = os.path.join('train','log')
print(log_path)

env = DummyVecEnv([lambda: env])
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=log_path)



train/log
Using cpu device


In [94]:
model.learn(total_timesteps=20000, log_interval=100)

TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates