In [1]:
!pip install tensorflow
!pip install git+https://github.com/carlosluis/stable-baselines3@fix_tests 

Collecting git+https://github.com/carlosluis/stable-baselines3@fix_tests
  Cloning https://github.com/carlosluis/stable-baselines3 (to revision fix_tests) to /tmp/pip-req-build-sl_bl82e
  Running command git clone --filter=blob:none --quiet https://github.com/carlosluis/stable-baselines3 /tmp/pip-req-build-sl_bl82e
  Running command git checkout -b fix_tests --track origin/fix_tests
  Switched to a new branch 'fix_tests'
  Branch 'fix_tests' set up to track remote branch 'fix_tests' from 'origin'.
  Resolved https://github.com/carlosluis/stable-baselines3 to commit d5c79b0c3bd6f0f69d2673d67407bbc819d73f67
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
import os
import numpy as np
import random

import gym
from gym import Env
from gym.spaces import Discrete, Box, MultiDiscrete

from stable_baselines3 import PPO
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
%load_ext tensorboard

In [3]:
env_name = 'tria-3d-rl-model-'

t_ini= 70.0; h_ini= 40.0; a_ini= 10.0 

t_min =-40.0; t_max=110; h_min=0.0; h_max=100.0; a_min=0.0; a_max=5000.0

act_state = 2

stat_rand_min = -1.0; stat_rand_max = 1.0

equilibrium_cycles= 60

r1 = -0.25; r2 = -0.5; r3 = 2; nr3 = -2

const_weight_vec  = [1, 1, 1, 1]

d3 = {
     0 : [65.0, 80.0, 50.0, 85.0, 40.0, 90.0], 
     1 : [30.0, 50.0, 20.0, 60.0, 10.0, 70.0], 
     2 : [0.0, 19.0, 200.0, 599.0, 600.0, 2000.0]
    }

d1 = {0: [65.0, 80.0], 1: [30.0, 50.0], 2: [0.0, 20.0]}

ppo_model_timesteps= 20000; neural_model_timesteps=20000; a2c_model_timesteps=20000

ppo_model = env_name + 'ppo'; neural_model = env_name + 'ppo-neural'; a2c_model = env_name + 'a2c'

In [4]:
class TriaEnv(Env):
    
    def __init__(self):
        self.action_space = MultiDiscrete(np.array([act_state,act_state,act_state,act_state,act_state]))
        
        self.observation_space = Box(low=np.array([t_min, h_min, a_min]), high=np.array([t_max, h_max, a_max]), dtype=np.float32)
        
        self.state = [t_ini + random.uniform(stat_rand_min, stat_rand_max), h_ini + random.uniform(stat_rand_min, stat_rand_max), a_ini + random.uniform(stat_rand_min, stat_rand_max)]
        
        #print('^^^', self.state, self.action_space)
        
        self.equilibrium_cycles_len = equilibrium_cycles
        
    def step(self, action):
        
        ap_scaled = [1 if e == 1 else -1 for e in action]  # 0 (off) => -1 and 1 (on) => 1
        
        actionPrime = [a * b for a, b in zip(ap_scaled, const_weight_vec)] 
        
        actionAlgo = [actionPrime[a] - actionPrime[len(actionPrime) -a -1] for a in range(len(actionPrime) // 2)]
        
        actionAlgo.append(actionPrime[len(actionPrime) // 2])                                                              
        
        #print('***',actionAlgo, self.state)
        
        self.state = [a + b for a, b in zip(actionAlgo, self.state)]
        
        #print('&&&', actionAlgo, self.state)
        
        #reduce tria simulation length by 1 second
        self.equilibrium_cycles_len -= 1
        
        reward = [r3 if e >= d3[i][0] and e<= d3[i][1] else r2 if e >= d3[i][2] and e<= d3[i][3] else r1 if e >= d3[i][4] and e <= d3[i][5] else nr3 for i, e in enumerate(self.state)]
        #reward = [r3 if e >= d1[i][0] and e <= d1[i][1] else nr3  for i, e in enumerate(self.state)]

        reward = sum(reward)
        #print('$$$', reward)
            
        if self.equilibrium_cycles_len <= 0:
            terminated = True
        else:
            terminated = False
            
        info = {}
        #print('reward:{} state:{}'.format(reward, self.state))
        return self.state, reward, terminated, False, info
    
    def render(self):
        pass
    
    def reset(self):
        
        self.state =[t_ini + random.uniform(stat_rand_min, stat_rand_max),h_ini + random.uniform(stat_rand_min, stat_rand_max),a_ini + random.uniform(stat_rand_min, stat_rand_max)]
        #print('@@@', self.state)
        self.equilibrium_cycles_len = equilibrium_cycles
        
        return self.state, {}
        

In [5]:
env= TriaEnv()

print("1. Sample observation space: {}".format(env.observation_space.sample()))
print("2. Sample action space     : {}".format(env.action_space.sample()))
print("3. Sample state            : {}".format(env.state))

1. Sample observation space: [ 106.23854   66.10783 3743.8933 ]
2. Sample action space     : [1 0 0 1 0]
3. Sample state            : [69.12956374793603, 40.447094689361286, 10.656903523012252]


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [6]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    #print(state)
    terminated = False
    score = 0 #[0,0,0] 
    
    while not terminated:
        #env.render()
        action = env.action_space.sample()
        #print(action, terminated , reward)
        #print(env.step(action))
        next_state, reward, terminated, truncated, info = env.step(action) 
        score += reward #[a + b for a, b in zip(reward, score)]
    print('Episode: {} Score: {}'.format(episode, score))
env.close()

Episode: 1 Score: 299.0
Episode: 2 Score: 196.25
Episode: 3 Score: 256.0
Episode: 4 Score: 119.75
Episode: 5 Score: 360


In [7]:
log_path = os.path.join('training','logs')
log_path

'training/logs'

In [8]:
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [9]:
model.learn(total_timesteps=ppo_model_timesteps)

2023-03-14 19:47:52.843357: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-14 19:48:04.079395: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-14 19:48:04.080398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


Logging to training/logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 49   |
|    iterations      | 1    |
|    time_elapsed    | 41   |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 2          |
|    time_elapsed         | 70         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00990727 |
|    clip_fraction        | 0.0976     |
|    clip_range           | 0.2        |
|    entropy_loss         | -3.46      |
|    explained_variance   | -0.00135   |
|    learning_rate        | 0.0003     |
|    loss                 | 1.37e+03   |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0111    |
|    value_loss           | 3.12e+03   |
----------------------------------------
---------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f8630e04ee0>

In [210]:
tria_model_path = os.path.join('training','save', ppo_model)

In [211]:
model.save(tria_model_path)

In [212]:
del model

In [213]:
model = PPO.load(tria_model_path, env=env)

In [214]:
evaluate_policy(model, env, n_eval_episodes=20, render=True)

(360.0, 0.0)

In [183]:
env.close()

In [184]:
episodes=10
for episode in range(1, episodes+1):
    observation = env.reset()
    terminated = False
    score = 0
    while not terminated:
        env.render()
        action, _ = model.predict(observation)
        observation, reward, terminated , info = env.step(action)
        score += reward
    print('Episone:{} Score:{}'.format( episode, score))
env.close()  

Episone:1 Score:[282.75]
Episone:2 Score:[285.]
Episone:3 Score:[360.]
Episone:4 Score:[342.5]
Episone:5 Score:[262.5]
Episone:6 Score:[357.5]
Episone:7 Score:[332.5]
Episone:8 Score:[259.5]
Episone:9 Score:[342.5]
Episone:10 Score:[322.5]


# custom neural network injected in PPO model based on tria 3D environment

In [185]:
net_arch = dict(pi=[128,128,128,128], vf=[128,128,128,128])

In [186]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


In [187]:
model.learn(total_timesteps=neural_model_timesteps)

Logging to training/logs/PPO_5
-----------------------------
| time/              |      |
|    fps             | 92   |
|    iterations      | 1    |
|    time_elapsed    | 22   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 2           |
|    time_elapsed         | 56          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010003677 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.46       |
|    explained_variance   | -0.000324   |
|    learning_rate        | 0.0003      |
|    loss                 | 885         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0101     |
|    value_loss           | 2.43e+03    |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x7efbf0530d00>

In [188]:
tria_model_path_neural = os.path.join('training','save',neural_model)
model.save(tria_model_path_neural)

In [189]:
del model

In [190]:
model = PPO.load(tria_model_path_neural, env=env)

In [191]:
evaluate_policy(model, env, n_eval_episodes=20, render=True)

(79.5875, 8.751883725804404)

In [192]:
env.close()

# create A2C network based learing with Tria 3D environment

In [193]:
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import SubprocVecEnv

In [194]:
model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [195]:
model.learn(total_timesteps=a2c_model_timesteps)

Logging to training/logs/A2C_2
------------------------------------
| time/                 |          |
|    fps                | 72       |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -3.39    |
|    explained_variance | 3.45e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 60.2     |
|    value_loss         | 371      |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 76        |
|    iterations         | 200       |
|    time_elapsed       | 13        |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -3.21     |
|    explained_variance | -0.000909 |
|    learning_rate      | 0.0007    |
|    n_updates          | 199       |
|    policy_loss        | 15.4   

<stable_baselines3.a2c.a2c.A2C at 0x7efbf0458970>

In [196]:
tria_model_path_a2c = os.path.join('training','save',a2c_model)
model.save(tria_model_path_a2c)

In [197]:
del model

In [198]:
model = A2C.load(tria_model_path_a2c, env=env)

In [199]:
evaluate_policy(model, env, n_eval_episodes=20, render=True)

(360.0, 0.0)

In [200]:
env.close()

In [201]:
!tensorboard --logdir './training/logs/' --bind_all  # training_log_path

2023-03-07 20:51:36.652546: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-07 20:51:37.101809: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-07 20:51:37.101989: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-07 20:51:40.960059: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-