In [2]:
import numpy as np
import matplotlib.pyplot as plt
import or_gym
import os

from common import make_env

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor, SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.env_util import make_vec_env

from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy as SACPolicy

from stable_baselines3 import A2C
from stable_baselines3.a2c.policies import MlpPolicy as A2CPolicy

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy as PPOPolicy

from sb3_contrib import ARS
from sb3_contrib.ars.policies import ARSPolicy

from sb3_contrib import RecurrentPPO
from sb3_contrib.ppo_recurrent.policies import RecurrentActorCriticPolicy

from sb3_contrib import TQC
from sb3_contrib.tqc.policies import MlpPolicy as TQCPolicy

from sb3_contrib import TRPO
from sb3_contrib.trpo.policies import MlpPolicy as TRPOPolicy

In [14]:
def train_model_on_env(env_name, algo_name, name, n_envs=1, timesteps=int(1e5), eval_freq=int(5e3), env_seed=0, load_existing=True):
    save_path = f'./data/{env_name}/{algo_name}/{name}/'

    if os.path.exists(save_path):
        print(f'Using existing directory {save_path}')
    
    else:
        print(f'Creating new directory {save_path}')

    def make_subproc_env():    
        def _init():
            make_env(env_name)
        return _init
    
    if n_envs == 1:
        env = make_env(env_name)

        if load_existing:
            env = Monitor(env, save_path, override_existing=False)
        else:
            env = Monitor(env, save_path, override_existing=True)
    else:
        env = SubprocVecEnv([make_subproc_env() for _ in range(n_envs)])

        if load_existing:
            env = VecMonitor(env, save_path, override_existing=False)
        else:
            env = VecMonitor(env, save_path, override_existing=True)

    def make_model(algo_name, env, n_steps, batch_size):
        model_path = save_path + 'best_model.zip'

        if algo_name == 'PPO':
            if load_existing is True and os.path.exists(model_path):
                print('Loading existing model...')
                return PPO.load(model_path, env)
            else:
                return PPO(PPOPolicy, env, n_steps=n_steps, batch_size=batch_size)

        if algo_name == 'RecurrentPPO':
            if load_existing is True and os.path.exists(model_path):
                print('Loading existing model...')
                RecurrentPPO.load(model_path, env)
            else:
                return RecurrentPPO(RecurrentActorCriticPolicy, env, n_steps=n_steps, batch_size=batch_size)

        if algo_name == 'A2C':
            if load_existing is True and os.path.exists(model_path):
                print('Loading existing model...')
                return A2C.load(model_path, env)
            else:
                return A2C(A2CPolicy, env, n_steps=n_steps)
        
        if algo_name == 'ARS':
            if load_existing is True and os.path.isfile(model_path):
                print('Loading existing model...')
                return ARS.load(model_path, env)
            else:
                return ARS(ARSPolicy, env)

        if algo_name == 'SAC':
            if load_existing is True and os.path.isfile(model_path):
                print('Loading existing model...')
                return SAC.load(model_path, env)
            else:
                return SAC(SACPolicy, env, batch_size=batch_size)

        if algo_name == 'TQC':
            if load_existing is True and os.path.isfile(model_path):
                print('Loading existing model...')
                return TQC.load(model_path, env)
            else:
                return TQC(TQCPolicy, env, batch_size=batch_size)

        if algo_name == 'TRPO':
            if load_existing is True and os.path.isfile(model_path):
                print('Loading existing model...')
                return TRPO.load(model_path, env)
            else:
                return TRPO(TRPOPolicy, env, n_steps=n_steps, batch_size=batch_size)

    model = make_model(algo_name, env, n_steps=env.num_periods, batch_size=env.num_periods*n_envs)

    eval_callback = EvalCallback(env, best_model_save_path=save_path, verbose=1, log_path=save_path, 
                                    eval_freq=int(eval_freq), deterministic=True, render=False)

    model.learn(total_timesteps=int(timesteps), callback=eval_callback)

train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='PPO', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)
    

Using existing directory ./data/NetworkManagement-v1-100/PPO/default/
Loading existing model...
Eval num_timesteps=5000, episode_reward=951.73 +/- 7.86
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=1175.71 +/- 4.38
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=15000, episode_reward=937.57 +/- 6.35
Episode length: 100.00 +/- 0.00
Eval num_timesteps=20000, episode_reward=1004.18 +/- 4.18
Episode length: 100.00 +/- 0.00
Eval num_timesteps=25000, episode_reward=905.00 +/- 4.48
Episode length: 100.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=981.10 +/- 5.31
Episode length: 100.00 +/- 0.00
Eval num_timesteps=35000, episode_reward=844.88 +/- 3.49
Episode length: 100.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=737.42 +/- 1.82
Episode length: 100.00 +/- 0.00


In [5]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='TRPO', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)

Using existing directory ./data/NetworkManagement-v1-100/TRPO/default/
Eval num_timesteps=5000, episode_reward=-1466.54 +/- 2.43
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1407.51 +/- 3.49
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=15000, episode_reward=-1380.62 +/- 2.02
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1365.15 +/- 4.43
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=25000, episode_reward=-1367.34 +/- 1.92
Episode length: 100.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-793.46 +/- 3.20
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=35000, episode_reward=-188.12 +/- 5.60
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=40000, episode_reward=-454.58 +/- 3.13
Episode length: 100.00 +/- 0.00
Eval num_timesteps=45000, episode_reward=-173.95 +/- 2.23
Episode length:

In [6]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='PPO', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)

Using existing directory ./data/NetworkManagement-v1-100/PPO/default/
Eval num_timesteps=5000, episode_reward=-1523.62 +/- 4.87
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1521.26 +/- 3.73
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=15000, episode_reward=-1292.16 +/- 4.78
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1415.39 +/- 4.45
Episode length: 100.00 +/- 0.00
Eval num_timesteps=25000, episode_reward=-1408.21 +/- 4.04
Episode length: 100.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-1484.25 +/- 5.05
Episode length: 100.00 +/- 0.00
Eval num_timesteps=35000, episode_reward=-1401.90 +/- 3.63
Episode length: 100.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=-1096.19 +/- 2.45
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=45000, episode_reward=-1134.17 +/- 6.34
Episode length: 100.00 +/- 0.00
Eval num_timesteps=50000

In [7]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='RecurrentPPO', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)

Using existing directory ./data/NetworkManagement-v1-100/RecurrentPPO/default/
Eval num_timesteps=5000, episode_reward=-1634.30 +/- 6.11
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1652.31 +/- 5.33
Episode length: 100.00 +/- 0.00
Eval num_timesteps=15000, episode_reward=-1736.78 +/- 2.57
Episode length: 100.00 +/- 0.00
Eval num_timesteps=20000, episode_reward=-1803.64 +/- 4.66
Episode length: 100.00 +/- 0.00
Eval num_timesteps=25000, episode_reward=-2574.77 +/- 4.52
Episode length: 100.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-3241.53 +/- 3.57
Episode length: 100.00 +/- 0.00
Eval num_timesteps=35000, episode_reward=-2373.76 +/- 3.23
Episode length: 100.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=-1267.57 +/- 6.99
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=45000, episode_reward=-1096.65 +/- 1.86
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=50000, episode_rew

In [10]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='ARS', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)

Using existing directory ./data/NetworkManagement-v1-100/ARS/default/
Eval num_timesteps=5000, episode_reward=-655725.86 +/- 58.08
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-6479.00 +/- 134.78
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=15000, episode_reward=-2277.81 +/- 158.38
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-242.43 +/- 135.90
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=25000, episode_reward=-2855.89 +/- 150.43
Episode length: 100.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=1738.05 +/- 118.48
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=35000, episode_reward=1512.83 +/- 106.96
Episode length: 100.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=2294.30 +/- 80.13
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=45000, episode_reward=2203.68 +/- 83.41


In [5]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='A2C', name='default', n_envs=1, timesteps=2.5e6, eval_freq=5e3)

Using existing directory ./data/NetworkManagement-v1-100/A2C/default/
Eval num_timesteps=5000, episode_reward=-1521.55 +/- 3.97
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1673.25 +/- 2.63
Episode length: 100.00 +/- 0.00
Eval num_timesteps=15000, episode_reward=-1445.37 +/- 5.34
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1506.61 +/- 4.80
Episode length: 100.00 +/- 0.00
Eval num_timesteps=25000, episode_reward=-929.78 +/- 1.92
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=30000, episode_reward=-1449.77 +/- 5.06
Episode length: 100.00 +/- 0.00
Eval num_timesteps=35000, episode_reward=-1481.48 +/- 1.94
Episode length: 100.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=-1423.92 +/- 3.72
Episode length: 100.00 +/- 0.00
Eval num_timesteps=45000, episode_reward=-1193.68 +/- 1.36
Episode length: 100.00 +/- 0.00
Eval num_timesteps=50000, episode_reward=-1247.

In [4]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='SAC', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)

Creating new directory ./data/NetworkManagement-v1-100/SAC/default/
Eval num_timesteps=5000, episode_reward=-658057.47 +/- 69.87
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-658020.77 +/- 118.59
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=15000, episode_reward=-657981.66 +/- 90.86
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-658044.68 +/- 148.09
Episode length: 100.00 +/- 0.00
Eval num_timesteps=25000, episode_reward=-49331.48 +/- 220.61
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=30000, episode_reward=-12364.71 +/- 4478.47
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=35000, episode_reward=-1371533.47 +/- 3.39
Episode length: 100.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=-1371538.01 +/- 2.86
Episode length: 100.00 +/- 0.00
Eval num_timesteps=45000, episode_reward=-1371535.99 +/- 2.24
Episod

ValueError: Expected parameter loc (Tensor of shape (1, 11)) of distribution Normal(loc: torch.Size([1, 11]), scale: torch.Size([1, 11])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]])