In [3]:
import numpy as np
import matplotlib.pyplot as plt
import or_gym
import os

from common import make_env

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor, SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.env_util import make_vec_env

from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy as SACPolicy

from stable_baselines3 import A2C
from stable_baselines3.a2c.policies import MlpPolicy as A2CPolicy

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy as PPOPolicy

from sb3_contrib import ARS
from sb3_contrib.ars.policies import ARSPolicy

from sb3_contrib import RecurrentPPO
from sb3_contrib.ppo_recurrent.policies import RecurrentActorCriticPolicy

from sb3_contrib import TQC
from sb3_contrib.tqc.policies import MlpPolicy as TQCPolicy

from sb3_contrib import TRPO
from sb3_contrib.trpo.policies import MlpPolicy as TRPOPolicy

In [4]:
def train_model_on_env(env_name, algo_name, name, n_envs=1, timesteps=int(1e5), eval_freq=int(5e3), env_seed=0):
    save_path = f'./data/{env_name}/{algo_name}/{name}/'

    if os.path.exists(save_path):
        print(f'Using existing directory {save_path}')
    
    else:
        print(f'Creating new directory {save_path}')

    def make_subproc_env():    
        def _init():
            make_env(env_name)
        return _init
    
    if n_envs == 1:
        env = make_env(env_name)
        env = Monitor(env, save_path)
    else:
        env = SubprocVecEnv([make_subproc_env() for _ in range(n_envs)])
        env = VecMonitor(env, save_path)

    def make_model(algo_name, env, n_steps, batch_size):
        if algo_name == 'PPO':
            return PPO(PPOPolicy, env, n_steps=n_steps, batch_size=batch_size)

        if algo_name == 'RecurrentPPO':
            return RecurrentPPO(RecurrentActorCriticPolicy, env, n_steps=n_steps, batch_size=batch_size)

        if algo_name == 'A2C':
            return A2C(A2CPolicy, env, n_steps=n_steps)
        
        if algo_name == 'ARS':
            return ARS(ARSPolicy, env, n_eval_episodes=1, batch_size=batch_size)

        if algo_name == 'SAC':
            return SAC(SACPolicy, env, batch_size=batch_size)

        if algo_name == 'TQC':
            return TQC(TQCPolicy, env, batch_size=batch_size)

        if algo_name == 'TRPO':
            return TRPO(TRPOPolicy, env, n_steps=n_steps, batch_size=batch_size)

    model = make_model(algo_name, env, n_steps=env.num_periods, batch_size=env.num_periods*n_envs)

    eval_callback = EvalCallback(env, best_model_save_path=save_path, verbose=1, log_path=save_path, 
                                    eval_freq=int(eval_freq), deterministic=True, render=False)

    model.learn(total_timesteps=int(timesteps), callback=eval_callback)
    

In [5]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='TRPO', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)

Using existing directory ./data/NetworkManagement-v1-100/TRPO/default/
Eval num_timesteps=5000, episode_reward=-1466.54 +/- 2.43
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1407.51 +/- 3.49
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=15000, episode_reward=-1380.62 +/- 2.02
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1365.15 +/- 4.43
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=25000, episode_reward=-1367.34 +/- 1.92
Episode length: 100.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-793.46 +/- 3.20
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=35000, episode_reward=-188.12 +/- 5.60
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=40000, episode_reward=-454.58 +/- 3.13
Episode length: 100.00 +/- 0.00
Eval num_timesteps=45000, episode_reward=-173.95 +/- 2.23
Episode length:

In [6]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='PPO', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)

Using existing directory ./data/NetworkManagement-v1-100/PPO/default/
Eval num_timesteps=5000, episode_reward=-1523.62 +/- 4.87
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=10000, episode_reward=-1521.26 +/- 3.73
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=15000, episode_reward=-1292.16 +/- 4.78
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=20000, episode_reward=-1415.39 +/- 4.45
Episode length: 100.00 +/- 0.00
Eval num_timesteps=25000, episode_reward=-1408.21 +/- 4.04
Episode length: 100.00 +/- 0.00
Eval num_timesteps=30000, episode_reward=-1484.25 +/- 5.05
Episode length: 100.00 +/- 0.00
Eval num_timesteps=35000, episode_reward=-1401.90 +/- 3.63
Episode length: 100.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=-1096.19 +/- 2.45
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=45000, episode_reward=-1134.17 +/- 6.34
Episode length: 100.00 +/- 0.00
Eval num_timesteps=50000

In [None]:
train_model_on_env(env_name='NetworkManagement-v1-100', algo_name='RecurrentPPO', name='default', n_envs=1, timesteps=1e6, eval_freq=5e3)