In [14]:
import numpy as np
import random
import matplotlib.pyplot as plt
import or_gym
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from collections import namedtuple, deque
from itertools import count
from or_gym.utils import create_env
from gym.spaces import Box

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.env_util import make_vec_env

from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy as SACPolicy

from stable_baselines3 import A2C
from stable_baselines3.a2c.policies import MlpPolicy as A2CPolicy

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy as PPOPolicy

from sb3_contrib import ARS
from sb3_contrib.ars.policies import ARSPolicy

from sb3_contrib import RecurrentPPO
from sb3_contrib.ppo_recurrent.policies import RecurrentActorCriticPolicy

from sb3_contrib import TQC
from sb3_contrib.tqc.policies import MlpPolicy as TQCPolicy

from sb3_contrib import TRPO
from sb3_contrib.trpo.policies import MlpPolicy as TRPOPolicy

In [15]:
def create_env(env_name, n_envs, monitor_path, env_seed):
    if n_envs == 1:
        env = or_gym.make(env_name, seed_int=env_seed)
        monitored_env = Monitor(env, monitor_path)
        return monitored_env
    else:
        def make_env():    
            def _init():
                return or_gym.make(env_name)
            return _init
    
        vec_env = make_vec_env(make_env(), n_envs=n_envs, seed=env_seed, vec_env_cls=DummyVecEnv)
        #vec_env = make_vec_env(env, n_envs, env_seed, vec_env_cls=DummyVecEnv)
        monitored_vec_env = VecMonitor(vec_env, monitor_path)
        return monitored_vec_env


def train_model_on_env(env_name, algorithm, policy, name='default', n_envs=4, env_seed=42, timesteps=100000, eval_freq=1000):
    env_model_path = f'./data/{env_name}/{algorithm.__name__}/{name}/'

    if not os.path.exists(env_model_path):
        os.makedirs(env_model_path)

    env = create_env(env_name, n_envs, env_model_path, env_seed)
    model = algorithm(policy, env)

    eval_callback = EvalCallback(env, best_model_save_path=env_model_path, verbose=1,
                                log_path=env_model_path, eval_freq=eval_freq, deterministic=True, render=False)
    
    model.learn(total_timesteps=timesteps, callback=eval_callback)


In [17]:
train_model_on_env('InvManagement-v1', PPO, PPOPolicy, timesteps=3000000, eval_freq=10000)

Eval num_timesteps=40000, episode_reward=-268.81 +/- 5.45
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=-272.40 +/- 5.00
Episode length: 30.00 +/- 0.00
Eval num_timesteps=120000, episode_reward=-269.01 +/- 3.43
Episode length: 30.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=-270.07 +/- 2.89
Episode length: 30.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=-258.28 +/- 2.37
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=-255.51 +/- 6.79
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=280000, episode_reward=-241.10 +/- 2.81
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=320000, episode_reward=-237.15 +/- 3.63
Episode length: 30.00 +/- 0.00
New best mean reward!


In [None]:
train_model_on_env('InvManagement-v1', A2C, A2CPolicy, timesteps=2500000, eval_freq=10000)



Eval num_timesteps=40000, episode_reward=20.66 +/- 2.67
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=183.58 +/- 9.81
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=178.69 +/- 3.31
Episode length: 30.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=151.16 +/- 7.18
Episode length: 30.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=199.47 +/- 5.65
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=192.21 +/- 7.16
Episode length: 30.00 +/- 0.00
Eval num_timesteps=280000, episode_reward=266.38 +/- 7.81
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=320000, episode_reward=275.29 +/- 16.92
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=360000, episode_reward=423.81 +/- 13.63
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=400000, episode_reward=408.19 +/- 9.62
Episode le

In [None]:
train_model_on_env('InvManagement-v1', SAC, SACPolicy, timesteps=2500000, eval_freq=10000)

Eval num_timesteps=40000, episode_reward=196.95 +/- 28.86
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=210.65 +/- 22.71
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=181.60 +/- 36.90
Episode length: 30.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=103.82 +/- 24.91
Episode length: 30.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=66.20 +/- 33.62
Episode length: 30.00 +/- 0.00
Eval num_timesteps=240000, episode_reward=54.12 +/- 12.53
Episode length: 30.00 +/- 0.00
Eval num_timesteps=280000, episode_reward=151.05 +/- 32.77
Episode length: 30.00 +/- 0.00
Eval num_timesteps=320000, episode_reward=217.17 +/- 42.04
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=360000, episode_reward=286.09 +/- 16.69
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=400000, episode_reward=249.35 +/- 49.46
Episode length: 30.00 +/- 0.00
Eval num_timeste

In [None]:
train_model_on_env('InvManagement-v1', ARS, ARSPolicy, timesteps=2500000, eval_freq=10000)

Eval num_timesteps=40000, episode_reward=287.97 +/- 31.77
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=301.76 +/- 37.41
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=364.25 +/- 32.09
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=160000, episode_reward=361.79 +/- 19.53
Episode length: 30.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=393.80 +/- 33.84
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=378.23 +/- 24.59
Episode length: 30.00 +/- 0.00
Eval num_timesteps=280000, episode_reward=408.09 +/- 23.39
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=320000, episode_reward=396.41 +/- 50.65
Episode length: 30.00 +/- 0.00
Eval num_timesteps=360000, episode_reward=398.58 +/- 23.42
Episode length: 30.00 +/- 0.00
Eval num_timesteps=400000, episode_reward=374.94 +/- 18.02
Episode length: 30.00 +

In [None]:
train_model_on_env('InvManagement-v1', TQC, TQCPolicy, timesteps=80000, eval_freq=1000)

Eval num_timesteps=4000, episode_reward=184.73 +/- 28.94
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=8000, episode_reward=282.95 +/- 27.99
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=12000, episode_reward=322.64 +/- 24.61
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=16000, episode_reward=284.00 +/- 13.63
Episode length: 30.00 +/- 0.00
Eval num_timesteps=20000, episode_reward=249.72 +/- 9.08
Episode length: 30.00 +/- 0.00
Eval num_timesteps=24000, episode_reward=75.37 +/- 14.77
Episode length: 30.00 +/- 0.00
Eval num_timesteps=28000, episode_reward=-96.91 +/- 7.72
Episode length: 30.00 +/- 0.00


KeyboardInterrupt: 

In [None]:
train_model_on_env('InvManagement-v1', TRPO, TRPOPolicy, timesteps=2500000, eval_freq=10000)

Eval num_timesteps=40000, episode_reward=-270.87 +/- 2.39
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=-251.62 +/- 4.14
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=-163.15 +/- 6.09
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=160000, episode_reward=-66.92 +/- 5.45
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200000, episode_reward=5.14 +/- 6.13
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=40.12 +/- 4.45
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=280000, episode_reward=45.02 +/- 3.13
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=320000, episode_reward=111.62 +/- 3.62
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=360000, episode_reward=173.64 +/- 10.02
Episode length: 30.00 +/- 0.00
New best mean reward!
Eva

In [None]:
train_model_on_env('InvManagement-v1', RecurrentPPO, RecurrentActorCriticPolicy, timesteps=2500000, eval_freq=10000)

Eval num_timesteps=40000, episode_reward=-213.86 +/- 4.68
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=12.62 +/- 3.00
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=208.88 +/- 8.19
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=160000, episode_reward=295.88 +/- 11.27
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200000, episode_reward=340.69 +/- 11.06
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=355.44 +/- 8.18
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=280000, episode_reward=406.45 +/- 9.82
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=320000, episode_reward=414.00 +/- 8.25
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=360000, episode_reward=419.55 +/- 17.84
Episode length: 30.00 +/- 0.00
New best mean reward!


KeyboardInterrupt: 

In [None]:
train_model_on_env('NetworkManagement-v1', PPO, PPOPolicy, timesteps=2500000, eval_freq=10000)

In [None]:
train_model_on_env('NetworkManagement-v1', TRPO, TRPOPolicy, timesteps=2500000, eval_freq=10000)

In [None]:
train_model_on_env('NetworkManagement-v1', RecurrentPPO, RecurrentActorCriticPolicy, timesteps=2500000, eval_freq=10000)

In [None]:
train_model_on_env('NetworkManagement-v1', ARS, ARSPolicy, timesteps=2500000, eval_freq=10000)

In [None]:
train_model_on_env('NetworkManagement-v1', A2C, A2CPolicy, timesteps=2500000, eval_freq=10000)