In [2]:
import numpy as np
import random
import matplotlib.pyplot as plt
import or_gym
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from collections import namedtuple, deque
from itertools import count
from or_gym.utils import create_env
from gym.spaces import Box

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.env_util import make_vec_env

from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy as SACPolicy

from stable_baselines3 import A2C
from stable_baselines3.a2c.policies import MlpPolicy as A2CPolicy

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy as PPOPolicy

from sb3_contrib import ARS
from sb3_contrib.ars.policies import ARSPolicy

from sb3_contrib import RecurrentPPO
from sb3_contrib.ppo_recurrent.policies import RecurrentActorCriticPolicy

from sb3_contrib import TQC
from sb3_contrib.tqc.policies import MlpPolicy as TQCPolicy

from sb3_contrib import TRPO
from sb3_contrib.trpo.policies import MlpPolicy as TRPOPolicy

In [3]:
def create_env(env_name, n_envs, monitor_path, env_seed):
    if n_envs == 1:
        env = or_gym.make(env_name, seed_int=env_seed)
        monitored_env = Monitor(env, monitor_path)
        return monitored_env
    else:
        def make_env():    
            def _init():
                return or_gym.make(env_name, seed_int=env_seed)
            return _init
    
        vec_env = make_vec_env(make_env(), n_envs=n_envs, seed=env_seed, vec_env_cls=DummyVecEnv)
        #vec_env = make_vec_env(env, n_envs, env_seed, vec_env_cls=DummyVecEnv)
        monitored_vec_env = VecMonitor(vec_env, monitor_path)
        return monitored_vec_env


def train_model_on_env(env_name, algorithm, policy, name='default', n_envs=4, env_seed=42, timesteps=100000, eval_freq=1000):
    env_model_path = f'./data/{env_name}/{algorithm.__name__}/{name}/'

    if not os.path.exists(env_model_path):
        os.makedirs(env_model_path)

    env = create_env(env_name, n_envs, env_model_path, env_seed)
    model = algorithm(policy, env)

    eval_callback = EvalCallback(env, best_model_save_path=env_model_path, verbose=1,
                                log_path=env_model_path, eval_freq=eval_freq, deterministic=True, render=False)
    
    model.learn(total_timesteps=timesteps, callback=eval_callback)


In [24]:
train_model_on_env('InvManagement-v1', PPO, PPOPolicy, timesteps=2500000, eval_freq=10000)



Eval num_timesteps=40000, episode_reward=-268.81 +/- 5.45
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=-272.40 +/- 5.00
Episode length: 30.00 +/- 0.00
Eval num_timesteps=120000, episode_reward=-269.01 +/- 3.43
Episode length: 30.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=-257.26 +/- 1.77
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=200000, episode_reward=-250.25 +/- 1.64
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=-217.45 +/- 6.85
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=280000, episode_reward=-185.44 +/- 4.26
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=320000, episode_reward=-152.40 +/- 5.72
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=360000, episode_reward=-120.23 +/- 5.79
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=400000, episode_rew

In [4]:
train_model_on_env('InvManagement-v1', A2C, A2CPolicy, timesteps=2500000, eval_freq=10000)



Eval num_timesteps=40000, episode_reward=20.66 +/- 2.67
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=80000, episode_reward=183.58 +/- 9.81
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=120000, episode_reward=178.69 +/- 3.31
Episode length: 30.00 +/- 0.00
Eval num_timesteps=160000, episode_reward=151.16 +/- 7.18
Episode length: 30.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=199.47 +/- 5.65
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=240000, episode_reward=192.21 +/- 7.16
Episode length: 30.00 +/- 0.00
Eval num_timesteps=280000, episode_reward=266.38 +/- 7.81
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=320000, episode_reward=275.29 +/- 16.92
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=360000, episode_reward=423.81 +/- 13.63
Episode length: 30.00 +/- 0.00
New best mean reward!
Eval num_timesteps=400000, episode_reward=408.19 +/- 9.62
Episode le

In [None]:
train_model_on_env('InvManagement-v1', SAC, SACPolicy, timesteps=2500000, eval_freq=10000)