In [1]:
# Library imports
from env import WarehouseEnv
from stable_baselines3 import PPO, DQN
import json

In [2]:
%load_ext tensorboard
%tensorboard --logdir='Data_Training' --port 3001    #Change port if needed (6006,9009,9999)

#pip install --upgrade tensorboard tensorflow

In [3]:
def train_dqn():
    
    env = WarehouseEnv()
    
    env.reset()

    model_DQN = DQN("MlpPolicy", 
                    env, 
                    tensorboard_log=f"Data_Training/DQN", 
                    verbose=0,
                    device="cuda",
                    learning_rate=0.005,
                    gamma=0.99,
                    exploration_fraction=0.8,
                    exploration_initial_eps=1,
                    exploration_final_eps=0.05,
                )

    n_steps = 500000
    model_DQN.learn(n_steps, reset_num_timesteps=True)
    model_DQN.save(f'rl_models/dqn_models/dqn_model.zip')
    model_DQN.save_replay_buffer(f'rl_models/dqn_replay_buffer/replay_buffer_dqn.zip')
    print('Saved DQN model')

In [4]:
train_dqn()

Saved DQN model


In [5]:
def train_ppo():
    
    env = WarehouseEnv()
    
    env.reset()

    model_PPO = PPO("MlpPolicy", 
                    env, 
                    tensorboard_log=f"Data_Training/PPO", 
                    verbose=0,
                    device="cuda",
                    learning_rate=0.0001,
                    n_steps=2048,
                    batch_size=64,
                    n_epochs=10,
                    gamma=0.99,
                )

    n_steps = 500000
    model_PPO.learn(n_steps, reset_num_timesteps=True)
    model_PPO.save(f'rl_models/ppo_models/ppo_model.zip')
    print('Saved PPO model')

In [6]:
train_ppo()

Saved PPO model


In [7]:
def train_maskable_ppo():
    from sb3_contrib import MaskablePPO
    env = WarehouseEnv()
    env.reset()

    model_maskable_PPO = MaskablePPO("MlpPolicy", 
                    env, 
                    tensorboard_log=f"Data_Training/MaskablePPO", 
                    verbose=0,
                    device="cuda",
                    learning_rate=0.0001,
                    n_steps=2048,
                    batch_size=64,
                    n_epochs=10,
                    gamma=0.99)

    n_steps = 500000
    model_maskable_PPO.learn(n_steps, reset_num_timesteps=True)
    model_maskable_PPO.save(f'rl_models/maskable_ppo_models/maskable_ppo_model.zip')
    print('Saved Maskable PPO model')


In [8]:
train_maskable_ppo()

Saved Maskable PPO model


In [None]:
#Run the env with a random policy

done = False
final_reward = 0
env = WarehouseEnv()
state,_ = env.reset()

while not done:
    action = env.action_space.sample()
    state, reward, done, done, info = env.step(action)
    final_reward += reward
    env.render()
print(f'Final reward: {final_reward}')
#Run the env with a trained policy