## Using the StableBaselines3 library for reinforcement learning

In this notebook we test an implementation of the proximal policy optimization (PPO) method for the 3D bin packing environment.
The PPO method is described in detail in [this paper](https://arxiv.org/abs/1707.06347). It is a variant of Trust Region Policy Optimization (TRPO) described in [this article](https://arxiv.org/abs/1502.05477). The PPO algorithm works in two phases. In one phase, a large number of rollouts are performed (in parallel). The rollouts are then aggregated on the driver and a surrogate optimization objective is defined based on those rollouts. We then use SGD to find the policy that maximizes that objective with a penalty term for diverging too much from the current policy.

![ppo](https://raw.githubusercontent.com/ucbrise/risecamp/risecamp2018/ray/tutorial/rllib_exercises/ppo.png)

We now test the PPO algorithm with the 3D bin packing environment.

In [1]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

import gym
from numpy.typing import NDArray
from sb3_contrib.common.wrappers import ActionMasker

from src.utils import boxes_generator

In [2]:
def mask_fn(env: gym.Env) -> NDArray:
    return env.get_action_mask

def make_env(container_size, num_boxes, num_visible_boxes=1, seed=0, render_mode="rgb_array",
             random_boxes=False, only_terminal_reward=False):
    """Utility function for initializing bin packing env with action masking
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """

    env = gym.make(
        "PackingEnv-v0",
        container_size=container_size,
        box_sizes=boxes_generator(container_size, num_boxes, seed),
        num_visible_boxes=num_visible_boxes,
        render_mode=render_mode,
        random_boxes=random_boxes,
        only_terminal_reward=only_terminal_reward
    )
    env = ActionMasker(env, mask_fn)
    return env


In [3]:
from stable_baselines3.common.env_util import make_vec_env
from sb3_contrib.ppo_mask import MaskablePPO
from sb3_contrib.common.maskable.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback


warnings.filterwarnings("ignore", category=DeprecationWarning)
# Environment initialization
container_size = [5, 5, 5]
num_boxes = 10
num_visible_boxes = 10
num_env = 2
env_kwargs = dict(
    container_size=container_size,
    num_boxes=num_boxes,
    num_visible_boxes=num_visible_boxes,
    render_mode="rgb_array",
    seed=42,
    random_boxes=True,
    only_terminal_reward=False)
env = make_vec_env(make_env, n_envs=num_env, env_kwargs=env_kwargs)
print("finished initialization of vectorized environment")
print("beginning training")

# MaskablePPO initialization
model = MaskablePPO("MultiInputPolicy", env, gamma=0.4, verbose=1, tensorboard_log="../logs")

#checkpoint_callback = CheckpointCallback(
 #   save_freq=10, save_path="../logs/", name_prefix="rl_model"
#)

evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90, warn=False)
model.learn(50, callback=checkpoint_callback)
print("done training")
model.save("../models/ppo_mask_cont555_boxes10_vis10_steps_50_numenv_2")

finished initialization of vectorized environment
beginning training
Using cpu device


ValueError: not enough values to unpack (expected 5, got 4)

In [2]:
from sb3_contrib.common.maskable.evaluation import evaluate_policy
from stable_baselines3.common.vec_env.vec_monitor import VecMonitor
import os

model = MaskablePPO.load("../models/ppo_mask_cont555_boxes10_vis10_steps_50_numenv_2")

num_env = 2
env_kwargs = dict(
    container_size=container_size,
    num_boxes=num_boxes,
    num_visible_boxes=num_visible_boxes,
    render_mode="rgb_array",
    seed=42,
    random_boxes=True,
    only_terminal_reward=False)

eval_env = make_vec_env(make_env, n_envs=num_env, env_kwargs=env_kwargs)
log_dir = "../eval/"
os.makedirs(log_dir, exist_ok=True)
eval_env = VecMonitor(eval_env, log_dir)

print("beginning evaluation")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

# obs = env.reset()
# while True:
#     # Retrieve current action mask
#     action_masks = get_action_masks(env)
#     action, _states = model.predict(obs, action_masks=action_masks)
#     obs, rewards, dones, info = env.step(action)
#     env.render()


NameError: name 'MaskablePPO' is not defined

In [4]:
from plotly_gif import GIF

gif = GIF(gif_name="555_10box_10vis_200steps.gif", gif_path="../gifs")

container_size = [5, 5, 5]
num_boxes = 10
num_visible_boxes = 10
seed = 33
env_kwargs = dict(
    container_size=container_size,
    num_boxes=num_boxes,
    num_visible_boxes=num_visible_boxes,
    render_mode="human",
    seed=seed,

)

eval_env = make_vec_env(make_env, n_envs=2, env_kwargs=env_kwargs)

done = False
obs = eval_env.reset()


In [5]:
while not done:
    # Retrieve current action mask
    action, _states = model.predict(obs)
    obs, rewards, dones, info = eval_env.step(action)
    fig = eval_env.render()
    gif.create_image(fig)

gif.create_gif(length=5000)
fig = eval_env.container.plot()






AttributeError: 'NoneType' object has no attribute 'write_image'