# Quick Start

This notebook demonstrates how to use MARO's reinforcement learning (RL) toolkit to solve the container inventory management ([CIM](https://maro.readthedocs.io/en/latest/scenarios/container_inventory_management.html)) problem. It is formalized as a multi-agent reinforcement learning problem, where each port acts as a decision agent. When a vessel arrives at a port, these agents must take actions by transfering a certain amount of containers to / from the vessel. The objective is for the agents to learn policies that minimize the overall container shortage. 

In [1]:
import numpy as np

# Common info
PORT_ATTRIBUTES = ["empty", "full", "on_shipper", "on_consignee", "booking", "shortage", "fulfillment"]
VESSEL_ATTRIBUTES = ["empty", "full", "remaining_space"]
ACTION_SPACE = list(np.linspace(-1.0, 1.0, 21))

## Shaping

In [2]:
from collections import defaultdict
import numpy as np
from maro.simulator.scenarios.cim.common import Action, ActionType

def get_state(decision_event, snapshots, look_back=7):
    """
    This function converts environment observations to state vectors that encode temporal and spatial
    information. The temporal information includes relevant port and vessel information, such as shortage
    and remaining space, over the past k days (here k = 7). The spatial information includes features
    of the downstream ports.
    """
    tick, port_idx, vessel_idx = decision_event.tick, decision_event.port_idx, decision_event.vessel_idx
    ticks = [tick - rt for rt in range(look_back - 1)]
    future_port_idx_list = snapshots["vessels"][tick: vessel_idx: 'future_stop_list'].astype('int')
    port_features = snapshots["ports"][ticks: [port_idx] + list(future_port_idx_list): PORT_ATTRIBUTES]
    vessel_features = snapshots["vessels"][tick: vessel_idx: VESSEL_ATTRIBUTES]
    return np.concatenate((port_features, vessel_features))

def get_env_action(model_action, decision_event, vessel_snapshots):
    """
    This function converts agents' output (an integer that maps to a percentage of containers to be loaded
    to or unloaded from the vessel) to action objects that can be executed by the environment.
    """
    scope = decision_event.action_scope
    tick = decision_event.tick
    port = decision_event.port_idx
    vessel = decision_event.vessel_idx
    zero_action_idx = len(ACTION_SPACE) / 2  # index corresponding to value zero.

    vessel_space = vessel_snapshots[tick:vessel:VESSEL_ATTRIBUTES][2]
    early_discharge = vessel_snapshots[tick:vessel:"early_discharge"][0]
    percent = abs(ACTION_SPACE[model_action])
    if model_action < zero_action_idx:
        action_type = ActionType.LOAD
        actual_action = min(round(percent * scope.load), vessel_space)
    elif model_action > zero_action_idx:
        action_type = ActionType.DISCHARGE
        plan_action = percent * (scope.discharge + early_discharge) - early_discharge
        actual_action = round(plan_action) if plan_action > 0 else round(percent * scope.discharge)
    else:
        actual_action, action_type = 0, None

    return Action(vessel, port, actual_action, action_type)

def get_reward(
    decision_event, port_snapshots, reward_time_window=100, time_decay=0.97,
    fulfillment_factor=1.0, shortage_factor=1.0    
):
    """
    This function computes the reward of a given action as a linear combination of fulfillment and
    shortage within a future time frame (set to 100 here).
    """
    start_tick = decision_event.tick + 1
    end_tick = decision_event.tick + reward_time_window
    ticks = list(range(start_tick, end_tick))

    future_fulfillment = port_snapshots[ticks::"fulfillment"]
    future_shortage = port_snapshots[ticks::"shortage"]
    decay_list = [
        time_decay ** i for i in range(end_tick - start_tick)
        for _ in range(future_fulfillment.shape[0] // (end_tick - start_tick))
    ]

    tot_fulfillment = np.dot(future_fulfillment, decay_list)
    tot_shortage = np.dot(future_shortage, decay_list)

    return np.float32(fulfillment_factor * tot_fulfillment - shortage_factor * tot_shortage)

def get_training_data(trajectory, port_snapshots):
    """
    This function processes a trajectory of transitions into training data. The transitions are
    bucketed by the agent ID.
    """
    agent_ids = trajectory["agent_id"]
    events = trajectory["event"]
    states = trajectory["state"]
    actions = trajectory["action"]
    log_p = trajectory["log_p"]

    training_data = defaultdict(lambda: defaultdict(list))
    for i in range(len(states)):
        data = training_data[agent_ids[i]]
        data["state"].append(states[i])
        data["action"].append(actions[i])
        data["log_p"].append(log_p[i])
        data["reward"].append(get_reward(events[i], port_snapshots))
        
    for agent_id in training_data:
        for key, vals in training_data[agent_id].items():
            training_data[agent_id][key] = np.asarray(vals, dtype=np.float32 if key == "reward" else None)
    
    return training_data

## [Agent](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#agent)

The out-of-the-box ActorCritic is used as our agent.

In [3]:
import torch.nn as nn
from torch.optim import Adam, RMSprop

from maro.rl import ActorCritic, ActorCriticConfig, FullyConnectedBlock, OptimOption, SimpleMultiHeadModel

# We consider the port in question as well as two downstream ports, hence the factor 3.
# We consider the states of these ports over the past 7 days plus the current day, hence the factor 8.
input_dim = 3 * 8 * len(PORT_ATTRIBUTES) + len(VESSEL_ATTRIBUTES)
agent_config = {
    "model": {
        "actor": {
            "input_dim": input_dim,
            "output_dim": len(ACTION_SPACE),
            "hidden_dims": [256, 128, 64],
            "activation": nn.Tanh,
            "softmax": True,
            "batch_norm": False,
            "head": True
        },
        "critic": {
            "input_dim": input_dim,
            "output_dim": 1,
            "hidden_dims": [256, 128, 64],
            "activation": nn.LeakyReLU,
            "softmax": False,
            "batch_norm": True,
            "head": True
        }
    },
    "optimization": {
        "actor": OptimOption(optim_cls=Adam, optim_params={"lr": 0.001}),
        "critic": OptimOption(optim_cls=RMSprop, optim_params={"lr": 0.001})
    },
    "hyper_params": {
        "reward_discount": .0,
        "critic_loss_func": nn.SmoothL1Loss(),
        "train_iters": 10,
        "actor_loss_coefficient": 0.1,  # loss = actor_loss_coefficient * actor_loss + critic_loss
        "k": 1,  # for k-step return
        "lam": 0.0  # lambda return coefficient
    }
}

def get_ac_agent():
    actor_net = FullyConnectedBlock(**agent_config["model"]["actor"])
    critic_net = FullyConnectedBlock(**agent_config["model"]["critic"])
    ac_model = SimpleMultiHeadModel(
        {"actor": actor_net, "critic": critic_net}, optim_option=agent_config["optimization"],
    )
    return ActorCritic(ac_model, ActorCriticConfig(**agent_config["hyper_params"]))

## Actor (Roll-out Loop)

Below is an implementation of the roll-out loop. Note how the shaping functions are used during the agents' interaction with the environment. For each transition, we record the agent ID, event, state, action and its log probability. At the end of the roll-out, the recorded sequence of transitions (the trajectory) gets processed into training data. 

In [4]:
from maro.rl import AbsActor


class BasicActor(AbsActor):
    def roll_out(self, index, training=True):
        self.env.reset()
        trajectory = {key: [] for key in ["state", "action", "agent_id", "event", "log_p"]}
        metrics, event, is_done = self.env.step(None)
        while not is_done:
            state = get_state(event, self.env.snapshot_list)
            agent_id = event.port_idx
            action, log_p = self.agent[agent_id].choose_action(state)
            trajectory["state"].append(state)
            trajectory["agent_id"].append(agent_id)
            trajectory["event"].append(event)
            trajectory["action"].append(action)
            trajectory["log_p"].append(log_p)
            env_action = get_env_action(action, event, self.env.snapshot_list["vessels"])
            metrics, event, is_done = self.env.step(env_action)

        return get_training_data(trajectory, self.env.snapshot_list["ports"]) if training else None

## Training

This code cell demonstrates a typical single-threaded training workflow.

In [5]:
from maro.simulator import Env
from maro.rl import MultiAgentWrapper
from maro.utils import set_seeds

set_seeds(1024)  # for reproducibility

# Step 1: create a CIM environment for a toy dataset
env = Env("cim", "toy.4p_ssdd_l0.0", durations=1120)
# Step 2: create agents
agent = MultiAgentWrapper({name: get_ac_agent() for name in env.agent_idx_list})
# Step 3: training loop
actor = BasicActor(env, agent)
for ep in range(50):
    exp_by_agent = actor.roll_out(ep)
    print(f"ep-{ep}: {env.metrics}")
    for agent_id, exp in exp_by_agent.items():
        agent[agent_id].learn(exp["state"], exp["action"], exp["log_p"], exp["reward"])

ep-0: {'order_requirements': 2240000, 'container_shortage': 1422736, 'operation_number': 4220466}
ep-1: {'order_requirements': 2240000, 'container_shortage': 1330641, 'operation_number': 3919970}
ep-2: {'order_requirements': 2240000, 'container_shortage': 996878, 'operation_number': 3226186}
ep-3: {'order_requirements': 2240000, 'container_shortage': 703662, 'operation_number': 3608511}
ep-4: {'order_requirements': 2240000, 'container_shortage': 601934, 'operation_number': 3579281}
ep-5: {'order_requirements': 2240000, 'container_shortage': 629344, 'operation_number': 3456707}
ep-6: {'order_requirements': 2240000, 'container_shortage': 560709, 'operation_number': 3511869}
ep-7: {'order_requirements': 2240000, 'container_shortage': 483549, 'operation_number': 3613713}
ep-8: {'order_requirements': 2240000, 'container_shortage': 390332, 'operation_number': 3817820}
ep-9: {'order_requirements': 2240000, 'container_shortage': 361151, 'operation_number': 3823994}
ep-10: {'order_requirements'