# Quick Start

This notebook demonstrates how to use MARO's reinforcement learning (RL) toolkit to solve the container inventory management ([CIM](https://maro.readthedocs.io/en/latest/scenarios/container_inventory_management.html)) problem. It is formalized as a multi-agent reinforcement learning problem, where each port acts as a decision agent. The agents take actions independently, e.g., loading containers to vessels or discharging containers from vessels.   

## [State Shaper](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#shapers)

State shaper converts the environment observation to the model input state which includes temporal and spatial information. For this scenario, the model input state includes: 

- Temporal information, including the past week's information of ports and vessels, such as shortage on port and remaining space on vessel. 

- Spatial information, it including the related downstream port features.    

In [1]:
import numpy as np
from maro.rl import Shaper


PORT_ATTRIBUTES = ["empty", "full", "on_shipper", "on_consignee", "booking", "shortage", "fulfillment"]
VESSEL_ATTRIBUTES = ["empty", "full", "remaining_space"]


class CIMStateShaper(Shaper):
    def __init__(self, *, look_back, max_ports_downstream):
        super().__init__()
        self._look_back = look_back
        self._max_ports_downstream = max_ports_downstream
        self._dim = (look_back + 1) * (max_ports_downstream + 1) * len(PORT_ATTRIBUTES) + len(VESSEL_ATTRIBUTES)

    def __call__(self, decision_event, snapshot_list):
        tick, port_idx, vessel_idx = decision_event.tick, decision_event.port_idx, decision_event.vessel_idx
        ticks = [tick - rt for rt in range(self._look_back - 1)]
        future_port_idx_list = snapshot_list["vessels"][tick: vessel_idx: 'future_stop_list'].astype('int')
        port_features = snapshot_list["ports"][ticks: [port_idx] + list(future_port_idx_list): PORT_ATTRIBUTES]
        vessel_features = snapshot_list["vessels"][tick: vessel_idx: VESSEL_ATTRIBUTES]
        state = np.concatenate((port_features, vessel_features))
        return state

    @property
    def dim(self):
        return self._dim
    
# Create a state shaper
state_shaper = CIMStateShaper(look_back=7, max_ports_downstream=2)

## [Action Shaper](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#shapers)

Action shaper is used to convert an agent's model output to an environment executable action. For this specific scenario, the output is a discrete index that corresponds to a percentage indicating the fraction of containers to be loaded to or discharged from the arriving vessel.

In [2]:
from maro.simulator.scenarios.cim.common import Action, ActionType


class CIMActionShaper(Shaper):
    def __init__(self, action_space):
        super().__init__()
        self._action_space = action_space
        self._zero_action_index = action_space.index(0)

    def __call__(self, model_action, decision_event, snapshot_list):
        scope = decision_event.action_scope
        tick = decision_event.tick
        port_idx = decision_event.port_idx
        vessel_idx = decision_event.vessel_idx

        port_empty = snapshot_list["ports"][tick: port_idx: ["empty", "full", "on_shipper", "on_consignee"]][0]
        vessel_remaining_space = snapshot_list["vessels"][tick: vessel_idx: ["empty", "full", "remaining_space"]][2]
        early_discharge = snapshot_list["vessels"][tick:vessel_idx: "early_discharge"][0]
        assert 0 <= model_action < len(self._action_space)
        operation_num = self._action_space[model_action]

        if model_action < self._zero_action_index:
            actual_action = max(round(operation_num * port_empty), -vessel_remaining_space)
            action_type = ActionType.LOAD
        elif model_action > self._zero_action_index:
            plan_action = operation_num * (scope.discharge + early_discharge) - early_discharge
            actual_action = round(plan_action) if plan_action > 0 else round(operation_num * scope.discharge)
            action_type = ActionType.DISCHARGE
        else:
            actual_action = 0
            action_type = None

        return Action(vessel_idx, port_idx, abs(actual_action), action_type)
    
# Create an action shaper
NUM_ACTIONS = 21
action_shaper = CIMActionShaper(action_space=list(np.linspace(-1.0, 1.0, NUM_ACTIONS)))

## [Experience Shaper](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#shapers)

Experience shaper is used to convert an episode trajectory to trainable experiences for RL agents. For this specific scenario, the reward is a linear combination of fulfillment and shortage in a limited time window.

In [3]:
from collections import defaultdict


class CIMExperienceShaper(Shaper):
    def __init__(
        self, *, time_window: int, time_decay_factor: float, fulfillment_factor: float, shortage_factor: float
    ):
        super().__init__(reward_func=None)
        self._time_window = time_window
        self._time_decay_factor = time_decay_factor
        self._fulfillment_factor = fulfillment_factor
        self._shortage_factor = shortage_factor
        self._trajectory = {key: [] for key in ["state", "action", "agent_id", "event"]}
    
    def __call__(self, snapshot_list):
        states = self._trajectory["state"]
        actions = self._trajectory["action"]
        agent_ids = self._trajectory["agent_id"]
        events = self._trajectory["event"]

        experiences_by_agent = defaultdict(lambda: defaultdict(list))
        for i in range(len(states) - 1):
            experiences = experiences_by_agent[agent_ids[i]]
            experiences["state"].append(states[i])
            experiences["action"].append(actions[i])
            experiences["reward"].append(self._compute_reward(events[i], snapshot_list))
            experiences["next_state"].append(states[i + 1])

        return dict(experiences_by_agent)

    def record(self, transition: dict):
        for key, val in transition.items():
            self._trajectory[key].append(val)

    def reset(self):
        self._trajectory = {key: [] for key in ["state", "action", "agent_id", "event"]}

    def _compute_reward(self, decision_event, snapshot_list):
        start_tick = decision_event.tick + 1
        end_tick = decision_event.tick + self._time_window
        ticks = list(range(start_tick, end_tick))

        # calculate tc reward
        future_fulfillment = snapshot_list["ports"][ticks::"fulfillment"]
        future_shortage = snapshot_list["ports"][ticks::"shortage"]
        decay_list = [
            self._time_decay_factor ** i for i in range(end_tick - start_tick)
            for _ in range(future_fulfillment.shape[0] // (end_tick - start_tick))
        ]

        tot_fulfillment = np.dot(future_fulfillment, decay_list)
        tot_shortage = np.dot(future_shortage, decay_list)

        return np.float32(self._fulfillment_factor * tot_fulfillment - self._shortage_factor * tot_shortage)
    
# Create an experience shaper
experience_shaper = CIMExperienceShaper(time_window=100, fulfillment_factor=1.0, shortage_factor=1.0, time_decay_factor=0.97)

## [Agent](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#agent)

For this scenario, the agent is the algorithmic abstraction of a port. We choose DQN as our underlying learning algorithm with a TD-error-based sampling mechanism.  

In [4]:
import torch.nn as nn
from torch.nn.functional import smooth_l1_loss
from torch.optim import RMSprop

from maro.rl import DQN, DQNConfig, FullyConnectedBlock, OptimOption, SimpleMultiHeadModel, SimpleStore
from maro.utils import set_seeds


def create_dqn_agents(agent_id_list):
    set_seeds(64)  # for reproducibility
    agent_dict = {}
    for agent_id in agent_id_list:
        q_net = FullyConnectedBlock(
            input_dim=state_shaper.dim,
            hidden_dims=[256, 128, 64],
            output_dim=NUM_ACTIONS,
            activation=nn.LeakyReLU,
            is_head=True,
            batch_norm=True, 
            softmax=False,
            skip_connection=False,
            dropout_p=.0
        )
        
        learning_model = SimpleMultiHeadModel(
            q_net, optim_option=OptimOption(optim_cls=RMSprop, optim_params={"lr": 0.05})
        )
        agent_dict[agent_id] = DQN(
            agent_id, 
            learning_model, 
            config=DQNConfig(
                reward_discount=.0, 
                min_exp_to_train=1024,
                num_batches=10,
                batch_size=128, 
                target_update_freq=5, 
                tau=0.1, 
                is_double=True, 
                per_sample_td_error=True,
                loss_cls=nn.SmoothL1Loss
            )
        )

    return agent_dict

## [Actor](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#actor)

The sole purpose of an actor is to perform roll-outs and collect experiences. An actor consists of an environment instance, an agent (a single agent or multiple agents wrapped by MultiAgentWrapper) and optionally a state shaper, an action shaper and an experience shaper if certain conversions are necessary.  

In [5]:
from maro.rl import AbsActor


class Actor(AbsActor):
    def __init__(self, env, agent, state_shaper, action_shaper, experience_shaper):
        super().__init__(
            env, agent, 
            state_shaper=state_shaper, action_shaper=action_shaper, experience_shaper=experience_shaper
        )

    def roll_out(self, index, training=True):
        self.env.reset()
        metrics, event, is_done = self.env.step(None)
        while not is_done:
            state = self.state_shaper(event, self.env.snapshot_list)
            agent_id = str(event.port_idx)
            action = self.agent[agent_id].choose_action(state)
            self.experience_shaper.record(
                {"state": state, "agent_id": agent_id, "event": event, "action": action}
            )
            metrics, event, is_done = self.env.step(self.action_shaper(action, event, self.env.snapshot_list))

        exp = self.experience_shaper(self.env.snapshot_list) if training else None
        self.experience_shaper.reset()

        return exp

## [Learner](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#learner)

A learner implements the main training loop and policy update logic. It contains an actor for performing roll-outs and collecting experiences and a scheduler for controlling the training loop and generating exploration parameters. 

In [6]:
from maro.rl import AbsLearner
from maro.utils import LogFormat, Logger


class Learner(AbsLearner):
    def __init__(self, actor, scheduler):
        super().__init__(actor, scheduler)
        self._logger = Logger("learner", format_=LogFormat.simple, auto_timestamp=False)

    def learn(self):
        for exploration_params in self.scheduler:
            # load exploration parameters
            self.actor.agent.set_exploration_params(exploration_params)
            exp = self.actor.roll_out(self.scheduler.iter)
            self._logger.info(
                f"ep {self.scheduler.iter} - performance: {self.actor.env.metrics}, "
                f"exploration_params: {exploration_params}"
            )
            self.update(exp)

    def update(self, experiences_by_agent):
        # Store experiences for each agent
        for agent_id, exp in experiences_by_agent.items():
            exp.update({"loss": [1e8] * len(list(exp.values())[0])})
            self.actor.agent[agent_id].store_experiences(exp)

        for agent in self.actor.agent.agent_dict.values():
            agent.train()

## Main Loop

This code cell demonstrates the typical workflow of a learning policy's interaction with a MARO environment. 

- Initialize an environment with specific scenario and topology parameters. 

- Define scenario-specific components, e.g. shapers. 

- Create agents. 

- Create an actor and a learner to start the training process. 

In [7]:
from maro.simulator import Env
from maro.rl import MultiAgentWrapper, TwoPhaseLinearParameterScheduler

# Step 1: initialize a CIM environment for a toy dataset. 
env = Env("cim", "toy.4p_ssdd_l0.0", durations=1120)
agent_id_list = [str(agent_id) for agent_id in env.agent_idx_list]

# Step 2: create DQN agents.
agent = MultiAgentWrapper(create_dqn_agents(agent_id_list))

# Step 3: Create an actor and a learner to start the training process. 
scheduler = TwoPhaseLinearParameterScheduler(
    max_iter=100,
    parameter_names=["epsilon"],
    split_ep=50,
    start_values=0.4,
    mid_values=0.32,
    end_values=.0
)

actor = Actor(env, agent, state_shaper, action_shaper, experience_shaper)
learner = Learner(actor, scheduler)
learner.run()

10:15:32 | learner | INFO | ep 0 - performance: {'order_requirements': 2240000, 'container_shortage': 1352136, 'operation_number': 3254760}, exploration_params: {'epsilon': 0.4}
10:15:37 | learner | INFO | ep 1 - performance: {'order_requirements': 2240000, 'container_shortage': 1249849, 'operation_number': 3426101}, exploration_params: {'epsilon': 0.39840000000000003}
10:15:41 | learner | INFO | ep 2 - performance: {'order_requirements': 2240000, 'container_shortage': 1174857, 'operation_number': 3816050}, exploration_params: {'epsilon': 0.39680000000000004}
10:15:46 | learner | INFO | ep 3 - performance: {'order_requirements': 2240000, 'container_shortage': 1168029, 'operation_number': 3783409}, exploration_params: {'epsilon': 0.39520000000000005}
10:15:51 | learner | INFO | ep 4 - performance: {'order_requirements': 2240000, 'container_shortage': 1478014, 'operation_number': 3503012}, exploration_params: {'epsilon': 0.39360000000000006}
10:15:56 | learner | INFO | ep 5 - performance

10:19:25 | learner | INFO | ep 43 - performance: {'order_requirements': 2240000, 'container_shortage': 676856, 'operation_number': 4361711}, exploration_params: {'epsilon': 0.33120000000000044}
10:19:31 | learner | INFO | ep 44 - performance: {'order_requirements': 2240000, 'container_shortage': 669537, 'operation_number': 5291246}, exploration_params: {'epsilon': 0.32960000000000045}
10:19:37 | learner | INFO | ep 45 - performance: {'order_requirements': 2240000, 'container_shortage': 569000, 'operation_number': 4652232}, exploration_params: {'epsilon': 0.32800000000000046}
10:19:43 | learner | INFO | ep 46 - performance: {'order_requirements': 2240000, 'container_shortage': 604969, 'operation_number': 5123438}, exploration_params: {'epsilon': 0.32640000000000047}
10:19:48 | learner | INFO | ep 47 - performance: {'order_requirements': 2240000, 'container_shortage': 557511, 'operation_number': 4832546}, exploration_params: {'epsilon': 0.3248000000000005}
10:19:54 | learner | INFO | ep 

10:23:35 | learner | INFO | ep 86 - performance: {'order_requirements': 2240000, 'container_shortage': 129900, 'operation_number': 4370451}, exploration_params: {'epsilon': 0.08489795918367388}
10:23:41 | learner | INFO | ep 87 - performance: {'order_requirements': 2240000, 'container_shortage': 150391, 'operation_number': 4372565}, exploration_params: {'epsilon': 0.07836734693877592}
10:23:47 | learner | INFO | ep 88 - performance: {'order_requirements': 2240000, 'container_shortage': 240991, 'operation_number': 4263189}, exploration_params: {'epsilon': 0.07183673469387797}
10:23:53 | learner | INFO | ep 89 - performance: {'order_requirements': 2240000, 'container_shortage': 108313, 'operation_number': 4400742}, exploration_params: {'epsilon': 0.06530612244898001}
10:23:59 | learner | INFO | ep 90 - performance: {'order_requirements': 2240000, 'container_shortage': 153466, 'operation_number': 4211634}, exploration_params: {'epsilon': 0.05877551020408205}
10:24:05 | learner | INFO | ep