# Quick Start

This notebook demonstrates how to use MARO's reinforcement learning (RL) toolkit to solve the container inventory management ([CIM](https://maro.readthedocs.io/en/latest/scenarios/container_inventory_management.html)) problem. It is formalized as a multi-agent reinforcement learning problem, where each port acts as a decision agent. The agents take actions independently, e.g., loading containers to vessels or discharging containers from vessels.   

## [State Shaper](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#shapers)

State shaper converts the environment observation to the model input state which includes temporal and spatial information. For this scenario, the model input state includes: 

- Temporal information, including the past week's information of ports and vessels, such as shortage on port and remaining space on vessel. 

- Spatial information, it including the related downstream port features.    

In [1]:
import numpy as np
from maro.rl import StateShaper


class CIMStateShaper(StateShaper):
    def __init__(self, *, look_back, max_ports_downstream, port_attributes, vessel_attributes):
        super().__init__()
        self._look_back = look_back
        self._max_ports_downstream = max_ports_downstream
        self._port_attributes = port_attributes
        self._vessel_attributes = vessel_attributes
        self._dim = (look_back + 1) * (max_ports_downstream + 1) * len(port_attributes) + len(vessel_attributes)

    def __call__(self, decision_event, snapshot_list):
        tick, port_idx, vessel_idx = decision_event.tick, decision_event.port_idx, decision_event.vessel_idx
        ticks = [tick - rt for rt in range(self._look_back-1)]
        future_port_idx_list = snapshot_list["vessels"][tick: vessel_idx: 'future_stop_list'].astype('int')
        port_features = snapshot_list["ports"][ticks: [port_idx] + list(future_port_idx_list): self._port_attributes]
        vessel_features = snapshot_list["vessels"][tick: vessel_idx: self._vessel_attributes]
        state = np.concatenate((port_features, vessel_features))
        return str(port_idx), state
    
    @property
    def dim(self):
        return self._dim

## [Action Shaper](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#shapers)

Action shaper is used to convert an agent's model output to an environment executable action. For this specific scenario, the output is a discrete index that corresponds to a percentage indicating the fraction of containers to be loaded to or discharged from the arriving vessel.

In [2]:
from maro.rl import ActionShaper
from maro.simulator.scenarios.cim.common import Action


class CIMActionShaper(ActionShaper):
    def __init__(self, action_space):
        super().__init__()
        self._action_space = action_space
        self._zero_action_index = action_space.index(0)

    def __call__(self, model_action, decision_event, snapshot_list):
        assert 0 <= model_action < len(self._action_space)
        
        scope = decision_event.action_scope
        tick = decision_event.tick
        port_idx = decision_event.port_idx
        vessel_idx = decision_event.vessel_idx
        port_empty = snapshot_list["ports"][tick: port_idx: ["empty", "full", "on_shipper", "on_consignee"]][0]
        vessel_remaining_space = snapshot_list["vessels"][tick: vessel_idx: ["empty", "full", "remaining_space"]][2]
        early_discharge = snapshot_list["vessels"][tick:vessel_idx: "early_discharge"][0]
     
        if model_action < self._zero_action_index:
            # The number of loaded containers must be less thean the vessel's remaining space.
            actual_action = max(round(self._action_space[model_action] * port_empty), -vessel_remaining_space)
        elif model_action > self._zero_action_index:
            # In the case of an early discharge event, we need to subtract the early discharge amount from the expected 
            # discharge quote.   
            plan_action = self._action_space[model_action] * (scope.discharge + early_discharge) - early_discharge
            actual_action = round(plan_action) if plan_action > 0 else round(self._action_space[model_action] * scope.discharge)
        else:
            actual_action = 0

        return Action(vessel_idx, port_idx, actual_action)

## [Experience Shaper](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#shapers)

Experience shaper is used to convert an episode trajectory to trainable experiences for RL agents. For this specific scenario, the reward is a linear combination of fulfillment and shortage in a limited time window.

In [3]:
from collections import defaultdict

from maro.rl import ExperienceShaper


class TruncatedExperienceShaper(ExperienceShaper):
    def __init__(self, *, time_window: int, time_decay_factor: float, fulfillment_factor: float,
                 shortage_factor: float):
        super().__init__(reward_func=None)
        self._time_window = time_window
        self._time_decay_factor = time_decay_factor
        self._fulfillment_factor = fulfillment_factor
        self._shortage_factor = shortage_factor

    def __call__(self, trajectory, snapshot_list):
        experiences_by_agent = {}
        for i in range(len(trajectory) - 1):
            transition = trajectory[i]
            agent_id = transition["agent_id"]
            if agent_id not in experiences_by_agent:
                experiences_by_agent[agent_id] = defaultdict(list)
            
            experiences = experiences_by_agent[agent_id]
            experiences["state"].append(transition["state"])
            experiences["action"].append(transition["action"])
            experiences["reward"].append(self._compute_reward(transition["event"], snapshot_list))
            experiences["next_state"].append(trajectory[i+1]["state"])

        return experiences_by_agent

    def _compute_reward(self, decision_event, snapshot_list):
        start_tick = decision_event.tick + 1
        end_tick = decision_event.tick + self._time_window
        ticks = list(range(start_tick, end_tick))

        # Calculate truncate reward.
        future_fulfillment = snapshot_list["ports"][ticks::"fulfillment"]
        future_shortage = snapshot_list["ports"][ticks::"shortage"]
        decay_list = [self._time_decay_factor ** i for i in range(end_tick - start_tick)
                      for _ in range(future_fulfillment.shape[0]//(end_tick-start_tick))]

        tot_fulfillment = np.dot(future_fulfillment, decay_list)
        tot_shortage = np.dot(future_shortage, decay_list)

        return np.float(self._fulfillment_factor * tot_fulfillment - self._shortage_factor * tot_shortage)

## [Agent](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#agent)

For this scenario, the agent is the abstraction of a port. We choose DQN as our underlying learning algorithm with a TD-error-based sampling mechanism.  

In [4]:
from maro.rl import AbsAgent, ColumnBasedStore


class CIMAgent(AbsAgent):
    def __init__(self, name, algorithm, experience_pool: ColumnBasedStore, min_experiences_to_train,
                 num_batches, batch_size):
        super().__init__(name, algorithm, experience_pool)
        self._min_experiences_to_train = min_experiences_to_train
        self._num_batches = num_batches
        self._batch_size = batch_size

    def train(self):
        if len(self._experience_pool) < self._min_experiences_to_train:
            return

        for _ in range(self._num_batches):
            indexes, sample = self._experience_pool.sample_by_key("loss", self._batch_size)
            state = np.asarray(sample["state"])
            action = np.asarray(sample["action"])
            reward = np.asarray(sample["reward"])
            next_state = np.asarray(sample["next_state"])
            loss = self._algorithm.train(state, action, reward, next_state)
            self._experience_pool.update(indexes, {"loss": loss})

## [Agent Manager](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#agent-manager)

The complexities of the environment can be isolated from the learning algorithm by using an AgentManager to manage individual agents. We define a function to create the agents and an agent manager class that implements the ``train`` method where the newly obtained experiences are stored in the agents' experience pools before training, in accordance with the DQN algorithm.

In [5]:
import io
import yaml

import torch.nn as nn
from torch.nn.functional import smooth_l1_loss
from torch.optim import RMSprop

from maro.rl import SimpleAgentManager, LearningModel, FullyConnectedNet, DQN, DQNHyperParams, ColumnBasedStore


input_dim = 171
num_actions = 21



def create_dqn_agents(agent_id_list):
    agent_dict = {}
    for agent_id in agent_id_list:
        eval_model = LearningModel(
            decision_layers=FullyConnectedNet(
                name=f'{agent_id}.policy',
                input_dim=input_dim,
                output_dim=num_actions,
                activation=nn.LeakyReLU, 
                hidden_dims=[256, 128, 64],
                softmax_enabled=False,
                batch_norm_enabled=True,
                dropout_p=.0
            )
        )

        algorithm = DQN(
            eval_model=eval_model,
            optimizer_cls=RMSprop,
            optimizer_params={"lr": 0.05},
            loss_func=nn.functional.smooth_l1_loss,
            hyper_params=DQNHyperParams(
                num_actions=num_actions,
                reward_decay=.0,
                target_update_frequency=5,
                tau=0.1
            )
        )

        experience_pool = ColumnBasedStore()
        agent_dict[agent_id] = CIMAgent(
            name=agent_id,
            algorithm=algorithm,
            experience_pool=experience_pool,
            min_experiences_to_train=1024,
            num_batches=10,
            batch_size=128
        )

    return agent_dict


class DQNAgentManager(SimpleAgentManager):
    def train(self, experiences_by_agent, performance=None):
        self._assert_train_mode()

        # store experiences for each agent
        for agent_id, exp in experiences_by_agent.items():
            exp.update({"loss": [1e8] * len(list(exp.values())[0])})
            self.agent_dict[agent_id].store_experiences(exp)

        for agent in self.agent_dict.values():
            agent.train()

## Main Loop with [Actor and Learner](https://maro.readthedocs.io/en/latest/key_components/rl_toolkit.html#learner-and-actor)

This code cell demonstrates the typical workflow of a learning policy's interaction with a MARO environment. 

- Initialize an environment with specific scenario and topology parameters. 

- Define scenario-specific components, e.g. shapers. 

- Create agents and an agent manager. 

- Create an actor and a learner to start the training process in which the agent manager interacts with the environment for collecting experiences and updating policies. 

In [6]:
from maro.simulator import Env
from maro.rl import SimpleLearner, SimpleActor, AgentManagerMode, TwoPhaseLinearExplorer
from maro.utils import Logger, LogFormat

# Step 1: initialize a CIM environment for using a toy dataset. 
env = Env("cim", "toy.4p_ssdd_l0.0", durations=1120)
agent_id_list = [str(agent_id) for agent_id in env.agent_idx_list]

# Step 2: create state, action and experience shapers. We also need to create an explorer here due to the 
# greedy nature of the DQN algorithm.  
state_shaper = CIMStateShaper(look_back=7, max_ports_downstream=2, 
                              port_attributes=["empty", "full", "on_shipper", "on_consignee", 
                                               "booking", "shortage", "fulfillment"],
                              vessel_attributes=["empty", "full", "remaining_space"]
                             )

action_shaper = CIMActionShaper(action_space=list(np.linspace(-1.0, 1.0, num_actions)))

experience_shaper = TruncatedExperienceShaper(time_window=100, fulfillment_factor=1.0, shortage_factor=1.0,
                                              time_decay_factor=0.97)

# Step 3: create an agent manager.
agent_manager = DQNAgentManager(name="cim_learner",
                                mode=AgentManagerMode.TRAIN_INFERENCE,
                                agent_dict=create_dqn_agents(agent_id_list),
                                state_shaper=state_shaper,
                                action_shaper=action_shaper,
                                experience_shaper=experience_shaper)

# Step 4: Create an actor and a learner to start the training process. 
actor = SimpleActor(env, agent_manager)
learner = SimpleLearner(trainable_agents=agent_manager, actor=actor, 
                        explorer=TwoPhaseLinearExplorer(start_eps=0.4, mid_eps=0.32, end_eps=0.0, split_point=0.5),
                        logger=Logger("single_host_cim_learner", format_=LogFormat.simple, auto_timestamp=False))

learner.train(max_episode=100)

08:19:40 | single_host_cim_learner | INFO | ep 0 - performance: {'order_requirements': 2240000, 'container_shortage': 1449174, 'operation_number': 2877166}, epsilons: {'0': 0.4, '1': 0.4, '2': 0.4, '3': 0.4}
08:19:44 | single_host_cim_learner | INFO | ep 1 - performance: {'order_requirements': 2240000, 'container_shortage': 1433716, 'operation_number': 2572894}, epsilons: {'0': 0.3983838383838384, '1': 0.3983838383838384, '2': 0.3983838383838384, '3': 0.3983838383838384}
08:19:48 | single_host_cim_learner | INFO | ep 2 - performance: {'order_requirements': 2240000, 'container_shortage': 1508563, 'operation_number': 2738400}, epsilons: {'0': 0.39676767676767677, '1': 0.39676767676767677, '2': 0.39676767676767677, '3': 0.39676767676767677}
08:19:53 | single_host_cim_learner | INFO | ep 3 - performance: {'order_requirements': 2240000, 'container_shortage': 1489816, 'operation_number': 2809858}, epsilons: {'0': 0.3951515151515152, '1': 0.3951515151515152, '2': 0.3951515151515152, '3': 0.39

08:22:02 | single_host_cim_learner | INFO | ep 31 - performance: {'order_requirements': 2240000, 'container_shortage': 589462, 'operation_number': 4166597}, epsilons: {'0': 0.34989898989898993, '1': 0.34989898989898993, '2': 0.34989898989898993, '3': 0.34989898989898993}
08:22:06 | single_host_cim_learner | INFO | ep 32 - performance: {'order_requirements': 2240000, 'container_shortage': 651137, 'operation_number': 3812302}, epsilons: {'0': 0.3482828282828283, '1': 0.3482828282828283, '2': 0.3482828282828283, '3': 0.3482828282828283}
08:22:11 | single_host_cim_learner | INFO | ep 33 - performance: {'order_requirements': 2240000, 'container_shortage': 506324, 'operation_number': 4250385}, epsilons: {'0': 0.3466666666666667, '1': 0.3466666666666667, '2': 0.3466666666666667, '3': 0.3466666666666667}
08:22:16 | single_host_cim_learner | INFO | ep 34 - performance: {'order_requirements': 2240000, 'container_shortage': 656772, 'operation_number': 4013839}, epsilons: {'0': 0.34505050505050505

08:24:30 | single_host_cim_learner | INFO | ep 62 - performance: {'order_requirements': 2240000, 'container_shortage': 427942, 'operation_number': 3986817}, epsilons: {'0': 0.23919191919191918, '1': 0.23919191919191918, '2': 0.23919191919191918, '3': 0.23919191919191918}
08:24:34 | single_host_cim_learner | INFO | ep 63 - performance: {'order_requirements': 2240000, 'container_shortage': 349003, 'operation_number': 4169958}, epsilons: {'0': 0.23272727272727273, '1': 0.23272727272727273, '2': 0.23272727272727273, '3': 0.23272727272727273}
08:24:39 | single_host_cim_learner | INFO | ep 64 - performance: {'order_requirements': 2240000, 'container_shortage': 443412, 'operation_number': 4040920}, epsilons: {'0': 0.22626262626262622, '1': 0.22626262626262622, '2': 0.22626262626262622, '3': 0.22626262626262622}
08:24:44 | single_host_cim_learner | INFO | ep 65 - performance: {'order_requirements': 2240000, 'container_shortage': 491950, 'operation_number': 3867277}, epsilons: {'0': 0.219797979

08:27:03 | single_host_cim_learner | INFO | ep 93 - performance: {'order_requirements': 2240000, 'container_shortage': 54462, 'operation_number': 4504207}, epsilons: {'0': 0.03878787878787875, '1': 0.03878787878787875, '2': 0.03878787878787875, '3': 0.03878787878787875}
08:27:08 | single_host_cim_learner | INFO | ep 94 - performance: {'order_requirements': 2240000, 'container_shortage': 17770, 'operation_number': 4507162}, epsilons: {'0': 0.032323232323232316, '1': 0.032323232323232316, '2': 0.032323232323232316, '3': 0.032323232323232316}
08:27:13 | single_host_cim_learner | INFO | ep 95 - performance: {'order_requirements': 2240000, 'container_shortage': 31159, 'operation_number': 4457436}, epsilons: {'0': 0.025858585858585883, '1': 0.025858585858585883, '2': 0.025858585858585883, '3': 0.025858585858585883}
08:27:18 | single_host_cim_learner | INFO | ep 96 - performance: {'order_requirements': 2240000, 'container_shortage': 29988, 'operation_number': 4428244}, epsilons: {'0': 0.01939