# state shaper

A state shaper is used to convert an environment observation to a state vector as input to value or policy models by extracting relevant temporal and spatial information. The scenario-specific __call__ method returns the the ID of the agent involved in the current decision event and the shaped state. 

In [None]:
import numpy as np
from maro.rl import AbstractStateShaper


class ECRStateShaper(AbstractStateShaper):
    def __init__(self, *, look_back, max_ports_downstream, port_attributes, vessel_attributes):
        super().__init__()
        self._look_back = look_back
        self._max_ports_downstream = max_ports_downstream
        self._port_attributes = port_attributes
        self._vessel_attributes = vessel_attributes
        self._dim = (look_back + 1) * (max_ports_downstream + 1) * len(port_attributes) + len(vessel_attributes)

    def __call__(self, decision_event, snapshot_list):
        tick, port_idx, vessel_idx = decision_event.tick, decision_event.port_idx, decision_event.vessel_idx
        ticks = [tick - rt for rt in range(self._look_back-1)]
        future_port_idx_list = snapshot_list["vessels"][tick: vessel_idx: 'future_stop_list'].astype('int')
        port_features = snapshot_list["ports"][ticks: [port_idx] + list(future_port_idx_list): self._port_attributes]
        vessel_features = snapshot_list["vessels"][tick: vessel_idx: self._vessel_attributes]
        state = np.concatenate((port_features, vessel_features))
        return str(port_idx), state
    
    @property
    def dim(self):
        return self._dim

# action shaper

An action shaper is used to convert the output of an underlying algorithm's choose_action() method to an Action object which can be executed by the env's step() method.

In [None]:
from maro.rl import AbstractActionShaper
from maro.simulator.scenarios.ecr.common import Action


class ECRActionShaper(AbstractActionShaper):
    def __init__(self, action_space):
        super().__init__()
        self._action_space = action_space
        self._zero_action_index = action_space.index(0)

    def __call__(self, model_action, decision_event, snapshot_list):
        scope = decision_event.action_scope
        tick = decision_event.tick
        port_idx = decision_event.port_idx
        vessel_idx = decision_event.vessel_idx

        port_empty = snapshot_list["ports"][tick: port_idx: ["empty", "full", "on_shipper", "on_consignee"]][0]
        vessel_remaining_space = snapshot_list["vessels"][tick: vessel_idx: ["empty", "full", "remaining_space"]][2]
        early_discharge = snapshot_list["vessels"][tick:vessel_idx: "early_discharge"][0]
        assert 0 <= model_action < len(self._action_space)

        if model_action < self._zero_action_index:
            actual_action = max(round(self._action_space[model_action] * port_empty), -vessel_remaining_space)
        elif model_action > self._zero_action_index:
            plan_action = self._action_space[model_action] * (scope.discharge + early_discharge) - early_discharge
            actual_action = round(plan_action) if plan_action > 0 else round(self._action_space[model_action] * scope.discharge)
        else:
            actual_action = 0

        return Action(vessel_idx, port_idx, actual_action)

# reward shaper

A reward shaper is used to record transitions during a roll-out episode and perform necessary post-processing at the end of the episode. The post-processing logic is encapsulated in the abstract shape() method and needs to be implemented for each scenario. It is necessary to compute rewards and next-states (and also next-actions for SARSA-like on-policy algorithms) during post-processing as they are set to None during the episode. In particular, it is necessary to specify how to determine the reward for an action given the business metrics associated with the corresponding transition. MARO provides the KStepRewardShaper class which may be combined with a user-defined reward function to form a default reward shaper. Here we showcase a custom reward shaper for the ECR scenario.   

In [None]:
import pickle
from maro.rl import AbstractRewardShaper, ExperienceKey, ExperienceInfoKey


class ECRRewardShaper(AbstractRewardShaper):
    def __init__(self, *, agent_id_list, time_window: int, time_decay_factor: float,
                 fulfillment_factor: float, shortage_factor: float):
        super().__init__()
        self._agent_id_list = agent_id_list
        self._time_window = time_window
        self._time_decay_factor = time_decay_factor
        self._fulfillment_factor = fulfillment_factor
        self._shortage_factor = shortage_factor

    def _shape(self, snapshot_list):
        for i in range(len(self._trajectory[ExperienceKey.STATE])-1):
            metrics = self._trajectory[ExperienceKey.INFO][i][ExperienceInfoKey.METRICS]
            event = pickle.loads(self._trajectory[ExperienceKey.INFO][i][ExperienceInfoKey.EVENT])
            self._trajectory[ExperienceKey.REWARD][i] = self._compute_reward(metrics, event, snapshot_list)
            self._trajectory[ExperienceKey.NEXT_STATE][i] = self._trajectory[ExperienceKey.STATE][i+1]
            self._trajectory[ExperienceKey.NEXT_ACTION][i] = self._trajectory[ExperienceKey.ACTION][i+1]
            self._trajectory[ExperienceKey.INFO][i][ExperienceInfoKey.DISCOUNT] = .0

    def _compute_reward(self, metrics, decision_event, snapshot_list):
        start_tick = decision_event.tick + 1
        end_tick = decision_event.tick + self._time_window
        ticks = list(range(start_tick, end_tick))

        # calculate tc reward
        decay_list = [self._time_decay_factor ** i for i in range(end_tick - start_tick)
                      for _ in range(len(self._agent_id_list))]

        tot_fulfillment = np.dot(snapshot_list["ports"][ticks::"fulfillment"], decay_list)
        tot_shortage = np.dot(snapshot_list["ports"][ticks::"shortage"], decay_list)

        return np.float(self._fulfillment_factor * tot_fulfillment - self._shortage_factor * tot_shortage)


# agent manager

An agent manager manages all agents and provides a unified interface with the environment. It is composed of a state shaper and an action shaper which perform necessary conversions so that the underlying agents do not need to concern themselves with the business logic; 

In [None]:
from torch.nn.functional import smooth_l1_loss
from torch.optim import RMSprop

from maro.rl import AgentManager, Agent, AgentParameters, LearningModel, MLPDecisionLayers, DQN, DQNHyperParams, \
    ExperienceInfoKey

num_actions = 21
model_config = {"hidden_dims": [256, 128, 64], "output_dim": num_actions, "dropout_p": 0.0}
optimizer_config = {"lr": 0.05}
dqn_config = {"num_actions": num_actions, "replace_target_frequency": 5, "tau": 0.1}
training_config = {"min_experiences_to_train": 1024, "samplers": [(lambda d: d[ExperienceInfoKey.TD_ERROR], 128)],
                   "num_steps": 10}


class DQNAgentManager(AgentManager):
    def _assemble_agents(self):
        agent_params = AgentParameters(**training_config)
        for agent_id in self._agent_id_list:
            eval_model = LearningModel(decision_layers=MLPDecisionLayers(name=f'{agent_id}.policy',
                                                                         input_dim=self._state_shaper.dim,
                                                                         **model_config)
                                       )

            algorithm = DQN(model_dict={"eval": eval_model}, optimizer_opt=(RMSprop, optimizer_config),
                            loss_func_dict={"eval": smooth_l1_loss}, hyper_params=DQNHyperParams(**dqn_config))

            self._agent_dict[agent_id] = Agent(name=agent_id, algorithm=algorithm, params=agent_params)

# main loop

The code below demonstrates the typical structure of a program using MARO. One starts by creating an environment. Next, shapers and an explorer are created and an agent manager is created by loading these components. The creation of the agent manager also assembles all agents under the hood. Because the code is for the single-host mode, the agent manager mode is set to TRAIN_INFERENCE. Next, an actor is created to wrap the env and agent manager, and a learner is created to wrap the same agent manager and the actor. Finally, the task is started by calling the learner's train_test() method.    

In [None]:
from maro.simulator import Env
from maro.rl import SimpleLearner, SimpleActor, AgentMode, KStepRewardShaper, TwoPhaseLinearExplorer
from maro.utils import Logger, convert_dottable


total_episodes = 100

env = Env(scenario="ecr", topology="toy.5p_ssddd_l0.0", durations=1120)
agent_id_list = [str(agent_id) for agent_id in env.agent_idx_list]
state_shaper = ECRStateShaper(look_back=7, max_ports_downstream=2,
                              port_attributes=["empty","full","on_shipper","on_consignee","booking","shortage","fulfillment"],
                              vessel_attributes=["empty","full", "remaining_space"]
                             )
action_shaper = ECRActionShaper(action_space=list(np.linspace(-1.0, 1.0, num_actions)))
reward_shaper = ECRRewardShaper(agent_id_list=agent_id_list, time_window=100, fulfillment_factor=1.0,
                                shortage_factor=1.0, time_decay_factor=0.97)
explorer = TwoPhaseLinearExplorer(agent_id_list, total_episodes, 
                                  epsilon_range_dict={"_all_": (.0, .4)},
                                  split_point_dict={"_all_": (.5, .8)}
                                 )
agent_manager = DQNAgentManager(name="ecr_learner",
                                mode=AgentMode.TRAIN_INFERENCE,
                                agent_id_list=agent_id_list,
                                state_shaper=state_shaper,
                                action_shaper=action_shaper,
                                reward_shaper=reward_shaper,
                                explorer=explorer)
learner = SimpleLearner(trainable_agents=agent_manager,
                        actor=SimpleActor(env=env, inference_agents=agent_manager),
                        logger=Logger("single_host_ecr_learner", auto_timestamp=False))

learner.train_test(total_episodes)