# State Shaper

A state shaper is used to convert an environment observation to a state vector as input to value or policy models by extracting relevant temporal and spatial information. The scenario-specific call method returns the the ID of the agent involved in the current decision event and the shaped state.

In [None]:
import numpy as np
from maro.rl import StateShaper


class CIMStateShaper(StateShaper):
    def __init__(self, *, look_back, max_ports_downstream, port_attributes, vessel_attributes):
        super().__init__()
        self._look_back = look_back
        self._max_ports_downstream = max_ports_downstream
        self._port_attributes = port_attributes
        self._vessel_attributes = vessel_attributes
        self._dim = (look_back + 1) * (max_ports_downstream + 1) * len(port_attributes) + len(vessel_attributes)

    def __call__(self, decision_event, snapshot_list):
        tick, port_idx, vessel_idx = decision_event.tick, decision_event.port_idx, decision_event.vessel_idx
        ticks = [tick - rt for rt in range(self._look_back-1)]
        future_port_idx_list = snapshot_list["vessels"][tick: vessel_idx: 'future_stop_list'].astype('int')
        port_features = snapshot_list["ports"][ticks: [port_idx] + list(future_port_idx_list): self._port_attributes]
        vessel_features = snapshot_list["vessels"][tick: vessel_idx: self._vessel_attributes]
        state = np.concatenate((port_features, vessel_features))
        return str(port_idx), state
    
    @property
    def dim(self):
        return self._dim

# Action Shaper

An action shaper is used to convert an agent's output to an Action object which can be executed by the env's step() method.

In [None]:
from maro.rl import ActionShaper
from maro.simulator.scenarios.cim.common import Action


class CIMActionShaper(ActionShaper):
    def __init__(self, action_space):
        super().__init__()
        self._action_space = action_space
        self._zero_action_index = action_space.index(0)

    def __call__(self, model_action, decision_event, snapshot_list):
        assert 0 <= model_action < len(self._action_space)
        
        scope = decision_event.action_scope
        tick = decision_event.tick
        port_idx = decision_event.port_idx
        vessel_idx = decision_event.vessel_idx
        port_empty = snapshot_list["ports"][tick: port_idx: ["empty", "full", "on_shipper", "on_consignee"]][0]
        vessel_remaining_space = snapshot_list["vessels"][tick: vessel_idx: ["empty", "full", "remaining_space"]][2]
        early_discharge = snapshot_list["vessels"][tick:vessel_idx: "early_discharge"][0]

        if model_action < self._zero_action_index:
            actual_action = max(round(self._action_space[model_action] * port_empty), -vessel_remaining_space)
        elif model_action > self._zero_action_index:
            plan_action = self._action_space[model_action] * (scope.discharge + early_discharge) - early_discharge
            actual_action = round(plan_action) if plan_action > 0 else round(self._action_space[model_action] * scope.discharge)
        else:
            actual_action = 0

        return Action(vessel_idx, port_idx, actual_action)

In [None]:
from collections import defaultdict

from maro.rl import ExperienceShaper


class TruncatedExperienceShaper(ExperienceShaper):
    def __init__(self, *, time_window: int, time_decay_factor: float, fulfillment_factor: float,
                 shortage_factor: float):
        super().__init__(reward_func=None)
        self._time_window = time_window
        self._time_decay_factor = time_decay_factor
        self._fulfillment_factor = fulfillment_factor
        self._shortage_factor = shortage_factor

    def __call__(self, trajectory, snapshot_list):
        experiences_by_agent = {}
        for i in range(len(trajectory) - 1):
            transition = trajectory[i]
            agent_id = transition["agent_id"]
            if agent_id not in experiences_by_agent:
                experiences_by_agent[agent_id] = defaultdict(list)
            
            experiences = experiences_by_agent[agent_id]
            experiences["state"].append(transition["state"])
            experiences["action"].append(transition["action"])
            experiences["reward"].append(self._compute_reward(transition["event"], snapshot_list))
            experiences["next_state"].append(trajectory[i+1]["state"])

        return experiences_by_agent

    def _compute_reward(self, decision_event, snapshot_list):
        start_tick = decision_event.tick + 1
        end_tick = decision_event.tick + self._time_window
        ticks = list(range(start_tick, end_tick))

        # calculate tc reward
        future_fulfillment = snapshot_list["ports"][ticks::"fulfillment"]
        future_shortage = snapshot_list["ports"][ticks::"shortage"]
        decay_list = [self._time_decay_factor ** i for i in range(end_tick - start_tick)
                      for _ in range(future_fulfillment.shape[0]//(end_tick-start_tick))]

        tot_fulfillment = np.dot(future_fulfillment, decay_list)
        tot_shortage = np.dot(future_shortage, decay_list)

        return np.float(self._fulfillment_factor * tot_fulfillment - self._shortage_factor * tot_shortage)

# Agent

An agent is a combination of (RL) algorithm, experience pool, and a set of non-algorithm-specific parameters (algorithm-specific parameters are managed by the algorithm module). Non-algorithm-specific parameters are used to manage experience storage, sampling strategies, and training strategies. Since all kinds of scenario-specific stuff will be handled by the agent manager, the agent is scenario agnostic.

In [None]:
from maro.rl import AbsAgent, ColumnBasedStore


class CIMAgent(AbsAgent):
    def __init__(self, name, algorithm, experience_pool: ColumnBasedStore, min_experiences_to_train,
                 num_batches, batch_size):
        super().__init__(name, algorithm, experience_pool)
        self._min_experiences_to_train = min_experiences_to_train
        self._num_batches = num_batches
        self._batch_size = batch_size

    def train(self):
        if len(self._experience_pool) < self._min_experiences_to_train:
            return

        for _ in range(self._num_batches):
            indexes, sample = self._experience_pool.sample_by_key("loss", self._batch_size)
            state = np.asarray(sample["state"])
            action = np.asarray(sample["action"])
            reward = np.asarray(sample["reward"])
            next_state = np.asarray(sample["next_state"])
            loss = self._algorithm.train(state, action, reward, next_state)
            self._experience_pool.update(indexes, {"loss": loss})

# Agent Manager

An agent manager manages all agents and provides a unified interface with the environment. The agent manager is responsible for both inference and training. It is composed of a state shaper, an action shaper and an experience shaper which perform necessary conversions so that the underlying agents do not need to concern themselves with the business logic. 

In [None]:
import io
import yaml

from torch.nn.functional import smooth_l1_loss
from torch.optim import RMSprop

from maro.rl import AbsAgentManager, LearningModel, MLPDecisionLayers, DQN, DQNHyperParams, ColumnBasedStore


num_actions = 21


class DQNAgentManager(AbsAgentManager):
    def _assemble(self, agent_dict):
        for agent_id in self._agent_id_list:
            eval_model = LearningModel(decision_layers=MLPDecisionLayers(name=f'{agent_id}.policy',
                                                                         input_dim=self._state_shaper.dim,
                                                                         output_dim=num_actions,
                                                                         hidden_dims=[256, 128, 64],
                                                                         dropout_p=.0)
                                       )

            algorithm = DQN(model_dict={"eval": eval_model},
                            optimizer_opt=(RMSprop, {"lr": 0.05}),
                            loss_func_dict={"eval": smooth_l1_loss},
                            hyper_params=DQNHyperParams(num_actions=num_actions, reward_decay=.0,
                                                        num_training_rounds_per_target_replacement=5, tau=0.1)
                           )

            experience_pool = ColumnBasedStore()
            
            agent_dict[agent_id] = CIMAgent(name=agent_id, algorithm=algorithm, experience_pool=experience_pool,
                                            min_experiences_to_train=1024, num_batches=10, batch_size=128)

    def store_experiences(self, experiences):
        for agent_id, exp in experiences.items():
            exp.update({"loss": [1e8] * len(exp[next(iter(exp))])})
            self._agent_dict[agent_id].store_experiences(exp)

# Main Loop

The code below demonstrates the typical structure of a program using MARO. One starts by creating an environment. Next, shapers and an explorer are created and an agent manager is created that loads these components. The creation of the agent manager also assembles all agents under the hood. Because the code is for the single-host mode, the agent manager mode is set to TRAIN_INFERENCE. Next, an actor is created to wrap the env and agent manager, and a learner is created to wrap the same agent manager and the actor. Finally, the task is started by calling the learner's train() method.

In [None]:
from maro.simulator import Env
from maro.rl import SimpleLearner, SimpleActor, AgentMode, TwoPhaseLinearExplorer
from maro.utils import Logger


env = Env("cim", "toy.4p_ssdd_l0.0", durations=1120)
total_episodes = 100
agent_id_list = [str(agent_id) for agent_id in env.agent_idx_list]

state_shaper = CIMStateShaper(look_back=7, max_ports_downstream=2, 
                              port_attributes=["empty", "full", "on_shipper", "on_consignee", 
                                               "booking", "shortage", "fulfillment"],
                              vessel_attributes=["empty", "full", "remaining_space"]
                             )

action_shaper = CIMActionShaper(action_space=list(np.linspace(-1.0, 1.0, num_actions)))

experience_shaper = TruncatedExperienceShaper(time_window=100, fulfillment_factor=1.0, shortage_factor=1.0,
                                              time_decay_factor=0.97)

explorer = TwoPhaseLinearExplorer(agent_id_list, total_episodes, 
                                  epsilon_range_dict={"_all_": (.0, .4)},
                                  split_point_dict={"_all_": (.5, .8)},
                                  with_cache=True)

agent_manager = DQNAgentManager(name="cim_learner",
                                mode=AgentMode.TRAIN_INFERENCE,
                                agent_id_list=agent_id_list,
                                state_shaper=state_shaper,
                                action_shaper=action_shaper,
                                experience_shaper=experience_shaper,
                                explorer=explorer)

learner = SimpleLearner(trainable_agents=agent_manager, actor=SimpleActor(env, agent_manager),
                        logger=Logger("single_host_cim_learner", auto_timestamp=False))

learner.train(total_episodes)