microsoft · ysqyang · Jan 21, 2021 · Nov 10, 2020 · Nov 10, 2020 · Nov 11, 2020
diff --git a/examples/cim/dqn/components/config.py b/examples/cim/dqn/components/config.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 """
-This file is used to load config and convert it into a dotted dictionary.
+This file is used to load the configuration and convert it into a dotted dictionary.
 """
 
 import io

diff --git a/examples/cim/dqn/single_process_launcher.py b/examples/cim/dqn/single_process_launcher.py
@@ -38,7 +38,6 @@ def launch(config):
 
     # Step 4: Create an actor and a learner to start the training process.
     scheduler = TwoPhaseLinearParameterScheduler(config.main_loop.max_episode, **config.main_loop.exploration)
-
     actor = SimpleActor(env, agent_manager)
     learner = SimpleLearner(
         agent_manager, actor, scheduler,

diff --git a/examples/cim/policy_optimization/README.md b/examples/cim/policy_optimization/README.md
@@ -0,0 +1,22 @@
+# Overview
+
+The CIM problem is one of the quintessential use cases of MARO. The example can
+be run with a set of scenario configurations that can be found under
+maro/simulator/scenarios/cim. General experimental parameters (e.g., type of
+topology, type of algorithm to use, number of training episodes) can be configured
+through config.yml. Each RL formulation has a dedicated folder, e.g., dqn, and
+all algorithm-specific parameters can be configured through
+the config.py file in that folder.
+
+## Single-host Single-process Mode
+
+To run the CIM example using the DQN algorithm under single-host mode, go to
+examples/cim/dqn and run single_process_launcher.py. You may play around with
+the configuration if you want to try out different settings.
+
+## Distributed Mode
+
+The examples/cim/dqn/components folder contains dist_learner.py and dist_actor.py
+for distributed training. For debugging purposes, we provide a script that
+simulates distributed mode using multi-processing. Simply go to examples/cim/dqn
+and run multi_process_launcher.py to start the learner and actor processes.
diff --git a/examples/cim/policy_optimization/components/__init__.py b/examples/cim/policy_optimization/components/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .action_shaper import CIMActionShaper
+from .agent_manager import POAgentManager, create_po_agents
+from .experience_shaper import TruncatedExperienceShaper
+from .state_shaper import CIMStateShaper
+
+__all__ = [
+    "CIMActionShaper",
+    "POAgentManager", "create_po_agents",
+    "TruncatedExperienceShaper",
+    "CIMStateShaper"
+]
diff --git a/examples/cim/policy_optimization/components/action_shaper.py b/examples/cim/policy_optimization/components/action_shaper.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from maro.rl import ActionShaper
+from maro.simulator.scenarios.cim.common import Action
+
+
+class CIMActionShaper(ActionShaper):
+    def __init__(self, action_space):
+        super().__init__()
+        self._action_space = action_space
+        self._zero_action_index = action_space.index(0)
+
+    def __call__(self, model_action, decision_event, snapshot_list):
+        scope = decision_event.action_scope
+        tick = decision_event.tick
+        port_idx = decision_event.port_idx
+        vessel_idx = decision_event.vessel_idx
+
+        port_empty = snapshot_list["ports"][tick: port_idx: ["empty", "full", "on_shipper", "on_consignee"]][0]
+        vessel_remaining_space = snapshot_list["vessels"][tick: vessel_idx: ["empty", "full", "remaining_space"]][2]
+        early_discharge = snapshot_list["vessels"][tick:vessel_idx: "early_discharge"][0]
+        assert 0 <= model_action < len(self._action_space)
+
+        if model_action < self._zero_action_index:
+            actual_action = max(round(self._action_space[model_action] * port_empty), -vessel_remaining_space)
+        elif model_action > self._zero_action_index:
+            plan_action = self._action_space[model_action] * (scope.discharge + early_discharge) - early_discharge
+            actual_action = round(plan_action) if plan_action > 0 else round(self._action_space[model_action] * scope.discharge)
+        else:
+            actual_action = 0
+
+        return Action(vessel_idx, port_idx, actual_action)
diff --git a/examples/cim/policy_optimization/components/agent_manager.py b/examples/cim/policy_optimization/components/agent_manager.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+import torch.nn as nn
+from torch.optim import Adam, RMSprop
+
+from maro.rl import (
+    AbsAgent, ActorCritic, ActorCriticConfig, FullyConnectedBlock, LearningModel, NNStack,
+    OptimizerOptions, PolicyGradient, PolicyOptimizationConfig, SimpleAgentManager
+)
+from maro.utils import set_seeds
+
+
+class POAgent(AbsAgent):
+    def train(self, states: np.ndarray, actions: np.ndarray, log_action_prob: np.ndarray, rewards: np.ndarray):
+        self._algorithm.train(states, actions, log_action_prob, rewards)
+
+
+def create_po_agents(agent_id_list, config):
+    input_dim, num_actions = config.input_dim, config.num_actions
+    set_seeds(config.seed)
+    agent_dict = {}
+    for agent_id in agent_id_list:
+        actor_net = NNStack(
+            "actor",
+            FullyConnectedBlock(
+                input_dim=input_dim,
+                output_dim=num_actions,
+                activation=nn.Tanh,
+                is_head=True,
+                **config.actor_model
+            )
+        )
+
+        if config.type == "actor_critic":
+            critic_net = NNStack(
+                "critic",
+                FullyConnectedBlock(
+                    input_dim=config.input_dim,
+                    output_dim=1,
+                    activation=nn.LeakyReLU,
+                    is_head=True,
+                    **config.critic_model
+                )
+            )
+
+            hyper_params = config.actor_critic_hyper_parameters
+            hyper_params.update({"reward_discount": config.reward_discount})
+            learning_model = LearningModel(
+                actor_net, critic_net, 
+                optimizer_options={
+                    "actor": OptimizerOptions(cls=Adam, params=config.actor_optimizer),
+                    "critic": OptimizerOptions(cls=RMSprop, params=config.critic_optimizer)
+                } 
+            )
+            algorithm = ActorCritic(
+                learning_model, ActorCriticConfig(critic_loss_func=nn.SmoothL1Loss(), **hyper_params)
+            )
+        else:
+            learning_model = LearningModel(
+                actor_net, 
+                optimizer_options=OptimizerOptions(cls=Adam, params=config.actor_optimizer)  
+            )
+            algorithm = PolicyGradient(learning_model, PolicyOptimizationConfig(config.reward_discount))
+
+        agent_dict[agent_id] = POAgent(name=agent_id, algorithm=algorithm)
+
+    return agent_dict
+
+
+class POAgentManager(SimpleAgentManager):
+    def train(self, experiences_by_agent: dict):
+        for agent_id, exp in experiences_by_agent.items():
+            if not isinstance(exp, list):
+                exp = [exp]
+            for trajectory in exp:
+                self.agent_dict[agent_id].train(
+                    trajectory["state"],
+                    trajectory["action"],
+                    trajectory["log_action_probability"],
+                    trajectory["reward"]
+                )
diff --git a/examples/cim/policy_optimization/components/config.py b/examples/cim/policy_optimization/components/config.py
@@ -0,0 +1,19 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""
+This file is used to load the configuration and convert it into a dotted dictionary.
+"""
+
+import io
+import os
+import yaml
+
+
+CONFIG_PATH = os.path.join(os.path.split(os.path.realpath(__file__))[0], "../config.yml")
+with io.open(CONFIG_PATH, "r") as in_file:
+    config = yaml.safe_load(in_file)
+
+DISTRIBUTED_CONFIG_PATH = os.path.join(os.path.split(os.path.realpath(__file__))[0], "../distributed_config.yml")
+with io.open(DISTRIBUTED_CONFIG_PATH, "r") as in_file:
+    distributed_config = yaml.safe_load(in_file)
diff --git a/examples/cim/policy_optimization/components/experience_shaper.py b/examples/cim/policy_optimization/components/experience_shaper.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from collections import defaultdict
+
+import numpy as np
+
+from maro.rl import ExperienceShaper
+
+
+class TruncatedExperienceShaper(ExperienceShaper):
+    def __init__(self, *, time_window: int, time_decay_factor: float, fulfillment_factor: float,
+                 shortage_factor: float):
+        super().__init__(reward_func=None)
+        self._time_window = time_window
+        self._time_decay_factor = time_decay_factor
+        self._fulfillment_factor = fulfillment_factor
+        self._shortage_factor = shortage_factor
+
+    def __call__(self, trajectory, snapshot_list):
+        agent_ids = np.asarray(trajectory.get_by_key("agent_id"))
+        states = np.asarray(trajectory.get_by_key("state"))
+        actions = np.asarray(trajectory.get_by_key("action"))
+        log_action_probabilities = np.asarray(trajectory.get_by_key("log_action_probability"))
+        rewards = np.fromiter(
+            map(self._compute_reward, trajectory.get_by_key("event"), [snapshot_list] * len(trajectory)),
+            dtype=np.float32
+        )
+        return {agent_id: {
+                    "state": states[agent_ids == agent_id],
+                    "action": actions[agent_ids == agent_id],
+                    "log_action_probability": log_action_probabilities[agent_ids == agent_id],
+                    "reward": rewards[agent_ids == agent_id],
+                }
+                for agent_id in set(agent_ids)}
+
+    def _compute_reward(self, decision_event, snapshot_list):
+        start_tick = decision_event.tick + 1
+        end_tick = decision_event.tick + self._time_window
+        ticks = list(range(start_tick, end_tick))
+
+        # calculate tc reward
+        future_fulfillment = snapshot_list["ports"][ticks::"fulfillment"]
+        future_shortage = snapshot_list["ports"][ticks::"shortage"]
+        decay_list = [self._time_decay_factor ** i for i in range(end_tick - start_tick)
+                      for _ in range(future_fulfillment.shape[0]//(end_tick-start_tick))]
+
+        tot_fulfillment = np.dot(future_fulfillment, decay_list)
+        tot_shortage = np.dot(future_shortage, decay_list)
+
+        return np.float(self._fulfillment_factor * tot_fulfillment - self._shortage_factor * tot_shortage)
diff --git a/examples/cim/policy_optimization/components/state_shaper.py b/examples/cim/policy_optimization/components/state_shaper.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+
+from maro.rl import StateShaper
+
+PORT_ATTRIBUTES = ["empty", "full", "on_shipper", "on_consignee", "booking", "shortage", "fulfillment"]
+VESSEL_ATTRIBUTES = ["empty", "full", "remaining_space"]
+
+
+class CIMStateShaper(StateShaper):
+    def __init__(self, *, look_back, max_ports_downstream):
+        super().__init__()
+        self._look_back = look_back
+        self._max_ports_downstream = max_ports_downstream
+        self._dim = (look_back + 1) * (max_ports_downstream + 1) * len(PORT_ATTRIBUTES) + len(VESSEL_ATTRIBUTES)
+
+    def __call__(self, decision_event, snapshot_list):
+        tick, port_idx, vessel_idx = decision_event.tick, decision_event.port_idx, decision_event.vessel_idx
+        ticks = [tick - rt for rt in range(self._look_back - 1)]
+        future_port_idx_list = snapshot_list["vessels"][tick: vessel_idx: 'future_stop_list'].astype('int')
+        port_features = snapshot_list["ports"][ticks: [port_idx] + list(future_port_idx_list): PORT_ATTRIBUTES]
+        vessel_features = snapshot_list["vessels"][tick: vessel_idx: VESSEL_ATTRIBUTES]
+        state = np.concatenate((port_features, vessel_features))
+        return str(port_idx), state
+
+    @property
+    def dim(self):
+        return self._dim
diff --git a/examples/cim/policy_optimization/config.yml b/examples/cim/policy_optimization/config.yml
@@ -0,0 +1,50 @@
+env:
+  scenario: "cim"
+  topology: "toy.4p_ssdd_l0.0"
+  durations: 1120
+  state_shaping:
+    look_back: 7
+    max_ports_downstream: 2
+  experience_shaping:
+    time_window: 100
+    fulfillment_factor: 1.0
+    shortage_factor: 1.0
+    time_decay_factor: 0.97
+main_loop:
+  max_episode: 100
+  early_stopping:
+    warmup_ep: 20
+    last_k: 5
+    perf_threshold: 0.95   # minimum performance (fulfillment ratio) required to trigger early stopping
+    perf_stability_threshold: 0.1  # stability is measured by the maximum of abs(perf_(i+1) - perf_i) / perf_i
+                                   # over the last k episodes (where perf is short for performance). This value must
+                                   # be below this threshold to trigger early stopping
+agents:
+  seed: 1024   # for reproducibility
+  type: "actor_critic"  # "actor_critic" or "policy_gradient"
+  num_actions: 21
+  actor_model:
+    hidden_dims:
+      - 256
+      - 128
+      - 64
+    softmax_enabled: true
+    batch_norm_enabled: false
+  actor_optimizer:
+    lr: 0.001
+  critic_model:
+    hidden_dims:
+      - 256
+      - 128
+      - 64
+    softmax_enabled: false
+    batch_norm_enabled: true
+  critic_optimizer:
+    lr: 0.001
+  reward_discount: .0
+  actor_critic_hyper_parameters:
+    train_iters: 10
+    actor_loss_coefficient: 0.1
+    k: 1
+    lam: 0.0
+    # clip_ratio: 0.8
diff --git a/examples/cim/policy_optimization/dist_actor.py b/examples/cim/policy_optimization/dist_actor.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import os
+
+import numpy as np
+
+from maro.simulator import Env
+from maro.rl import AgentManagerMode, SimpleActor, ActorWorker
+from maro.utils import convert_dottable
+
+from components import CIMActionShaper, CIMStateShaper, POAgentManager, TruncatedExperienceShaper, create_po_agents
+
+
+def launch(config):
+    config = convert_dottable(config)
+    env = Env(config.env.scenario, config.env.topology, durations=config.env.durations)
+    agent_id_list = [str(agent_id) for agent_id in env.agent_idx_list]
+    state_shaper = CIMStateShaper(**config.env.state_shaping)
+    action_shaper = CIMActionShaper(action_space=list(np.linspace(-1.0, 1.0, config.agents.num_actions)))
+    experience_shaper = TruncatedExperienceShaper(**config.env.experience_shaping)
+
+    config["agents"]["input_dim"] = state_shaper.dim
+    agent_manager = POAgentManager(
+        name="cim_actor",
+        mode=AgentManagerMode.INFERENCE,
+        agent_dict=create_po_agents(agent_id_list, config.agents),
+        state_shaper=state_shaper,
+        action_shaper=action_shaper,
+        experience_shaper=experience_shaper,
+    )
+    proxy_params = {
+        "group_name": os.environ["GROUP"],
+        "expected_peers": {"learner": 1},
+        "redis_address": ("localhost", 6379)
+    }
+    actor_worker = ActorWorker(
+        local_actor=SimpleActor(env=env, agent_manager=agent_manager),
+        proxy_params=proxy_params
+    )
+    actor_worker.launch()
+
+
+if __name__ == "__main__":
+    from components.config import config
+    launch(config)