microsoft · Jinyu-W · Oct 26, 2020 · Sep 22, 2020 · Sep 23, 2020 · Sep 23, 2020
diff --git a/examples/cim/dqn/components/experience_shaper.py b/examples/cim/dqn/components/experience_shaper.py
@@ -41,7 +41,7 @@ def _compute_reward(self, decision_event, snapshot_list):
         future_fulfillment = snapshot_list["ports"][ticks::"fulfillment"]
         future_shortage = snapshot_list["ports"][ticks::"shortage"]
         decay_list = [self._time_decay_factor ** i for i in range(end_tick - start_tick)
-                      for _ in range(future_fulfillment.shape[0]//(end_tick-start_tick))]
+                      for _ in range(future_fulfillment.shape[0] // (end_tick - start_tick))]
 
         tot_fulfillment = np.dot(future_fulfillment, decay_list)
         tot_shortage = np.dot(future_shortage, decay_list)

diff --git a/examples/cim/dqn/single_process_launcher.py b/examples/cim/dqn/single_process_launcher.py
@@ -29,7 +29,8 @@
     if config.experience_shaping.type == "truncated":
         experience_shaper = TruncatedExperienceShaper(**config.experience_shaping.truncated)
     else:
-        experience_shaper = KStepExperienceShaper(reward_func=lambda mt: 1-mt["container_shortage"]/mt["order_requirements"],
+        experience_shaper = KStepExperienceShaper(reward_func=lambda mt: 1 - mt["container_shortage"] /
+                                                    mt["order_requirements"],
                                                   **config.experience_shaping.k_step)
 
     exploration_config = {"epsilon_range_dict": {"_all_": config.exploration.epsilon_range},

diff --git a/examples/cim/gnn/action_shaper.py b/examples/cim/gnn/action_shaper.py
@@ -0,0 +1,37 @@
+from maro.rl import ActionShaper
+
+
+class DiscreteActionShaper(ActionShaper):
+    """The shaping class to transform the action in [-1, 1] to actual repositioning function."""
+    def __init__(self, action_dim):
+        super().__init__()
+        self._action_dim = action_dim
+        self._zero_action = self._action_dim // 2
+
+    def __call__(self, decision_event, model_action):
+        """Shaping the action in [-1,1] range to the actual repositioning function.
+
+        This function maps integer model action within the range of [-A, A] to actual action. We define negative actual
+        action as discharge resource from vessel to port and positive action as upload from port to vessel, so the
+        upper bound and lower bound of actual action are the resource in dynamic and static node respectively.
+
+        Args:
+            decision_event (Event): The decision event from the environment.
+            model_action (int): Output action, range A means the half of the agent output dim.
+        """
+        env_action = 0
+        model_action -= self._zero_action
+
+        action_scope = decision_event.action_scope
+
+        if model_action < 0:
+            # Discharge resource from dynamic node.
+            env_action = round(int(model_action) * 1.0 / self._zero_action * action_scope.load)
+        elif model_action == 0:
+            env_action = 0
+        else:
+            # Load resource to dynamic node.
+            env_action = round(int(model_action) * 1.0 / self._zero_action * action_scope.discharge)
+        env_action = int(env_action)
+
+        return env_action
diff --git a/examples/cim/gnn/actor.py b/examples/cim/gnn/actor.py
diff --git a/examples/cim/gnn/actor_critic.py b/examples/cim/gnn/actor_critic.py
@@ -0,0 +1,179 @@
+import os
+
+import torch
+from torch import nn
+from torch.distributions import Categorical
+from torch.nn.utils import clip_grad
+
+from examples.cim.gnn.utils import gnn_union
+from maro.rl import AbsAlgorithm
+
+
+class ActorCritic(AbsAlgorithm):
+    """Actor-Critic algorithm in CIM problem.
+
+    The vanilla ac algorithm.
+
+    Args:
+        model (nn.Module): A actor-critic module outputing both the policy network and the value network.
+        device (torch.device): A PyTorch device instance where the module is computed on.
+        p2p_adj (numpy.array): The static port-to-port adjencency matrix.
+        td_steps (int): The value "n" in the n-step TD algorithm.
+        gamma (float): The time decay.
+        learning_rate (float): The learning rate for the module.
+        entropy_factor (float): The weight of the policy"s entropy to boost exploration.
+    """
+
+    def __init__(
+            self, model: nn.Module, device: torch.device, p2p_adj=None, td_steps=100, gamma=0.97, learning_rate=0.0003,
+            entropy_factor=0.1):
+        self._gamma = gamma
+        self._td_steps = td_steps
+        self._value_discount = gamma**100
+        self._entropy_factor = entropy_factor
+        self._device = device
+        self._tot_batchs = 0
+        self._p2p_adj = p2p_adj
+        super().__init__(
+            model_dict={"a&c": model}, optimizer_opt={"a&c": (torch.optim.Adam, {"lr": learning_rate})},
+            loss_func_dict={}, hyper_params=None)
+
+    def choose_action(self, state: dict, p_idx: int, v_idx: int):
+        """Get action from the AC model.
+
+        Args:
+            state (dict): A dictionary containing the input to the module. For example:
+                {
+                    "v": v,
+                    "p": p,
+                    "pe": {
+                        "edge": pedge,
+                        "adj": padj,
+                        "mask": pmask,
+                    },
+                    "ve": {
+                        "edge": vedge,
+                        "adj": vadj,
+                        "mask": vmask,
+                    },
+                    "ppe": {
+                        "edge": ppedge,
+                        "adj": p2p_adj,
+                        "mask": p2p_mask,
+                    },
+                    "mask": seq_mask,
+                }
+            p_idx (int): The identity of the port doing the action.
+            v_idx (int): The identity of the vessel doing the action.
+
+        Returns:
+            model_action (numpy.int64): The action returned from the module.
+        """
+        with torch.no_grad():
+            prob, _ = self._model_dict["a&c"](state, a=True, p_idx=p_idx, v_idx=v_idx)
+            distribution = Categorical(prob)
+            model_action = distribution.sample().cpu().numpy()
+            return model_action
+
+    def train(self, batch, p_idx, v_idx):
+        """Model training.
+
+        Args:
+            batch (dict): The dictionary of a batch of experience. For example:
+                {
+                    "s": the dictionary of state,
+                    "a": model actions in numpy array,
+                    "R": the n-step accumulated reward,
+                    "s"": the dictionary of the next state,
+                }
+            p_idx (int): The identity of the port doing the action.
+            v_idx (int): The identity of the vessel doing the action.
+
+        Returns:
+            a_loss (float): action loss.
+            c_loss (float): critic loss.
+            e_loss (float): entropy loss.
+            tot_norm (float): the L2 norm of the gradient.
+
+        """
+        self._tot_batchs += 1
+        item_a_loss, item_c_loss, item_e_loss = 0, 0, 0
+        obs_batch = batch["s"]
+        action_batch = batch["a"]
+        return_batch = batch["R"]
+        next_obs_batch = batch["s_"]
+
+        obs_batch = gnn_union(
+            obs_batch["p"], obs_batch["po"], obs_batch["pedge"], obs_batch["v"], obs_batch["vo"], obs_batch["vedge"],
+            self._p2p_adj, obs_batch["ppedge"], obs_batch["mask"], self._device)
+        action_batch = torch.from_numpy(action_batch).long().to(self._device)
+        return_batch = torch.from_numpy(return_batch).float().to(self._device)
+        next_obs_batch = gnn_union(
+            next_obs_batch["p"], next_obs_batch["po"], next_obs_batch["pedge"], next_obs_batch["v"],
+            next_obs_batch["vo"], next_obs_batch["vedge"], self._p2p_adj, next_obs_batch["ppedge"],
+            next_obs_batch["mask"], self._device)
+
+        # Train actor network.
+        self._optimizer["a&c"].zero_grad()
+
+        # Every port has a value.
+        # values.shape: (batch, p_cnt)
+        probs, values = self._model_dict["a&c"](obs_batch, a=True, p_idx=p_idx, v_idx=v_idx, c=True)
+        distribution = Categorical(probs)
+        log_prob = distribution.log_prob(action_batch)
+        entropy_loss = distribution.entropy()
+
+        _, values_ = self._model_dict["a&c"](next_obs_batch, c=True)
+        advantage = return_batch + self._value_discount * values_.detach() - values
+
+        if self._entropy_factor != 0:
+            # actor_loss = actor_loss* torch.log(entropy_loss + np.e)
+            advantage[:, p_idx] += self._entropy_factor * entropy_loss.detach()
+
+        actor_loss = - (log_prob * torch.sum(advantage, axis=-1).detach()).mean()
+
+        item_a_loss = actor_loss.item()
+        item_e_loss = entropy_loss.mean().item()
+
+        # Train critic network.
+        critic_loss = torch.sum(advantage.pow(2), axis=1).mean()
+        item_c_loss = critic_loss.item()
+        # torch.nn.utils.clip_grad_norm_(self._critic_model.parameters(),0.5)
+        tot_loss = 0.1 * actor_loss + critic_loss
+        tot_loss.backward()
+        tot_norm = clip_grad.clip_grad_norm_(self._model_dict["a&c"].parameters(), 1)
+        self._optimizer["a&c"].step()
+        return item_a_loss, item_c_loss, item_e_loss, float(tot_norm)
+
+    def set_weights(self, weights):
+        self._model_dict["a&c"].load_state_dict(weights)
+
+    def get_weights(self):
+        return self._model_dict["a&c"].state_dict()
+
+    def _get_save_idx(self, fp_str):
+        return int(fp_str.split(".")[0].split("_")[0])
+
+    def save_model(self, pth, id):
+        if not os.path.exists(pth):
+            os.makedirs(pth)
+        pth = os.path.join(pth, f"{id}_ac.pkl")
+        torch.save(self._model_dict["a&c"].state_dict(), pth)
+
+    def _set_gnn_weights(self, weights):
+        for key in weights:
+            if key in self._model_dict["a&c"].state_dict().keys():
+                self._model_dict["a&c"].state_dict()[key].copy_(weights[key])
+
+    def load_model(self, folder_pth, idx=-1):
+        if idx == -1:
+            fps = os.listdir(folder_pth)
+            fps = [f for f in fps if "ac" in f]
+            fps.sort(key=self._get_save_idx)
+            ac_pth = fps[-1]
+        else:
+            ac_pth = f"{idx}_ac.pkl"
+        pth = os.path.join(folder_pth, ac_pth)
+        with open(pth, "rb") as fp:
+            weights = torch.load(fp, map_location=self._device)
+        self._set_gnn_weights(weights)
diff --git a/examples/cim/gnn/agent.py b/examples/cim/gnn/agent.py
@@ -0,0 +1,40 @@
+from collections import defaultdict
+
+import numpy as np
+
+from examples.cim.gnn.numpy_store import Shuffler
+from maro.rl import AbsAgent
+from maro.utils import DummyLogger
+
+
+class TrainableAgent(AbsAgent):
+    def __init__(self, name, algorithm, experience_pool, logger=DummyLogger()):
+        self._logger = logger
+        super().__init__(name, algorithm, experience_pool)
+
+    def train(self, training_config):
+        loss_dict = defaultdict(list)
+        for j in range(training_config.shuffle_time):
+            shuffler = Shuffler(self._experience_pool, batch_size=training_config.batch_size)
+            while shuffler.has_next():
+                batch = shuffler.next()
+                actor_loss, critic_loss, entropy_loss, tot_loss = self._algorithm.train(
+                    batch, self._name[0], self._name[1])
+                loss_dict["actor"].append(actor_loss)
+                loss_dict["critic"].append(critic_loss)
+                loss_dict["entropy"].append(entropy_loss)
+                loss_dict["tot"].append(tot_loss)
+
+        a_loss = np.mean(loss_dict["actor"])
+        c_loss = np.mean(loss_dict["critic"])
+        e_loss = np.mean(loss_dict["entropy"])
+        tot_loss = np.mean(loss_dict["tot"])
+        self._logger.debug(
+            f"code: {str(self._name)} \t actor: {float(a_loss)} \t critic: {float(c_loss)} \t entropy: {float(e_loss)} \
+            \t tot: {float(tot_loss)}")
+
+        self._experience_pool.clear()
+        return loss_dict
+
+    def choose_action(self, model_state):
+        return self._algorithm.choose_action(model_state, self._name[0], self._name[1])
diff --git a/examples/cim/gnn/agent_manager.py b/examples/cim/gnn/agent_manager.py
@@ -0,0 +1,118 @@
+from copy import copy
+
+import numpy as np
+import torch
+
+from examples.cim.gnn.agent import TrainableAgent
+from examples.cim.gnn.actor_critic import ActorCritic
+from examples.cim.gnn.simple_gnn import SharedAC
+from examples.cim.gnn.numpy_store import NumpyStore
+from examples.cim.gnn.state_shaper import GNNStateShaper
+from maro.rl import AbsAgentManager, AgentMode
+from maro.utils import DummyLogger
+
+
+class SimpleAgentManger(AbsAgentManager):
+    def __init__(
+            self, name, agent_id_list, port_code_list, vessel_code_list, demo_env, state_shaper: GNNStateShaper,
+            logger=DummyLogger()):
+        super().__init__(
+            name, AgentMode.TRAIN, agent_id_list, state_shaper=state_shaper, action_shaper=None,
+            experience_shaper=None, explorer=None)
+        self.port_code_list = copy(port_code_list)
+        self.vessel_code_list = copy(vessel_code_list)
+        self.demo_env = demo_env
+        self._logger = logger
+
+    def assemble(self, config):
+        v_dim, vedge_dim = self._state_shaper.get_input_dim("v"), self._state_shaper.get_input_dim("vedge")
+        p_dim, pedge_dim = self._state_shaper.get_input_dim("p"), self._state_shaper.get_input_dim("pedge")
+
+        self.device = torch.device(config.training.device)
+        self._logger.info(config.training.device)
+        ac_model = SharedAC(
+            p_dim, pedge_dim, v_dim, vedge_dim, config.model.tick_buffer, config.model.action_dim).to(self.device)
+
+        value_dict = {
+            ("s", "v"):
+                (
+                    (config.model.tick_buffer, len(self.vessel_code_list), self._state_shaper.get_input_dim("v")),
+                    np.float32, False),
+            ("s", "p"):
+                (
+                    (config.model.tick_buffer, len(self.port_code_list), self._state_shaper.get_input_dim("p")),
+                    np.float32, False),
+            ("s", "vo"): ((len(self.vessel_code_list), len(self.port_code_list)), np.int64, True),
+            ("s", "po"): ((len(self.port_code_list), len(self.vessel_code_list)), np.int64, True),
+            ("s", "vedge"):
+                (
+                    (len(self.vessel_code_list), len(self.port_code_list), self._state_shaper.get_input_dim("vedge")),
+                    np.float32, True),
+            ("s", "pedge"):
+                (
+                    (len(self.port_code_list), len(self.vessel_code_list), self._state_shaper.get_input_dim("vedge")),
+                    np.float32, True),
+            ("s", "ppedge"):
+                (
+                    (len(self.port_code_list), len(self.port_code_list), self._state_shaper.get_input_dim("pedge")),
+                    np.float32, True),
+            ("s", "mask"): ((config.model.tick_buffer, ), np.bool, True),
+
+            ("s_", "v"):
+                (
+                    (config.model.tick_buffer, len(self.vessel_code_list), self._state_shaper.get_input_dim("v")),
+                    np.float32, False),
+            ("s_", "p"):
+                (
+                    (config.model.tick_buffer, len(self.port_code_list), self._state_shaper.get_input_dim("p")),
+                    np.float32, False),
+            ("s_", "vo"): ((len(self.vessel_code_list), len(self.port_code_list)), np.int64, True),
+            ("s_", "po"):
+                (
+                    (len(self.port_code_list), len(self.vessel_code_list)), np.int64, True),
+            ("s_", "vedge"):
+                (
+                    (len(self.vessel_code_list), len(self.port_code_list), self._state_shaper.get_input_dim("vedge")),
+                    np.float32, True),
+            ("s_", "pedge"):
+                (
+                    (len(self.port_code_list), len(self.vessel_code_list), self._state_shaper.get_input_dim("vedge")),
+                    np.float32, True),
+            ("s_", "ppedge"):
+                (
+                    (len(self.port_code_list), len(self.port_code_list), self._state_shaper.get_input_dim("pedge")),
+                    np.float32, True),
+            ("s_", "mask"): ((config.model.tick_buffer, ), np.bool, True),
+
+            # To identify one dimension variable.
+            ("R",): ((len(self.port_code_list), ), np.float32, True),
+            ("a",): (tuple(), np.int64, True),
+        }
+
+        self._algorithm = ActorCritic(
+            ac_model, self.device, td_steps=config.training.td_steps, p2p_adj=self._state_shaper.p2p_static_graph,
+            gamma=config.training.gamma, learning_rate=config.training.learning_rate)
+
+        for agent_id, cnt in config.env.exp_per_ep.items():
+            experience_pool = NumpyStore(value_dict, config.training.parallel_cnt * config.training.train_freq * cnt)
+            self._agent_dict[agent_id] = TrainableAgent(agent_id, self._algorithm, experience_pool, self._logger)
+
+    def choose_action(self, agent_id, state):
+        return self._agent_dict[agent_id].choose_action(state)
+
+    def load_models_from_files(self, model_pth):
+        self._algorithm.load_model(model_pth)
+
+    def train(self, training_config):
+        for agent in self._agent_dict.values():
+            agent.train(training_config)
+
+    def store_experiences(self, experiences):
+        for code, exp_list in experiences.items():
+            self._agent_dict[code].store_experiences(exp_list)
+
+    def save_model(self, pth, id):
+        self._algorithm.save_model(pth, id)
+
+    def load_model(self, pth):
+        self._algorithm.load_model(pth)