RL benchmark on GYM (#575)

* PPO, SAC, DDPG passed * Explore in SAC * Test GYM on server * Sync server changes * pre-commit * Ready to try on server * . * . * . * . * . * Performance OK * Move to tests * Remove old versions * PPO done * Start to test AC * Start to test SAC * SAC test passed * update for some PR comments; Add a MARKDOWN file (#576) Co-authored-by: Jinyu Wang <wang.jinyu@microsoft.com> * Use FullyConnected to replace mlp * Update action bound * Pre-commit --------- Co-authored-by: Jinyu-W <53509467+Jinyu-W@users.noreply.github.com> Co-authored-by: Jinyu Wang <wang.jinyu@microsoft.com>
microsoft · Feb 6, 2023 · 214383f · 214383f
1 parent eb6324c
commit 214383f
Show file tree

Hide file tree

Showing 24 changed files with 604 additions and 14 deletions.
diff --git a/examples/cim/rl/algorithms/ac.py b/examples/cim/rl/algorithms/ac.py
@@ -11,6 +11,7 @@
 actor_net_conf = {
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.Tanh,
+    "output_activation": torch.nn.Tanh,
     "softmax": True,
     "batch_norm": False,
     "head": True,
@@ -19,6 +20,7 @@
     "hidden_dims": [256, 128, 64],
     "output_dim": 1,
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": True,
     "head": True,

diff --git a/examples/cim/rl/algorithms/dqn.py b/examples/cim/rl/algorithms/dqn.py
@@ -12,6 +12,7 @@
 q_net_conf = {
     "hidden_dims": [256, 128, 64, 32],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": True,
     "skip_connection": False,

diff --git a/examples/cim/rl/algorithms/maddpg.py b/examples/cim/rl/algorithms/maddpg.py
@@ -14,6 +14,7 @@
 actor_net_conf = {
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.Tanh,
+    "output_activation": torch.nn.Tanh,
     "softmax": True,
     "batch_norm": False,
     "head": True,
@@ -22,6 +23,7 @@
     "hidden_dims": [256, 128, 64],
     "output_dim": 1,
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": True,
     "head": True,

diff --git a/examples/rl/cim.yml b/examples/rl/cim.yml
@@ -5,8 +5,8 @@
 # Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.
 
 # Run this workflow by executing one of the following commands:
-# - python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml
-# - (Requires installing MARO from source) maro local run .\examples\rl\cim.yml
+# - python ./examples/rl/run.py ./examples/rl/cim.yml
+# - (Requires installing MARO from source) maro local run ./examples/rl/cim.yml
 
 job: cim_rl_workflow
 scenario_path: "examples/cim/rl"

diff --git a/examples/rl/cim_distributed.yml b/examples/rl/cim_distributed.yml
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-# Example RL config file for CIM scenario.
+# Example RL config file for CIM scenario (distributed version).
 # Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.
 
 # Run this workflow by executing one of the following commands:
-# - python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml
-# - (Requires installing MARO from source) maro local run .\examples\rl\cim.yml
+# - python ./examples/rl/run.py ./examples/rl/cim_distributed.yml
+# - (Requires installing MARO from source) maro local run ./examples/rl/cim_distributed.yml
 
 job: cim_rl_workflow
 scenario_path: "examples/cim/rl"

diff --git a/examples/rl/run_rl_example.py → examples/rl/run.py b/examples/rl/run_rl_example.py → examples/rl/run.py
diff --git a/examples/rl/vm_scheduling.yml b/examples/rl/vm_scheduling.yml
@@ -5,8 +5,8 @@
 # Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.
 
 # Run this workflow by executing one of the following commands:
-# - python .\examples\rl\run_rl_example.py .\examples\rl\vm_scheduling.yml
-# - (Requires installing MARO from source) maro local run .\examples\rl\vm_scheduling.yml
+# - python ./examples/rl/run.py ./examples/rl/vm_scheduling.yml
+# - (Requires installing MARO from source) maro local run ./examples/rl/vm_scheduling.yml
 
 job: vm_scheduling_rl_workflow
 scenario_path: "examples/vm_scheduling/rl"

diff --git a/examples/vm_scheduling/rl/algorithms/ac.py b/examples/vm_scheduling/rl/algorithms/ac.py
@@ -11,6 +11,7 @@
 actor_net_conf = {
     "hidden_dims": [64, 32, 32],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": True,
     "batch_norm": False,
     "head": True,
@@ -19,6 +20,7 @@
 critic_net_conf = {
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": False,
     "head": True,

diff --git a/examples/vm_scheduling/rl/algorithms/dqn.py b/examples/vm_scheduling/rl/algorithms/dqn.py
@@ -14,6 +14,7 @@
 q_net_conf = {
     "hidden_dims": [64, 128, 256],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": False,
     "skip_connection": False,

diff --git a/maro/rl/model/fc_block.py b/maro/rl/model/fc_block.py
@@ -39,7 +39,8 @@ def __init__(
         input_dim: int,
         output_dim: int,
         hidden_dims: List[int],
-        activation: Optional[Type[torch.nn.Module]] = nn.ReLU,
+        activation: Optional[Type[torch.nn.Module]] = None,
+        output_activation: Optional[Type[torch.nn.Module]] = None,
         head: bool = False,
         softmax: bool = False,
         batch_norm: bool = False,
@@ -54,7 +55,8 @@ def __init__(
         self._output_dim = output_dim
 
         # network features
-        self._activation = activation() if activation else None
+        self._activation = activation if activation else None
+        self._output_activation = output_activation if output_activation else None
         self._head = head
         self._softmax = nn.Softmax(dim=1) if softmax else None
         self._batch_norm = batch_norm
@@ -70,9 +72,13 @@ def __init__(
 
         # build the net
         dims = [self._input_dim] + self._hidden_dims
-        layers = [self._build_layer(in_dim, out_dim) for in_dim, out_dim in zip(dims, dims[1:])]
+        layers = [
+            self._build_layer(in_dim, out_dim, activation=self._activation) for in_dim, out_dim in zip(dims, dims[1:])
+        ]
         # top layer
-        layers.append(self._build_layer(dims[-1], self._output_dim, head=self._head))
+        layers.append(
+            self._build_layer(dims[-1], self._output_dim, head=self._head, activation=self._output_activation),
+        )
 
         self._net = nn.Sequential(*layers)
 
@@ -101,7 +107,13 @@ def input_dim(self) -> int:
     def output_dim(self) -> int:
         return self._output_dim
 
-    def _build_layer(self, input_dim: int, output_dim: int, head: bool = False) -> nn.Module:
+    def _build_layer(
+        self,
+        input_dim: int,
+        output_dim: int,
+        head: bool = False,
+        activation: Type[torch.nn.Module] = None,
+    ) -> nn.Module:
         """Build a basic layer.
 
         BN -> Linear -> Activation -> Dropout
@@ -110,8 +122,8 @@ def _build_layer(self, input_dim: int, output_dim: int, head: bool = False) -> n
         if self._batch_norm:
             components.append(("batch_norm", nn.BatchNorm1d(input_dim)))
         components.append(("linear", nn.Linear(input_dim, output_dim)))
-        if not head and self._activation is not None:
-            components.append(("activation", self._activation))
+        if not head and activation is not None:
+            components.append(("activation", activation()))
         if not head and self._dropout_p:
             components.append(("dropout", nn.Dropout(p=self._dropout_p)))
         return nn.Sequential(OrderedDict(components))
diff --git a/tests/rl/algorithms/__init__.py b/tests/rl/algorithms/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
diff --git a/tests/rl/algorithms/ac.py b/tests/rl/algorithms/ac.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch.distributions import Normal
+from torch.optim import Adam
+
+from maro.rl.model import ContinuousACBasedNet, VNet
+from maro.rl.model.fc_block import FullyConnected
+from maro.rl.policy import ContinuousRLPolicy
+from maro.rl.training.algorithms import ActorCriticParams, ActorCriticTrainer
+
+actor_net_conf = {
+    "hidden_dims": [64, 64],
+    "activation": torch.nn.Tanh,
+}
+critic_net_conf = {
+    "hidden_dims": [64, 64],
+    "activation": torch.nn.Tanh,
+}
+actor_learning_rate = 3e-4
+critic_learning_rate = 1e-3
+
+
+class MyContinuousACBasedNet(ContinuousACBasedNet):
+    def __init__(self, state_dim: int, action_dim: int) -> None:
+        super(MyContinuousACBasedNet, self).__init__(state_dim=state_dim, action_dim=action_dim)
+
+        log_std = -0.5 * np.ones(action_dim, dtype=np.float32)
+        self._log_std = torch.nn.Parameter(torch.as_tensor(log_std))
+        self._mu_net = FullyConnected(
+            input_dim=state_dim,
+            hidden_dims=actor_net_conf["hidden_dims"],
+            output_dim=action_dim,
+            activation=actor_net_conf["activation"],
+        )
+        self._optim = Adam(self.parameters(), lr=actor_learning_rate)
+
+    def _get_actions_with_logps_impl(self, states: torch.Tensor, exploring: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+        distribution = self._distribution(states)
+        actions = distribution.sample()
+        logps = distribution.log_prob(actions).sum(axis=-1)
+        return actions, logps
+
+    def _get_states_actions_logps_impl(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
+        distribution = self._distribution(states)
+        logps = distribution.log_prob(actions).sum(axis=-1)
+        return logps
+
+    def _distribution(self, states: torch.Tensor) -> Normal:
+        mu = self._mu_net(states.float())
+        std = torch.exp(self._log_std)
+        return Normal(mu, std)
+
+
+class MyVCriticNet(VNet):
+    def __init__(self, state_dim: int) -> None:
+        super(MyVCriticNet, self).__init__(state_dim=state_dim)
+        self._critic = FullyConnected(
+            input_dim=state_dim,
+            output_dim=1,
+            hidden_dims=critic_net_conf["hidden_dims"],
+            activation=critic_net_conf["activation"],
+        )
+        self._optim = Adam(self._critic.parameters(), lr=critic_learning_rate)
+
+    def _get_v_values(self, states: torch.Tensor) -> torch.Tensor:
+        return self._critic(states.float()).squeeze(-1)
+
+
+def get_ac_policy(
+    name: str,
+    action_lower_bound: list,
+    action_upper_bound: list,
+    gym_state_dim: int,
+    gym_action_dim: int,
+) -> ContinuousRLPolicy:
+    return ContinuousRLPolicy(
+        name=name,
+        action_range=(action_lower_bound, action_upper_bound),
+        policy_net=MyContinuousACBasedNet(gym_state_dim, gym_action_dim),
+    )
+
+
+def get_ac_trainer(name: str, state_dim: int) -> ActorCriticTrainer:
+    return ActorCriticTrainer(
+        name=name,
+        reward_discount=0.99,
+        params=ActorCriticParams(
+            get_v_critic_net_func=lambda: MyVCriticNet(state_dim),
+            grad_iters=80,
+            lam=0.97,
+        ),
+    )
diff --git a/tests/rl/algorithms/ppo.py b/tests/rl/algorithms/ppo.py
@@ -0,0 +1,21 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from maro.rl.training.algorithms import PPOParams, PPOTrainer
+
+from .ac import MyVCriticNet, get_ac_policy
+
+get_ppo_policy = get_ac_policy
+
+
+def get_ppo_trainer(name: str, state_dim: int) -> PPOTrainer:
+    return PPOTrainer(
+        name=name,
+        reward_discount=0.99,
+        params=PPOParams(
+            get_v_critic_net_func=lambda: MyVCriticNet(state_dim),
+            grad_iters=80,
+            lam=0.97,
+            clip_ratio=0.2,
+        ),
+    )
diff --git a/tests/rl/algorithms/sac.py b/tests/rl/algorithms/sac.py
@@ -0,0 +1,104 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.distributions import Normal
+from torch.optim import Adam
+
+from maro.rl.model import ContinuousSACNet, QNet
+from maro.rl.model.fc_block import FullyConnected
+from maro.rl.policy import ContinuousRLPolicy
+from maro.rl.training.algorithms import SoftActorCriticParams, SoftActorCriticTrainer
+
+actor_net_conf = {
+    "hidden_dims": [64, 64],
+    "activation": torch.nn.Tanh,
+}
+critic_net_conf = {
+    "hidden_dims": [64, 64],
+    "activation": torch.nn.Tanh,
+}
+actor_learning_rate = 3e-4
+critic_learning_rate = 1e-3
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -20
+
+
+class MyContinuousSACNet(ContinuousSACNet):
+    def __init__(self, state_dim: int, action_dim: int, action_limit: float) -> None:
+        super(MyContinuousSACNet, self).__init__(state_dim=state_dim, action_dim=action_dim)
+
+        self._net = FullyConnected(
+            input_dim=state_dim,
+            output_dim=actor_net_conf["hidden_dims"][-1],
+            hidden_dims=actor_net_conf["hidden_dims"][:-1],
+            activation=actor_net_conf["activation"],
+            output_activation=actor_net_conf["activation"],
+        )
+        self._mu = torch.nn.Linear(actor_net_conf["hidden_dims"][-1], action_dim)
+        self._log_std = torch.nn.Linear(actor_net_conf["hidden_dims"][-1], action_dim)
+        self._action_limit = action_limit
+        self._optim = Adam(self.parameters(), lr=actor_learning_rate)
+
+    def _get_actions_with_logps_impl(self, states: torch.Tensor, exploring: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+        net_out = self._net(states.float())
+        mu = self._mu(net_out)
+        log_std = torch.clamp(self._log_std(net_out), LOG_STD_MIN, LOG_STD_MAX)
+        std = torch.exp(log_std)
+
+        pi_distribution = Normal(mu, std)
+        pi_action = pi_distribution.rsample() if exploring else mu
+
+        logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
+        logp_pi -= (2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(axis=1)
+
+        pi_action = torch.tanh(pi_action) * self._action_limit
+
+        return pi_action, logp_pi
+
+
+class MyQCriticNet(QNet):
+    def __init__(self, state_dim: int, action_dim: int) -> None:
+        super(MyQCriticNet, self).__init__(state_dim=state_dim, action_dim=action_dim)
+        self._critic = FullyConnected(
+            input_dim=state_dim + action_dim,
+            output_dim=1,
+            hidden_dims=critic_net_conf["hidden_dims"],
+            activation=critic_net_conf["activation"],
+        )
+        self._optim = Adam(self._critic.parameters(), lr=critic_learning_rate)
+
+    def _get_q_values(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
+        return self._critic(torch.cat([states, actions], dim=1).float()).squeeze(-1)
+
+
+def get_sac_policy(
+    name: str,
+    action_lower_bound: list,
+    action_upper_bound: list,
+    gym_state_dim: int,
+    gym_action_dim: int,
+    action_limit: float,
+) -> ContinuousRLPolicy:
+    return ContinuousRLPolicy(
+        name=name,
+        action_range=(action_lower_bound, action_upper_bound),
+        policy_net=MyContinuousSACNet(gym_state_dim, gym_action_dim, action_limit),
+    )
+
+
+def get_sac_trainer(name: str, state_dim: int, action_dim: int) -> SoftActorCriticTrainer:
+    return SoftActorCriticTrainer(
+        name=name,
+        reward_discount=0.99,
+        params=SoftActorCriticParams(
+            get_q_critic_net_func=lambda: MyQCriticNet(state_dim, action_dim),
+            num_epochs=10,
+            n_start_train=10000,
+        ),
+    )