Skip to content

Commit

Permalink
RL benchmark on GYM (#575)
Browse files Browse the repository at this point in the history
* PPO, SAC, DDPG passed

* Explore in SAC

* Test GYM on server

* Sync server changes

* pre-commit

* Ready to try on server

* .

* .

* .

* .

* .

* Performance OK

* Move to tests

* Remove old versions

* PPO done

* Start to test AC

* Start to test SAC

* SAC test passed

* update for some PR comments; Add a MARKDOWN file (#576)

Co-authored-by: Jinyu Wang <wang.jinyu@microsoft.com>

* Use FullyConnected to replace mlp

* Update action bound

* Pre-commit

---------

Co-authored-by: Jinyu-W <53509467+Jinyu-W@users.noreply.github.com>
Co-authored-by: Jinyu Wang <wang.jinyu@microsoft.com>
  • Loading branch information
3 people committed Feb 6, 2023
1 parent eb6324c commit 214383f
Show file tree
Hide file tree
Showing 24 changed files with 604 additions and 14 deletions.
2 changes: 2 additions & 0 deletions examples/cim/rl/algorithms/ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
actor_net_conf = {
"hidden_dims": [256, 128, 64],
"activation": torch.nn.Tanh,
"output_activation": torch.nn.Tanh,
"softmax": True,
"batch_norm": False,
"head": True,
Expand All @@ -19,6 +20,7 @@
"hidden_dims": [256, 128, 64],
"output_dim": 1,
"activation": torch.nn.LeakyReLU,
"output_activation": torch.nn.LeakyReLU,
"softmax": False,
"batch_norm": True,
"head": True,
Expand Down
1 change: 1 addition & 0 deletions examples/cim/rl/algorithms/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
q_net_conf = {
"hidden_dims": [256, 128, 64, 32],
"activation": torch.nn.LeakyReLU,
"output_activation": torch.nn.LeakyReLU,
"softmax": False,
"batch_norm": True,
"skip_connection": False,
Expand Down
2 changes: 2 additions & 0 deletions examples/cim/rl/algorithms/maddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
actor_net_conf = {
"hidden_dims": [256, 128, 64],
"activation": torch.nn.Tanh,
"output_activation": torch.nn.Tanh,
"softmax": True,
"batch_norm": False,
"head": True,
Expand All @@ -22,6 +23,7 @@
"hidden_dims": [256, 128, 64],
"output_dim": 1,
"activation": torch.nn.LeakyReLU,
"output_activation": torch.nn.LeakyReLU,
"softmax": False,
"batch_norm": True,
"head": True,
Expand Down
4 changes: 2 additions & 2 deletions examples/rl/cim.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
# Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.

# Run this workflow by executing one of the following commands:
# - python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml
# - (Requires installing MARO from source) maro local run .\examples\rl\cim.yml
# - python ./examples/rl/run.py ./examples/rl/cim.yml
# - (Requires installing MARO from source) maro local run ./examples/rl/cim.yml

job: cim_rl_workflow
scenario_path: "examples/cim/rl"
Expand Down
6 changes: 3 additions & 3 deletions examples/rl/cim_distributed.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

# Example RL config file for CIM scenario.
# Example RL config file for CIM scenario (distributed version).
# Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.

# Run this workflow by executing one of the following commands:
# - python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml
# - (Requires installing MARO from source) maro local run .\examples\rl\cim.yml
# - python ./examples/rl/run.py ./examples/rl/cim_distributed.yml
# - (Requires installing MARO from source) maro local run ./examples/rl/cim_distributed.yml

job: cim_rl_workflow
scenario_path: "examples/cim/rl"
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions examples/rl/vm_scheduling.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
# Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.

# Run this workflow by executing one of the following commands:
# - python .\examples\rl\run_rl_example.py .\examples\rl\vm_scheduling.yml
# - (Requires installing MARO from source) maro local run .\examples\rl\vm_scheduling.yml
# - python ./examples/rl/run.py ./examples/rl/vm_scheduling.yml
# - (Requires installing MARO from source) maro local run ./examples/rl/vm_scheduling.yml

job: vm_scheduling_rl_workflow
scenario_path: "examples/vm_scheduling/rl"
Expand Down
2 changes: 2 additions & 0 deletions examples/vm_scheduling/rl/algorithms/ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
actor_net_conf = {
"hidden_dims": [64, 32, 32],
"activation": torch.nn.LeakyReLU,
"output_activation": torch.nn.LeakyReLU,
"softmax": True,
"batch_norm": False,
"head": True,
Expand All @@ -19,6 +20,7 @@
critic_net_conf = {
"hidden_dims": [256, 128, 64],
"activation": torch.nn.LeakyReLU,
"output_activation": torch.nn.LeakyReLU,
"softmax": False,
"batch_norm": False,
"head": True,
Expand Down
1 change: 1 addition & 0 deletions examples/vm_scheduling/rl/algorithms/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
q_net_conf = {
"hidden_dims": [64, 128, 256],
"activation": torch.nn.LeakyReLU,
"output_activation": torch.nn.LeakyReLU,
"softmax": False,
"batch_norm": False,
"skip_connection": False,
Expand Down
26 changes: 19 additions & 7 deletions maro/rl/model/fc_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def __init__(
input_dim: int,
output_dim: int,
hidden_dims: List[int],
activation: Optional[Type[torch.nn.Module]] = nn.ReLU,
activation: Optional[Type[torch.nn.Module]] = None,
output_activation: Optional[Type[torch.nn.Module]] = None,
head: bool = False,
softmax: bool = False,
batch_norm: bool = False,
Expand All @@ -54,7 +55,8 @@ def __init__(
self._output_dim = output_dim

# network features
self._activation = activation() if activation else None
self._activation = activation if activation else None
self._output_activation = output_activation if output_activation else None
self._head = head
self._softmax = nn.Softmax(dim=1) if softmax else None
self._batch_norm = batch_norm
Expand All @@ -70,9 +72,13 @@ def __init__(

# build the net
dims = [self._input_dim] + self._hidden_dims
layers = [self._build_layer(in_dim, out_dim) for in_dim, out_dim in zip(dims, dims[1:])]
layers = [
self._build_layer(in_dim, out_dim, activation=self._activation) for in_dim, out_dim in zip(dims, dims[1:])
]
# top layer
layers.append(self._build_layer(dims[-1], self._output_dim, head=self._head))
layers.append(
self._build_layer(dims[-1], self._output_dim, head=self._head, activation=self._output_activation),
)

self._net = nn.Sequential(*layers)

Expand Down Expand Up @@ -101,7 +107,13 @@ def input_dim(self) -> int:
def output_dim(self) -> int:
return self._output_dim

def _build_layer(self, input_dim: int, output_dim: int, head: bool = False) -> nn.Module:
def _build_layer(
self,
input_dim: int,
output_dim: int,
head: bool = False,
activation: Type[torch.nn.Module] = None,
) -> nn.Module:
"""Build a basic layer.
BN -> Linear -> Activation -> Dropout
Expand All @@ -110,8 +122,8 @@ def _build_layer(self, input_dim: int, output_dim: int, head: bool = False) -> n
if self._batch_norm:
components.append(("batch_norm", nn.BatchNorm1d(input_dim)))
components.append(("linear", nn.Linear(input_dim, output_dim)))
if not head and self._activation is not None:
components.append(("activation", self._activation))
if not head and activation is not None:
components.append(("activation", activation()))
if not head and self._dropout_p:
components.append(("dropout", nn.Dropout(p=self._dropout_p)))
return nn.Sequential(OrderedDict(components))
2 changes: 2 additions & 0 deletions tests/rl/algorithms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
97 changes: 97 additions & 0 deletions tests/rl/algorithms/ac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Tuple

import numpy as np
import torch
from torch.distributions import Normal
from torch.optim import Adam

from maro.rl.model import ContinuousACBasedNet, VNet
from maro.rl.model.fc_block import FullyConnected
from maro.rl.policy import ContinuousRLPolicy
from maro.rl.training.algorithms import ActorCriticParams, ActorCriticTrainer

actor_net_conf = {
"hidden_dims": [64, 64],
"activation": torch.nn.Tanh,
}
critic_net_conf = {
"hidden_dims": [64, 64],
"activation": torch.nn.Tanh,
}
actor_learning_rate = 3e-4
critic_learning_rate = 1e-3


class MyContinuousACBasedNet(ContinuousACBasedNet):
def __init__(self, state_dim: int, action_dim: int) -> None:
super(MyContinuousACBasedNet, self).__init__(state_dim=state_dim, action_dim=action_dim)

log_std = -0.5 * np.ones(action_dim, dtype=np.float32)
self._log_std = torch.nn.Parameter(torch.as_tensor(log_std))
self._mu_net = FullyConnected(
input_dim=state_dim,
hidden_dims=actor_net_conf["hidden_dims"],
output_dim=action_dim,
activation=actor_net_conf["activation"],
)
self._optim = Adam(self.parameters(), lr=actor_learning_rate)

def _get_actions_with_logps_impl(self, states: torch.Tensor, exploring: bool) -> Tuple[torch.Tensor, torch.Tensor]:
distribution = self._distribution(states)
actions = distribution.sample()
logps = distribution.log_prob(actions).sum(axis=-1)
return actions, logps

def _get_states_actions_logps_impl(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
distribution = self._distribution(states)
logps = distribution.log_prob(actions).sum(axis=-1)
return logps

def _distribution(self, states: torch.Tensor) -> Normal:
mu = self._mu_net(states.float())
std = torch.exp(self._log_std)
return Normal(mu, std)


class MyVCriticNet(VNet):
def __init__(self, state_dim: int) -> None:
super(MyVCriticNet, self).__init__(state_dim=state_dim)
self._critic = FullyConnected(
input_dim=state_dim,
output_dim=1,
hidden_dims=critic_net_conf["hidden_dims"],
activation=critic_net_conf["activation"],
)
self._optim = Adam(self._critic.parameters(), lr=critic_learning_rate)

def _get_v_values(self, states: torch.Tensor) -> torch.Tensor:
return self._critic(states.float()).squeeze(-1)


def get_ac_policy(
name: str,
action_lower_bound: list,
action_upper_bound: list,
gym_state_dim: int,
gym_action_dim: int,
) -> ContinuousRLPolicy:
return ContinuousRLPolicy(
name=name,
action_range=(action_lower_bound, action_upper_bound),
policy_net=MyContinuousACBasedNet(gym_state_dim, gym_action_dim),
)


def get_ac_trainer(name: str, state_dim: int) -> ActorCriticTrainer:
return ActorCriticTrainer(
name=name,
reward_discount=0.99,
params=ActorCriticParams(
get_v_critic_net_func=lambda: MyVCriticNet(state_dim),
grad_iters=80,
lam=0.97,
),
)
21 changes: 21 additions & 0 deletions tests/rl/algorithms/ppo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from maro.rl.training.algorithms import PPOParams, PPOTrainer

from .ac import MyVCriticNet, get_ac_policy

get_ppo_policy = get_ac_policy


def get_ppo_trainer(name: str, state_dim: int) -> PPOTrainer:
return PPOTrainer(
name=name,
reward_discount=0.99,
params=PPOParams(
get_v_critic_net_func=lambda: MyVCriticNet(state_dim),
grad_iters=80,
lam=0.97,
clip_ratio=0.2,
),
)
104 changes: 104 additions & 0 deletions tests/rl/algorithms/sac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Tuple

import numpy as np
import torch
import torch.nn.functional as F
from torch.distributions import Normal
from torch.optim import Adam

from maro.rl.model import ContinuousSACNet, QNet
from maro.rl.model.fc_block import FullyConnected
from maro.rl.policy import ContinuousRLPolicy
from maro.rl.training.algorithms import SoftActorCriticParams, SoftActorCriticTrainer

actor_net_conf = {
"hidden_dims": [64, 64],
"activation": torch.nn.Tanh,
}
critic_net_conf = {
"hidden_dims": [64, 64],
"activation": torch.nn.Tanh,
}
actor_learning_rate = 3e-4
critic_learning_rate = 1e-3

LOG_STD_MAX = 2
LOG_STD_MIN = -20


class MyContinuousSACNet(ContinuousSACNet):
def __init__(self, state_dim: int, action_dim: int, action_limit: float) -> None:
super(MyContinuousSACNet, self).__init__(state_dim=state_dim, action_dim=action_dim)

self._net = FullyConnected(
input_dim=state_dim,
output_dim=actor_net_conf["hidden_dims"][-1],
hidden_dims=actor_net_conf["hidden_dims"][:-1],
activation=actor_net_conf["activation"],
output_activation=actor_net_conf["activation"],
)
self._mu = torch.nn.Linear(actor_net_conf["hidden_dims"][-1], action_dim)
self._log_std = torch.nn.Linear(actor_net_conf["hidden_dims"][-1], action_dim)
self._action_limit = action_limit
self._optim = Adam(self.parameters(), lr=actor_learning_rate)

def _get_actions_with_logps_impl(self, states: torch.Tensor, exploring: bool) -> Tuple[torch.Tensor, torch.Tensor]:
net_out = self._net(states.float())
mu = self._mu(net_out)
log_std = torch.clamp(self._log_std(net_out), LOG_STD_MIN, LOG_STD_MAX)
std = torch.exp(log_std)

pi_distribution = Normal(mu, std)
pi_action = pi_distribution.rsample() if exploring else mu

logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
logp_pi -= (2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(axis=1)

pi_action = torch.tanh(pi_action) * self._action_limit

return pi_action, logp_pi


class MyQCriticNet(QNet):
def __init__(self, state_dim: int, action_dim: int) -> None:
super(MyQCriticNet, self).__init__(state_dim=state_dim, action_dim=action_dim)
self._critic = FullyConnected(
input_dim=state_dim + action_dim,
output_dim=1,
hidden_dims=critic_net_conf["hidden_dims"],
activation=critic_net_conf["activation"],
)
self._optim = Adam(self._critic.parameters(), lr=critic_learning_rate)

def _get_q_values(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
return self._critic(torch.cat([states, actions], dim=1).float()).squeeze(-1)


def get_sac_policy(
name: str,
action_lower_bound: list,
action_upper_bound: list,
gym_state_dim: int,
gym_action_dim: int,
action_limit: float,
) -> ContinuousRLPolicy:
return ContinuousRLPolicy(
name=name,
action_range=(action_lower_bound, action_upper_bound),
policy_net=MyContinuousSACNet(gym_state_dim, gym_action_dim, action_limit),
)


def get_sac_trainer(name: str, state_dim: int, action_dim: int) -> SoftActorCriticTrainer:
return SoftActorCriticTrainer(
name=name,
reward_discount=0.99,
params=SoftActorCriticParams(
get_q_critic_net_func=lambda: MyQCriticNet(state_dim, action_dim),
num_epochs=10,
n_start_train=10000,
),
)
Loading

0 comments on commit 214383f

Please sign in to comment.