Skip to content

Commit

Permalink
Update RL Benchmarks (#584)
Browse files Browse the repository at this point in the history
* update plot func for rl tests

* Refine seed setting logic

* Refine metrics logic; add warmup to ddpg.

* Complete ddpg config

* Minor refinement of GymEnvSampler and plot.py

* update rl benchmark performance results

* Lint

---------

Co-authored-by: Huoran Li <huoranli@microsoft.com>
Co-authored-by: Huoran Li <huo53926@126.com>
  • Loading branch information
3 people committed Mar 20, 2023
1 parent f42d5b7 commit d859a4b
Show file tree
Hide file tree
Showing 18 changed files with 112 additions and 63 deletions.
5 changes: 3 additions & 2 deletions maro/cli/local/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_redis_conn(port=None):


# Functions executed on CLI commands
def run(conf_path: str, containerize: bool = False, evaluate_only: bool = False, **kwargs):
def run(conf_path: str, containerize: bool = False, seed: int = None, evaluate_only: bool = False, **kwargs):
# Load job configuration file
parser = ConfigParser(conf_path)
if containerize:
Expand All @@ -71,13 +71,14 @@ def run(conf_path: str, containerize: bool = False, evaluate_only: bool = False,
LOCAL_MARO_ROOT,
DOCKERFILE_PATH,
DOCKER_IMAGE_NAME,
seed=seed,
evaluate_only=evaluate_only,
)
except KeyboardInterrupt:
stop_rl_job_with_docker_compose(parser.config["job"], LOCAL_MARO_ROOT)
else:
try:
start_rl_job(parser, LOCAL_MARO_ROOT, evaluate_only=evaluate_only)
start_rl_job(parser, LOCAL_MARO_ROOT, seed=seed, evaluate_only=evaluate_only)
except KeyboardInterrupt:
sys.exit(1)

Expand Down
12 changes: 9 additions & 3 deletions maro/cli/local/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import subprocess
from copy import deepcopy
from typing import List
from typing import List, Optional

import docker
import yaml
Expand Down Expand Up @@ -110,12 +110,15 @@ def exec(cmd: str, env: dict, debug: bool = False) -> subprocess.Popen:
def start_rl_job(
parser: ConfigParser,
maro_root: str,
seed: Optional[int],
evaluate_only: bool,
background: bool = False,
) -> List[subprocess.Popen]:
procs = [
exec(
f"python {script}" + ("" if not evaluate_only else " --evaluate_only"),
f"python {script}"
+ ("" if not evaluate_only else " --evaluate_only")
+ ("" if seed is None else f" --seed {seed}"),
format_env_vars({**env, "PYTHONPATH": maro_root}, mode="proc"),
debug=not background,
)
Expand Down Expand Up @@ -169,6 +172,7 @@ def start_rl_job_with_docker_compose(
context: str,
dockerfile_path: str,
image_name: str,
seed: Optional[int],
evaluate_only: bool,
) -> None:
common_spec = {
Expand All @@ -185,7 +189,9 @@ def start_rl_job_with_docker_compose(
**deepcopy(common_spec),
**{
"container_name": component,
"command": f"python3 {script}" + ("" if not evaluate_only else " --evaluate_only"),
"command": f"python3 {script}"
+ ("" if not evaluate_only else " --evaluate_only")
+ ("" if seed is None else f" --seed {seed}"),
"environment": format_env_vars(env, mode="docker-compose"),
},
}
Expand Down
9 changes: 7 additions & 2 deletions maro/rl/workflows/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from maro.rl.utils.common import float_or_none, get_env, int_or_none, list_or_none
from maro.rl.utils.training import get_latest_ep
from maro.rl.workflows.callback import CallbackManager, Checkpoint, EarlyStopping, MetricsRecorder
from maro.utils import LoggerV2
from maro.utils import LoggerV2, set_seeds


class WorkflowEnvAttributes:
Expand Down Expand Up @@ -85,6 +85,7 @@ def __init__(self) -> None:
def _get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="MARO RL workflow parser")
parser.add_argument("--evaluate_only", action="store_true", help="Only run evaluation part of the workflow")
parser.add_argument("--seed", type=int, help="The random seed set before running this job")
return parser.parse_args()


Expand Down Expand Up @@ -243,9 +244,13 @@ def evaluate_only_workflow(rl_component_bundle: RLComponentBundle, env_attr: Wor


if __name__ == "__main__":
args = _get_args()
if args.seed is not None:
set_seeds(seed=args.seed)

scenario_path = get_env("SCENARIO_PATH")
scenario_path = os.path.normpath(scenario_path)
sys.path.insert(0, os.path.dirname(scenario_path))
module = importlib.import_module(os.path.basename(scenario_path))

main(getattr(module, "rl_component_bundle"), WorkflowEnvAttributes(), args=_get_args())
main(getattr(module, "rl_component_bundle"), WorkflowEnvAttributes(), args=args)
48 changes: 27 additions & 21 deletions tests/rl/gym_wrapper/env_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,26 +68,32 @@ def _post_eval_step(self, cache_element: CacheElement) -> None:
self._eval_rewards.append((len(rewards), np.sum(rewards)))

def post_collect(self, info_list: list, ep: int) -> None:
cur = {
"n_steps": sum([n for n, _ in self._sample_rewards]),
"n_segment": len(self._sample_rewards),
"avg_reward": np.mean([r for _, r in self._sample_rewards]),
"avg_n_steps": np.mean([n for n, _ in self._sample_rewards]),
"max_n_steps": np.max([n for n, _ in self._sample_rewards]),
"n_interactions": self._total_number_interactions,
}
self.metrics.update(cur)
# clear validation metrics
self.metrics = {k: v for k, v in self.metrics.items() if not k.startswith("val/")}
self._sample_rewards.clear()
if len(self._sample_rewards) > 0:
cur = {
"n_steps": sum([n for n, _ in self._sample_rewards]),
"n_segment": len(self._sample_rewards),
"avg_reward": np.mean([r for _, r in self._sample_rewards]),
"avg_n_steps": np.mean([n for n, _ in self._sample_rewards]),
"max_n_steps": np.max([n for n, _ in self._sample_rewards]),
"n_interactions": self._total_number_interactions,
}
self.metrics.update(cur)
# clear validation metrics
self.metrics = {k: v for k, v in self.metrics.items() if not k.startswith("val/")}
self._sample_rewards.clear()
else:
self.metrics = {"n_interactions": self._total_number_interactions}

def post_evaluate(self, info_list: list, ep: int) -> None:
cur = {
"val/n_steps": sum([n for n, _ in self._eval_rewards]),
"val/n_segment": len(self._eval_rewards),
"val/avg_reward": np.mean([r for _, r in self._eval_rewards]),
"val/avg_n_steps": np.mean([n for n, _ in self._eval_rewards]),
"val/max_n_steps": np.max([n for n, _ in self._eval_rewards]),
}
self.metrics.update(cur)
self._eval_rewards.clear()
if len(self._eval_rewards) > 0:
cur = {
"val/n_steps": sum([n for n, _ in self._eval_rewards]),
"val/n_segment": len(self._eval_rewards),
"val/avg_reward": np.mean([r for _, r in self._eval_rewards]),
"val/avg_n_steps": np.mean([n for n, _ in self._eval_rewards]),
"val/max_n_steps": np.max([n for n, _ in self._eval_rewards]),
}
self.metrics.update(cur)
self._eval_rewards.clear()
else:
self.metrics = {k: v for k, v in self.metrics.items() if not k.startswith("val/")}
Binary file modified tests/rl/log/Ant_1.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/Ant_11.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/HalfCheetah_1.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/HalfCheetah_11.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/Hopper_1.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/Hopper_11.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/Swimmer_1.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/Swimmer_11.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/Walker2d_1.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/rl/log/Walker2d_11.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
31 changes: 22 additions & 9 deletions tests/rl/performance.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# Performance for Gym Task Suite

We benchmarked the MARO RL Toolkit implementation in Gym task suite.
Some are compared to the benchmarks in [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#).
Limited by the environment version difference<!-- and some others?-->,
there may be some gaps between the performance here and that in Spinning Up benchmarks.
We benchmarked the MARO RL Toolkit implementation in Gym task suite. Some are compared to the benchmarks in
[OpenAI Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#). We've tried to align the
hyper-parameters for these benchmarks , but limited by the environment version difference, there may be some gaps
between the performance here and that in Spinning Up benchmarks. Generally speaking, the performance is comparable.

## Experimental Setting

The hyper-parameters are set to align with those used in [Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#experiment-details):
The hyper-parameters are set to align with those used in
[Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#experiment-details):

**Batch Size**:

Expand All @@ -24,13 +25,25 @@ The hyper-parameters are set to align with those used in [Spinning Up](https://s
- For on-policy algorithms: measured as the average trajectory return across the batch collected at each epoch;
- For off-policy algorithms: measured once every 10,000 steps by running the deterministic policy (or, in the case of SAC, the mean policy) without action noise for ten trajectories, and reporting the average return over those test trajectories;

**Total timesteps**: set to 3M for all task suites and algorithms.
**Total timesteps**: set to 4M for all task suites and algorithms.

Other parameters are set to the values in *tests/rl/tasks/*.
More details about the parameters can be found in *tests/rl/tasks/*.

## Performance Comparison
## Performance

Five environments from the MuJoCo Gym task suite are reported in Spinning Up, they are: HalfCheetah, Hopper, Walker2d, Swimmer, and Ant.
Five environments from the MuJoCo Gym task suite are reported in Spinning Up, they are: HalfCheetah, Hopper, Walker2d,
Swimmer, and Ant. The commit id of the code used to conduct the experiments for MARO RL benchmarks is ee25ce1e97.
The commands used are:

```sh
# Step 1: Set up the MuJoCo Environment in file tests/rl/gym_wrapper/common.py

# Step 2: Use the command below to run experiment with ALGORITHM (ddpg, ppo, sac) and random seed SEED.
python tests/rl/run.py tests/rl/tasks/ALGORITHM/config.yml --seed SEED

# Step 3: Plot performance curves by environment with specific smooth window size WINDOWSIZE.
python tests/rl/plot.py --smooth WINDOWSIZE
```

| **Env** | **Spinning Up** | **MARO RL w/o Smooth** | **MARO RL w/ Smooth** |
|:---------------:|:---------------:|:----------------------:|:---------------------:|
Expand Down
44 changes: 24 additions & 20 deletions tests/rl/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,32 +54,32 @@ def get_on_policy_data(log_dir: str) -> Tuple[np.ndarray, np.ndarray]:


def plot_performance_curves(title: str, dir_names: List[str], smooth_window_size: int) -> None:
for name in dir_names:
log_dir = os.path.join(LOG_DIR, name)
if not os.path.exists(log_dir):
for algorithm in color_map.keys():
if algorithm in ["ddpg", "sac", "td3"]:
func = get_off_policy_data
elif algorithm in ["ppo", "vpg"]:
func = get_on_policy_data

log_dirs = [os.path.join(LOG_DIR, name) for name in dir_names if algorithm in name]
series = [func(log_dir) for log_dir in log_dirs if os.path.exists(log_dir)]
if len(series) == 0:
continue

if "ppo" in name:
algorithm = "ppo"
func = get_on_policy_data
elif "sac" in name:
algorithm = "sac"
func = get_off_policy_data
elif "ddpg" in name:
algorithm = "ddpg"
func = get_off_policy_data
else:
raise "unknown algorithm name"
x = series[0][0]
assert all(len(_x) == len(x) for _x, _ in series), f"Input data should share the same length!"
ys = np.array([smooth(y, smooth_window_size) for _, y in series])
y_mean = np.mean(ys, axis=0)
y_std = np.std(ys, axis=0)

x, y = func(log_dir)
y = smooth(y, smooth_window_size)
plt.plot(x, y, label=algorithm, color=color_map[algorithm])
plt.plot(x, y_mean, label=algorithm, color=color_map[algorithm])
plt.fill_between(x, y_mean - y_std, y_mean + y_std, color=color_map[algorithm], alpha=0.2)

plt.legend()
plt.grid()
plt.title(title)
plt.xlabel("Total Env Interactions")
plt.ylabel(f"Average Trajectory Return (moving average with window size = {smooth_window_size})")
plt.savefig(os.path.join(LOG_DIR, f"{title}_{smooth_window_size}.png"))
plt.ylabel(f"Average Trajectory Return \n(moving average with window size = {smooth_window_size})")
plt.savefig(os.path.join(LOG_DIR, f"{title}_{smooth_window_size}.png"), bbox_inches="tight")
plt.close()


Expand All @@ -91,6 +91,10 @@ def plot_performance_curves(title: str, dir_names: List[str], smooth_window_size
for env_name in ["HalfCheetah", "Hopper", "Walker2d", "Swimmer", "Ant"]:
plot_performance_curves(
title=env_name,
dir_names=[f"{algorithm}_{env_name.lower()}" for algorithm in ["ppo", "sac", "ddpg"]],
dir_names=[
f"{algorithm}_{env_name.lower()}_{seed}"
for algorithm in ["ppo", "sac", "ddpg"]
for seed in [42, 729, 1024, 2023, 3500]
],
smooth_window_size=args.smooth,
)
5 changes: 1 addition & 4 deletions tests/rl/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import argparse

from maro.cli.local.commands import run
from maro.utils.utils import set_seeds


def get_args() -> argparse.Namespace:
Expand All @@ -17,6 +16,4 @@ def get_args() -> argparse.Namespace:

if __name__ == "__main__":
args = get_args()
if args.seed is not None:
set_seeds(seed=args.seed)
run(conf_path=args.conf_path, containerize=False, evaluate_only=args.evaluate_only)
run(conf_path=args.conf_path, containerize=False, seed=args.seed, evaluate_only=args.evaluate_only)
21 changes: 19 additions & 2 deletions tests/rl/tasks/ddpg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import torch
from gym import spaces
from torch.optim import Adam

from maro.rl.model import QNet
Expand All @@ -10,12 +11,14 @@
from maro.rl.policy import ContinuousRLPolicy
from maro.rl.rl_component.rl_component_bundle import RLComponentBundle
from maro.rl.training.algorithms import DDPGParams, DDPGTrainer
from maro.rl.utils import ndarray_to_tensor

from tests.rl.gym_wrapper.common import (
action_limit,
action_lower_bound,
action_upper_bound,
gym_action_dim,
gym_action_space,
gym_state_dim,
learn_env,
num_agents,
Expand All @@ -37,7 +40,14 @@


class MyContinuousDDPGNet(ContinuousDDPGNet):
def __init__(self, state_dim: int, action_dim: int, action_limit: float, noise_scale: float = 0.1) -> None:
def __init__(
self,
state_dim: int,
action_dim: int,
action_limit: float,
action_space: spaces.Space,
noise_scale: float = 0.1,
) -> None:
super(MyContinuousDDPGNet, self).__init__(state_dim=state_dim, action_dim=action_dim)

self._net = FullyConnected(
Expand All @@ -50,6 +60,7 @@ def __init__(self, state_dim: int, action_dim: int, action_limit: float, noise_s
self._optim = Adam(self._net.parameters(), lr=critic_learning_rate)
self._action_limit = action_limit
self._noise_scale = noise_scale
self._action_space = action_space

def _get_actions_impl(self, states: torch.Tensor, exploring: bool) -> torch.Tensor:
action = self._net(states) * self._action_limit
Expand All @@ -59,6 +70,11 @@ def _get_actions_impl(self, states: torch.Tensor, exploring: bool) -> torch.Tens
action = torch.clamp(action, -self._action_limit, self._action_limit)
return action

def _get_random_actions_impl(self, states: torch.Tensor) -> torch.Tensor:
return torch.stack(
[ndarray_to_tensor(self._action_space.sample(), device=self._device) for _ in range(states.shape[0])],
)


class MyQCriticNet(QNet):
def __init__(self, state_dim: int, action_dim: int) -> None:
Expand Down Expand Up @@ -86,7 +102,8 @@ def get_ddpg_policy(
return ContinuousRLPolicy(
name=name,
action_range=(action_lower_bound, action_upper_bound),
policy_net=MyContinuousDDPGNet(gym_state_dim, gym_action_dim, action_limit),
policy_net=MyContinuousDDPGNet(gym_state_dim, gym_action_dim, action_limit, gym_action_space),
warmup=10000,
)


Expand Down

0 comments on commit d859a4b

Please sign in to comment.