change rms 2 tfrms switch in vec_normalize to be more explicit (#886)

* change rms 2 tfrms switch in vec_normalize to be more explicit * modify the vec_normalize / use_tf logic a little bit * typo * use_tf = False by default
openai · Apr 26, 2019 · a07fad9 · a07fad9
1 parent 5d8041d
commit a07fad9
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -109,7 +109,7 @@ This should get to the mean reward per episode about 20. To load and visualize t
 python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
 ```
 
-*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
+*NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116). 
 
 ## Loading and vizualizing learning curves and other training metrics
 See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data. 

diff --git a/baselines/common/vec_env/vec_normalize.py b/baselines/common/vec_env/vec_normalize.py
@@ -1,18 +1,21 @@
 from . import VecEnvWrapper
-from baselines.common.running_mean_std import RunningMeanStd
+from baselines.common.running_mean_std import TfRunningMeanStd, RunningMeanStd
 import numpy as np
 
-
 class VecNormalize(VecEnvWrapper):
     """
     A vectorized wrapper that normalizes the observations
     and returns from an environment.
     """
 
-    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
+    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False):
         VecEnvWrapper.__init__(self, venv)
-        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
-        self.ret_rms = RunningMeanStd(shape=()) if ret else None
+        if use_tf:
+            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None
+            self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None
+        else:
+            self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
+            self.ret_rms = RunningMeanStd(shape=()) if ret else None
         self.clipob = clipob
         self.cliprew = cliprew
         self.ret = np.zeros(self.num_envs)

diff --git a/baselines/run.py b/baselines/run.py
@@ -113,7 +113,7 @@ def build_env(args):
         env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)
 
         if env_type == 'mujoco':
-            env = VecNormalize(env)
+            env = VecNormalize(env, use_tf=True)
 
     return env