## 1. 模型的保存与读取

In [3]:
import warnings
warnings.filterwarnings('ignore') # 忽略warning错误信息

import gym
import os
from stable_baselines import A2C, SAC, PPO2, TD3

In [6]:
# 建立用于保存模型的目录
save_dir = "./save/"
os.makedirs(save_dir, exist_ok=True)

# 建立模型
model = PPO2('MlpPolicy', 'Pendulum-v0', verbose=0).learn(8000)
# 保存模型 PPO2_tutorial.zip
model.save(save_dir + "/PPO2_tutorial")


# 验证读取模型的一致性
# sample an observation from the environment
obs = model.env.observation_space.sample()

# Check prediction before saving
print("pre saved", model.predict(obs, deterministic=True))

del model # delete trained model to demonstrate loading

loaded_model = PPO2.load(save_dir + "/PPO2_tutorial")
# Check that the prediction is the same after loading (for the same observation)
print("loaded", loaded_model.predict(obs, deterministic=True))

pre saved (array([-0.5466813], dtype=float32), None)
loaded (array([-0.5466813], dtype=float32), None)


保存和读取模型最大的好处是，可以边训练、边保存，再读取继续训练模型。防止内存数据溢出。

In [7]:
import os
from stable_baselines.common.vec_env import DummyVecEnv

# 建立用于保存模型的目录
save_dir = "./save/"
os.makedirs(save_dir, exist_ok=True)

# 定义模型
model = A2C('MlpPolicy', 'Pendulum-v0', verbose=0, gamma=0.9, n_steps=20).learn(8000)
# 保存模型 A2C_tutorial.zip
model.save(save_dir + "/A2C_tutorial")

# 删除原模型
del model 

# 加载保存的模型
loaded_model = A2C.load(save_dir + "/A2C_tutorial", verbose=1)

# show the save hyperparameters
print("loaded:", "gamma =", loaded_model.gamma, "n_steps =", loaded_model.n_steps)

# 将模型与环境连接
# as the environment is not serializable, we need to set a new instance of the environment
loaded_model.set_env(DummyVecEnv([lambda: gym.make('Pendulum-v0')]))
# and continue training
loaded_model.learn(8000)




Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Loading a model without an environment, this model cannot be trained until it has a valid environment.
loaded: gamma = 0.9 n_steps = 20
---------------------------------
| explained_variance | 0.00385  |
| fps                | 95       |
| nupdates           | 1        |
| policy_entropy     | 1.42     |
| total_timesteps    | 20       |
| value_loss         | 3.26e+03 |
---------------------------------
---------------------------------
| explained_variance | -0.0332  |
| fps                | 984      |
| nupdates           | 100      |
| policy_entropy     | 1.42     |
| total_timesteps    | 2000     |
| value_loss         | 1.59e+03 |
---------------------------------
---------------------------------
| explained_variance | -0.012   |
| fps                | 999      |
| nupdates           | 200      |
| policy_entropy     | 1.42     |
| total_timesteps    | 4000  

<stable_baselines.a2c.a2c.A2C at 0x7f108c6435f8>

## 2 Gym and VecEnv wrappers

A gym wrapper follows the gym interface: it has a reset() and step() method. Because a wrapper is around an environment, we can access it with self.env, this allow to easily interact with it without modifying the original env

In [8]:
class CustomWrapper(gym.Wrapper):
  """
  :param env: (gym.Env) Gym environment that will be wrapped
  """
  def __init__(self, env):
    # Call the parent constructor, so we can access self.env later
    super(CustomWrapper, self).__init__(env)
  
  def reset(self):
    """
    Reset the environment 
    """
    obs = self.env.reset()
    return obs

  def step(self, action):
    """
    :param action: ([float] or int) Action taken by the agent
    :return: (np.ndarray, float, bool, dict) observation, reward, is the episode over?, additional informations
    """
    obs, reward, done, info = self.env.step(action)
    return obs, reward, done, info

### 2.1 First example: limit the episode length

In [9]:
class TimeLimitWrapper(gym.Wrapper):
  """
  :param env: (gym.Env) Gym environment that will be wrapped
  :param max_steps: (int) Max number of steps per episode
  """
  def __init__(self, env, max_steps=100):
    # Call the parent constructor, so we can access self.env later
    super(TimeLimitWrapper, self).__init__(env)
    self.max_steps = max_steps # 定义最大步长
    # Counter of steps per episode
    self.current_step = 0
  
  def reset(self):
    """
    Reset the environment 
    """
    # Reset the counter
    self.current_step = 0
    return self.env.reset()

  def step(self, action):
    """
    :param action: ([float] or int) Action taken by the agent
    :return: (np.ndarray, float, bool, dict) observation, reward, is the episode over?, additional informations
    """
    self.current_step += 1
    obs, reward, done, info = self.env.step(action)
    # Overwrite the done signal when 
    if self.current_step >= self.max_steps:
          done = True
          # Update the info dict to signal that the limit was exceeded
          info['time_limit_reached'] = True
    return obs, reward, done, info

In [11]:
# 测试环境
from gym.envs.classic_control.pendulum import PendulumEnv

# Here we create the environment directly because gym.make() already wrap the environement in a TimeLimit wrapper otherwise
env = PendulumEnv()
# Wrap the environment
env = TimeLimitWrapper(env, max_steps=100)

obs = env.reset()
done = False
n_steps = 0
while not done:
    random_action = env.action_space.sample()
    # 达到最大步长后停止，在env.step（）函数中控制
    obs, reward, done, info = env.step(random_action)
    env.render()
    n_steps += 1
    
env.close()
print(n_steps, info)

100 {'time_limit_reached': True}


### 2.2 Second example: normalize actions

It is usually a good idea to normalize observations and actions before giving it to the agent, this prevent hard to debug issue.

In this example, we are going to normalize the action space of Pendulum-v0 so it lies in [-1, 1] instead of [-2, 2].

Note: here we are dealing with continuous actions, hence the gym.Box space

In [14]:
import numpy as np

class NormalizeActionWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment that will be wrapped
    """
    def __init__(self, env):
        # Retrieve the action space
        action_space = env.action_space
        # 确保是连续的动作空间
        assert isinstance(action_space, gym.spaces.Box), "This wrapper only works with continuous action space (spaces.Box)"
        
        # Retrieve the max/min values
        self.low, self.high = action_space.low, action_space.high

        # We modify the action space, so all actions will lie in [-1, 1]
        env.action_space = gym.spaces.Box(low=-1, high=1, shape=action_space.shape, dtype=np.float32)

        # Call the parent constructor, so we can access self.env later
        super(NormalizeActionWrapper, self).__init__(env)
  
    def rescale_action(self, scaled_action):
        """
          Rescale the action from [-1, 1] to [low, high]
          (no need for symmetric action space)
          :param scaled_action: (np.ndarray)
          :return: (np.ndarray)
          """
        return self.low + (0.5 * (scaled_action + 1.0) * (self.high -  self.low))

    def reset(self):
        """
        Reset the environment 
        """
        # Reset the counter
        return self.env.reset()

    def step(self, action):
        """
        :param action: ([float] or int) Action taken by the agent
        :return: (np.ndarray, float, bool, dict) observation, reward, is the episode over?, additional informations
        """
        # Rescale action from [-1, 1] to original [low, high] interval
        rescaled_action = self.rescale_action(action)
        obs, reward, done, info = self.env.step(rescaled_action)
        return obs, reward, done, info

In [15]:
# 测试环境

# 原来的环境
original_env = gym.make("Pendulum-v0")

print(original_env.action_space.low)
for _ in range(10):
    print(original_env.action_space.sample())

# 新的环境
env = NormalizeActionWrapper(gym.make("Pendulum-v0"))

print(env.action_space.low)
for _ in range(10):
      print(env.action_space.sample())

[-2.]
[0.20659406]
[1.5130978]
[-1.0943062]
[1.9629356]
[1.7582728]
[-0.05007163]
[0.99274355]
[1.3490527]
[-1.7977108]
[0.24460626]
[-1.]
[0.2625217]
[-0.30736285]
[0.805443]
[0.38870916]
[-0.5412025]
[0.7188048]
[-0.3701165]
[0.09230722]
[-0.94432396]
[0.28304732]


### 2.3 Monitor wrapper 

We are going to use the Monitor wrapper of stable baselines, wich allow to monitor training stats (mean episode reward, mean episode length)

In [23]:
from stable_baselines.bench import Monitor
from stable_baselines.common.vec_env import DummyVecEnv
org_env = gym.make('Pendulum-v0')

# 原环境下的训练输出
model = A2C("MlpPolicy", org_env, verbose=1).learn(int(1000))

Wrapping the env in a DummyVecEnv.
---------------------------------
| explained_variance | -0.182   |
| fps                | 30       |
| nupdates           | 1        |
| policy_entropy     | 1.42     |
| total_timesteps    | 5        |
| value_loss         | 113      |
---------------------------------
---------------------------------
| explained_variance | 0.0308   |
| fps                | 614      |
| nupdates           | 100      |
| policy_entropy     | 1.42     |
| total_timesteps    | 500      |
| value_loss         | 653      |
---------------------------------
---------------------------------
| explained_variance | -0.0416  |
| fps                | 703      |
| nupdates           | 200      |
| policy_entropy     | 1.42     |
| total_timesteps    | 1000     |
| value_loss         | 447      |
---------------------------------


In [24]:
# Monitor Wrapper 下的输出
mon_env = Monitor(gym.make('Pendulum-v0'), filename=None, allow_early_resets=True)
mon_env = DummyVecEnv([lambda: mon_env])
model = A2C("MlpPolicy", mon_env, verbose=1).learn(int(1000))

---------------------------------
| explained_variance | -0.00195 |
| fps                | 29       |
| nupdates           | 1        |
| policy_entropy     | 1.42     |
| total_timesteps    | 5        |
| value_loss         | 971      |
---------------------------------
----------------------------------
| ep_len_mean        | 200       |
| ep_reward_mean     | -1.75e+03 |
| explained_variance | -0.206    |
| fps                | 639       |
| nupdates           | 100       |
| policy_entropy     | 1.42      |
| total_timesteps    | 500       |
| value_loss         | 107       |
----------------------------------
----------------------------------
| ep_len_mean        | 200       |
| ep_reward_mean     | -1.42e+03 |
| explained_variance | -0.00183  |
| fps                | 703       |
| nupdates           | 200       |
| policy_entropy     | 1.42      |
| total_timesteps    | 1000      |
| value_loss         | 407       |
----------------------------------


In [25]:
# 可以用多个wrappers的嵌套
normalized_env = Monitor(gym.make('Pendulum-v0'), filename=None, allow_early_resets=True)
# Note that we can use multiple wrappers
normalized_env = NormalizeActionWrapper(normalized_env)
normalized_env = DummyVecEnv([lambda: normalized_env])

model_2 = A2C("MlpPolicy", normalized_env, verbose=1).learn(int(1000))

---------------------------------
| explained_variance | -0.0417  |
| fps                | 29       |
| nupdates           | 1        |
| policy_entropy     | 1.42     |
| total_timesteps    | 5        |
| value_loss         | 287      |
---------------------------------
----------------------------------
| ep_len_mean        | 200       |
| ep_reward_mean     | -1.52e+03 |
| explained_variance | -0.0161   |
| fps                | 626       |
| nupdates           | 100       |
| policy_entropy     | 1.42      |
| total_timesteps    | 500       |
| value_loss         | 96.9      |
----------------------------------
----------------------------------
| ep_len_mean        | 200       |
| ep_reward_mean     | -1.34e+03 |
| explained_variance | -0.00161  |
| fps                | 688       |
| nupdates           | 200       |
| policy_entropy     | 1.42      |
| total_timesteps    | 1000      |
| value_loss         | 496       |
----------------------------------


### 2.4 Additional wrappers: VecEnvWrappers

#### Vectorized Environments
Vectorized Environments are a method for stacking multiple independent environments into a single environment. Instead of training an RL agent on 1 environment per step, it allows us to train it on n environments per step. Because of this, actions passed to the environment are now a vector (of dimension n). It is the same for observations, rewards and end of episode signals (dones). In the case of non-array observation spaces such as Dict or Tuple, where different sub-spaces may have different shapes, the sub-observations are vectors (of dimension n).

* DummyVecEnv: Creates a simple vectorized wrapper for multiple environments, calling each environment in sequence on the current Python process. 
* VecNormalize: it computes a running mean and standard deviation to normalize observation and returns
* VecFrameStack: it stacks several consecutive observations (useful to integrate time in the observation, e.g. sucessive frame of an atari game)
    

In [26]:
from stable_baselines.common.vec_env import VecNormalize, VecFrameStack

env = DummyVecEnv([lambda: gym.make("Pendulum-v0")])
normalized_vec_env = VecNormalize(env)

obs = normalized_vec_env.reset()
for _ in range(10):
    action = [normalized_vec_env.action_space.sample()]
    obs, reward, _, _ = normalized_vec_env.step(action)
    print(obs, reward)

[[-0.00257274  0.00693956  0.00756877]] [-1.9999806]
[[-0.98675806 -0.82780082  0.99958868]] [-1.287048]
[[-1.26605007 -1.28255063  1.18649636]] [-1.0246431]
[[-1.39662169 -1.50355055  1.19014693]] [-0.8883706]
[[-1.47944287 -1.66787636  1.35452686]] [-0.7907782]
[[-1.50824391 -1.78664143  1.41903748]] [-0.7410711]
[[-1.46904282 -1.850043    1.30965066]] [-0.7049484]
[[-1.37155844 -1.88683722  1.27379552]] [-0.65967]
[[-1.2010548  -1.89022432  1.15042482]] [-0.62446564]
[[-0.94812426 -1.86178195  0.94591632]] [-0.54811007]
