## Stable_baseline PPO CartPole
- SB3 Documentation: https://stable-baselines3.readthedocs.io/en/master/guide/install.html
- SB3 Blog: https://araffin.github.io/#projects
- SB - Tensorflow: https://stable-baselines.readthedocs.io/en/master/guide/install.html
- SB Blog Medium: https://towardsdatascience.com/stable-baselines-a-fork-of-openai-baselines-reinforcement-learning-made-easy-df87c4b2fc82
- SB3 Tutorials: https://github.com/araffin/rl-tutorial-jnrr19/tree/sb3

In [None]:
!pip --version

In [6]:
pip install stable-baselines3[extra]

Note: you may need to restart the kernel to use updated packages.


In [None]:
import gym

from stable_baselines3 import A2C

env = gym.make("CartPole-v1")

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(100):
    action, _state = model.predict(obs, deterministic = True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()

## DQN on Lunar Lander
- https://stable-baselines3.readthedocs.io/en/master/guide/examples.html
- Issue with swig :(

In [None]:
!apt install swig cmake
!pip install stable-baselines3[extra] box2d box2d-kengz

In [None]:
# import gym
# from stable_baselines3 import DQN
# from stable_baselines3.common.evaluation import evaluate_policy

# # Create environment
# env = gym.make("LunarLander-v2")

# # Instantiate the agent
# model = DQN("MlpPolicy", env ,verbose=1)

# # Train the agent
# model.learn(total_timesteps=int(2e5))

# # Save the agent
# model.save("trained_dqn_lunar")

# # Delete trained model to demonstrate loading
# del model 

# # Load the trained agent
# model = DQN.load("dqn_lunar", env=env)

# # Evaluate the agent:
# # Note: if you use wrappers with your environment that modify rewards, this will be reflected here.
# # To evaluate with original rewards, wrap environment in a "Monitor" wrapped before other wrappers
# mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

# # Enjoy trained agent
# obs = env.reset()
# for i in range(1000):
#     action, _state = model.predict(obs, deterministic=True)
#     obs, rewards, dones, info = env.step(action)
#     env.render()

## Multiprocessing: Unleashing the Power of Vectorized Environments
- CartPole
- This is not stable on Jupyter Notebook, do it on local machine

In [None]:
import gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed

def make_env(env_id, rank, seed=0):
    """Utility function for multiprocessed env.
    
    Parameters
    ----------
    env_id: str
        the environment ID
    seed: int
        initial seed for RNG
    rank: int
        index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init

if __name__ == "__main__":
    env_id = "CartPole-v1"
    num_cpu = 4 # Number of processor to use
    
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    
    # SB provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
    
    model = PPO("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps = 25000)
    
    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()

# HER = Hindsight Experience Replay

In [2]:
!pip install highway-env

Collecting highway-env
  Downloading highway_env-1.2-py3-none-any.whl (92 kB)
Collecting pygame
  Downloading pygame-2.0.1-cp38-cp38-win_amd64.whl (5.2 MB)
Installing collected packages: pygame, highway-env
Successfully installed highway-env-1.2 pygame-2.0.1


In [4]:
!pip install --upgrade stable-baselines3
import stable_baselines3 as sb3
print(sb3.__version__)




In [10]:
import gym
import highway_env
import numpy as np

from stable_baselines3 import SAC, DDPG, TD3, HerReplayBuffer
from stable_baselines3.common.noise import NormalActionNoise

env = gym.make("parking-v0")

# Create 4 artificial transitions per real transition
n_sampled_goal = 4

# SAC hyperparams:
model = SAC(
    "MultiInputPolicy",
    env,
    replay_buffer_class=HerReplayBuffer,
    replay_buffer_kwargs=dict(
      n_sampled_goal=n_sampled_goal,
      goal_selection_strategy="future",
      # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper
      # we have to manually specify the max number of steps per episode
      max_episode_length=100,
      online_sampling=True,
    ),
    verbose=1,
    buffer_size=int(1e6),
    learning_rate=1e-3,
    gamma=0.95,
    batch_size=256,
    policy_kwargs=dict(net_arch=[256, 256, 256]),
)

model.learn(int(2e5))
model.save("her_sac_highway")

# Load saved model
# Because it needs access to `env.compute_reward()`
# HER must be loaded with the env
model = SAC.load("her_sac_highway", env=env)

obs = env.reset()

# Evaluate the agent
episode_reward = 0
for _ in range(100):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    episode_reward += reward
    if done or info.get("is_success", False):
        print("Reward:", episode_reward, "Success?", info.get("is_success", False))
        episode_reward = 0.0
        obs = env.reset()

ImportError: cannot import name 'HerReplayBuffer' from 'stable_baselines3' (C:\Users\nguye\anaconda3\lib\site-packages\stable_baselines3\__init__.py)