https://pythonprogramming.net/introduction-reinforcement-learning-stable-baselines-3-tutorial/

# The lunar lander

In [1]:
import gym


env = gym.make('LunarLander-v2')  # continuous: LunarLanderContinuous-v2
env.reset()

for step in range(200):
    env.render()
    # take random action
    env.step(env.action_space.sample())

env.close()

# Use a stable baselines 3 algorithm to compute the optimal policy

In [1]:
import gym
import os
from stable_baselines3 import A2C, PPO



In [2]:
env = gym.make('LunarLander-v2') 

## Save models at different iterations

In [3]:
model_name = "A2C"
models_dir = "models/" + model_name
logdir = "logs"

In [4]:
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [5]:
env.reset()
if model_name =="PPO":
    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir)
elif model_name =="A2C":
    model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
TIMESTEPS = 10000
iters = 0
for i in range(30):
    iters += 1
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=model_name)
    model.save(f"{models_dir}/{TIMESTEPS*iters}")

Logging to logs\A2C_0
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 108      |
|    ep_rew_mean        | -418     |
| time/                 |          |
|    fps                | 253      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.965   |
|    explained_variance | -0.0465  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -2.53    |
|    value_loss         | 3.31     |
------------------------------------


KeyboardInterrupt: 

In [21]:
env.reset()

model_path = f"{models_dir}/180000.zip"

if model_name =="PPO":
    model = PPO.load(model_path, env=env)
elif model_name =="A2C":
    model = A2C.load(model_path, env=env)
    


episodes = 10
for ep in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        # pass observation to model to get predicted action
        action, _states = model.predict(obs)
        # pass action to env and get info back
        obs, rewards, done, info = env.step(action)
        
        # show the environment on the screen
        env.render()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
