CREDIT: https://pythonprogramming.net/introduction-reinforcement-learning-stable-baselines-3-tutorial/

In [1]:
import gym
import os
from stable_baselines3 import A2C, PPO



# The lunar lander

In [2]:
env = gym.make('LunarLander-v2') 
env.reset()

for step in range(200):
    env.render()
    # take random action
    env.step(env.action_space.sample())

env.close()

# Use a stable baselines 3 algorithm to compute the optimal policy

In [3]:
env = gym.make('LunarLander-v2') 

## Save models at different iterations

In [4]:
model_name = "PPO"
models_dir = "models/" + model_name
logdir = "logs"

In [5]:
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [6]:
env.reset()
if model_name =="PPO":
    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir)
elif model_name =="A2C":
    model = A2C('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [7]:
TIMESTEPS = 10000
iters = 0
for i in range(10):
    iters += 1
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=model_name)
    model.save(f"{models_dir}/{TIMESTEPS*iters}")

Logging to logs\PPO_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 94.1     |
|    ep_rew_mean     | -183     |
| time/              |          |
|    fps             | 780      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 91.7         |
|    ep_rew_mean          | -169         |
| time/                   |              |
|    fps                  | 867          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0050810706 |
|    clip_fraction        | 0.0215       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.00583 

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 150         |
|    ep_rew_mean          | -118        |
| time/                   |             |
|    fps                  | 1071        |
|    iterations           | 3           |
|    time_elapsed         | 5           |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009380927 |
|    clip_fraction        | 0.0602      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.15       |
|    explained_variance   | 0.0487      |
|    learning_rate        | 0.0003      |
|    loss                 | 306         |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.00889    |
|    value_loss           | 326         |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 161   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 300         |
|    ep_rew_mean          | -59.2       |
| time/                   |             |
|    fps                  | 596         |
|    iterations           | 4           |
|    time_elapsed         | 13          |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.010136433 |
|    clip_fraction        | 0.0876      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.994      |
|    explained_variance   | 0.293       |
|    learning_rate        | 0.0003      |
|    loss                 | 40.5        |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.00426    |
|    value_loss           | 131         |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 311   

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 439          |
|    ep_rew_mean          | -27.6        |
| time/                   |              |
|    fps                  | 644          |
|    iterations           | 5            |
|    time_elapsed         | 15           |
|    total_timesteps      | 71680        |
| train/                  |              |
|    approx_kl            | 0.0098894145 |
|    clip_fraction        | 0.0872       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.836       |
|    explained_variance   | 0.28         |
|    learning_rate        | 0.0003       |
|    loss                 | 61.1         |
|    n_updates            | 340          |
|    policy_gradient_loss | -0.0099      |
|    value_loss           | 139          |
------------------------------------------
Logging to logs\PPO_0
---------------------------------
| rollout/           |          |
|    ep

Logging to logs\PPO_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 524      |
|    ep_rew_mean     | 54.1     |
| time/              |          |
|    fps             | 1080     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 94208    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 525         |
|    ep_rew_mean          | 62.1        |
| time/                   |             |
|    fps                  | 747         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 96256       |
| train/                  |             |
|    approx_kl            | 0.004958462 |
|    clip_fraction        | 0.0337      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.792      |
|    explained_variance   | 0.528       |
|    lea

In [9]:
env.reset()

model_path = f"{models_dir}/100000.zip"

if model_name =="PPO":
    model = PPO.load(model_path, env=env)
elif model_name =="A2C":
    model = A2C.load(model_path, env=env)
    


episodes = 10
for ep in range(episodes):
    obs = env.reset()
    done = False
    while not done:
        # pass observation to model to get predicted action
        action, _states = model.predict(obs)
        # pass action to env and get info back
        obs, rewards, done, info = env.step(action)
        
        # show the environment on the screen
        env.render()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


KeyboardInterrupt: 