In [1]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

# Create the environment
env_id = "LunarLander-v2"
n_envs = 16
env = make_vec_env(env_id, n_envs=n_envs)

# Create the evaluation envs
eval_envs = make_vec_env(env_id, n_envs=5)

# Adjust evaluation interval depending on the number of envs
eval_freq = int(1e5)
eval_freq = max(eval_freq // n_envs, 1)

# Create evaluation callback to save best model
# and monitor agent performance
eval_callback = EvalCallback(
    eval_envs,
    best_model_save_path="./logs/",
    eval_freq=eval_freq,
    n_eval_episodes=10,
)


tensorboard_log = "./tb_logs/"

# Instantiate the agent
# Hyperparameters from https://github.com/DLR-RM/rl-baselines3-zoo
model = PPO(
    "MlpPolicy",
    env,
    n_steps=1024,
    batch_size=64,
    gae_lambda=0.98,
    gamma=0.999,
    n_epochs=4,
    ent_coef=0.01,
    verbose=1,
    tensorboard_log=tensorboard_log
)


model.learn(total_timesteps=int(5e6), callback=eval_callback, progress_bar=True, log_interval=250)

Using cuda device
Logging to ./tb_logs/PPO_1


Output()

------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 577          |
|    mean_reward          | -180         |
| time/                   |              |
|    total_timesteps      | 100000       |
| train/                  |              |
|    approx_kl            | 0.0072945035 |
|    clip_fraction        | 0.0789       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.3         |
|    explained_variance   | -4.17e-06    |
|    learning_rate        | 0.0003       |
|    loss                 | 113          |
|    n_updates            | 24           |
|    policy_gradient_loss | -0.00624     |
|    value_loss           | 361          |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 514          |
|    mean_reward          | -159         |
| time/                   |              |
|    total_timesteps      | 200000       |
| train/                  |              |
|    approx_kl            | 0.0061549647 |
|    clip_fraction        | 0.034        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.18        |
|    explained_variance   | -4.77e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 458          |
|    n_updates            | 48           |
|    policy_gradient_loss | -0.000939    |
|    value_loss           | 567          |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 458         |
|    mean_reward          | -163        |
| time/                   |             |
|    total_timesteps      | 300000      |
| train/                  |             |
|    approx_kl            | 0.005920329 |
|    clip_fraction        | 0.0288      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.2        |
|    explained_variance   | 0.642       |
|    learning_rate        | 0.0003      |
|    loss                 | 160         |
|    n_updates            | 72          |
|    policy_gradient_loss | -0.00316    |
|    value_loss           | 323         |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 975          |
|    mean_reward          | -78.6        |
| time/                   |              |
|    total_timesteps      | 400000       |
| train/                  |              |
|    approx_kl            | 0.0034153752 |
|    clip_fraction        | 0.0224       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.13        |
|    explained_variance   | 0.867        |
|    learning_rate        | 0.0003       |
|    loss                 | 25.8         |
|    n_updates            | 96           |
|    policy_gradient_loss | -0.000697    |
|    value_loss           | 87.8         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 658          |
|    mean_reward          | 185          |
| time/                   |              |
|    total_timesteps      | 500000       |
| train/                  |              |
|    approx_kl            | 0.0050838133 |
|    clip_fraction        | 0.0206       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.06        |
|    explained_variance   | 0.913        |
|    learning_rate        | 0.0003       |
|    loss                 | 39.3         |
|    n_updates            | 120          |
|    policy_gradient_loss | -0.000919    |
|    value_loss           | 87.4         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 515          |
|    mean_reward          | 221          |
| time/                   |              |
|    total_timesteps      | 600000       |
| train/                  |              |
|    approx_kl            | 0.0044032857 |
|    clip_fraction        | 0.0308       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.06        |
|    explained_variance   | 0.989        |
|    learning_rate        | 0.0003       |
|    loss                 | 4.6          |
|    n_updates            | 144          |
|    policy_gradient_loss | -0.000442    |
|    value_loss           | 8.15         |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 406         |
|    mean_reward          | 248         |
| time/                   |             |
|    total_timesteps      | 700000      |
| train/                  |             |
|    approx_kl            | 0.004065208 |
|    clip_fraction        | 0.023       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.956      |
|    explained_variance   | 0.974       |
|    learning_rate        | 0.0003      |
|    loss                 | 3.43        |
|    n_updates            | 168         |
|    policy_gradient_loss | -0.000714   |
|    value_loss           | 35.1        |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 372          |
|    mean_reward          | 252          |
| time/                   |              |
|    total_timesteps      | 800000       |
| train/                  |              |
|    approx_kl            | 0.0033726564 |
|    clip_fraction        | 0.0288       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.76        |
|    explained_variance   | 0.912        |
|    learning_rate        | 0.0003       |
|    loss                 | 13.2         |
|    n_updates            | 192          |
|    policy_gradient_loss | -0.00147     |
|    value_loss           | 150          |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 330         |
|    mean_reward          | 259         |
| time/                   |             |
|    total_timesteps      | 900000      |
| train/                  |             |
|    approx_kl            | 0.008117203 |
|    clip_fraction        | 0.0807      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.767      |
|    explained_variance   | 0.718       |
|    learning_rate        | 0.0003      |
|    loss                 | 85.6        |
|    n_updates            | 216         |
|    policy_gradient_loss | -0.00186    |
|    value_loss           | 227         |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 320         |
|    mean_reward          | 267         |
| time/                   |             |
|    total_timesteps      | 1000000     |
| train/                  |             |
|    approx_kl            | 0.004254428 |
|    clip_fraction        | 0.0412      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.767      |
|    explained_variance   | 0.887       |
|    learning_rate        | 0.0003      |
|    loss                 | 11.5        |
|    n_updates            | 244         |
|    policy_gradient_loss | -0.000152   |
|    value_loss           | 160         |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 300          |
|    mean_reward          | 273          |
| time/                   |              |
|    total_timesteps      | 1100000      |
| train/                  |              |
|    approx_kl            | 0.0035112593 |
|    clip_fraction        | 0.0393       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.72        |
|    explained_variance   | 0.968        |
|    learning_rate        | 0.0003       |
|    loss                 | 24.5         |
|    n_updates            | 268          |
|    policy_gradient_loss | -9.97e-05    |
|    value_loss           | 76.8         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 285          |
|    mean_reward          | 253          |
| time/                   |              |
|    total_timesteps      | 1200000      |
| train/                  |              |
|    approx_kl            | 0.0045701116 |
|    clip_fraction        | 0.038        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.716       |
|    explained_variance   | 0.94         |
|    learning_rate        | 0.0003       |
|    loss                 | 71.1         |
|    n_updates            | 292          |
|    policy_gradient_loss | 0.000423     |
|    value_loss           | 97.9         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 279          |
|    mean_reward          | 275          |
| time/                   |              |
|    total_timesteps      | 1300000      |
| train/                  |              |
|    approx_kl            | 0.0030478798 |
|    clip_fraction        | 0.0432       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.681       |
|    explained_variance   | 0.972        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.15         |
|    n_updates            | 316          |
|    policy_gradient_loss | 0.000972     |
|    value_loss           | 71.3         |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 258         |
|    mean_reward          | 278         |
| time/                   |             |
|    total_timesteps      | 1400000     |
| train/                  |             |
|    approx_kl            | 0.003092675 |
|    clip_fraction        | 0.0259      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.671      |
|    explained_variance   | 0.923       |
|    learning_rate        | 0.0003      |
|    loss                 | 8.78        |
|    n_updates            | 340         |
|    policy_gradient_loss | 0.000843    |
|    value_loss           | 169         |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 249         |
|    mean_reward          | 279         |
| time/                   |             |
|    total_timesteps      | 1500000     |
| train/                  |             |
|    approx_kl            | 0.006424964 |
|    clip_fraction        | 0.0556      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.652      |
|    explained_variance   | 0.936       |
|    learning_rate        | 0.0003      |
|    loss                 | 78.9        |
|    n_updates            | 364         |
|    policy_gradient_loss | 0.000357    |
|    value_loss           | 158         |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 255         |
|    mean_reward          | 277         |
| time/                   |             |
|    total_timesteps      | 1600000     |
| train/                  |             |
|    approx_kl            | 0.002413792 |
|    clip_fraction        | 0.023       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.639      |
|    explained_variance   | 0.918       |
|    learning_rate        | 0.0003      |
|    loss                 | 96.7        |
|    n_updates            | 388         |
|    policy_gradient_loss | -0.000854   |
|    value_loss           | 238         |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 260         |
|    mean_reward          | 281         |
| time/                   |             |
|    total_timesteps      | 1700000     |
| train/                  |             |
|    approx_kl            | 0.004526453 |
|    clip_fraction        | 0.0499      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.639      |
|    explained_variance   | 0.968       |
|    learning_rate        | 0.0003      |
|    loss                 | 10.2        |
|    n_updates            | 412         |
|    policy_gradient_loss | 0.00123     |
|    value_loss           | 75.6        |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 238         |
|    mean_reward          | 282         |
| time/                   |             |
|    total_timesteps      | 1800000     |
| train/                  |             |
|    approx_kl            | 0.003467917 |
|    clip_fraction        | 0.0405      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.615      |
|    explained_variance   | 0.959       |
|    learning_rate        | 0.0003      |
|    loss                 | 20.5        |
|    n_updates            | 436         |
|    policy_gradient_loss | -0.00082    |
|    value_loss           | 117         |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 244         |
|    mean_reward          | 280         |
| time/                   |             |
|    total_timesteps      | 1900000     |
| train/                  |             |
|    approx_kl            | 0.004326036 |
|    clip_fraction        | 0.0483      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.622      |
|    explained_variance   | 0.988       |
|    learning_rate        | 0.0003      |
|    loss                 | 3.36        |
|    n_updates            | 460         |
|    policy_gradient_loss | 0.0008      |
|    value_loss           | 14.8        |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 237         |
|    mean_reward          | 273         |
| time/                   |             |
|    total_timesteps      | 2000000     |
| train/                  |             |
|    approx_kl            | 0.004693101 |
|    clip_fraction        | 0.0528      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.646      |
|    explained_variance   | 0.977       |
|    learning_rate        | 0.0003      |
|    loss                 | 20.9        |
|    n_updates            | 488         |
|    policy_gradient_loss | 0.000411    |
|    value_loss           | 25.4        |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 231          |
|    mean_reward          | 262          |
| time/                   |              |
|    total_timesteps      | 2100000      |
| train/                  |              |
|    approx_kl            | 0.0041478015 |
|    clip_fraction        | 0.0423       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.616       |
|    explained_variance   | 0.991        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.03         |
|    n_updates            | 512          |
|    policy_gradient_loss | 0.00104      |
|    value_loss           | 9.02         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 236          |
|    mean_reward          | 273          |
| time/                   |              |
|    total_timesteps      | 2200000      |
| train/                  |              |
|    approx_kl            | 0.0042353426 |
|    clip_fraction        | 0.051        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.63        |
|    explained_variance   | 0.993        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.75         |
|    n_updates            | 536          |
|    policy_gradient_loss | 0.000681     |
|    value_loss           | 9.91         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 225          |
|    mean_reward          | 273          |
| time/                   |              |
|    total_timesteps      | 2300000      |
| train/                  |              |
|    approx_kl            | 0.0044205533 |
|    clip_fraction        | 0.0484       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.621       |
|    explained_variance   | 0.997        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.932        |
|    n_updates            | 560          |
|    policy_gradient_loss | 0.000243     |
|    value_loss           | 5.4          |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 220          |
|    mean_reward          | 288          |
| time/                   |              |
|    total_timesteps      | 2400000      |
| train/                  |              |
|    approx_kl            | 0.0033114753 |
|    clip_fraction        | 0.0381       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.589       |
|    explained_variance   | 0.957        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.69         |
|    n_updates            | 584          |
|    policy_gradient_loss | -0.000323    |
|    value_loss           | 146          |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 224         |
|    mean_reward          | 286         |
| time/                   |             |
|    total_timesteps      | 2500000     |
| train/                  |             |
|    approx_kl            | 0.007296943 |
|    clip_fraction        | 0.0697      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.592      |
|    explained_variance   | 0.995       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.82        |
|    n_updates            | 608         |
|    policy_gradient_loss | 0.00139     |
|    value_loss           | 7.29        |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 210         |
|    mean_reward          | 258         |
| time/                   |             |
|    total_timesteps      | 2600000     |
| train/                  |             |
|    approx_kl            | 0.005531233 |
|    clip_fraction        | 0.055       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.583      |
|    explained_variance   | 0.998       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.09        |
|    n_updates            | 632         |
|    policy_gradient_loss | 0.000191    |
|    value_loss           | 3.92        |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 216          |
|    mean_reward          | 276          |
| time/                   |              |
|    total_timesteps      | 2700000      |
| train/                  |              |
|    approx_kl            | 0.0045724404 |
|    clip_fraction        | 0.0551       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.581       |
|    explained_variance   | 0.998        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.735        |
|    n_updates            | 656          |
|    policy_gradient_loss | 0.000878     |
|    value_loss           | 3.13         |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 223         |
|    mean_reward          | 278         |
| time/                   |             |
|    total_timesteps      | 2800000     |
| train/                  |             |
|    approx_kl            | 0.004592713 |
|    clip_fraction        | 0.0504      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.573      |
|    explained_variance   | 0.975       |
|    learning_rate        | 0.0003      |
|    loss                 | 35.3        |
|    n_updates            | 680         |
|    policy_gradient_loss | 0.000101    |
|    value_loss           | 65.5        |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 216          |
|    mean_reward          | 295          |
| time/                   |              |
|    total_timesteps      | 2900000      |
| train/                  |              |
|    approx_kl            | 0.0046921065 |
|    clip_fraction        | 0.0366       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.524       |
|    explained_variance   | 0.956        |
|    learning_rate        | 0.0003       |
|    loss                 | 36.8         |
|    n_updates            | 708          |
|    policy_gradient_loss | -0.000395    |
|    value_loss           | 136          |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 214          |
|    mean_reward          | 278          |
| time/                   |              |
|    total_timesteps      | 3000000      |
| train/                  |              |
|    approx_kl            | 0.0063594906 |
|    clip_fraction        | 0.0732       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.525       |
|    explained_variance   | 0.982        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.3          |
|    n_updates            | 732          |
|    policy_gradient_loss | 0.00134      |
|    value_loss           | 38.2         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 209          |
|    mean_reward          | 290          |
| time/                   |              |
|    total_timesteps      | 3100000      |
| train/                  |              |
|    approx_kl            | 0.0033467128 |
|    clip_fraction        | 0.0338       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.541       |
|    explained_variance   | 0.973        |
|    learning_rate        | 0.0003       |
|    loss                 | 55.5         |
|    n_updates            | 756          |
|    policy_gradient_loss | 0.00052      |
|    value_loss           | 98.9         |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 213         |
|    mean_reward          | 284         |
| time/                   |             |
|    total_timesteps      | 3200000     |
| train/                  |             |
|    approx_kl            | 0.004649163 |
|    clip_fraction        | 0.0567      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.538      |
|    explained_variance   | 0.946       |
|    learning_rate        | 0.0003      |
|    loss                 | 8.22        |
|    n_updates            | 780         |
|    policy_gradient_loss | 0.00124     |
|    value_loss           | 155         |
-----------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 202         |
|    mean_reward          | 258         |
| time/                   |             |
|    total_timesteps      | 3300000     |
| train/                  |             |
|    approx_kl            | 0.004177572 |
|    clip_fraction        | 0.039       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.534      |
|    explained_variance   | 0.959       |
|    learning_rate        | 0.0003      |
|    loss                 | 29.9        |
|    n_updates            | 804         |
|    policy_gradient_loss | 0.000826    |
|    value_loss           | 138         |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 208          |
|    mean_reward          | 257          |
| time/                   |              |
|    total_timesteps      | 3400000      |
| train/                  |              |
|    approx_kl            | 0.0038056178 |
|    clip_fraction        | 0.0494       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.532       |
|    explained_variance   | 0.978        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.13         |
|    n_updates            | 828          |
|    policy_gradient_loss | 0.000959     |
|    value_loss           | 88.4         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 212          |
|    mean_reward          | 280          |
| time/                   |              |
|    total_timesteps      | 3500000      |
| train/                  |              |
|    approx_kl            | 0.0040168273 |
|    clip_fraction        | 0.0431       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.543       |
|    explained_variance   | 0.982        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.03         |
|    n_updates            | 852          |
|    policy_gradient_loss | 0.000622     |
|    value_loss           | 61.1         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 218          |
|    mean_reward          | 279          |
| time/                   |              |
|    total_timesteps      | 3600000      |
| train/                  |              |
|    approx_kl            | 0.0038615228 |
|    clip_fraction        | 0.0524       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.537       |
|    explained_variance   | 0.977        |
|    learning_rate        | 0.0003       |
|    loss                 | 6.88         |
|    n_updates            | 876          |
|    policy_gradient_loss | 0.00131      |
|    value_loss           | 45.9         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 214          |
|    mean_reward          | 283          |
| time/                   |              |
|    total_timesteps      | 3700000      |
| train/                  |              |
|    approx_kl            | 0.0034605092 |
|    clip_fraction        | 0.0345       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.502       |
|    explained_variance   | 0.973        |
|    learning_rate        | 0.0003       |
|    loss                 | 138          |
|    n_updates            | 900          |
|    policy_gradient_loss | 0.000827     |
|    value_loss           | 99.4         |
------------------------------------------


----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 198        |
|    mean_reward          | 265        |
| time/                   |            |
|    total_timesteps      | 3800000    |
| train/                  |            |
|    approx_kl            | 0.00462112 |
|    clip_fraction        | 0.0613     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.521     |
|    explained_variance   | 0.98       |
|    learning_rate        | 0.0003     |
|    loss                 | 351        |
|    n_updates            | 924        |
|    policy_gradient_loss | 0.000236   |
|    value_loss           | 72.6       |
----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 215          |
|    mean_reward          | 280          |
| time/                   |              |
|    total_timesteps      | 3900000      |
| train/                  |              |
|    approx_kl            | 0.0056997277 |
|    clip_fraction        | 0.0449       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.503       |
|    explained_variance   | 0.996        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.56         |
|    n_updates            | 952          |
|    policy_gradient_loss | 0.00125      |
|    value_loss           | 7.33         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 207          |
|    mean_reward          | 286          |
| time/                   |              |
|    total_timesteps      | 4000000      |
| train/                  |              |
|    approx_kl            | 0.0030909195 |
|    clip_fraction        | 0.0435       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.498       |
|    explained_variance   | 0.986        |
|    learning_rate        | 0.0003       |
|    loss                 | 2.8          |
|    n_updates            | 976          |
|    policy_gradient_loss | 0.000642     |
|    value_loss           | 30.3         |
------------------------------------------


------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 225          |
|    ep_rew_mean          | 278          |
| time/                   |              |
|    fps                  | 2253         |
|    iterations           | 250          |
|    time_elapsed         | 1817         |
|    total_timesteps      | 4096000      |
| train/                  |              |
|    approx_kl            | 0.0045940606 |
|    clip_fraction        | 0.0522       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.517       |
|    explained_variance   | 0.995        |
|    learning_rate        | 0.0003       |
|    loss                 | 1            |
|    n_updates            | 996          |
|    policy_gradient_loss | 0.00089      |
|    value_loss           | 4.72         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 209          |
|    mean_reward          | 287          |
| time/                   |              |
|    total_timesteps      | 4100000      |
| train/                  |              |
|    approx_kl            | 0.0034657593 |
|    clip_fraction        | 0.0442       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.511       |
|    explained_variance   | 0.997        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.758        |
|    n_updates            | 1000         |
|    policy_gradient_loss | 0.000456     |
|    value_loss           | 5.02         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 212          |
|    mean_reward          | 287          |
| time/                   |              |
|    total_timesteps      | 4200000      |
| train/                  |              |
|    approx_kl            | 0.0041257786 |
|    clip_fraction        | 0.0424       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.472       |
|    explained_variance   | 0.98         |
|    learning_rate        | 0.0003       |
|    loss                 | 5.69         |
|    n_updates            | 1024         |
|    policy_gradient_loss | 0.00117      |
|    value_loss           | 40.8         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 213          |
|    mean_reward          | 295          |
| time/                   |              |
|    total_timesteps      | 4300000      |
| train/                  |              |
|    approx_kl            | 0.0051349746 |
|    clip_fraction        | 0.05         |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.478       |
|    explained_variance   | 0.995        |
|    learning_rate        | 0.0003       |
|    loss                 | 4.98         |
|    n_updates            | 1048         |
|    policy_gradient_loss | 0.00187      |
|    value_loss           | 9.1          |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 211          |
|    mean_reward          | 306          |
| time/                   |              |
|    total_timesteps      | 4400000      |
| train/                  |              |
|    approx_kl            | 0.0039806697 |
|    clip_fraction        | 0.0552       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.462       |
|    explained_variance   | 0.977        |
|    learning_rate        | 0.0003       |
|    loss                 | 38           |
|    n_updates            | 1072         |
|    policy_gradient_loss | 0.00267      |
|    value_loss           | 77.5         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 205          |
|    mean_reward          | 284          |
| time/                   |              |
|    total_timesteps      | 4500000      |
| train/                  |              |
|    approx_kl            | 0.0045606135 |
|    clip_fraction        | 0.0452       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.475       |
|    explained_variance   | 0.99         |
|    learning_rate        | 0.0003       |
|    loss                 | 2.48         |
|    n_updates            | 1096         |
|    policy_gradient_loss | 0.001        |
|    value_loss           | 22.2         |
------------------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 209         |
|    mean_reward          | 288         |
| time/                   |             |
|    total_timesteps      | 4600000     |
| train/                  |             |
|    approx_kl            | 0.004160145 |
|    clip_fraction        | 0.0596      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.463      |
|    explained_variance   | 0.952       |
|    learning_rate        | 0.0003      |
|    loss                 | 3.11        |
|    n_updates            | 1120        |
|    policy_gradient_loss | -0.00072    |
|    value_loss           | 146         |
-----------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 201          |
|    mean_reward          | 283          |
| time/                   |              |
|    total_timesteps      | 4700000      |
| train/                  |              |
|    approx_kl            | 0.0042763553 |
|    clip_fraction        | 0.0415       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.46        |
|    explained_variance   | 0.997        |
|    learning_rate        | 0.0003       |
|    loss                 | 5.14         |
|    n_updates            | 1144         |
|    policy_gradient_loss | 0.0007       |
|    value_loss           | 9.19         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 197          |
|    mean_reward          | 276          |
| time/                   |              |
|    total_timesteps      | 4800000      |
| train/                  |              |
|    approx_kl            | 0.0044049295 |
|    clip_fraction        | 0.0459       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.463       |
|    explained_variance   | 0.992        |
|    learning_rate        | 0.0003       |
|    loss                 | 9.37         |
|    n_updates            | 1168         |
|    policy_gradient_loss | 0.00121      |
|    value_loss           | 16.5         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 291          |
|    mean_reward          | 282          |
| time/                   |              |
|    total_timesteps      | 4900000      |
| train/                  |              |
|    approx_kl            | 0.0050800573 |
|    clip_fraction        | 0.0533       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.456       |
|    explained_variance   | 0.984        |
|    learning_rate        | 0.0003       |
|    loss                 | 324          |
|    n_updates            | 1196         |
|    policy_gradient_loss | 0.0027       |
|    value_loss           | 66.6         |
------------------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 207          |
|    mean_reward          | 280          |
| time/                   |              |
|    total_timesteps      | 5000000      |
| train/                  |              |
|    approx_kl            | 0.0052814046 |
|    clip_fraction        | 0.0535       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.456       |
|    explained_variance   | 0.983        |
|    learning_rate        | 0.0003       |
|    loss                 | 156          |
|    n_updates            | 1220         |
|    policy_gradient_loss | 0.00149      |
|    value_loss           | 69.5         |
------------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x700c21673c70>

In [2]:
model = PPO.load("logs/best_model.zip")

In [3]:
from helper_videos import show_videos, record_video 

record_video(env_id, model, video_length=2000, prefix="ppo-lunarlander-v2")

Saving video to /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2-step-0-to-step-2000.mp4
Moviepy - Building video /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2-step-0-to-step-2000.mp4.
Moviepy - Writing video /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2-step-0-to-step-2000.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2-step-0-to-step-2000.mp4




In [4]:
show_videos("videos/", prefix="ppo-lunarlander-v2")

In [5]:
model_2 = PPO(
    "MlpPolicy",
    env,
    n_steps=1024,
    batch_size=64,
    gae_lambda=0.98,
    gamma=0.999,
    n_epochs=4,
    ent_coef=0.01,
    verbose=1,
    tensorboard_log=tensorboard_log
)

Using cuda device


In [6]:
record_video(env_id, model_2, video_length=2000, prefix="ppo-lunarlander-v2_base")

Saving video to /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2_base-step-0-to-step-2000.mp4
Moviepy - Building video /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2_base-step-0-to-step-2000.mp4.
Moviepy - Writing video /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2_base-step-0-to-step-2000.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready /home/ubuntu/dev/lunarlander/videos/ppo-lunarlander-v2_base-step-0-to-step-2000.mp4


In [11]:
show_videos("videos/", prefix="ppo-lunarlander-v2_base")
show_videos("videos/", prefix="ppo-lunarlander-v2")

In [13]:
from stable_baselines3.common.evaluation import evaluate_policy

# TRAINED MODEL

mean_reward, std_reward = evaluate_policy(model, eval_envs, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:284.18 +/- 21.81


In [14]:
# UNTRAINED MODEL

mean_reward, std_reward = evaluate_policy(model_2, eval_envs, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:-402.44 +/- 188.88
