In [4]:
import gymnasium as gym
import tensorflow as tf
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

## Check GPU availability

In [3]:
assert tf.config.list_physical_devices('GPU')
assert tf.test.is_built_with_cuda()

## Create the learning environment

In [11]:
#env = Monitor(gym.make('LunarLander-v2', render_mode='human'))
env = make_vec_env('LunarLander-v2', n_envs=16)

## Create a virtual display for visualizing the environment

In [5]:
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f0a0352e550>

## Train the agent using PPO (Proximal Policy Optimization) algorithm

In [7]:
model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

Using cuda device


## Train the model

In [8]:
model.learn(total_timesteps=1000000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.8     |
|    ep_rew_mean     | -184     |
| time/              |          |
|    fps             | 3735     |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 16384    |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 98.7          |
|    ep_rew_mean          | -151          |
| time/                   |               |
|    fps                  | 2758          |
|    iterations           | 2             |
|    time_elapsed         | 11            |
|    total_timesteps      | 32768         |
| train/                  |               |
|    approx_kl            | 0.009995481   |
|    clip_fraction        | 0.0498        |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.38         |
|    explained_variance   | -0.0032986403 |


<stable_baselines3.ppo.ppo.PPO at 0x7f338fed0dd0>

## Save the model

In [8]:
model.save("./models/ppo-LunarLander-v2")

## Delete the in memory model and load it from a file

In [8]:
del model

NameError: name 'model' is not defined

In [9]:
model = PPO.load("./models/ppo-LunarLander-v2", print_system_info=True)

== CURRENT SYSTEM INFO ==
- OS: Linux-6.11.3-2-default-x86_64-with-glibc2.35 # 1 SMP PREEMPT_DYNAMIC Fri Oct 11 06:48:36 UTC 2024 (7881e90)
- Python: 3.11.0rc1
- Stable-Baselines3: 2.3.2
- PyTorch: 2.5.0+cu124
- GPU Enabled: True
- Numpy: 2.0.2
- Cloudpickle: 3.1.0
- Gymnasium: 0.29.1

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-6.11.3-2-default-x86_64-with-glibc2.35 # 1 SMP PREEMPT_DYNAMIC Fri Oct 11 06:48:36 UTC 2024 (7881e90)
- Python: 3.11.0rc1
- Stable-Baselines3: 2.3.2
- PyTorch: 2.5.0+cu124
- GPU Enabled: True
- Numpy: 2.0.2
- Cloudpickle: 3.1.0
- Gymnasium: 0.29.1



## Evaluate the model using a new training environment

In [10]:
eval_env = Monitor(gym.make("LunarLander-v2"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=277.24 +/- 20.09512655789409


## Generate a video of the agent

In [5]:
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

env_id = 'LunarLander-v2'
video_folder = './videos/'
video_length = 1000

env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])

obs = env.reset()

env = VecVideoRecorder(env, video_folder,
                       record_video_trigger=lambda x: x == 0, video_length=video_length,
                       name_prefix="PPO-agent-{}".format(env_id))

env.reset()
for _ in range(video_length + 1):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
# Save the video
env.close()

Saving video to /tf/demo/videos/PPO-agent-LunarLander-v2-step-0-to-step-1000.mp4
Moviepy - Building video /tf/demo/videos/PPO-agent-LunarLander-v2-step-0-to-step-1000.mp4.
Moviepy - Writing video /tf/demo/videos/PPO-agent-LunarLander-v2-step-0-to-step-1000.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /tf/demo/videos/PPO-agent-LunarLander-v2-step-0-to-step-1000.mp4




In [6]:
%%HTML
<video width="500" height="500" controls>
  <source src="./videos/PPO-agent-LunarLander-v2-step-0-to-step-1000.mp4" type="video/mp4">
</video>