In [1]:
import warnings
import time
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

warnings.filterwarnings("ignore", category=UserWarning, module="gymnasium.wrappers.rendering")

In [4]:
device = "cpu"
env = gym.make("LunarLander-v3")
model = PPO("MlpPolicy", env, device=device)

n_iters = 10
timesteps = 12000
name_to_save = f"results_p1/lunar_lander_PPO_{n_iters}_{timesteps}"
start = time.time()
for iter in range(n_iters):
        model.learn(total_timesteps=timesteps, reset_num_timesteps=False)
        model.save(f"{name_to_save}")
        print(f"Iteration {iter} trained; model saved in {name_to_save}")

minutos, segundos = divmod(time.time()-start, 60)
print(f"*******Tiempo entreno: {int(minutos)} minutos y {segundos:.2f} segundos*******")

Iteration 0 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 1 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 2 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 3 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 4 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 5 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 6 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 7 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 8 trained; model saved in results_p1/lunar_lander_PPO_10_12000
Iteration 9 trained; model saved in results_p1/lunar_lander_PPO_10_12000
*******Tiempo entreno: 1 minutos y 58.49 segundos*******


## **Testing**

In [5]:
num_episodes_test = 10
destiny_folder = "results_p1/videos"
test_env = gym.make("LunarLander-v3", render_mode="rgb_array")
test_env = Monitor(test_env)
test_env = RecordVideo(test_env, video_folder=destiny_folder, name_prefix="LL-PPO", episode_trigger=lambda x: x < num_episodes_test)  

test_model = PPO.load(name_to_save, device=device)
episode_rewards, episode_lenghts = evaluate_policy(test_model, test_env, n_eval_episodes=num_episodes_test, return_episode_rewards=True)
for i, reward in enumerate(episode_rewards):
    print(f"Episode {i}: Reward: {reward}")
print(f"Mean reward: {sum(episode_rewards)/num_episodes_test}")

test_env.close()

Episode 0: Reward: -47.214445
Episode 1: Reward: -100.306478
Episode 2: Reward: 7.35256
Episode 3: Reward: -21.649472
Episode 4: Reward: 43.633936
Episode 5: Reward: -25.923537
Episode 6: Reward: -37.2401
Episode 7: Reward: 112.232886
Episode 8: Reward: -13.915473
Episode 9: Reward: 6.683834
Mean reward: -7.6346289
