**Goal**: Use Stable Baselines 3 to solve Gymansium CartPole-v1

### Import

In [2]:
import os

import gymnasium as gym
from stable_baselines3 import PPO
from tqdm import tqdm
import numpy as np

### Stable Baselines 3

In [3]:
env_name = "CartPole-v1"

In [4]:
# Create tensorboard Logs and Saved models paths
log_path = os.path.join("Training", "Logs", env_name)
save_path = os.path.join("Training", "Saved Models", "PPO_Model_"+env_name )

In [5]:
# Create Cart Pole environement and train agent with PPO
env = gym.make(env_name)
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=60000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training/Logs/CartPole-v1/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.9     |
|    ep_rew_mean     | 21.9     |
| time/              |          |
|    fps             | 7832     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 27          |
|    ep_rew_mean          | 27          |
| time/                   |             |
|    fps                  | 4639        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008577067 |
|    clip_fraction        | 0.0976      |
|    clip_range    

<stable_baselines3.ppo.ppo.PPO at 0x17404ac50>

In [6]:
# Save trained model
model.save(save_path)

### Test - trained policy

In [7]:
# Test model
env = gym.make(env_name)
model = PPO.load(save_path, env=env)

total_rewards = []
episodes = 1000
for episode in tqdm(range(episodes)):
    state, _ = env.reset()
    score = 0

    while True:
        # Perform the chosen action and observe the next state and reward
        action, _  = model.predict(state, deterministic=True)
        state, reward, terminated, truncated, info = env.step(action)
        score += reward
        if terminated or truncated:
            total_rewards.append(score)
            break
total_rewards = np.array(total_rewards)
env.close()

print(
    f"The mean reward is {np.mean(total_rewards)} and the standard deviation is {np.std(total_rewards)}."
)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


100%|██████████| 1000/1000 [00:38<00:00, 25.64it/s]

The mean reward is 500.0 and the standard deviation is 0.0.



