In [4]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy



In [5]:
# Create the game environment
env_name = "CartPole-v1"
env = gym.make(env_name)

# Wrap the environment with DummyVecEnv for compatibility with Stable Baselines3
env = DummyVecEnv([lambda: env])

# Initialize the agent using the Proximal Policy Optimization (PPO) algorithm
model = PPO("MlpPolicy", env, verbose=1)

# Train the agent
model.learn(total_timesteps=20000)

# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")




Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1715 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1181        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009218229 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.000165    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.16        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0141     |
|    value_loss           | 53.2        |
-----------------------------------------
-----------------

In [6]:
# Test the trained agent
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
    env.render(mode = "ipython")
    if dones:
        obs = env.reset()



In [None]:
env.close()