In [1]:
import os
import random
import shutil
import numpy as np
import matplotlib.pyplot as plt

import gymnasium as gym

In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

In [3]:
log_dir = "./ppo_cartpole_tensorboard/"
# Check if the directory exists and remove it
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)
# Recreate the empty directory

os.makedirs(log_dir, exist_ok=True)

In [4]:
# Create the environment
env = make_vec_env('CartPole-v1', n_envs=1)

# Initialize the agent
# tensorboard --logdir ./ppo_cartpole_tensorboard/
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_dir)

# Train the agent
model.learn(total_timesteps=1000000, tb_log_name='first_run')

# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f'Mean reward: {mean_reward} +/- {std_reward}')

In [5]:
# Test the trained agent
obs = env.reset()
for _ in range(10000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = env.step(action)
    env.render('human')

---