In [9]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback
import gym

# 1. 환경(Environment): CartPole 환경을 벡터 환경으로 생성
env = make_vec_env("CartPole-v1", n_envs=1)

# 2. 콜백(Callback): 일정 간격마다 평가하고, 최고 성능 모델 저장
eval_env = gym.make("CartPole-v1")
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/best_model",
    log_path="./logs/",
    eval_freq=5000,
    deterministic=True,
    render=False
)

# 3. 모델(Model) 및 4.정책(Policy): PPO 알고리즘, MLP 기반 정책 사용
model = PPO(
    policy="MlpPolicy",  # 정책 (Policy)
    env=env,             # 학습 환경
    verbose=1,
    tensorboard_log="./ppo_log/"
)

# 5. 학습 시작
model.learn(total_timesteps=20000, callback=eval_callback)

Using cpu device
Logging to ./ppo_log/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20.1     |
|    ep_rew_mean     | 20.1     |
| time/              |          |
|    fps             | 2184     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 25.1        |
|    ep_rew_mean          | 25.1        |
| time/                   |             |
|    fps                  | 1520        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008907101 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 

<stable_baselines3.ppo.ppo.PPO at 0x202b8876ad0>