In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [None]:
env = make_vec_env('LunarLander-v2', n_envs=16)

In [None]:
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)

In [None]:
model.learn(total_timesteps=1000000)
model_name = "./model/ppo-LunarLander-v2"
model.save(model_name)

In [None]:
eval_env = gym.make('LunarLander-v2')
mean_reward, std_reward = evaluate_policy(model, eval_env)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
model = PPO.load(model_name)
env = gym.make('LunarLander-v2')
# env = make_vec_env('LunarLander-v2', n_envs=2)

In [None]:
obs = env.reset()

episode_reward = 0

while True:
    env.render()
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    episode_reward += reward
    if done:
        break

print(f'episode_reward: {episode_reward:.3f}')

env.close()