### Import libraries

In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.monitor import Monitor

### Make environments monitoring rewards

In [2]:
# Ensure MuJoCo is using the correct OpenGL backend
os.environ["MUJOCO_GL"] = "glfw"

def make_env(rank):
    def _init():
        env = gym.make("Walker2d-v5", render_mode=None)  # No GUI during training
        env = Monitor(env, filename=f"./logs/monitor_{rank}.csv")  # Logs per env)
        return env
    return _init

num_envs = 4  
env = SubprocVecEnv([make_env(i) for i in range(num_envs)])

### Define PPO policy using SB3. Start learning and save results

In [4]:
ppo_model = PPO(
    policy="MlpPolicy",
    env=env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    clip_range_vf=0.2,
    ent_coef=0.01,
    vf_coef=0.5,
    target_kl=0.05,
    verbose=1,
    device="cpu",
    tensorboard_log="./ppo_logs/Tests_10M"
    )

time_steps = 10_000_000 # Adjust based on your training time
ppo_model.learn(total_timesteps=time_steps, tb_log_name="ModifiedTorquesRewards2_10M")

ppo_model.save("ppo_SB3_ModifiedTorquesRewards2_10Msteps") 

env.close()


Using cpu device
Logging to ./ppo_logs/Tests_10M\ModifiedTorquesRewards2_10M_1


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.1     |
|    ep_rew_mean     | 7.48     |
| time/              |          |
|    fps             | 2012     |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 27.4        |
|    ep_rew_mean          | 15.7        |
| time/                   |             |
|    fps                  | 1069        |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.019793052 |
|    clip_fraction        | 0.275       |
|    clip_range           | 0.2         |
|    clip_range_vf        | 0.2         |
|    entropy_loss         | -8.47       |
|    explained_variance   | 0.

### Load trained model and evaluate

In [8]:
ppo_model = PPO.load("ppo_SB3_ModifiedTorquesRewards2_10Msteps")
eval_env = gym.make("Walker2d-v5", render_mode="human")

mean_reward, std_reward = evaluate_policy(ppo_model, eval_env, n_eval_episodes=5)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

obs, _ = eval_env.reset()
done = False
while not done:
    action, _states = ppo_model.predict(obs)
    obs, reward, done, truncated, info = eval_env.step(action)
    eval_env.render()

eval_env.close()

Mean reward: 3452.244850087166, Std reward: 646.5295345721495
