In [2]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from tqdm import tqdm  # Import tqdm for progress bar (I havent been able to implement it using this stable baseline libraries)

import torch # Might be useful in the future

In [3]:
# Ensure MuJoCo is using the correct OpenGL backend
os.environ["MUJOCO_GL"] = "glfw"

In [8]:
# Step 1: Create and visualize the environment
def modify_mass_parameters(model):
    # Modify the mass of specific body parts
    model.body_mass[1] = 4.15  # Set mass of torso 
    model.body_mass[2], model.body_mass[5] = 0.6, 0.6  # Set mass of thigh
    model.body_mass[3], model.body_mass[6] = 0.3, 0.3  # Set mass of leg
    model.body_mass[4], model.body_mass[7]= 0.1, 0.1  # Set mass of foot

def make_env():
    env = gym.make("Walker2d-v5", render_mode=None)  # No GUI during training
    # Access the MuJoCo model and modify it
    model = env.unwrapped.model
    # modify_mass_parameters(model) # Modify mass parameters for the environment's model
    return env

In [9]:
# Step 2: Use multiple environments for faster training
num_envs = 4  # Increase this for better training speed (dont put more than your computer cores)
env = SubprocVecEnv([make_env for _ in range(num_envs)])

In [10]:
# Step 3: Define PPO model with tuned hyperparameters
ppo_model = PPO(
    policy="MlpPolicy",
    env=env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.01,
    verbose=1,
    device="auto"
)

Using cpu device


In [None]:
# Step 4: Train the model
# This is going to print something like this: Look at "total_timesteps" to see how long of a training is left
"""
    ----------------------------------------
    | time/                   |            |
    |    fps                  | 943        |
    |    iterations           | 8          |
    |    time_elapsed         | 69         |
    |    total_timesteps      | 65536      |
    | train/                  |            |
    |    approx_kl            | 0.00866781 |
    |    clip_fraction        | 0.0932     |
    |    clip_range           | 0.2        |
    |    entropy_loss         | -8.32      |
    |    explained_variance   | 0.669      |
    |    learning_rate        | 0.0003     |
    |    loss                 | 54         |
    |    n_updates            | 70         |
    |    policy_gradient_loss | -0.015     |
    |    std                  | 0.966      |
    |    value_loss           | 84.9       |
    ----------------------------------------
"""
time_steps = 500_000 # Adjust based on your training time
ppo_model.learn(total_timesteps=time_steps)
# progress_bar = tqdm(total=time_steps, desc="Training Progress", unit="steps")

-----------------------------
| time/              |      |
|    fps             | 2143 |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 8192 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1286        |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.018404648 |
|    clip_fraction        | 0.263       |
|    clip_range           | 0.2         |
|    entropy_loss         | -8.48       |
|    explained_variance   | -0.122      |
|    learning_rate        | 0.0003      |
|    loss                 | 3.47        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0333     |
|    std                  | 0.992       |
|    value_loss           | 7.24        |
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1da3a4251f0>

In [12]:
# Step 5: Save the trained model
ppo_model.save("ppo_walker2d") #If you want to continue training the model from where it left off, you can load the saved model and call learn() to continue the training.

In [13]:
# Step 6: Close the environment
env.close()

In [None]:
# Step 7: Load the trained model and evaluate
ppo_model = PPO.load("ppo_walker2d")
eval_env = gym.make("Walker2d-v5", render_mode="human")

mean_reward, std_reward = evaluate_policy(ppo_model, eval_env, n_eval_episodes=5)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

obs, _ = eval_env.reset()
done = False
while not done:
    action, _states = ppo_model.predict(obs)
    obs, reward, done, truncated, info = eval_env.step(action)
    eval_env.render()

eval_env.close()



Mean reward: 280.9908569574356, Std reward: 5.634012182536946
