## VecAdapter Class

Define the VecAdapter class to convert an EnvPool object to a Stable-Baselines3-compatible VecEnv.

## Helper Functions

Define utility functions for argument parsing, user interaction, and model management.

In [23]:
def create_or_load_model(args, env, policy_kwargs, use_vecnormalize=True):
    """Create a new model or load existing one based on user choice."""
    model_exists = os.path.exists(f"{args.model_save_path}.zip")
    vecnorm_exists = os.path.exists(f"{args.model_save_path}_vecnormalize.pkl")
    
    # Determine whether to load existing model
    should_continue = False
    if model_exists:
        if args.force_new:
            print(f"Found existing model but --force-new specified. Starting fresh training.")
            should_continue = False
        elif args.continue_training:
            print(f"Found existing model and --continue-training specified. Continuing training.")
            should_continue = True
        else:
            # Interactive mode - ask user
            should_continue = ask_continue_or_restart(args.model_save_path)
    
    if should_continue and model_exists:
        print(f"Loading existing model from {args.model_save_path}.zip")
        
        # Load VecNormalize stats if they exist and we're using VecNormalize
        if use_vecnormalize and vecnorm_exists:
            print(f"Loading VecNormalize statistics from {args.model_save_path}_vecnormalize.pkl")
            env = VecNormalize.load(f"{args.model_save_path}_vecnormalize.pkl", env)
            # Important: set training=True to continue updating statistics
            env.training = True
        
        model = PPO.load(f"{args.model_save_path}.zip", env=env)
        print("Model loaded successfully. Continuing training...")
    else:
        print("Creating new model...")
        model = PPO(
            policy="MlpPolicy",
            env=env,

            # ───── PPO hyper-parameters (Appendix, Table "Hyperparameters for Proximal Policy Gradient") ─────
            learning_rate=1e-4,      # "Adam stepsize" ≈ 1 × 10⁻³
            clip_range=0.2,            # tighten the trust‐region
            target_kl=0.01,            # early stop if KL > 1%
            n_steps=4096,           # 5 000 samples/iteration (match 5 000 MuJoCo steps)
            batch_size=1024,        # "Minibatch size"
            n_epochs=8,              # "Number epochs"
            gamma=0.99,              # "Discount (γ)"
            gae_lambda=0.95,         # standard value; paper does not override
            max_grad_norm=0.5,      # "Max gradient norm"
            ent_coef=0.1,            # paper does not add entropy bonus
            vf_coef=1.0,             # SB3 default; paper gives no separate weight

            # ───── bookkeeping ─────
            tensorboard_log="runs/ppo_taskspace",
            policy_kwargs=policy_kwargs,
            verbose=1,
        )
        print("New model created.")
    
    return model, env

In [24]:
# Configuration parameters - modify these as needed
class Args:
    def __init__(self):
        self.env_name = "Humanoid-v4"
        self.num_envs = 128
        self.seed = 0
        self.total_timesteps = 100_000_000
        self.tb_log_dir = "./logs"
        self.model_save_path = "./quadruped_ppo_model"
        self.render_mode = False
        self.continue_training = False
        self.force_new = False
        self.use_vecnormalize = True  # Enable VecNormalize by default

args = Args()

# Display current configuration
print("Training Configuration:")
print(f"Environment: {args.env_name}")
print(f"Number of environments: {args.num_envs}")
print(f"Seed: {args.seed}")
print(f"Total timesteps: {args.total_timesteps:,}")
print(f"Use VecNormalize: {args.use_vecnormalize}")
print(f"Model save path: {args.model_save_path}")

Training Configuration:
Environment: Humanoid-v4
Number of environments: 128
Seed: 0
Total timesteps: 100,000,000
Use VecNormalize: True
Model save path: ./quadruped_ppo_model


In [25]:
# Set up logging and directories
run_dir = os.path.join("runs_csv", datetime.now().strftime("%Y%m%d_%H%M%S"))
os.makedirs(run_dir, exist_ok=True)

# Build a logger that keeps every format (stdout, log, tensorboard, csv)
logger = configure(
    run_dir,
    format_strings=("stdout", "log", "tensorboard", "csv")
)

logging.basicConfig(level=logging.INFO)
logging.info("Experiment: quadruped_ppo_experiment")
logging.info(f"Using EnvPool for environment {args.env_name} with {args.num_envs} envs. Seed: {args.seed}")

# Set random seed
np.random.seed(args.seed)

print(f"Log directory: {run_dir}")

Logging to runs_csv/20250807_105817


INFO:root:Experiment: quadruped_ppo_experiment
INFO:root:Using EnvPool for environment Humanoid-v4 with 128 envs. Seed: 0


Log directory: runs_csv/20250807_105817


In [None]:
# Apply VecNormalize if requested (BEFORE VecMonitor)
vecnormalize_wrapper = None
if args.use_vecnormalize:
    print("Using VecNormalize wrapper...")
    vecnormalize_wrapper = VecNormalize(env, norm_obs=True, norm_reward=True, clip_reward=10.0)
    env = vecnormalize_wrapper

env = VecMonitor(env)  # Monitor for tracking episode stats

print("Environment setup complete.")

In [None]:
# Define policy network architecture
policy_kwargs = dict(
    # 2 hidden layers, 256 units each, Tanh activation
    activation_fn=th.nn.Tanh,
    net_arch=[dict(pi=[256,256], vf=[256,256])],  # 2 hidden layers with 256 units each
    # initialise exploration noise to exp(–2.0) ≈ 0.135
    log_std_init=-2.0,
)

print("Policy architecture:")
print(f"  Activation function: {policy_kwargs['activation_fn'].__name__}")
print(f"  Network architecture: {policy_kwargs['net_arch']}")
print(f"  Log std init: {policy_kwargs['log_std_init']}")

## Training

Start the training process. You can interrupt this cell to stop training early.