# Quadruped PPO Training and Evaluation with EnvPool

This notebook provides a comprehensive pipeline for training and evaluating PPO (Proximal Policy Optimization) models for quadruped control using EnvPool. The code has been refactored to use common modules for better organization and reusability.

## Features:
- **Modular Design**: Common components moved to separate modules
- **Interactive Training**: Train models with progress monitoring
- **Comprehensive Evaluation**: Detailed performance metrics and visualizations
- **Model Management**: Save/load models with normalization statistics
- **Environment Support**: Multiple MuJoCo environments via EnvPool

## Notebook Structure:
1. Import libraries and setup
2. Configure training parameters
3. Create and setup environments
4. Define model architecture
5. Train the PPO model
6. Save model and statistics
7. Load and evaluate models
8. Visualize results

## 1. Import Required Libraries

Import all necessary libraries including the newly refactored common modules for training and evaluation.

In [None]:
#!/usr/bin/env python3
import os
import sys
import logging
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time
from packaging import version
from IPython.display import display, clear_output
import warnings
warnings.filterwarnings('ignore')

# Deep learning and RL libraries
import torch as th
import envpool
import gym
import gymnasium
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecMonitor, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy

# Import our refactored common modules
sys.path.append('./common')
from common import (
    VecAdapter,
    setup_environment,
    create_policy_kwargs,
    create_ppo_model,
    setup_logging,
    create_or_load_model,
    save_model_and_stats,
    setup_vecnormalize,
    find_vecnormalize_wrapper,
    load_model_and_normalization,
    detailed_evaluation,
    print_evaluation_results,
    save_evaluation_results
)

# Set up visualization
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Force PyTorch to use one thread for speed
th.set_num_threads(1)

print("✅ All libraries imported successfully!")
print(f"🚀 PyTorch version: {th.__version__}")
print(f"🎯 EnvPool available: {envpool.__version__}")
print(f"🤖 GPU available: {th.cuda.is_available()}")
if th.cuda.is_available():
    print(f"   GPU: {th.cuda.get_device_name(0)}")
    print(f"   Memory: {th.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Setup Environment Configuration

Configure training parameters, environment settings, and logging for reproducible experiments.

In [None]:
# ============================================================================
# TRAINING CONFIGURATION
# ============================================================================

# Environment settings
ENV_NAME = "Humanoid-v4"  # Try: "Ant-v4", "HalfCheetah-v4", "Hopper-v4", "Walker2d-v4"
NUM_ENVS = 256            # Number of parallel environments (higher = faster training)
SEED = 42                 # Random seed for reproducibility

# Training parameters
TOTAL_TIMESTEPS = 1_000_000  # Total training steps (adjust for shorter/longer training)
USE_VECNORMALIZE = True      # Use observation/reward normalization (recommended)
RENDER_MODE = False          # Set to True for visualization during training (slower)

# Model save/load settings
MODEL_SAVE_PATH = "./models/quadruped_ppo_model"
FORCE_NEW = False           # Set True to always start fresh training
CONTINUE_TRAINING = False   # Set True to automatically continue from existing model

# Evaluation settings
N_EVAL_EPISODES = 20       # Number of episodes for evaluation
DETERMINISTIC_EVAL = True  # Use deterministic actions during evaluation

# ============================================================================
# SETUP LOGGING AND REPRODUCIBILITY
# ============================================================================

# Set random seeds for reproducibility
np.random.seed(SEED)
th.manual_seed(SEED)
if th.cuda.is_available():
    th.cuda.manual_seed(SEED)

# Setup logging
logger = setup_logging()
logging.info("🚀 Starting PPO training experiment")
logging.info(f"Environment: {ENV_NAME}, Envs: {NUM_ENVS}, Seed: {SEED}")

# Create model directory
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

print("📋 Configuration Summary:")
print(f"   Environment: {ENV_NAME}")
print(f"   Parallel Envs: {NUM_ENVS}")
print(f"   Training Steps: {TOTAL_TIMESTEPS:,}")
print(f"   VecNormalize: {USE_VECNORMALIZE}")
print(f"   Model Path: {MODEL_SAVE_PATH}")
print(f"   Seed: {SEED}")
print("✅ Configuration complete!")

## 3. Create and Configure Training Environment

Set up the EnvPool environment with proper wrappers and normalization.

In [None]:
# ============================================================================
# CREATE TRAINING ENVIRONMENT
# ============================================================================

print("🌍 Creating training environment...")

# Create base environment using our utility function
env = setup_environment(
    env_name=ENV_NAME,
    num_envs=NUM_ENVS,
    seed=SEED,
    render_mode=RENDER_MODE
)

print(f"✅ Base environment created: {ENV_NAME}")
print(f"   Action space: {env.action_space}")
print(f"   Observation space: {env.observation_space}")
print(f"   Number of environments: {env.num_envs}")

# Apply VecNormalize wrapper if requested
env, vecnormalize_wrapper = setup_vecnormalize(env, USE_VECNORMALIZE)
if USE_VECNORMALIZE:
    print("✅ VecNormalize wrapper applied")

# Apply monitoring wrapper
env = VecMonitor(env)
print("✅ VecMonitor wrapper applied")

# Display environment information
print("\n📊 Environment Information:")
print(f"   Environment ID: {env.spec.id}")
print(f"   Action dimensions: {env.action_space.shape}")
print(f"   Observation dimensions: {env.observation_space.shape}")
print(f"   Action bounds: [{env.action_space.low[0]:.2f}, {env.action_space.high[0]:.2f}]")
print(f"   Wrapper stack: VecAdapter → {'VecNormalize → ' if USE_VECNORMALIZE else ''}VecMonitor")
print("✅ Environment setup complete!")

## 4. Define PPO Model Architecture

Configure the policy network architecture and hyperparameters for PPO training.

In [None]:
# ============================================================================
# CREATE OR LOAD PPO MODEL
# ============================================================================

print("🤖 Setting up PPO model...")

# Create policy kwargs using our utility function
policy_kwargs = create_policy_kwargs()

print("🏗️ Policy Architecture:")
print(f"   Activation: {policy_kwargs['activation_fn'].__name__}")
print(f"   Network architecture: {policy_kwargs['net_arch']}")
print(f"   Log std initialization: {policy_kwargs['log_std_init']}")

# Create or load model using our utility function
model, env = create_or_load_model(
    model_save_path=MODEL_SAVE_PATH,
    env=env,
    policy_kwargs=policy_kwargs,
    use_vecnormalize=USE_VECNORMALIZE,
    force_new=FORCE_NEW,
    continue_training=CONTINUE_TRAINING
)

# Update vecnormalize wrapper reference if modified
if USE_VECNORMALIZE and vecnormalize_wrapper is None:
    vecnormalize_wrapper = find_vecnormalize_wrapper(env)

# Set the logger
model.set_logger(logger)

print("\n🎯 PPO Hyperparameters:")
print(f"   Learning rate: {model.learning_rate}")
print(f"   Clip range: {model.clip_range}")
print(f"   Target KL: {model.target_kl}")
print(f"   Steps per rollout: {model.n_steps}")
print(f"   Batch size: {model.batch_size}")
print(f"   Epochs per update: {model.n_epochs}")
print(f"   Gamma (discount): {model.gamma}")
print(f"   GAE lambda: {model.gae_lambda}")
print(f"   Max gradient norm: {model.max_grad_norm}")
print(f"   Entropy coefficient: {model.ent_coef}")
print(f"   Value function coefficient: {model.vf_coef}")

print("✅ Model setup complete!")

## 5. Train the PPO Model

Execute the training loop with progress monitoring and interrupt handling.

In [None]:
# ============================================================================
# TRAIN THE MODEL
# ============================================================================

print("🚀 Starting PPO training...")
print(f"🎯 Target timesteps: {TOTAL_TIMESTEPS:,}")
print("💡 Tip: Press 'Interrupt' button to stop training and save the model")
print("=" * 60)

# Record training start time
training_start_time = time.time()

try:
    # Start training with progress monitoring
    model.learn(total_timesteps=TOTAL_TIMESTEPS)
    
    # Training completed successfully
    training_end_time = time.time()
    training_duration = training_end_time - training_start_time
    
    print("🎉 Training completed successfully!")
    print(f"⏱️ Total training time: {training_duration:.2f} seconds ({training_duration/3600:.2f} hours)")
    
except KeyboardInterrupt:
    # Handle graceful interruption
    training_end_time = time.time()
    training_duration = training_end_time - training_start_time
    
    print("\n⚠️ Training interrupted by user!")
    print(f"⏱️ Training time before interruption: {training_duration:.2f} seconds")
    print("💾 Saving model before exit...")
    
    # Save the model immediately
    save_model_and_stats(model, MODEL_SAVE_PATH, vecnormalize_wrapper)
    print("✅ Model saved successfully!")
    
except Exception as e:
    print(f"❌ Training failed with error: {e}")
    print("💾 Attempting to save model...")
    save_model_and_stats(model, MODEL_SAVE_PATH, vecnormalize_wrapper)
    raise

print("✅ Training phase complete!")

## 6. Save Model and Normalization Statistics

Save the trained model and VecNormalize statistics for future use.

In [None]:
# ============================================================================
# SAVE TRAINED MODEL
# ============================================================================

print("💾 Saving trained model and statistics...")

# Save model and normalization statistics
save_model_and_stats(model, MODEL_SAVE_PATH, vecnormalize_wrapper)

# Verify saved files
model_file = f"{MODEL_SAVE_PATH}.zip"
vecnorm_file = f"{MODEL_SAVE_PATH}_vecnormalize.pkl"

print("\n📁 Saved files:")
if os.path.exists(model_file):
    model_size = os.path.getsize(model_file) / (1024 * 1024)  # MB
    print(f"   ✅ Model: {model_file} ({model_size:.2f} MB)")
else:
    print(f"   ❌ Model file not found: {model_file}")

if USE_VECNORMALIZE and os.path.exists(vecnorm_file):
    vecnorm_size = os.path.getsize(vecnorm_file) / 1024  # KB
    print(f"   ✅ VecNormalize: {vecnorm_file} ({vecnorm_size:.2f} KB)")
elif USE_VECNORMALIZE:
    print(f"   ❌ VecNormalize file not found: {vecnorm_file}")

print("✅ Model saving complete!")

## 7. Load Trained Model for Evaluation

Load the saved model and configure the evaluation environment.

In [None]:
# ============================================================================
# SETUP EVALUATION ENVIRONMENT
# ============================================================================

print("🔍 Setting up evaluation environment...")

# Create evaluation environment (typically single environment for cleaner results)
eval_env = setup_environment(
    env_name=ENV_NAME,
    num_envs=1,  # Single environment for evaluation
    seed=SEED + 1000,  # Different seed for evaluation
    render_mode=None  # No rendering during evaluation
)

# Load model and normalization for evaluation
eval_model, eval_env = load_model_and_normalization(
    model_path=MODEL_SAVE_PATH,
    env=eval_env,
    auto_detect_vecnorm=True  # Automatically detect VecNormalize file
)

print("✅ Evaluation environment ready!")
print(f"   Model loaded from: {MODEL_SAVE_PATH}.zip")
print(f"   Evaluation environment: {ENV_NAME}")
print(f"   Number of evaluation episodes: {N_EVAL_EPISODES}")
print(f"   Deterministic policy: {DETERMINISTIC_EVAL}")

## 8. Evaluate Model Performance

Run comprehensive evaluation and analyze performance metrics.

In [None]:
# ============================================================================
# RUN DETAILED EVALUATION
# ============================================================================

print("📊 Running detailed evaluation...")

# Perform comprehensive evaluation
eval_results = detailed_evaluation(
    model=eval_model,
    env=eval_env,
    n_eval_episodes=N_EVAL_EPISODES,
    deterministic=DETERMINISTIC_EVAL,
    verbose=True  # Show episode-by-episode results
)

# Print comprehensive results
print_evaluation_results(
    results=eval_results,
    env_name=ENV_NAME,
    model_path=MODEL_SAVE_PATH,
    n_eval_episodes=N_EVAL_EPISODES
)

# Save evaluation results to file
results_file = save_evaluation_results(
    results=eval_results,
    model_path=MODEL_SAVE_PATH,
    env_name=ENV_NAME,
    n_eval_episodes=N_EVAL_EPISODES
)

# Store results for visualization
episode_rewards = eval_results['episode_rewards']
episode_lengths = eval_results['episode_lengths']
mean_reward = eval_results['mean_reward']
std_reward = eval_results['std_reward']

print("✅ Evaluation complete!")
print(f"📄 Results saved to: {results_file}")

## 9. Visualize Training and Evaluation Results

Create comprehensive visualizations of model performance and training progress.

In [None]:
# ============================================================================
# CREATE COMPREHENSIVE VISUALIZATIONS
# ============================================================================

print("📈 Creating visualizations...")

# Set up the plotting style
plt.style.use('seaborn-v0_8')
fig = plt.figure(figsize=(20, 12))

# ---- Subplot 1: Episode Rewards Distribution ----
plt.subplot(2, 3, 1)
plt.hist(episode_rewards, bins=15, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(mean_reward, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_reward:.2f}')
plt.axvline(np.median(episode_rewards), color='green', linestyle='--', linewidth=2, label=f'Median: {np.median(episode_rewards):.2f}')
plt.xlabel('Episode Reward')
plt.ylabel('Frequency')
plt.title('Distribution of Episode Rewards')
plt.legend()
plt.grid(True, alpha=0.3)

# ---- Subplot 2: Episode Rewards Over Time ----
plt.subplot(2, 3, 2)
episodes = range(1, len(episode_rewards) + 1)
plt.plot(episodes, episode_rewards, 'o-', alpha=0.7, color='blue', markersize=4)
plt.axhline(mean_reward, color='red', linestyle='--', alpha=0.8, label=f'Mean: {mean_reward:.2f}')
plt.fill_between(episodes, mean_reward - std_reward, mean_reward + std_reward, alpha=0.2, color='red', label=f'±1 STD')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Episode Rewards Over Time')
plt.legend()
plt.grid(True, alpha=0.3)

# ---- Subplot 3: Episode Lengths Distribution ----
plt.subplot(2, 3, 3)
plt.hist(episode_lengths, bins=15, alpha=0.7, color='lightgreen', edgecolor='black')
plt.axvline(np.mean(episode_lengths), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(episode_lengths):.1f}')
plt.axvline(np.median(episode_lengths), color='blue', linestyle='--', linewidth=2, label=f'Median: {np.median(episode_lengths):.1f}')
plt.xlabel('Episode Length')
plt.ylabel('Frequency')
plt.title('Distribution of Episode Lengths')
plt.legend()
plt.grid(True, alpha=0.3)

# ---- Subplot 4: Performance Summary Box Plot ----
plt.subplot(2, 3, 4)
box_data = [episode_rewards]
bp = plt.boxplot(box_data, labels=['Rewards'], patch_artist=True)
bp['boxes'][0].set_facecolor('lightblue')
bp['medians'][0].set_color('red')
bp['medians'][0].set_linewidth(2)
plt.ylabel('Reward')
plt.title('Performance Summary')
plt.grid(True, alpha=0.3)

# Add statistics text
stats_text = f"""Statistics:
Mean: {mean_reward:.2f} ± {std_reward:.2f}
Min: {np.min(episode_rewards):.2f}
Max: {np.max(episode_rewards):.2f}
Median: {np.median(episode_rewards):.2f}
Success Rate: {np.sum(np.array(episode_rewards) > 0) / len(episode_rewards) * 100:.1f}%"""
plt.text(1.1, np.median(episode_rewards), stats_text, fontsize=10, verticalalignment='center')

# ---- Subplot 5: Reward vs Episode Length Scatter ----
plt.subplot(2, 3, 5)
plt.scatter(episode_lengths, episode_rewards, alpha=0.6, color='purple', s=50)
plt.xlabel('Episode Length')
plt.ylabel('Episode Reward')
plt.title('Reward vs Episode Length')
plt.grid(True, alpha=0.3)
# Add correlation coefficient
correlation = np.corrcoef(episode_lengths, episode_rewards)[0, 1]
plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', transform=plt.gca().transAxes, 
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# ---- Subplot 6: Performance Metrics Summary ----
plt.subplot(2, 3, 6)
metrics_names = ['Mean\nReward', 'Std\nReward', 'Min\nReward', 'Max\nReward', 'Mean\nLength']
metrics_values = [mean_reward, std_reward, np.min(episode_rewards), np.max(episode_rewards), np.mean(episode_lengths)]
colors = ['blue', 'orange', 'red', 'green', 'purple']

bars = plt.bar(metrics_names, metrics_values, color=colors, alpha=0.7)
plt.title('Performance Metrics Summary')
plt.ylabel('Value')
plt.xticks(rotation=45)

# Add value labels on bars
for bar, value in zip(bars, metrics_values):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{value:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.suptitle(f'PPO Training Results - {ENV_NAME} Environment', fontsize=16, y=0.98)
plt.show()

print("✅ Visualizations complete!")

# ============================================================================
# SUMMARY REPORT
# ============================================================================

print("\n" + "="*80)
print("🎯 FINAL TRAINING AND EVALUATION SUMMARY")
print("="*80)
print(f"Environment: {ENV_NAME}")
print(f"Training Steps: {TOTAL_TIMESTEPS:,}")
print(f"Parallel Environments: {NUM_ENVS}")
print(f"VecNormalize Used: {USE_VECNORMALIZE}")
print(f"Model Path: {MODEL_SAVE_PATH}")
print(f"\n📊 Evaluation Results ({N_EVAL_EPISODES} episodes):")
print(f"   Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
print(f"   Best Episode: {np.max(episode_rewards):.2f}")
print(f"   Worst Episode: {np.min(episode_rewards):.2f}")
print(f"   Success Rate (>0): {np.sum(np.array(episode_rewards) > 0) / len(episode_rewards) * 100:.1f}%")
print(f"   Mean Episode Length: {np.mean(episode_lengths):.1f}")
print(f"   Evaluation Time: {eval_results['eval_time']:.2f} seconds")
print("="*80)

# Clean up environments
env.close()
eval_env.close()
print("🧹 Environments closed. Notebook complete!")