# RL Training Examples

Simple examples of training RL algorithms with Ray RLlib.

**Note**: For comprehensive hyperparameter documentation, see `00-rl_guide.ipynb`


## PPO Example


In [None]:
from ray.rllib.algorithms.ppo import PPOConfig

# Configure PPO
config = (
    PPOConfig()
    .environment("CartPole-v1")
    .training(
        gamma=0.99,
        lr=5e-5,
        train_batch_size=4000,
    )
    .env_runners(num_env_runners=4)
)

# Build and train
algo = config.build_algo()
result = algo.train()
print(f"PPO - Reward: {result['env_runners']['episode_return_mean']:.2f}")

{'timers': {'training_iteration': 15.783703999999943,
  'restore_env_runners': 2.6000000161729986e-05,
  'training_step': 15.78335369999968,
  'env_runner_sampling_timer': 2.3437175000003663,
  'learner_update_timer': 13.43641370000023,
  'synch_weights': 0.00224469999966459},
 'env_runners': {'num_episodes_lifetime': 194.0,
  'episode_return_mean': 20.08,
  'env_to_module_sum_episodes_length_in': np.float64(14.151481870612088),
  'env_to_module_connector': {'timers': {'connectors': {'add_time_dim_to_batch_and_zero_pad': np.float64(2.967564331265048e-06),
     'numpy_to_tensor': np.float64(4.383598640125949e-05),
     'add_observations_from_episodes_to_batch': np.float64(1.0201550578910416e-05),
     'add_states_from_episodes_to_batch': np.float64(2.3353606115337337e-06),
     'batch_individual_items': np.float64(1.9242406709198627e-05)}},
   'connector_pipeline_timer': np.float64(0.00013211424259001092)},
  'sample': np.float64(2.3064586499999677),
  'num_agent_steps_sampled': {'defau

## IMPALA Example (Windows-compatible)


In [9]:
from ray.rllib.algorithms.impala import IMPALAConfig

# Configure IMPALA (local mode for Windows compatibility)
config = (
    IMPALAConfig()
    .environment("CartPole-v1")
    .learners(num_learners=0)  # Local mode (no distributed learners)
    .env_runners(num_env_runners=4)
    .training(
        gamma=0.99,
        lr=0.0005,
        train_batch_size=512,
    )
)

# Build and train
algo = config.build_algo()
result = algo.train()
print(f"IMPALA - Reward: {result['env_runners']['episode_return_mean']:.2f}")

IMPALA - Reward: 27.44


In [8]:
# Train for multiple iterations (optional)
# for i in range(10):
#     result = algo.train()
#     print(f"Iter {i+1}: Reward = {result['env_runners']['episode_return_mean']:.2f}")
