# RL Training Examples

Simple examples of training RL algorithms with Ray RLlib.

**Note**: For comprehensive hyperparameter documentation, see `00-rl_guide.ipynb`


## PPO Example


In [None]:
from ray.rllib.algorithms.ppo import PPOConfig

# Configure PPO
config = (
    PPOConfig()
    .environment("CartPole-v1")
    .training(
        gamma=0.99,
        lr=5e-5,
        train_batch_size=4000,
    )
    .env_runners(num_env_runners=4)
)

# Build and train
algo = config.build_algo()
result = algo.train()
print(f"PPO - Reward: {result['env_runners']['episode_return_mean']:.2f}")

{'timers': {'training_iteration': 15.783703999999943,
  'restore_env_runners': 2.6000000161729986e-05,
  'training_step': 15.78335369999968,
  'env_runner_sampling_timer': 2.3437175000003663,
  'learner_update_timer': 13.43641370000023,
  'synch_weights': 0.00224469999966459},
 'env_runners': {'num_episodes_lifetime': 194.0,
  'episode_return_mean': 20.08,
  'env_to_module_sum_episodes_length_in': np.float64(14.151481870612088),
  'env_to_module_connector': {'timers': {'connectors': {'add_time_dim_to_batch_and_zero_pad': np.float64(2.967564331265048e-06),
     'numpy_to_tensor': np.float64(4.383598640125949e-05),
     'add_observations_from_episodes_to_batch': np.float64(1.0201550578910416e-05),
     'add_states_from_episodes_to_batch': np.float64(2.3353606115337337e-06),
     'batch_individual_items': np.float64(1.9242406709198627e-05)}},
   'connector_pipeline_timer': np.float64(0.00013211424259001092)},
  'sample': np.float64(2.3064586499999677),
  'num_agent_steps_sampled': {'defau

## IMPALA Example (Windows-compatible)

In [1]:
from ray.rllib.algorithms.impala import IMPALAConfig
from tqdm import tqdm

# Configure IMPALA (local mode for Windows compatibility)
config = (
    IMPALAConfig()
    .environment("CartPole-v1")
    .learners(num_learners=0)  # Local mode (no distributed learners)
    .env_runners(num_env_runners=4)
    .training(
        gamma=0.99,
        lr=0.0005,
        train_batch_size=512,
    )
)

# Build and train for multiple iterations
algo = config.build_algo()
print("Training IMPALA on CartPole-v1...")

for i in tqdm(range(10), desc="Training", unit="iter"):
    result = algo.train()
    episodes = result['env_runners']['num_episodes']
    reward = result['env_runners']['episode_return_mean']
    length = result['env_runners']['episode_len_mean']
    tqdm.write(f"Iter {i+1:2d} | Episodes: {episodes:6.0f} | Reward: {reward:6.2f} | Length: {length:6.2f}")

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2025-10-19 22:47:59,846	INFO worker.py:2013 -- Started a local Ray instance.
2025-10-19 22:49:28,358	INFO trainable.py:161 -- Trainable.setup took 96.322 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Training IMPALA on CartPole-v1...


Training:  10%|█         | 1/10 [00:10<01:30, 10.08s/iter]

Iter  1 | Episodes:    504 | Reward:  29.76 | Length:  29.76


Training:  20%|██        | 2/10 [00:20<01:20, 10.09s/iter]

Iter  2 | Episodes:    370 | Reward:  36.36 | Length:  36.36


Training:  30%|███       | 3/10 [00:30<01:10, 10.09s/iter]

Iter  3 | Episodes:    350 | Reward:  43.68 | Length:  43.68


Training:  40%|████      | 4/10 [00:40<01:00, 10.09s/iter]

Iter  4 | Episodes:    350 | Reward:  40.00 | Length:  40.00


Training:  50%|█████     | 5/10 [00:50<00:50, 10.11s/iter]

Iter  5 | Episodes:    372 | Reward:  41.84 | Length:  41.84


Training:  60%|██████    | 6/10 [01:00<00:40, 10.12s/iter]

Iter  6 | Episodes:    310 | Reward:  41.36 | Length:  41.36


Training:  70%|███████   | 7/10 [01:10<00:30, 10.12s/iter]

Iter  7 | Episodes:    180 | Reward:  31.04 | Length:  31.04


Training:  80%|████████  | 8/10 [01:20<00:20, 10.11s/iter]

Iter  8 | Episodes:    284 | Reward:  51.64 | Length:  51.64


Training:  90%|█████████ | 9/10 [01:30<00:10, 10.11s/iter]

Iter  9 | Episodes:    318 | Reward:  67.52 | Length:  67.52


Training: 100%|██████████| 10/10 [01:41<00:00, 10.11s/iter]

Iter 10 | Episodes:    281 | Reward:  48.44 | Length:  48.44





In [18]:
# Save the trained model (Best Practices)
from pathlib import Path
from datetime import datetime
import json

# Define checkpoint directory with timestamp
repo_root = Path(__file__).parent.parent if '__file__' in locals() else Path.cwd().parent
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
checkpoint_base = repo_root / "checkpoints" / "impala_cartpole"
checkpoint_dir = checkpoint_base / timestamp
checkpoint_dir.mkdir(parents=True, exist_ok=True)

# Save training metadata
metadata = {
    "algorithm": "IMPALA",
    "environment": "CartPole-v1",
    "timestamp": timestamp,
    "training_iterations": 10,
    "final_reward_mean": float(result['env_runners']['episode_return_mean']),
    "final_episode_length": float(result['env_runners']['episode_len_mean']),
    "final_episodes": float(result['env_runners']['num_episodes']),
    "hyperparameters": {
        "gamma": 0.99,
        "lr": 0.0005,
        "train_batch_size": 512,
        "num_env_runners": 4,
        "num_learners": 0
    }
}

metadata_path = checkpoint_dir / "training_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, indent=2, fp=f)

# Save the model checkpoint
try:
    saved_path = algo.save(checkpoint_dir=str(checkpoint_dir))
    print(f"✓ Model saved successfully to: {saved_path}")
    print(f"✓ Metadata saved to: {metadata_path}")
    print(f"\nFinal Performance:")
    print(f"  - Mean Reward: {metadata['final_reward_mean']:.2f}")
    print(f"  - Mean Episode Length: {metadata['final_episode_length']:.2f}")
except Exception as e:
    print(f"✗ Error saving model: {e}")


✓ Model saved successfully to: TrainingResult(checkpoint=Checkpoint(filesystem=local, path=c:\Users\Maverick\Documents\GitHub\reinforcement\checkpoints\impala_cartpole\20251018_221334), metrics={'timers': {'training_iteration': 10.007739852955527, 'restore_env_runners': 2.6509628412545033e-05, 'training_step': 0.005670142667001269, 'sample': 0.0039115120421240055, 'learner_update_timer': 0.0013640342305409612, 'synch_weights': 0.004529510631270928}, 'env_runners': {'mean_num_episode_lists_received': 0.1516576123899905, 'num_episodes_lifetime': 4031.999999999998, 'episode_return_mean': 43.64, 'env_to_module_sum_episodes_length_in': 16.908273096856036, 'env_to_module_connector': {'timers': {'connectors': {'add_time_dim_to_batch_and_zero_pad': 1.0475611641219527e-05, 'numpy_to_tensor': 0.00014623494969214157, 'add_observations_from_episodes_to_batch': 2.7573970838038283e-05, 'add_states_from_episodes_to_batch': 5.1542618762668485e-06, 'batch_individual_items': 6.217203271734908e-05}}, 'co