# PPO Multi-Seed Training

---

In [None]:
import os, sys, random

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor

import imageio
from IPython.display import Image, display

In [None]:
# Global Configuration

SEED_LIST = [42, 123, 3407]

SELECTED_ALGORITHM = "ppo"
ALGORITHM_CLASS = PPO

NOTEBOOK_DIR = os.path.dirname(os.path.abspath("__file__"))
OUTPUT_DIR = os.path.join(NOTEBOOK_DIR, "outputs_" + SELECTED_ALGORITHM)
os.makedirs(OUTPUT_DIR, exist_ok=True)
TENSORBOARD_LOGS_DIR = os.path.join(OUTPUT_DIR, "tensorboard")
MODELS_DIR = os.path.join(NOTEBOOK_DIR, "../../../models", SELECTED_ALGORITHM)
os.makedirs(MODELS_DIR, exist_ok=True)

GYMNASIUM_MODEL = "LunarLander-v3"
MLP_POLICY = "MlpPolicy"

WIND_ENABLED = False

TOTAL_TIMESTEPS = 1_000_000
EVALUATION_EPISODES = 20

DEVICE = "cpu"

print(f"Algorithm: {SELECTED_ALGORITHM.upper()}")
print(f"Seeds: {SEED_LIST}")
print(f"Wind enabled: {WIND_ENABLED}")
print(f"Total timesteps per seed: {TOTAL_TIMESTEPS:,}")
print(f"Evaluation episodes per seed: {EVALUATION_EPISODES}")
print(f"Device: {DEVICE}")
print(f"Models directory: {MODELS_DIR}")

In [None]:
print("Python:", sys.version.split()[0])
print("PyTorch:", torch.__version__)
print("Device:", DEVICE)
print("CUDA:", torch.version.cuda if torch.cuda.is_available() else "None")

In [None]:
# Environment inspection (run once, not per seed)
env_tmp = gym.make(GYMNASIUM_MODEL)

print("Observation space:", env_tmp.observation_space)
print("Action space:", env_tmp.action_space)

obs, info = env_tmp.reset()
print("Initial observation:", obs)

env_tmp.close()

In [None]:
class PPOLoggingCallback(BaseCallback):
    def __init__(self, verbose: int = 0):
        super().__init__(verbose)
        self.episode_rewards = []
        self.episode_lengths = []
        self.policy_loss = []
        self.value_loss = []
        self.entropy = []

        self._current_rewards: np.ndarray = np.array([])
        self._current_lengths: np.ndarray = np.array([])
        self._plot_handle = None
        self._stats_handle = None

    def _on_training_start(self) -> None:
        n_envs = self.training_env.num_envs
        self._current_rewards = np.zeros(n_envs, dtype=np.float32)
        self._current_lengths = np.zeros(n_envs, dtype=np.int32)

    def _on_step(self) -> bool:
        rewards = self.locals.get("rewards")
        dones = self.locals.get("dones")

        if rewards is not None and dones is not None:
            self._current_rewards += rewards
            self._current_lengths += 1

            for i, done in enumerate(dones):
                if done:
                    self.episode_rewards.append(float(self._current_rewards[i]))
                    self.episode_lengths.append(int(self._current_lengths[i]))
                    ep = len(self.episode_rewards)

                    if ep % 10 == 0:
                        recent = np.array(self.episode_rewards[-50:])
                        stats_text = (
                            f'Episode {ep} | Last {len(recent)} Ep \u2014 '
                            f'Mean: {np.mean(recent):.1f} | Std: {np.std(recent):.1f} | '
                            f'Min: {np.min(recent):.1f} | Max: {np.max(recent):.1f} | '
                            f'Success: {(recent >= 200).sum() / len(recent) * 100:.0f}%'
                        )
                        if self._stats_handle is None:
                            self._stats_handle = display(stats_text, display_id=True)
                        else:
                            self._stats_handle.update(stats_text)

                    if ep % 50 == 0:
                        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))

                        ax1.plot(self.episode_rewards, alpha=0.3, color='gray')
                        window = min(50, len(self.episode_rewards))
                        rolling = pd.Series(self.episode_rewards).rolling(window).mean()
                        ax1.plot(rolling, color='blue', linewidth=2)
                        ax1.axhline(y=200, color='red', linestyle='--')
                        ax1.set_title(f'Episode Reward \u2014 Ep {ep}')
                        ax1.set_xlabel('Episode')
                        ax1.set_ylabel('Reward')
                        ax1.grid(True, alpha=0.3)

                        if self.value_loss:
                            ax2.plot(self.value_loss, color='green', alpha=0.7)
                            ax2.set_title('Value Loss')
                            ax2.set_xlabel('Rollout')
                            ax2.set_ylabel('Loss')
                            ax2.grid(True, alpha=0.3)

                        plt.tight_layout()

                        if self._plot_handle is None:
                            self._plot_handle = display(fig, display_id=True)
                        else:
                            self._plot_handle.update(fig)
                        plt.close(fig)

                    self._current_rewards[i] = 0
                    self._current_lengths[i] = 0
        return True

    def _on_rollout_end(self) -> None:
        logger_data = self.model.logger.name_to_value
        if "train/policy_loss" in logger_data:
            self.policy_loss.append(logger_data["train/policy_loss"])
        if "train/value_loss" in logger_data:
            self.value_loss.append(logger_data["train/value_loss"])
        if "train/entropy_loss" in logger_data:
            self.entropy.append(-logger_data["train/entropy_loss"])

In [None]:
def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [None]:
# Training loop over all seeds

training_results = {}

for seed in SEED_LIST:
    print(f"\n{'='*60}")
    print(f"Training with seed {seed}")
    print(f"{'='*60}\n")

    set_all_seeds(seed)

    def make_env(s=seed):
        env = gym.make(GYMNASIUM_MODEL, render_mode="rgb_array", enable_wind=WIND_ENABLED)
        env.reset(seed=s)
        return env

    env = DummyVecEnv([make_env])
    env.seed(seed)

    params = {
        "policy": MLP_POLICY,
        "env": env,
        "learning_rate": 2.5e-4,
        "n_steps": 2048,
        "batch_size": 64,
        "n_epochs": 10,
        "gamma": 0.999,
        "gae_lambda": 0.95,
        "ent_coef": 0.01,
        "clip_range": 0.2,
        "device": DEVICE,
        "seed": seed,
        "tensorboard_log": TENSORBOARD_LOGS_DIR,
    }

    callback = PPOLoggingCallback()
    model = ALGORITHM_CLASS(**params)
    model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=callback, progress_bar=True)

    save_path = os.path.join(MODELS_DIR, f"lab005_{SELECTED_ALGORITHM}_{seed}")
    model.save(save_path)
    print(f"Model saved to: {save_path}")

    training_results[seed] = callback

    env.close()

print(f"\nAll {len(SEED_LIST)} models trained and saved.")

In [None]:
# Per-Seed: Episode Reward over Training

fig, axes = plt.subplots(1, len(SEED_LIST), figsize=(6 * len(SEED_LIST), 5), sharey=True)
if len(SEED_LIST) == 1:
    axes = [axes]

for ax, seed in zip(axes, SEED_LIST):
    ax.plot(training_results[seed].episode_rewards, alpha=0.7)
    ax.set_title(f"Seed {seed}")
    ax.set_xlabel("Episode")
    ax.grid(True, alpha=0.3)

axes[0].set_ylabel("Total Reward")
fig.suptitle("Episode Reward over Training", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Per-Seed: Episode Length over Training

fig, axes = plt.subplots(1, len(SEED_LIST), figsize=(6 * len(SEED_LIST), 5), sharey=True)
if len(SEED_LIST) == 1:
    axes = [axes]

for ax, seed in zip(axes, SEED_LIST):
    ax.plot(training_results[seed].episode_lengths, alpha=0.7, color="orange")
    ax.set_title(f"Seed {seed}")
    ax.set_xlabel("Episode")
    ax.grid(True, alpha=0.3)

axes[0].set_ylabel("Number of Steps")
fig.suptitle("Episode Length over Training", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Per-Seed: Value Loss over Rollouts

fig, axes = plt.subplots(1, len(SEED_LIST), figsize=(6 * len(SEED_LIST), 5), sharey=True)
if len(SEED_LIST) == 1:
    axes = [axes]

for ax, seed in zip(axes, SEED_LIST):
    ax.plot(training_results[seed].value_loss, alpha=0.7, color="green")
    ax.set_title(f"Seed {seed}")
    ax.set_xlabel("Rollout")
    ax.grid(True, alpha=0.3)

axes[0].set_ylabel("Loss Value")
fig.suptitle("Value Loss over Rollouts", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Per-Seed: Entropy over Rollouts

fig, axes = plt.subplots(1, len(SEED_LIST), figsize=(6 * len(SEED_LIST), 5), sharey=True)
if len(SEED_LIST) == 1:
    axes = [axes]

for ax, seed in zip(axes, SEED_LIST):
    ax.plot(training_results[seed].entropy, alpha=0.7, color="purple")
    ax.set_title(f"Seed {seed}")
    ax.set_xlabel("Rollout")
    ax.grid(True, alpha=0.3)

axes[0].set_ylabel("Entropy (Positive)")
fig.suptitle("Entropy over Rollouts", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Aggregated: Rolling Reward Overlay (all seeds)

colors = plt.cm.tab10.colors

plt.figure(figsize=(14, 6))
for i, seed in enumerate(SEED_LIST):
    rewards = training_results[seed].episode_rewards
    rolling = pd.Series(rewards).rolling(50).mean()
    plt.plot(rolling, color=colors[i], linewidth=2, label=f"Seed {seed}")

plt.axhline(y=200, color='red', linestyle='--', label='Solved Threshold (200)')
plt.title(f"{SELECTED_ALGORITHM.upper()} Training: Rolling Mean Reward (window=50)", fontsize=14)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Evaluation: 20 deterministic episodes per seed

evaluation_results = {}

for seed in SEED_LIST:
    print(f"Evaluating model for seed {seed}...")

    set_all_seeds(seed)

    load_path = os.path.join(MODELS_DIR, f"lab005_{SELECTED_ALGORITHM}_{seed}")

    def make_eval_env(s=seed):
        env = gym.make(GYMNASIUM_MODEL, render_mode="rgb_array", enable_wind=WIND_ENABLED)
        env.reset(seed=s)
        return env

    eval_model = ALGORITHM_CLASS.load(load_path, env=DummyVecEnv([make_eval_env]), device=DEVICE)

    eval_env = Monitor(gym.make(GYMNASIUM_MODEL, enable_wind=WIND_ENABLED))
    eval_env.reset(seed=seed)

    rewards, _ = evaluate_policy(
        eval_model,
        eval_env,
        n_eval_episodes=EVALUATION_EPISODES,
        deterministic=True,
        return_episode_rewards=True
    )

    evaluation_results[seed] = np.array(rewards)
    eval_env.close()

print(f"\nEvaluation complete for all {len(SEED_LIST)} seeds.")

In [None]:
# Evaluation Summary Table

rows = []
for seed in SEED_LIST:
    r = evaluation_results[seed]
    rows.append({
        "Seed": seed,
        "Mean Reward": f"{np.mean(r):.2f}",
        "Std Dev": f"{np.std(r):.2f}",
        "Min Reward": f"{np.min(r):.2f}",
        "Max Reward": f"{np.max(r):.2f}",
        "Success Rate": f"{(r >= 200).sum() / len(r) * 100:.1f}%"
    })

all_rewards = np.concatenate(list(evaluation_results.values()))
rows.append({
    "Seed": "Overall",
    "Mean Reward": f"{np.mean(all_rewards):.2f}",
    "Std Dev": f"{np.std(all_rewards):.2f}",
    "Min Reward": f"{np.min(all_rewards):.2f}",
    "Max Reward": f"{np.max(all_rewards):.2f}",
    "Success Rate": f"{(all_rewards >= 200).sum() / len(all_rewards) * 100:.1f}%"
})

df_summary = pd.DataFrame(rows)
print(f"*** {SELECTED_ALGORITHM.upper()} MULTI-SEED EVALUATION SUMMARY ***")
print(f"Episodes per seed: {EVALUATION_EPISODES}")
print(f"Total episodes: {len(all_rewards)}")
print()
print(df_summary.to_string(index=False))

In [None]:
# Per-Seed: Evaluation Convergence Plots

fig, axes = plt.subplots(1, len(SEED_LIST), figsize=(6 * len(SEED_LIST), 5), sharey=True)
if len(SEED_LIST) == 1:
    axes = [axes]

for ax, seed in zip(axes, SEED_LIST):
    rewards = evaluation_results[seed]
    episodes = np.arange(1, len(rewards) + 1)
    running_mean = np.cumsum(rewards) / episodes

    ax.scatter(episodes, rewards, color='gray', alpha=0.4, s=20)
    ax.plot(episodes, running_mean, color='blue', linewidth=2)
    ax.axhline(y=200, color='red', linestyle='--')
    ax.set_title(f"Seed {seed}")
    ax.set_xlabel("Episode")
    ax.grid(True, alpha=0.3)

axes[0].set_ylabel("Reward")
fig.suptitle(f"{SELECTED_ALGORITHM.upper()} Evaluation: {EVALUATION_EPISODES} Episodes per Seed", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Aggregated: Evaluation Bar Chart (mean reward per seed with error bars)

means = [np.mean(evaluation_results[s]) for s in SEED_LIST]
stds = [np.std(evaluation_results[s]) for s in SEED_LIST]
labels = [str(s) for s in SEED_LIST]

plt.figure(figsize=(max(8, 3 * len(SEED_LIST)), 6))
bars = plt.bar(labels, means, yerr=stds, capsize=5, color=colors[:len(SEED_LIST)], alpha=0.8)
plt.axhline(y=200, color='red', linestyle='--', label='Solved Threshold (200)')
plt.axhline(y=np.mean(all_rewards), color='blue', linestyle='-', linewidth=2,
            label=f'Overall Mean ({np.mean(all_rewards):.1f})')

plt.title(f"{SELECTED_ALGORITHM.upper()} Mean Reward per Seed ({EVALUATION_EPISODES} episodes each)", fontsize=14)
plt.xlabel("Seed")
plt.ylabel("Mean Reward")
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.show()

In [None]:
# GIF Visualization (one per seed)

for seed in SEED_LIST:
    print(f"Generating GIF for seed {seed}...")

    load_path = os.path.join(MODELS_DIR, f"lab005_{SELECTED_ALGORITHM}_{seed}")

    def make_vis_env(s=seed):
        env = gym.make(GYMNASIUM_MODEL, render_mode="rgb_array", enable_wind=WIND_ENABLED)
        env.reset(seed=s)
        return env

    vis_model = ALGORITHM_CLASS.load(load_path, env=DummyVecEnv([make_vis_env]), device=DEVICE)

    vis_env = gym.make(GYMNASIUM_MODEL, render_mode="rgb_array", enable_wind=WIND_ENABLED)
    frames = []
    obs, info = vis_env.reset(seed=seed)
    done = False

    while not done:
        action, _ = vis_model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = vis_env.step(action)
        done = terminated or truncated
        frames.append(vis_env.render())

    vis_env.close()

    gif_path = os.path.join(OUTPUT_DIR, f"{SELECTED_ALGORITHM}_seed{seed}.gif")
    imageio.mimsave(gif_path, frames, fps=30)
    print(f"  Saved: {gif_path}")
    display(Image(filename=gif_path))