# LunarLander-v3 Project using DQN and PPO

# 1. Libraries

In [1]:
# ====== Libraries  ======
import os
import random
import numpy as np
import random

import gymnasium as gym

# Plots
import matplotlib.pyplot as plt
%matplotlib inline

# PPO 
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor

import json
from stable_baselines3.common.utils import set_random_seed


from collections import deque, namedtuple
import random
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

from IPython.display import Image, display
from itertools import product


# 2. Create LunarLander-v3 environment

In [2]:
ENV_ID = "LunarLander-v3"
OUT_DIR = "runs_lunarlander_ppo"
os.makedirs(OUT_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)
print("Gymnasium:", gym.__version__)

Device: cpu
Gymnasium: 1.2.3


In [3]:
env = gym.make(ENV_ID)
obs, _ = env.reset(seed=42)
print("Env:", ENV_ID)
print("Obs space:", env.observation_space)
print("Action space:", env.action_space)
print("Obs shape:", obs.shape)
print("Example obs:", obs)
env.close()

Env: LunarLander-v3
Obs space: Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Action space: Discrete(4)
Obs shape: (8,)
Example obs: [ 0.00229702  1.4181306   0.2326471   0.3204666  -0.00265488 -0.05269805
  0.          0.        ]


In [None]:
# IMPORTANT: set this equal to DQN's total env steps for a fair comparison
TOTAL_STEPS = 1_050_000   
EVAL_EPISODES = 20
FINAL_SEEDS = [42, 123, 999]

# 3. ENV FACTORY (train env, eval env, wind)

In [5]:
def make_env(seed: int, enable_wind: bool = False, wind_power: float = 15.0, turbulence_power: float = 1.5):
    def _init():
        env = gym.make(
            ENV_ID,
            continuous=False,
            enable_wind=enable_wind,
            wind_power=wind_power,
            turbulence_power=turbulence_power,
        )
        env = Monitor(env)  # logs episode return/length
        env.reset(seed=seed)
        env.action_space.seed(seed)
        return env
    return _init

def make_vec_env(seed: int, enable_wind: bool = False):
    venv = DummyVecEnv([make_env(seed, enable_wind=enable_wind)])
    venv = VecMonitor(venv)
    return venv


# 4. PPO CFG SAMPLER (Monte Carlo configs)

In [6]:

#  includes exploration/entropy via ent_coef
#  includes net architecture via policy_kwargs

def sample_ppo_cfg(rng: random.Random):
    # discrete LR choices like DQN
    lr = rng.choice([2e-4, 1e-4, 8e-5, 5e-5, 3e-5])

    net_arch = rng.choice([
        [256, 256],
        [128, 128],
        [256, 128],
        [64, 64],
    ])

    return {
        "learning_rate": lr,
        "gamma": rng.choice([0.97, 0.98, 0.99, 0.995]),
        "gae_lambda": rng.choice([0.90, 0.95, 0.97]),
        "n_steps": rng.choice([512, 1024, 2048]),
        "batch_size": rng.choice([64, 128, 256]),
        "n_epochs": rng.choice([5, 10, 15]),
        "clip_range": rng.choice([0.1, 0.2, 0.3]),
        "ent_coef": rng.choice([0.0, 0.001, 0.005, 0.01]),  # exploration/entropy 
        "vf_coef": rng.choice([0.3, 0.5, 0.7]),
        "max_grad_norm": rng.choice([0.5, 1.0, 2.0]),
        "policy_kwargs": dict(net_arch=net_arch),
    }


# 5. HELPERS (load monitor csv, moving avg, grad updates)

In [7]:
def moving_avg(x, w=100):
    x = np.asarray(x, dtype=float)
    if len(x) < w:
        return np.array([])
    return np.convolve(x, np.ones(w)/w, mode="valid")

def load_monitor_csv(monitor_csv_path: str):
    # SB3 Monitor CSV has commented header lines starting with '#'
    data = np.genfromtxt(monitor_csv_path, delimiter=",", names=True, dtype=None, encoding="utf-8", comments="#")
    # columns: r (reward), l (length), t (time)
    return data["r"], data["l"], data["t"]

def ppo_grad_updates(total_steps: int, n_steps: int, batch_size: int, n_epochs: int):
    # For PPO: each rollout collects n_steps transitions (per env; here n_env=1).
    # Updates per rollout = n_epochs * (n_steps / batch_size) (integer batches if divisible)
    rollouts = total_steps // n_steps
    minibatches = int(np.ceil(n_steps / batch_size))
    return int(rollouts * n_epochs * minibatches)


# 6. RUN PPO (single function used by tuning + final)

In [8]:
#    - trains with TOTAL_STEPS (for fairness)
#    - evaluates deterministically for EVAL_EPISODES
#    - saves cfg, model, monitor logs

def run_ppo(cfg: dict, seed: int, run_name: str, total_steps: int = TOTAL_STEPS,
            eval_episodes: int = EVAL_EPISODES, wind_eval: bool = False):
    cfg = dict(cfg)

    # Reproducibility
    random.seed(seed)
    np.random.seed(seed)
    set_random_seed(seed)

    # Train env (NO WIND for fair training by default)
    venv = make_vec_env(seed, enable_wind=False)

    # Tensorboard + monitor logs
    tb_log = os.path.join(OUT_DIR, "tb")
    os.makedirs(tb_log, exist_ok=True)

    model = PPO(
        policy="MlpPolicy",
        env=venv,
        seed=seed,
        verbose=0,
        tensorboard_log=tb_log,
        **cfg
    )

    model.learn(total_timesteps=int(total_steps), tb_log_name=run_name)

    # Save model
    model_path = os.path.join(OUT_DIR, f"{run_name}_seed{seed}.zip")
    model.save(model_path)

    # Save cfg
    cfg_path = os.path.join(OUT_DIR, f"{run_name}_seed{seed}_cfg.json")
    with open(cfg_path, "w", encoding="utf-8") as f:
        json.dump(cfg, f, indent=2)

    # Eval env (optionally wind stress-test)
    eval_env = make_vec_env(seed + 10_000, enable_wind=wind_eval)
    mean_eval, std_eval = evaluate_policy(
        model, eval_env, n_eval_episodes=int(eval_episodes), deterministic=True, return_episode_rewards=False
    )

    # Grad update estimate
    grad_updates = ppo_grad_updates(
        total_steps=int(total_steps),
        n_steps=int(cfg["n_steps"]),
        batch_size=int(cfg["batch_size"]),
        n_epochs=int(cfg["n_epochs"]),
    )

    res = {
        "algo": "PPO",
        "run_name": run_name,
        "seed": seed,
        "cfg": cfg,
        "total_steps": int(total_steps),
        "grad_updates": int(grad_updates),
        "eval_episodes": int(eval_episodes),
        "eval_mean": float(mean_eval),
        "eval_std": float(std_eval),
        "model_path": model_path,
        "cfg_path": cfg_path,
        # monitor file path (VecMonitor stores monitor.csv under its log dir;
        # easiest is to store a separate Monitor per env; here we used VecMonitor w/ default)
    }

    venv.close()
    eval_env.close()
    return res


# 7. MONTE CARLO TUNING LOOP (select best cfg)

In [9]:
def mc_tune_ppo(n_trials=30, tune_seed=42, train_seed=42, total_steps: int = TOTAL_STEPS):
    rng = random.Random(tune_seed)
    trials, best = [], None

    for t in range(1, n_trials + 1):
        cfg = sample_ppo_cfg(rng)
        run_name = f"ppo_trial{t:02d}"

        print(f"\n=== PPO Trial {t}/{n_trials} | cfg: {cfg} ===")
        res = run_ppo(cfg, seed=train_seed, run_name=run_name, total_steps=total_steps, eval_episodes=EVAL_EPISODES)
        trials.append(res)

        if best is None or res["eval_mean"] > best["eval_mean"]:
            best = res

        print(f"Trial {t} -> eval_mean: {res['eval_mean']:.2f} ± {res['eval_std']:.2f} | BEST: {best['eval_mean']:.2f}")

    return trials, best


# 8. RUN TUNING (pick best cfg)

In [10]:
ppo_trials, best_ppo = mc_tune_ppo(n_trials=30, tune_seed=42, train_seed=42, total_steps=TOTAL_STEPS)
print("\nBEST PPO CFG:\n", best_ppo["cfg"])
print("BEST PPO eval:", best_ppo["eval_mean"], "±", best_ppo["eval_std"])


=== PPO Trial 1/30 | cfg: {'learning_rate': 0.0002, 'gamma': 0.99, 'gae_lambda': 0.9, 'n_steps': 512, 'batch_size': 64, 'n_epochs': 15, 'clip_range': 0.1, 'ent_coef': 0.0, 'vf_coef': 0.7, 'max_grad_norm': 1.0, 'policy_kwargs': {'net_arch': [256, 256]}} ===




Trial 1 -> eval_mean: -69.30 ± 112.35 | BEST: -69.30

=== PPO Trial 2/30 | cfg: {'learning_rate': 0.0002, 'gamma': 0.97, 'gae_lambda': 0.9, 'n_steps': 512, 'batch_size': 256, 'n_epochs': 15, 'clip_range': 0.1, 'ent_coef': 0.001, 'vf_coef': 0.7, 'max_grad_norm': 2.0, 'policy_kwargs': {'net_arch': [256, 256]}} ===
Trial 2 -> eval_mean: -110.40 ± 36.40 | BEST: -69.30

=== PPO Trial 3/30 | cfg: {'learning_rate': 3e-05, 'gamma': 0.98, 'gae_lambda': 0.95, 'n_steps': 2048, 'batch_size': 128, 'n_epochs': 5, 'clip_range': 0.1, 'ent_coef': 0.01, 'vf_coef': 0.5, 'max_grad_norm': 1.0, 'policy_kwargs': {'net_arch': [64, 64]}} ===
Trial 3 -> eval_mean: -1496.99 ± 316.12 | BEST: -69.30

=== PPO Trial 4/30 | cfg: {'learning_rate': 0.0001, 'gamma': 0.99, 'gae_lambda': 0.9, 'n_steps': 512, 'batch_size': 128, 'n_epochs': 5, 'clip_range': 0.2, 'ent_coef': 0.005, 'vf_coef': 0.7, 'max_grad_norm': 1.0, 'policy_kwargs': {'net_arch': [128, 128]}} ===
Trial 4 -> eval_mean: 126.64 ± 90.94 | BEST: 126.64

=== PPO

# 9. FINAL TRAINING (3 seeds) using best cfg + fair comparison

In [11]:
final_cfg = dict(best_ppo["cfg"])

ppo_final = []
for s in FINAL_SEEDS:
    print("\n=== PPO FINAL seed", s, "===")
    ppo_final.append(run_ppo(final_cfg, seed=s, run_name="ppo_final", total_steps=TOTAL_STEPS, eval_episodes=EVAL_EPISODES))

means = [r["eval_mean"] for r in ppo_final]
print("\nFINAL PPO (20 eps) mean across seeds:", float(np.mean(means)), "| std across seeds:", float(np.std(means, ddof=1)))
print("Gradient updates (per seed):", [r["grad_updates"] for r in ppo_final])



=== PPO FINAL seed 42 ===

=== PPO FINAL seed 123 ===

=== PPO FINAL seed 999 ===

FINAL PPO (20 eps) mean across seeds: 188.22471618652344 | std across seeds: 18.200725422976788
Gradient updates (per seed): [105120, 105120, 105120]


# 10. DIFFERENCIATOR: stress-test wind (evaluation only)

In [12]:
ppo_wind = []
for s in FINAL_SEEDS:
    print("\n=== PPO WIND-EVAL seed", s, "===")
    ppo_wind.append(run_ppo(final_cfg, seed=s, run_name="ppo_final_wind", total_steps=TOTAL_STEPS,
                            eval_episodes=EVAL_EPISODES, wind_eval=True))

means_w = [r["eval_mean"] for r in ppo_wind]
print("\nWIND-EVAL PPO mean across seeds:", float(np.mean(means_w)), "| std:", float(np.std(means_w, ddof=1)))



=== PPO WIND-EVAL seed 42 ===





=== PPO WIND-EVAL seed 123 ===

=== PPO WIND-EVAL seed 999 ===

WIND-EVAL PPO mean across seeds: 112.46104939778645 | std: 21.85254123618533


# 11. SUMMARY TABLE (clean reporting)

In [13]:
def summarize(results, title):
    means = [r["eval_mean"] for r in results]
    stds  = [r["eval_std"]  for r in results]
    print("\n" + title)
    for r in results:
        print(f"seed {r['seed']:>4d} | eval_mean±std: {r['eval_mean']:.2f} ± {r['eval_std']:.2f} | grad_updates: {r['grad_updates']}")
    print(f"Across seeds: mean={np.mean(means):.2f} | std={np.std(means, ddof=1):.2f}")

summarize(ppo_final, "PPO FINAL (no wind, deterministic eval)")
summarize(ppo_wind,  "PPO STRESS-TEST (wind in eval only)")



PPO FINAL (no wind, deterministic eval)
seed   42 | eval_mean±std: 171.66 ± 90.28 | grad_updates: 105120
seed  123 | eval_mean±std: 207.71 ± 60.09 | grad_updates: 105120
seed  999 | eval_mean±std: 185.31 ± 85.90 | grad_updates: 105120
Across seeds: mean=188.22 | std=18.20

PPO STRESS-TEST (wind in eval only)
seed   42 | eval_mean±std: 87.49 ± 130.91 | grad_updates: 105120
seed  123 | eval_mean±std: 128.08 ± 117.93 | grad_updates: 105120
seed  999 | eval_mean±std: 121.81 ± 139.12 | grad_updates: 105120
Across seeds: mean=112.46 | std=21.85
