In [None]:
import os, sys, random, time
import warnings

import numpy as np
import torch
import matplotlib.pyplot as plt

import gymnasium as gym
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

import imageio
from IPython.display import Image

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED) # to assure reproducibility on numpy (affects functions like np.random.rand, np.random.shuffle, etc.)
torch.manual_seed(SEED)  # to assure reproducibility on Torch (affects weight initialization, dropout, data shuffling, etc.)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed_all(SEED) # usefull when using more than one GPT, otherwise torch.manual_seed is enough

# Ensure deterministic behavior in CuDNN (NVIDIA backend for deep learning ops).
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
OUT_DIR = './outputs_DQN/'
os.makedirs(OUT_DIR, exist_ok=True)

TENSORBOARD_LOGS_DIR = OUT_DIR + "tensorboard/"
SAVE_MODEL_PATH = OUT_DIR + "model_dqn.zip"

MODEL_NAME = "LunarLander-v3"
MLP_POLICY = "MlpPolicy"

env = gym.make(MODEL_NAME)
env.reset(seed=SEED)

# Select device: use GPU if available, otherwise fallback to CPU. This will be very important do control in which device the processing will happen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Python:", sys.version.split()[0])
print("PyTorch:", torch.__version__)
print("Device:", device)
print("CUDA:", torch.version.cuda if torch.cuda.is_available() else "None")

In [None]:
def make_env():
    """Factory function that creates a fresh LunarLander environment."""
    return gym.make(MODEL_NAME)

env = DummyVecEnv([make_env])

In [None]:
"""
model_dqn = DQN(
    policy=MLP_POLICY,
    env=env,
    exploration_fraction=0.12,
    learning_rate=6.3e-4,
    buffer_size=50_000,
    batch_size=128,
    gamma=0.99,
    # Number of timesteps to collect before the first training update. 
    # During the first 500 steps, the agent only explores and fills the replay buffer.
    learning_starts=0,
    # Frequency (in timesteps) at which the target network is updated copying the weights from the main Q-network
    target_update_interval=250,
    # How often to perform a gradient update.
    # With train_freq=4, the network is updated once every 4 environment steps.
    train_freq=4,

    verbose=1,  # verbose 0 for disable logs
    seed=SEED,
    tensorboard_log=TENSORBOARD_LOGS_DIR,
)
"""

In [None]:
model_dqn = DQN(
    policy=MLP_POLICY,
    env=env,
    exploration_fraction=0.12,
    exploration_final_eps=0.02,
    learning_rate=6.3e-4,
    buffer_size=50_000,
    batch_size=128,
    gamma=0.99,
    # Number of timesteps to collect before the first training update. 
    # During the first 500 steps, the agent only explores and fills the replay buffer.
    learning_starts=0,
    # Frequency (in timesteps) at which the target network is updated copying the weights from the main Q-network
    target_update_interval=250,
    # How often to perform a gradient update.
    # With train_freq=4, the network is updated once every 4 environment steps.
    train_freq=4,

    verbose=1,  # verbose 0 for disable logs
    seed=SEED,
    tensorboard_log=TENSORBOARD_LOGS_DIR,
)

In [None]:
class DQNLoggingCallback(BaseCallback):
    """
    Custom callback for logging exploration-related metrics during training.

    This callback tracks:
    - The exploration rate (epsilon) over time
    - Episode rewards (one value per completed episode)
    - The number of gradient update steps performed by the agent

    The data collected here will later be used to plot training curves directly inside the notebook, complementing (but not replacing) TensorBoard.
    """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        # list to store the metrics
        self.epsilon_history = []
        self.reward_history = []
        self.update_steps = []
        self.episode_reward = 0

    def _on_step(self) -> bool:
        # epsilon info
        if hasattr(self.model, "exploration_rate"):
            self.epsilon_history.append(self.model.exploration_rate)

        # reward info
        reward = self.locals.get("rewards")
        if reward is not None:
            self.episode_reward += reward[0]

        # check if episode has end (done)
        done = self.locals.get("dones")
        if done is not None and done[0]:
            self.reward_history.append(self.episode_reward)
            self.episode_reward = 0 # reset for next episode

        # number of gradient updates performed
        self.update_steps.append(self.model._n_updates)

        return True

In [None]:
#instanciate the callback for training metrics logging
callback = DQNLoggingCallback()

model_dqn.learn(
    # total_timesteps=500_000,
    total_timesteps=750_000,
    callback = callback,
    progress_bar=True)

In [None]:
plt.figure(figsize=(14, 10))

# 1. EPSILON EVOLUTION
plt.subplot(3, 1, 1)
plt.plot(callback.epsilon_history)
plt.title("Exploration Rate (Epsilon) over Time")
plt.xlabel("Timesteps")
plt.ylabel("Epsilon")
plt.grid(True)

# 2. EPISODE REWARD EVOLUTION
plt.subplot(3, 1, 2)
plt.plot(callback.reward_history)
plt.title("Episode Reward over Training")
plt.xlabel("Episode")
plt.ylabel("Cumulative Reward")
plt.grid(True)

# 3. NUMBER OF GRADIENT UPDATE STEPS
plt.subplot(3, 1, 3)
plt.plot(callback.update_steps)
plt.title("Number of Gradient Updates over Time")
plt.xlabel("Timesteps")
plt.ylabel("Update Steps")
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Create a new, clean, evaluation environment (non-vectorized)
eval_env = gym.make(MODEL_NAME)

# run some episodes with the trained model, in the new environment
n_eval_episodes=50
episode_rewards, episode_lengths = evaluate_policy(
    model_dqn,
    eval_env,
    n_eval_episodes=50,
    deterministic=True,
    return_episode_rewards=True,  # return per-episode returns
)

# Compute summary statistics manually
mean_reward = np.mean(episode_rewards)
std_reward = np.std(episode_rewards)

print(f"Mean reward over {n_eval_episodes} episodes: {mean_reward:.2f} Â± {std_reward:.2f}")
print(f"Episode length range: min = {int(np.min(episode_lengths))}, max = {int(np.max(episode_lengths))}")

# Visualize the distribution of episode returns
plt.figure(figsize=(8, 4))
plt.hist(episode_rewards, bins=10, edgecolor="black")
plt.title("Distribution of episode returns (DQN on " + MODEL_NAME + ")")
plt.xlabel("Total reward per episode")
plt.ylabel("Count")
plt.grid(True)
plt.show()

# Close the evaluation environment
eval_env.close()


In [None]:
# Create a new environment for visualization only. Render_mode="rgb_array" so that env.render() returns actual image frames.
env = gym.make(MODEL_NAME, render_mode="rgb_array")

frames = [] # list to store each rendered frame of the episode

# Reset the environment to get the initial observation (state)
obs, info = env.reset()
done = False

# Run one full episode using the trained DQN agent
while not done:
    action, _ = model_dqn.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    # The episode ends either naturally (terminated) or by time limit (truncated)
    done = terminated or truncated
    frames.append(env.render())

env.close()

# Save all collected frames as an animated GIF
imageio.mimsave("dqn_lunarlander.gif", frames, duration=25)


# Presenting the image
Image(filename="dqn_lunarlander.gif")


In [None]:
print(f"Saving model to: {SAVE_MODEL_PATH}")
model_dqn.save(SAVE_MODEL_PATH)

In [None]:
print("Loading the saved model...")
env2 = DummyVecEnv([make_env])
loaded_model = DQN.load(SAVE_MODEL_PATH, env=env2)

In [None]:
obs = env2.reset()
done = False
total_reward = 0

while not done:
    action, _ = loaded_model.predict(obs, deterministic=True)
    obs, reward, done, info = env2.step(action)
    total_reward += reward

print(f"Total reward by the loaded model: {total_reward}")