# All code generated by Chatgpt

In [2]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
from tqdm import trange

In [3]:
# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x1e0731600b0>

In [4]:
# Define Q-Network
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
# Initialise env
env = gym.make("LunarLander-v3", render_mode="human")
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

# Initialise models
model = QNetwork(input_dim, output_dim).to(device)
target_model = QNetwork(input_dim, output_dim).to(device)
target_model.load_state_dict(model.state_dict())
target_model.eval()

QNetwork(
  (net): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=4, bias=True)
  )
)

In [6]:
# Hyperparameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.995
batch_size = 64
learning_rate = 1e-3
episodes = 500
target_update_freq = 10
replay_buffer = deque(maxlen=100_000)

# Optimiser & Loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

## Hyperparameters for DQN Training

These hyperparameters guide how the DQN agent learns and explores:

- **Gamma:** Determines how much future rewards matter (close to 1 means long-term rewards are valued highly).
- **Epsilon:** Starting exploration rate (100% exploration at start).
- **Epsilon Min:** The minimum exploration rate (ensures some exploration continues).
- **Epsilon Decay:** Reduces epsilon over time, gradually shifting from exploration to exploitation.
- **Batch Size:** Number of experiences sampled at once during training.
- **Learning Rate:** How quickly the network updates its parameters.
- **Episodes:** Number of episodes the agent trains for.
- **Target Network Update Frequency:** How often the target Q-network gets updated.

### Replay Buffer
A replay buffer to stabilise training by reusing past experiences.

### Optimiser & Loss Function
- **Optimiser:** Adam optimiser, chosen for its stability and adaptive learning rate.
- **Loss Function:** Mean Squared Error (MSE) loss, used to measure how accurately predicted Q-values match target Q-values.


In [9]:
# Training loop
for episode in trange(episodes):
    state = env.reset()[0]
    state = torch.tensor(state, dtype=torch.float32).to(device)
    done = False
    total_reward = 0

    while not done:
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_values = model(state.unsqueeze(0))
                action = q_values.argmax().item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        # Training
        if len(replay_buffer) >= batch_size:
            minibatch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*minibatch)

            states = torch.stack(states)
            next_states = torch.stack(next_states)
            actions = torch.tensor(actions, dtype=torch.int64, device=device)
            rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
            dones = torch.tensor(dones, dtype=torch.bool, device=device)

            q_vals = model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            with torch.no_grad():
                next_q_vals = target_model(next_states).max(1)[0]
                target_q = rewards + gamma * next_q_vals * (~dones)

            loss = loss_fn(q_vals, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Update target network
    if episode % target_update_freq == 0:
        target_model.load_state_dict(model.state_dict())

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

print("Training finished.")
env.close()

100%|██████████| 500/500 [1:33:58<00:00, 11.28s/it]

Training finished.





## Training Loop for DQN (Simple Explanation)

Here's how the agent learns over 500 episodes:

- **Start an episode:**  
  Reset the environment and get the initial state.

- **Choose actions (Epsilon-greedy):**  
  The agent either:
  - Picks a random action (to explore), or
  - Uses the trained model to pick the best-known action (to exploit).

- **Interact with the environment:**  
  Perform the chosen action, receive a new state and reward, and check if the episode is done.

- **Store experience:**  
  Save each experience (state, action, reward, next state, done) in memory.

- **Train from experience:**  
  Once there's enough experience:
  - Randomly sample experiences.
  - Calculate predicted and target Q-values.
  - Update the network based on the difference (loss) between these values.

- **Update Target Network:**  
  Every 10 episodes, update the target model with the main model’s parameters.

- **Reduce exploration gradually:**  
  Decrease exploration over time, letting the agent focus more on learned actions.

After training, the environment is closed, and the model is ready to use.


In [10]:
# Save model
torch.save(model.state_dict(), "lunarlander_dqn.pth")
print("model saved")

model saved
