In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import gymnasium as gym

In [17]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim_1, output_dim_2):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.out_1 = nn.Linear(512, output_dim_1)
        self.out_2 = nn.Linear(512, output_dim_2)

    def forward(self, x):
        x = torch.nn.functional.leaky_relu(self.fc1(x))
        x = torch.nn.functional.leaky_relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        
        out_1 = self.out_1(x)
        out_2 = self.out_2(x)

        return out_1, out_2

In [18]:
class DQNAgent:
    def __init__(self, state_dim, action_dim_1, action_dim_2, lr, gamma, epsilon, epsilon_decay, buffer_size):
        self.state_dim = state_dim
        self.action_dim_1 = action_dim_1
        self.action_dim_2 = action_dim_2
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)
        self.model = DQN(state_dim, action_dim_1, action_dim_2)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_dim_1), np.random.choice(self.action_dim_2)
        q_values_1, q_values_2 = self.model(torch.tensor(state, dtype=torch.float32))
        return torch.argmax(q_values_1).item(), torch.argmax(q_values_2).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.tensor(state, dtype=torch.float32)
            next_state = torch.tensor(next_state, dtype=torch.float32)

            q_values_1, q_values_2 = self.model(state)
            next_q_values_1, next_q_values_2 = self.model(next_state)

            target_1 = reward + (self.gamma * torch.max(next_q_values_1).item()) if not done else reward
            target_2 = reward + (self.gamma * torch.max(next_q_values_2).item()) if not done else reward

            target_f_1, target_f_2 = self.model(state)
            
            target_f_1[action[0]] = target_1
            target_f_2[action[1]] = target_2

            # Compute the loss
            loss_1 = nn.MSELoss()(q_values_1, target_f_1)
            loss_2 = nn.MSELoss()(q_values_2, target_f_2)

            # Total loss
            loss = loss_1 + loss_2
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay

In [19]:
from maze_world import MazeWorldEnv
from gymnasium.envs.registration import register

register(
     id="gym_examples/MazeWorldEnv-v0",
     entry_point="maze_world:MazeWorldEnv",
     max_episode_steps=250,
)

In [24]:
# Initialize environment and agent with Experience Replay Buffer
size = 7
env = gym.make("gym_examples/MazeWorldEnv-v0", render_mode="human", size=size)
dimensions = [space.shape for space in list(env.observation_space.values())]
state_dim = sum([1 if len(t) == 0 else t[0] if len(t) == 1 else t[0] * t[1] for t in dimensions])
action_dim = np.sum([space.n for space in env.action_space])
agent = DQNAgent(state_dim, env.action_space[0].n, env.action_space[1].n, lr=0.005, gamma=0.99, epsilon=1.0, epsilon_decay=0.999, buffer_size=10000)

# Train the DQN agent with Experience Replay Buffer
batch_size = 64
num_episodes = 100
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0
    step = 0
    while not done:
        action = agent.act(np.concatenate((state["maze"].reshape(size*size,), state["start"], state["end"])))
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        agent.remember(np.concatenate((state["maze"].reshape(size*size,), state["start"], state["end"])), action, reward, np.concatenate((next_state["maze"].reshape(size*size,), next_state["start"], next_state["end"])), done)
        state = next_state
        total_reward += reward
        step += 1
        if step % 5 == 0:
            agent.replay(batch_size)
    print(f"Episode: {episode + 1}, Total Reward: {total_reward} Total Steps: {step}")

Episode: 1, Total Reward: -42951.5 Total Steps: 250
Episode: 2, Total Reward: -49074.0 Total Steps: 250
Episode: 3, Total Reward: 1471.5 Total Steps: 18
Episode: 4, Total Reward: -51108.5 Total Steps: 250
Episode: 5, Total Reward: -43560.5 Total Steps: 250
Episode: 6, Total Reward: -40332.0 Total Steps: 250
Episode: 7, Total Reward: -52277.0 Total Steps: 250
Episode: 8, Total Reward: -42754.0 Total Steps: 250
Episode: 9, Total Reward: -20489.5 Total Steps: 132
Episode: 10, Total Reward: -49551.0 Total Steps: 250
Episode: 11, Total Reward: -35311.0 Total Steps: 202
Episode: 12, Total Reward: -39436.0 Total Steps: 250
Episode: 13, Total Reward: -50423.5 Total Steps: 250
Episode: 14, Total Reward: 1768.0 Total Steps: 25
Episode: 15, Total Reward: -55098.5 Total Steps: 250
Episode: 16, Total Reward: -40698.5 Total Steps: 250
Episode: 17, Total Reward: -53282.5 Total Steps: 250
Episode: 18, Total Reward: -34074.0 Total Steps: 250
Episode: 19, Total Reward: -41646.0 Total Steps: 250
Episode:

In [26]:
state, _ = env.reset()
done = False
total_reward = 0
# agent.epsilon = 0.0
while not done:
    action = agent.act(np.concatenate((state["maze"].reshape(size*size,), state["start"], state["end"])))
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    state = next_state
    total_reward += reward
print(total_reward)

892.5
