In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import gymnasium as gym

In [12]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  
        self.fc2 = nn.Linear(128, 128)      
        self.fc3 = nn.Linear(128, output_dim)     

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)

        return x

In [13]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, lr, gamma, epsilon, epsilon_decay, buffer_size):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=buffer_size)
        self.model = DQN(state_dim, action_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_dim)
        q_values = self.model(torch.tensor(state, dtype=torch.float32))
        return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.tensor(state, dtype=torch.float32)
            next_state = torch.tensor(next_state, dtype=torch.float32)

            q_values = self.model(state)
            next_q_values = self.model(next_state)

            target = reward + (self.gamma * torch.max(next_q_values).item()) if not done else reward
        

            target_f = self.model(state)
            
            target_f[action] = target

            # Compute the loss
            loss = nn.MSELoss()(q_values, target_f)
        
            # Total loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        if self.epsilon > 0.01:
            self.epsilon *= self.epsilon_decay

In [14]:
from monster_gen_world import MonsterWorldGenEnv
from gymnasium.envs.registration import register

register(
     id="gym_examples/MonsterWorldGenEnv-v0",
     entry_point="monster_gen_world:MonsterWorldGenEnv",
     max_episode_steps=200,
)

In [None]:
# Initialize environment and agent with Experience Replay Buffer
env = gym.make("gym_examples/MonsterWorldGenEnv-v0",)
state_dim = 4
action_dim = env.action_space.n
agent = DQNAgent(state_dim, action_dim, lr=0.0001, gamma=0.99, epsilon=1.0, epsilon_decay=0.9995, buffer_size=10000)

# Train the DQN agent with Experience Replay Buffer
batch_size = 128
num_episodes = 100
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0
    step = 0
    while not done:
        action = agent.act(state["stats"])
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        agent.remember(state["stats"], action, reward, next_state["stats"], done)
        state = next_state
        total_reward += reward
        step += 1
        agent.replay(batch_size)
    print(f"Episode: {episode + 1}, Total Reward: {total_reward} Total Steps: {step} Agent Epsilon: {agent.epsilon}")

Episode: 1, Total Reward: -153.80000000000007 Total Steps: 200 Agent Epsilon: 0.9641492930601405
Episode: 2, Total Reward: -200.0 Total Steps: 200 Agent Epsilon: 0.8723765399749112
Episode: 3, Total Reward: -186.17999999999998 Total Steps: 200 Agent Epsilon: 0.7893391956790312
Episode: 4, Total Reward: -164.25999999999996 Total Steps: 200 Agent Epsilon: 0.7142057784510548
Episode: 5, Total Reward: -199.8 Total Steps: 200 Agent Epsilon: 0.6462239513319374
Episode: 6, Total Reward: -196.58 Total Steps: 200 Agent Epsilon: 0.5847129887141925
Episode: 7, Total Reward: -181.81999999999996 Total Steps: 200 Agent Epsilon: 0.5290569600003411
Episode: 8, Total Reward: -200.0 Total Steps: 200 Agent Epsilon: 0.47869856207627054
Episode: 9, Total Reward: -200.0 Total Steps: 200 Agent Epsilon: 0.4331335388419069
Episode: 10, Total Reward: -33.45999999999999 Total Steps: 73 Agent Epsilon: 0.41760539527506135
Episode: 11, Total Reward: -200.0 Total Steps: 200 Agent Epsilon: 0.37785553796199073
Episode

In [None]:
state, _ = env.reset()
done = False
total_reward = 0
reward = 0.0
done = False
agent.epsilon = 0.0
while not done:
    action = agent.act(state["stats"])
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    state = next_state
    total_reward += reward
print(state)
print(reward)