In [1]:
import time
import gymnasium as gym
import numpy as np
import tensorflow as tf
import keras
from collections import deque

env = gym.make('LunarLander-v3')

In [2]:
class DQN(tf.keras.Model): 
    """Perceptron multicapa de 2 capas de 32 y una se salida""" 
    def __init__(self, num_actions: int): 
        super(DQN, self).__init__() 
        self.dense1 = keras.layers.Dense(32, activation="relu") 
        self.dense2 = keras.layers.Dense(32, activation="relu") 
        self.dense3 = keras.layers.Dense(num_actions, dtype=tf.float32) # No activation 

    def call(self, x): 
        """Construcción de las capas""" 
        x = self.dense1(x) 
        x = self.dense2(x) 
        return self.dense3(x) 

In [3]:
class ReplayBuffer(object): 
    """Experience replay buffer that samples uniformly.""" 
    def __init__(self, size): 
        self.buffer = deque(maxlen=size) 

    def add(self, state, action, reward, next_state, done): 
        self.buffer.append((state, action, reward, next_state, done)) 

    def __len__(self): 
        return len(self.buffer) 
    
    def sample(self, num_samples): 
        states, actions, rewards, next_states, dones = [], [], [], [], [] 
        idx = np.random.choice(len(self.buffer), num_samples) 
        for i in idx: 
            elem = self.buffer[i] 
            state, action, reward, next_state, done = elem 
            states.append(np.array(state, copy=False)) 
            # actions.append(np.array(action, copy=False)) 
            actions.append(np.array(action)) 
            rewards.append(reward) 
            next_states.append(np.array(next_state, copy=False)) 
            dones.append(done) 
        states = np.array(states) 
        actions = np.array(actions) 
        rewards = np.array(rewards, dtype=np.float32) 
        next_states = np.array(next_states) 
        dones = np.array(dones, dtype=np.float32) 
        return states, actions, rewards, next_states, dones

## **Entrenamiento**

In [4]:
# Hyperparámetros 
epsilon = 1.0 
batch_size = 128 
discount = 0.99 
learning_rate = 0.0001
cur_frame = 0 
num_actions = int(env.action_space.n)

buffer = ReplayBuffer(100000) 
main_nn = DQN(num_actions) # Red principal 
target_nn = DQN(num_actions) # Red objetivo 
mse = keras.losses.MeanSquaredError()
optimizer = keras.optimizers.Adam(learning_rate)

In [5]:
def select_epsilon_greedy_action(state, epsilon): 
    """Acción aleatoria con probabilidad menor que epsilon, en otro caso la mejor.""" 
    result = tf.random.uniform((1,)) 
    if result < epsilon: 
        return env.action_space.sample() # Elegimos una acción aleatória 
    else: 
        return tf.argmax(main_nn(state)[0]).numpy() # Elección de acción Greedy. 
    
@tf.function # Configuración de cada iteración de entrenamiento 
def train_step(states, actions, rewards, next_states, dones): 
    # Calculo de los objetivos (segunda red) 
    next_qs = target_nn(next_states) 
    max_next_qs = tf.reduce_max(next_qs, axis=-1) 
    target = rewards + (1. - dones) * discount * max_next_qs 
    with tf.GradientTape() as tape: 
        qs = main_nn(states) 
        action_masks = tf.one_hot(actions, num_actions) 
        masked_qs = tf.reduce_sum(action_masks * qs, axis=-1) 
        loss = mse(target, masked_qs) 
    grads = tape.gradient(loss, main_nn.trainable_variables) 
    optimizer.apply_gradients(zip(grads, main_nn.trainable_variables)) 
    return loss

In [None]:
num_episodes = 1000
trace = 50
for episode in range(num_episodes+1): 
    state, _ = env.reset()
    ep_reward, done = 0, False
    while not done or truncated: 
        state_in = tf.expand_dims(state, axis=0) 
        action = select_epsilon_greedy_action(state_in, epsilon) 
        next_state, reward, done, truncated, _ = env.step(action) 
        ep_reward += reward 
        buffer.add(state, action, reward, next_state, done) 
        state = next_state 
        cur_frame += 1 
        # Copiamos los pesos de main_nn a target_nn. 
        if cur_frame % 2000 == 0: 
            target_nn.set_weights(main_nn.get_weights()) 
        # ---------------------------------------------
        if len(buffer) >= batch_size: 
            states, actions, rewards, next_states, dones = buffer.sample(batch_size) 
            loss = train_step(states, actions, rewards, next_states, dones) 
    if episode < 950: 
        epsilon -= 0.001 
    if episode % trace == 0:
        print(f'Episode {episode}/{num_episodes}. Epsilon: {epsilon:.3f}. Reward: {ep_reward:.3f}')
        
env.close()