In [6]:
import gymnasium as gym
import numpy as np
from collections import namedtuple, deque

import matplotlib.pylab as plt
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam

import random


In [7]:
class DQNAgent:

    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory_size =100_000
        self.gamma = 0.95    # 折扣率
        self.epsilon = 1.0  # 探索率
        self.alpha = 0.001
        self.NUM_STEPS_FOR_UPDATE = 4
        self.optimizer = Adam(learning_rate = self.alpha)
        self.q_network= self._build_network()
        self.target_q_network = self._build_network()
        
    
    def _build_network(self):

        model = Sequential()
        model.add(Input(shape=(self.state_size,)))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        return model

    def compute_loss(self, experiences):
        states, actions, rewards, next_states, done_vals = experiences
        max_qsa = tf.reduce_max(self.target_q_network(next_states), axis=-1)
        y_targets = rewards + (self.gamma * max_qsa * (1-done_vals))
        q_values = self.q_network(states)
        q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]), tf.cast(actions, tf.int32)], axis=1))
        loss = MSE(y_targets, q_values)

        return loss

    def update_target_network(self):
        TAU=1e-3
        for target_weights, q_network_weights in zip(self.target_q_network.weights, self.q_network.weights):
            target_weights.assign(TAU * q_network_weights + (1.0-TAU) * target_weights)

    @tf.function
    def agent_learn(self, experiences):
        with tf.GradientTape() as tape:
            loss = self.compute_loss(experiences)
        gradients = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.q_network.trainable_variables))
        self.update_target_network()

    def get_action(self, q_values, epsilon=0):
        if random.random() > epsilon:
            return np.argmax(q_values.numpy()[0])
        else:
            return random.choice(np.arange(6))

    def check_update_conditions(self, j, memory_buffer):
        if(j+1) % self.NUM_STEPS_FOR_UPDATE == 0 and len(memory_buffer) > 64:
            return True
        else:
            return False

    def get_experiences(self, memory_buffer):
        experiences = random.sample(memory_buffer, k=64)
        states = tf.convert_to_tensor(np.array([e.state for e in experiences if e is not None]),dtype=tf.float32)
        actions = tf.convert_to_tensor(np.array([e.action for e in experiences if e is not None]), dtype=tf.float32)
        rewards = tf.convert_to_tensor(np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32)
        next_states = tf.convert_to_tensor(np.array([e.next_state for e in experiences if e is not None]),dtype=tf.float32)
        done_vals = tf.convert_to_tensor(np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
                                        dtype=tf.float32)
        return (states, actions, rewards, next_states, done_vals)

    def update_epsilon(self, epsilon):
        E_MIN = 0.01
        E_DECAY = 0.05
        return max(E_MIN, E_DECAY * epsilon)

    def get_one_hot_encoding(self, state, next_state, num_states):
        state_vector = np.zeros(num_states)
        next_state_vector = np.zeros(num_states)
        state_vector[state] = 1
        next_state_vector[next_state] = 1
        
        return state_vector, next_state_vector

In [8]:
def Train_Agent(Agent, episodes, rewards, state_size):
    memory_buffer = deque(maxlen=Agent.memory_size)
    Agent.target_q_network.set_weights(Agent.q_network.get_weights())
    epsilon = 1.0
    points_history = []
    time_step = 1000
    experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
     
    for i in range(episodes):
        observation = env.reset()
        state = observation[0]
        state, _ = Agent.get_one_hot_encoding(state, 0, state_size)
        total_points = 0
          
        for j in range(time_step):
            state_qn = np.expand_dims(state, axis=0)
            q_values = Agent.q_network(state_qn)
            action = Agent.get_action(q_values, epsilon)
            next_state, reward, terminated, truncated, _ = env.step(action)
            
            _, next_state = Agent.get_one_hot_encoding(0, next_state, state_size)
            memory_buffer.append(experience(state, action, reward, next_state, terminated))
            update = Agent.check_update_conditions(j, memory_buffer)
            if update:
                experiences = Agent.get_experiences(memory_buffer)
                Agent.agent_learn(experiences)
            
            state = next_state.copy()
            total_points += reward

            if terminated:
                break

        points_history.append(total_points)
        avg_points = np.mean(points_history[-100:])

        epsilon = Agent.update_epsilon(epsilon)

        print(f"\rEpisode {i+1} | Total point average of the last {100} episodes: {avg_points:.2f}", end="")


        if(avg_points >= 8):
            print(f"Environment solved in {i+1} episodes!")
            break
            
        if (i+1) % 100 == 0:
            print(f"\rEpisode {i+1} | Total point average of the last {100} episodes: {avg_points:.2f}")
        rewards.append(total_points)
        
    env.close()
    print(f"\rTraining completed over {episodes} episodes")
    


In [9]:
tf.random.set_seed(0)
env = gym.make('Taxi-v3')
env.reset()
rewards = []
episodes=2500
state_size = env.observation_space.n
action_size = env.action_space.n
Agent=DQNAgent(state_size, action_size)
Train_Agent(Agent, episodes, rewards, state_size)
plt.title("Cumulative reward per episode")
plt.xlabel("Episode")
plt.ylabel("Cumulative reward")
plt.plot(rewards)
plt.show()
env.close()

AttributeError: module 'keras.src.backend' has no attribute 'floatx'