In [1]:
import numpy as np
import random
import tensorflow as tf
from collections import deque
import gym
import matplotlib.pyplot as plt
import os
import time

!export TF_CPP_MIN_LOG_LEVEL=3

2024-12-20 09:25:26.772378: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 09:25:26.776392: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-20 09:25:26.789154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734686726.810320  145438 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734686726.816627  145438 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-20 09:25:26.839159: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [2]:
LEARNING_RATE = 0.001
GAMMA = 0.99
EPISODES = 1000
BATCH_SIZE = 64
MEMORY_SIZE = 10000
UPDATE_FREQUENCY = 100
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995

In [3]:
env = gym.make('CartPole-v1')

In [4]:
class QNetwork(tf.keras.Model):
    def __init__(self, action_space):
        super(QNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(24, activation='relu')
        self.dense2 = tf.keras.layers.Dense(24, activation='relu')
        self.out = tf.keras.layers.Dense(action_space, activation='linear')

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        return self.out(x)

In [5]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def size(self):
        return len(self.buffer)

def epsilon_greedy_policy(state, epsilon, model, action_space):
    if random.random() < epsilon:
        return random.choice(range(action_space))
    else:
        state = np.expand_dims(state, axis=0)
        q_values = model(state)
        return np.argmax(q_values.numpy())

In [6]:
def train_dqn(model, target_model, memory, batch_size, gamma, optimizer):
    if memory.size() < batch_size:
        return

    batch = memory.sample(batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = np.array(states)
    next_states = np.array(next_states)

    with tf.GradientTape() as tape:
        q_values = model(states)
        next_q_values = target_model(next_states)
        
        target_q_values = q_values.numpy()
        for i in range(batch_size):
            if dones[i]:
                target_q_values[i][actions[i]] = rewards[i]
            else:
                target_q_values[i][actions[i]] = rewards[i] + gamma * np.max(next_q_values[i])  # 非终止状态

        target_q_values = tf.convert_to_tensor(target_q_values, dtype=tf.float32)
        loss = tf.reduce_mean(tf.square(q_values - target_q_values))

    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))


In [7]:
action_space = env.action_space.n
model = QNetwork(action_space)
target_model = QNetwork(action_space)
target_model.set_weights(model.get_weights())

In [8]:
replay_buffer = ReplayBuffer(MEMORY_SIZE)

In [9]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

2024-12-20 09:25:41.012092: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [None]:
reward_list = []

start_time = time.time()
for episode in range(EPISODES):
    state = env.reset()[0]
    total_reward = 0
    done = False

    while not done:
        action = epsilon_greedy_policy(state, epsilon, model, action_space)
        next_state, reward, done, _, _ = env.step(action)

        replay_buffer.add((state, action, reward, next_state, done))
        
        state = next_state
        total_reward += reward

        train_dqn(model, target_model, replay_buffer, BATCH_SIZE, GAMMA, optimizer)

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    if episode % UPDATE_FREQUENCY == 0:
        target_model.set_weights(model.get_weights())

    reward_list.append(total_reward)

    if episode % 10 == 0:
        current_time = time.time()
        elapsed_time = current_time - start_time
        print(f"Episode {episode}, Total Reward: {total_reward}, ε: {epsilon:.2f}, Traning time: {elapsed_time:.2f} s")
        
end_time = time.time()
training_time = end_time - start_time
print(f"Total training time: {training_time:.2f} seconds")

  if not isinstance(terminated, (bool, np.bool8)):


Episode 0, Total Reward: 14.0, ε: 0.99, Traning time: 0.00 s
Episode 10, Total Reward: 21.0, ε: 0.95, Traning time: 6.59 s
Episode 20, Total Reward: 37.0, ε: 0.90, Traning time: 15.91 s
Episode 30, Total Reward: 36.0, ε: 0.86, Traning time: 27.61 s
Episode 40, Total Reward: 20.0, ε: 0.81, Traning time: 35.09 s
Episode 50, Total Reward: 12.0, ε: 0.77, Traning time: 42.94 s
Episode 60, Total Reward: 14.0, ε: 0.74, Traning time: 50.29 s
Episode 70, Total Reward: 21.0, ε: 0.70, Traning time: 56.12 s
Episode 80, Total Reward: 19.0, ε: 0.67, Traning time: 63.74 s
Episode 90, Total Reward: 19.0, ε: 0.63, Traning time: 69.95 s
Episode 100, Total Reward: 13.0, ε: 0.60, Traning time: 74.97 s
Episode 110, Total Reward: 15.0, ε: 0.57, Traning time: 86.52 s
Episode 120, Total Reward: 12.0, ε: 0.55, Traning time: 92.71 s
Episode 130, Total Reward: 18.0, ε: 0.52, Traning time: 100.11 s
Episode 140, Total Reward: 14.0, ε: 0.49, Traning time: 105.90 s
Episode 150, Total Reward: 11.0, ε: 0.47, Traning t

In [None]:
plt.plot(reward_list)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward during DQN Training (Maximum Reward: 500)')
plt.show()