In [22]:
# milestone2_dqn.py

# --- 0. 导入必要的库和设置随机种子 ---
import os
import random
import numpy as np
import gym # 根据您提供的 requirements.txt，直接导入 gym
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers.legacy import Adam

# Keras-RL2 imports
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, Callback
from rl.processors import Processor # <--- 新增导入

# 设置随机种子以保证结果可复现性
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [23]:
# --- 1. 环境初始化 ---
ENV_NAME = 'Acrobot-v1'
env = gym.make(ENV_NAME)
_ = env.reset(seed=SEED)

# 添加环境包装以兼容 keras-rl2 的 step() 方法
class KerasRL2Wrapper(gym.Wrapper):
    def step(self, action):
        # 新版 gym 的 step 返回 (observation, reward, terminated, truncated, info)
        observation, reward, terminated, truncated, info = self.env.step(action)
        # keras-rl2 需要 (observation, reward, done, info)
        # done 为 True 如果 terminated 或 truncated 为 True
        done = terminated or truncated
        return observation, reward, done, info

    # reset 方法在新的 Gym 版本中返回 (observation, info)
    # 您的 CustomProcessor 已经处理了 reset 返回元组的情况，所以这里不需要修改 reset 方法

# 应用包装
env = KerasRL2Wrapper(env)

nb_actions = env.action_space.n
input_shape = env.observation_space.shape

print(f"环境名称: {ENV_NAME}")
print(f"观测空间形状: {input_shape}")
print(f"动作空间大小: {nb_actions}")

环境名称: Acrobot-v1
观测空间形状: (6,)
动作空间大小: 3


In [24]:
# --- 2. 设计神经网络架构 (Q-网络) ---
def build_model(input_shape, nb_actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + input_shape))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(nb_actions, activation='linear'))
    return model

model = build_model(input_shape, nb_actions)
print("\n--- 构建的 Q-网络模型摘要 ---")
model.summary()

# --- 定義一個簡單的 Processor 類別 ---
class CustomProcessor(Processor):
    def process_observation(self, observation):
        # 確保觀測值是 float32 類型，這是 TensorFlow 常見的輸入類型
        # 檢查是否為元組，如果是，取第一個元素
        if isinstance(observation, tuple):
            observation = observation[0]
        return observation.astype('float32')

    def process_state_batch(self, batch):
        # 確保批處理的形狀適合模型輸入
        # 對於 window_length=1，batch 已經會是 (N, 1, *obs_shape)
        # Flatten 層會將其處理為 (N, flat_obs_size)
        return batch.astype('float32')


--- 构建的 Q-网络模型摘要 ---
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_5 (Flatten)         (None, 6)                 0         
                                                                 
 dense_15 (Dense)            (None, 64)                448       
                                                                 
 dense_16 (Dense)            (None, 64)                4160      
                                                                 
 dense_17 (Dense)            (None, 3)                 195       
                                                                 
Total params: 4803 (18.76 KB)
Trainable params: 4803 (18.76 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# --- 3. 构建 DQN 代理 (里程碑2的第1步) ---
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, nb_steps=50000, value_test=.05)

memory = SequentialMemory(limit=100000, window_length=1)

dqn = DQNAgent(
    model=model,
    nb_actions=nb_actions,
    policy=policy,
    memory=memory,
    nb_steps_warmup=1000,
    gamma=.99,
    target_model_update=10000,
    train_interval=4,
    processor=CustomProcessor() # <--- 在这里添加处理器
)

In [26]:
# --- 4. 编译代理 (里程碑2的第2步) ---
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

print("\n--- DQN 代理编译完成 ---")

2025-06-10 14:39:49.766958: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_17/bias/Assign' id:1339 op device:{requested: '', assigned: ''} def:{{{node dense_17/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_17/bias, dense_17/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.



--- DQN 代理编译完成 ---


In [27]:
# --- 5. 训练代理 (里程碑2的第3步) ---
NB_TRAINING_STEPS = 50000

log_filename = 'dqn_acrobot_log.json'
weights_filename = 'dqn_acrobot_weights.h5'
checkpoint_weights_filename = 'dqn_acrobot_weights_{step}.h5'

callbacks = [
    ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000),
    FileLogger(log_filename, interval=100)
]

print(f"\n--- 开始训练 DQN 代理，共 {NB_TRAINING_STEPS} 步 ---")
dqn.fit(env, nb_steps=NB_TRAINING_STEPS, visualize=False, verbose=2, callbacks=callbacks)
print("\n--- 训练完成 ---")


--- 开始训练 DQN 代理，共 50000 步 ---
Training for 50000 steps ...


2025-06-10 14:39:50.886456: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_17/BiasAdd' id:1344 op device:{requested: '', assigned: ''} def:{{{node dense_17/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_17/MatMul, dense_17/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2025-06-10 14:39:51.106977: W tensorflow/c/c_api.cc:305] Operation '{name:'total_12/Assign' id:1500 op device:{requested: '', assigned: ''} def:{{{node total_12/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](total_12, total_12/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after runni

   500/50000: episode: 1, duration: 4.470s, episode steps: 500, steps per second: 112, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.948 [0.000, 2.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  1000/50000: episode: 2, duration: 3.947s, episode steps: 500, steps per second: 127, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.890 [0.000, 2.000],  loss: --, mae: --, mean_q: --, mean_eps: --


  updates=self.state_updates,
2025-06-10 14:39:59.308357: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_17_1/BiasAdd' id:1418 op device:{requested: '', assigned: ''} def:{{{node dense_17_1/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_17_1/MatMul, dense_17_1/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2025-06-10 14:40:00.104449: W tensorflow/c/c_api.cc:305] Operation '{name:'loss_15/AddN' id:1631 op device:{requested: '', assigned: ''} def:{{{node loss_15/AddN}} = AddN[N=2, T=DT_FLOAT, _has_manual_control_dependencies=true](loss_15/mul, loss_15/mul_1)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or 

  1500/50000: episode: 3, duration: 11.180s, episode steps: 500, steps per second:  45, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.946 [0.000, 2.000],  loss: 0.052684, mae: 0.392730, mean_q: -0.360380, mean_eps: 0.977500
  2000/50000: episode: 4, duration: 9.051s, episode steps: 500, steps per second:  55, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.040 [0.000, 2.000],  loss: 0.003845, mae: 0.398548, mean_q: -0.487247, mean_eps: 0.968536
  2500/50000: episode: 5, duration: 7.396s, episode steps: 500, steps per second:  68, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.006 [0.000, 2.000],  loss: 0.002944, mae: 0.388470, mean_q: -0.469175, mean_eps: 0.959536
  3000/50000: episode: 6, duration: 3.452s, episode steps: 500, steps per second: 145, episode reward: -500.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.046 [0.000, 2.000],  loss: 0.002559, mae: 0.376311, mean_q: -0.44

In [28]:
# --- 6. 评估代理 (里程碑2的第4步和第5步) ---
NB_EVALUATION_EPISODES = 10 # 评估的回合数
print(f"\n--- 开始评估训练后的代理，共 {NB_EVALUATION_EPISODES} 回合 ---")
# visualize=True 会渲染环境，这需要一个图形界面
# 如果在没有图形界面的Docker中运行，请将 visualize 设为 False 或使用 'rgb_array' 模式并保存帧
history = dqn.test(env, nb_episodes=NB_EVALUATION_EPISODES, visualize=False, verbose=1) # 默认为False以避免图形界面问题

# 计算并记录平均奖励
episode_rewards = history.history['episode_reward']
average_reward = np.mean(episode_rewards)
print(f"评估回合奖励: {episode_rewards}")
print(f"平均奖励: {average_reward:.2f}")
print("--- 评估完成 ---")


--- 开始评估训练后的代理，共 10 回合 ---
Testing for 10 episodes ...
Episode 1: reward: -500.000, steps: 500
Episode 2: reward: -500.000, steps: 500
Episode 3: reward: -500.000, steps: 500
Episode 4: reward: -500.000, steps: 500
Episode 5: reward: -500.000, steps: 500
Episode 6: reward: -500.000, steps: 500
Episode 7: reward: -500.000, steps: 500
Episode 8: reward: -500.000, steps: 500
Episode 9: reward: -500.000, steps: 500
Episode 10: reward: -500.000, steps: 500
评估回合奖励: [-500.0, -500.0, -500.0, -500.0, -500.0, -500.0, -500.0, -500.0, -500.0, -500.0]
平均奖励: -500.00
--- 评估完成 ---


In [29]:
# --- 7. 保存模型权重 (里程碑2的第6步) ---
# 只保存模型的权重，而不是整个代理对象
dqn.save_weights(weights_filename, overwrite=True)
print(f"\n--- 模型权重已保存到: {weights_filename} ---")


--- 模型权重已保存到: dqn_acrobot_weights.h5 ---


In [32]:
# --- 8. 加载和测试保存的模型 (里程碑2的第7步) ---
print("\\n--- 正在加载保存的模型权重并测试 ---")

# 首先，创建一个新的代理实例 (或重置现有代理的状态)
# 注意：这里我们创建一个新的代理，但其模型结构必须与保存权重时的模型结构相同
loaded_model = build_model(input_shape, nb_actions)
loaded_dqn = DQNAgent(
    model=loaded_model,
    nb_actions=nb_actions,
    policy=policy, # 可以重复使用相同的策略和记忆，但对于加载测试，记忆通常不重要
    memory=memory,
    nb_steps_warmup=1000,
    gamma=.99,
    target_model_update=10000,
    train_interval=4,
    processor=CustomProcessor() # <--- 在这里添加处理器
)
loaded_dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

# 加载权重
loaded_dqn.load_weights(weights_filename)
print(f"--- 模型权重已从 {weights_filename} 加载 ---")

# ... existing code ...
# 测试加载后的代理
NB_LOADED_TEST_EPISODES = 5 # 评估加载后的代理的回合数
print(f"\n--- 开始评估加载后的代理，共 {NB_LOADED_TEST_EPISODES} 回合 ---")
# 同样，这里也将 visualize 设为 False
loaded_history = loaded_dqn.test(env, nb_episodes=NB_LOADED_TEST_EPISODES, visualize=False, verbose=1)

loaded_episode_rewards = loaded_history.history['episode_reward']
loaded_average_reward = np.mean(loaded_episode_rewards)
print(f"加载后代理评估回合奖励: {loaded_episode_rewards}")
print(f"加载后代理平均奖励: {loaded_average_reward:.2f}")
print("--- 加载后的模型测试完成 ---")

# 关闭环境
env.close()

\n--- 正在加载保存的模型权重并测试 ---


2025-06-10 14:48:47.196387: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_25_1/bias/Assign' id:2815 op device:{requested: '', assigned: ''} def:{{{node dense_25_1/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_25_1/bias, dense_25_1/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2025-06-10 14:48:47.958446: W tensorflow/c/c_api.cc:305] Operation '{name:'total_24/Assign' id:2926 op device:{requested: '', assigned: ''} def:{{{node total_24/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](total_24, total_24/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either d

--- 模型权重已从 dqn_acrobot_weights.h5 加载 ---

--- 开始评估加载后的代理，共 5 回合 ---
Testing for 5 episodes ...
Episode 1: reward: -500.000, steps: 500
Episode 2: reward: -500.000, steps: 500
Episode 3: reward: -500.000, steps: 500
Episode 4: reward: -500.000, steps: 500
Episode 5: reward: -500.000, steps: 500
加载后代理评估回合奖励: [-500.0, -500.0, -500.0, -500.0, -500.0]
加载后代理平均奖励: -500.00
--- 加载后的模型测试完成 ---
