# Compare the Q-value estimation of Q-learning and Deep Q-Network (DQN)

Use CliffWalking env, cauz it's simple and easy to understand.

In [None]:
from reinforce.alg.reinforce import REINFORCE
from actor_critic.alg.actor_critic import ActorCritic
import gymnasium as gym
from tqdm import tqdm
import torch
import numpy as np
import matplotlib.pyplot as plt

def set_seed(seed):
    """
    Set random seed for reproducibility.
    """
    torch.manual_seed(seed)
    np.random.seed(seed)

def train_on_policy_agent(env, agent, num_episodes, num_pbar, state_selected, action_selected):
    # to record episode returns
    return_list = []
    for i in range(num_pbar):
        with tqdm(total=int(num_episodes/num_pbar), desc="Iteration %d"%(i)) as pbar:
            for i_episode in range(int(num_episodes/num_pbar)): # for each pbar, there are int(num_episodes/10) episodes
                # each episode
                episode_return = 0
                transition_dict = {
                    "states":[],
                    "actions":[],
                    "next_states":[],
                    "rewards":[],
                    "dones":[]
                }
                # reset environment
                state, _ = env.reset()
                terminated, truncated = False, False
                while not terminated and not truncated: # interaction loop
                    # select action
                    action = agent.take_action(state)
                    # step the environment
                    next_state, reward, terminated, truncated, _ = env.step(action)
                    # store transition
                    transition_dict["states"].append(state)
                    transition_dict["actions"].append(action)
                    transition_dict["next_states"].append(next_state)
                    transition_dict["rewards"].append(reward)
                    transition_dict["dones"].append(terminated)
                    # update state
                    state = next_state
                    episode_return += reward
                # episode finished
                return_list.append(episode_return)
                # update agent
                agent.update(transition_dict)
                # show mean return of last 10 episodes, every 10 episodes
                if (i_episode+1) % 10 == 0:
                    pbar.set_postfix({
                        "episode":
                        "%d" % (int(num_episodes/10)*i+i_episode+1),
                        "return":
                        "%.3f" % (np.mean(return_list[-10:]))
                    })
                # update pbar
                pbar.update(1)
    # return return_list
    return return_list

In [None]:
# qlearning params
lr = 1e-3
gamma = 0.99
# DQN params
actor_lr = 1e-4 # alpha in q-learning is replaced by lr in DQN
critic_lr = 2e-3
hidden_dim = 128  # number of neurons in the hidden layer
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")  # use GPU if available
# env params
env_name = 'CartPole-v1'  # name of the environment to train on
# training params
num_pbar = 1 # number of progress bar
num_episodes = 100 # number of episodes to run
seed = 0  # random seed for reproducibility

# define the environment and the agent
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
state_selected, _ = env.reset()
action_selected = 1
vpg_agent = REINFORCE(state_dim, hidden_dim, action_dim, state_selected, action_selected, lr, gamma, device, "discrete")
a2c_agent = ActorCritic(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma, device, "discrete")
set_seed(seed)
# train the agent



# plot the results
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(dqn_return_list, label="DQN")
ax[0].plot(ql_return_list, label="Q-learning")
ax[0].set_xlabel("Episode")
ax[0].set_ylabel("Return")
ax[0].set_title("Comparison of DQN and Q-learning")
ax[0].legend()
ax[1].plot(dqn_q_list, label="DQN")
ax[1].plot(ql_q_list, label="Q-learning")
ax[1].set_xlabel("Episode")
ax[1].set_ylabel("Q-value")
ax[1].set_title("Comparison of DQN and Q-learning")
ax[1].legend()

在CliffWalking环境下训练，DQN训练很慢，每个episode训练时间很长，可能的原因想到了2个：
1. 神经网络本身就是用于浮点数(连续空间)的运算，强行将整型转为浮点来算Q值可能并不准确.
2. 浮点数本身有误差，在同样的网络参数下同一个整型的state输入进去可能得到的Q值估计会有差别，即使差别很小，但如果不同动作的Q值大小关系发生了变化，策略就会发生很大的变化，导致训练不稳定。
3. 一般DQN会用DL的方法从数据库中采样数据batch，然后更新网络。这么做需要先有一定的数据，在数据量还不够时，策略就会很盲目，类似MonteCarlo方法在更新前的状态，一直往上走，导致奖励很小，估计误差很大，可能直接就毁了训练。而将batch_size设为1，实际上效果也不好。