# DQN cartpole
> import 相關package

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym

In [2]:
env = gym.make('CartPole-v0')
env = env.unwrapped

In [3]:
env.observation_space.shape[0]

4

# 建立NET

In [4]:
# 超參數
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # greedy policy
GAMMA = 0.9                 # reward discount
TARGET_REPLACE_ITER = 100   # target update frequency
MEMORY_CAPACITY = 2000
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]

In [5]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(N_STATES,50)
        self.fc1.weight.data.normal_(0,0.1)
        
        self.out = nn.Linear(50,N_ACTIONS)
        self.out.weight.data.normal_(0,0.1)
        
    def forward(self,x):
        x = F.relu(self.fc1(x))
        action_value = self.out(x)
        
        return action_value
        
N = Net()
print(N)

Net(
  (fc1): Linear(in_features=4, out_features=50, bias=True)
  (out): Linear(in_features=50, out_features=2, bias=True)
)


# 建立DQN 
> 同樣架構網路 但一個是target network,另一個是eval network

In [10]:
class DQN(object):
    def __init__(self):
        self.eval_net, self.target_net = Net(), Net()

        self.learn_step_counter = 0                                     
        self.memory_counter = 0                                        
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()

    def choose_action(self, x):
        x = torch.unsqueeze(torch.FloatTensor(x), 0)
        
        
        if np.random.uniform() < EPSILON:   # 選q值較大的action
            actions_value = self.eval_net.forward(x)
            action = torch.argmax(actions_value, 1).item()
            
        else:   # 有很小的機率可以自由選擇
            action = np.random.randint(0, N_ACTIONS)
        return action

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, r, s_))
        
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1

    def learn(self):
        # 一定的步數後,將eval_net參數同步到target_net參數
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # 從memory中取隨機取batch個entry去訓練
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])

        # 回傳action所對應的q值
        q_eval = self.eval_net(b_s).gather(1, b_a)
        
        # shape(batch,n_actions) , 要detach,這是不更新的網路,沒detach的話每次backward都會更新參數
        q_next = self.target_net(b_s_).detach()
        
        
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)   # shape (batch, 1)
        loss = self.loss_func(q_eval, q_target)

        ## 記得要加zero_grad 每輪都要清掉梯度
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


        

In [11]:
dqn = DQN()

# 跑環境

In [12]:
n_epochs = 200

In [13]:
env = gym.make('CartPole-v0')
env = env.unwrapped
for i_episode in range(n_epochs):
    s = env.reset()
    ep_r = 0
    while True:
        env.render()
        a = dqn.choose_action(s)

        # take action
        s_, r, done, info = env.step(a)

        # modify the reward
        x, x_dot, theta, theta_dot = s_
        
        
        # reward要重新定義 預設是 : Reward is 1 for every step taken, including the termination step
        ## env.x_threshold代表x方向的最大距離,r1越大代表越靠近中間
        ## -0.8是要讓reward 不要那麼通膨, 最中間也只能拿 1-0.8 = 0.2 reward
        ## -0.8不加也是可以train的起來的
        r1 = (env.x_threshold - abs(x)) / env.x_threshold -0.8
        
        ## 角度越接近正垂直，r2越大
        ## -0.5同上是修正值
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians -0.5
        r = r1 + r2

        dqn.store_transition(s, a, r, s_)

        ep_r += r
        if dqn.memory_counter > MEMORY_CAPACITY:
            dqn.learn()
            if done:
                print('Ep: ', i_episode,
                      '| Ep_r: ', round(ep_r, 2))

        if done:
            break
        s = s_
env.close()


Ep:  168 | Ep_r:  4.18
Ep:  169 | Ep_r:  2.51
Ep:  170 | Ep_r:  1.78
Ep:  171 | Ep_r:  2.26
Ep:  172 | Ep_r:  2.93
Ep:  173 | Ep_r:  3.29
Ep:  174 | Ep_r:  1.69
Ep:  175 | Ep_r:  1.19
Ep:  176 | Ep_r:  3.08
Ep:  177 | Ep_r:  3.46
Ep:  178 | Ep_r:  0.78
Ep:  179 | Ep_r:  2.92
Ep:  180 | Ep_r:  2.05
Ep:  181 | Ep_r:  2.88
Ep:  182 | Ep_r:  3.16
Ep:  183 | Ep_r:  2.14
Ep:  184 | Ep_r:  2.68
Ep:  185 | Ep_r:  1.82
Ep:  186 | Ep_r:  2.39
Ep:  187 | Ep_r:  2.38
Ep:  188 | Ep_r:  2.9
Ep:  189 | Ep_r:  1.83
Ep:  190 | Ep_r:  3.08
Ep:  191 | Ep_r:  2.98
Ep:  192 | Ep_r:  3.05
Ep:  193 | Ep_r:  4.59
Ep:  194 | Ep_r:  0.91
Ep:  195 | Ep_r:  7.2
Ep:  196 | Ep_r:  2.08
Ep:  197 | Ep_r:  6.31
Ep:  198 | Ep_r:  2.52
Ep:  199 | Ep_r:  3.93
