In [1]:
import tensorflow as tf
import numpy as np
import gym

  from ._conv import register_converters as _register_converters


In [2]:
env = gym.make('CartPole-v0')
env = env.unwrapped

# 超參數

In [3]:
BATCH_SIZE = 32
EPISILON = 0.9
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]
MEMORY_CAPACITY = 150
GAMMA = 0.9
LR = 0.01

In [4]:
tf.reset_default_graph()

In [5]:
class double_DQN_Agent():
    def __init__(self,
                 n_actions = 2,
                 n_states = 4,
                 memory_capacity = 150,
                 episilon = 0.9,
                 gamma = 0.9,
                 batch_size = 32,
                 lr = 0.01,
                 memory_counter = 0,
                 synchronize_iter = 100,
                 
                 ):
        
        self.memory = np.zeros((memory_capacity,n_states*2+2))
        self.memory_capacity = memory_capacity
        self.memory_counter = 0
        self.synchronize_iter = synchronize_iter
        self.synchronize_counter = 0
        self.batch_size = batch_size
        self.n_states = n_states
        self.n_actions = n_actions
        self.gamma = gamma
        self.lr = lr
        self.episilon = episilon
        
        self._build_net()
        
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        
    def _build_net(self):
        self.s = tf.placeholder(tf.float32,[None,self.n_states],name = "current_state")
        self.a = tf.placeholder(tf.int32,[None,],name = "action")
        self.r = tf.placeholder(tf.float32,[None,] ,name = "reward")
        self.s_ = tf.placeholder(tf.float32,[None,self.n_states],name = "next_state")
        
        init_weight,init_bias = tf.truncated_normal_initializer(stddev=0.01),tf.constant_initializer(0.1)
        
        # eval_net
        with tf.variable_scope("eval_net"):
            
            self.e_fc1 = tf.layers.dense(self.s , 50 , activation = tf.nn.relu , 
                            kernel_initializer = init_weight , 
                            bias_initializer = init_bias, name="e_fc1")
            
            self.e_out = tf.layers.dense(self.e_fc1 , self.n_actions , activation=tf.nn.relu,
                                   kernel_initializer = init_weight,
                                   bias_initializer = init_bias , 
                                    name="e_fc2")
            
            
            
        # target_net
        with tf.variable_scope("target_net"):
            self.t_fc1 = tf.layers.dense(self.s_ , 50 , activation =tf.nn.relu,
                                        kernel_initializer = init_weight , 
                                        bias_initializer = init_bias , name="t_fc1")
            
            self.t_out = tf.layers.dense(self.t_fc1 , self.n_actions , activation=tf.nn.relu,
                                        kernel_initializer = init_weight,
                                        bias_initializer = init_bias , name= "t_fc2")
        
        
        self.eval_params = tf.get_collection(key = tf.GraphKeys.TRAINABLE_VARIABLES , scope = "eval_net")
        self.target_params = tf.get_collection(key = tf.GraphKeys.TRAINABLE_VARIABLES , scope = "target_net")
        
        
        # eval 參數同步到 target的op
        self.synchronize_op = [ tf.assign(t,e) for e,t in zip(self.eval_params,self.target_params)]
        
        # loss
        
        
#         next_q = tf.reduce_max(self.t_out,axis=1)
        ######### double QDN只要改這邊 #########
        # 用eval net得到的batch action,再利用target net得到batch action所對應的Q值
        # 會加速不少
        
        e_actions = tf.argmax(self.e_out,axis=1,output_type=tf.int32)
        e_actions_idx = tf.stack([tf.range(tf.shape(e_actions)[0]),e_actions],axis=1)
        next_q=tf.gather_nd(params=self.t_out,indices = e_actions_idx)
        
        ######################################
        
        self.target_q = self.r + self.gamma*next_q
#         self.target_q = tf.stop_gradient(self.target_q)  # target不要更新參數
        
        index = tf.stack([tf.range(tf.shape(self.a)[0]),self.a],axis=1)
        self.eval_q = tf.gather_nd(params = self.e_out , indices = index)
        self.loss = tf.reduce_mean(tf.squared_difference(self.eval_q,self.target_q))
        
        # 只更新eval net的參數
        q_vars = [ var for var in tf.trainable_variables() if "eval_net" in var.name]
        self.step = tf.train.RMSPropOptimizer(learning_rate = self.lr).minimize(self.loss,var_list=q_vars)
        
        
        
        
    def choose_action(self,states):
        states = states[np.newaxis,:]
        
        q_val = self.sess.run(self.e_out,feed_dict={self.s:states})
        if np.random.normal() < self.episilon:
            act = np.argmax(q_val,axis=1)[0]
        else:
            act = np.random.choice(self.n_actions)
        return act
        
    def store_entry(self,s,a,r,s_):
        idx = self.memory_counter % self.memory_capacity
        
        row_entry = np.hstack([s,a,r,s_])
        self.memory[idx,:] = row_entry
        
        self.memory_counter+=1
        
        
    def train(self):
        idx = np.random.choice(self.memory_capacity, self.batch_size)
        memory_batch = self.memory[idx,:]
        
        s = memory_batch[:,:self.n_states]
        a = memory_batch[:,self.n_states]
        r = memory_batch[:,self.n_states+1]
        s_ = memory_batch[:,-self.n_states:]
        
        loss,_ = self.sess.run([self.loss,self.step],feed_dict={self.s:s,
                                                               self.a:a,
                                                               self.r:r,
                                                               self.s_:s_})

        
        
        # 固定步數執行同步動作
        if self.synchronize_counter%self.synchronize_iter==0:
            print("eval參數 同步到 target參數")
            self.sess.run(self.synchronize_op)
            
        self.synchronize_counter+=1   
        

In [6]:
DQN = double_DQN_Agent()

In [7]:
env = gym.make('CartPole-v0')
env = env.unwrapped

In [None]:
for i_episode in range(100):
    s = env.reset()
    ep_r = 0
    while True:
        env.render()
        a = DQN.choose_action(s)
#         print(a)
        # take action
        s_, r, done, info = env.step(a)

        # modify the reward
        x, x_dot, theta, theta_dot = s_
        
    
        # reward要重新定義 預設是 : Reward is 1 for every step taken, including the termination step
        ## env.x_threshold代表x方向的最大距離,r1越大代表越靠近中間
        ## -0.8是要讓reward 不要那麼通膨, 最中間也只能拿 1-0.8 = 0.2 reward
        ## -0.8不加也是可以train的起來的
        r1 = (env.x_threshold - abs(x)) / env.x_threshold -0.8
        
        ## 角度越接近正垂直，r2越大
        ## -0.5同上是修正值
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians -0.5
        r = r1 + r2

        DQN.store_entry(s, a, r, s_)

        ep_r += r
        if DQN.memory_counter > DQN.memory_capacity:
            DQN.train()
            if done:
                print('Ep: ', i_episode,
                      '| Ep_r: ', round(ep_r, 2))

        if done:
            break
        s = s_
env.close()

eval參數 同步到 target參數
Ep:  15 | Ep_r:  4.03
Ep:  16 | Ep_r:  2.73
Ep:  17 | Ep_r:  1.46
Ep:  18 | Ep_r:  2.42
Ep:  19 | Ep_r:  2.92
Ep:  20 | Ep_r:  5.68
Ep:  21 | Ep_r:  1.77
Ep:  22 | Ep_r:  0.71
Ep:  23 | Ep_r:  1.29
eval參數 同步到 target參數
Ep:  24 | Ep_r:  3.72
Ep:  25 | Ep_r:  3.38
Ep:  26 | Ep_r:  9.73
Ep:  27 | Ep_r:  2.87
eval參數 同步到 target參數
Ep:  28 | Ep_r:  1.77
Ep:  29 | Ep_r:  2.11
Ep:  30 | Ep_r:  4.75
Ep:  31 | Ep_r:  2.36
Ep:  32 | Ep_r:  1.51
Ep:  33 | Ep_r:  1.7
Ep:  34 | Ep_r:  1.34
Ep:  35 | Ep_r:  4.36
eval參數 同步到 target參數
Ep:  36 | Ep_r:  5.99
Ep:  37 | Ep_r:  26.24
eval參數 同步到 target參數
eval參數 同步到 target參數
Ep:  38 | Ep_r:  69.31
eval參數 同步到 target參數
Ep:  39 | Ep_r:  14.74
Ep:  40 | Ep_r:  13.59
Ep:  41 | Ep_r:  2.69
Ep:  42 | Ep_r:  3.35
eval參數 同步到 target參數
Ep:  43 | Ep_r:  2.23
Ep:  44 | Ep_r:  10.91
Ep:  45 | Ep_r:  3.35
eval參數 同步到 target參數
Ep:  46 | Ep_r:  3.63
eval參數 同步到 target參數
Ep:  47 | Ep_r:  12.36
eval參數 同步到 target參數
Ep:  48 | Ep_r:  27.39
eval參數 同步到 target參數
eval參數