# dqn cartpole tf版

In [2]:
import tensorflow as tf
import numpy as np
import gym

  from ._conv import register_converters as _register_converters


In [59]:
env = gym.make('CartPole-v0')
env = env.unwrapped

In [67]:
# 超參數
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # greedy policy
GAMMA = 0.9                 # reward discount
TARGET_REPLACE_ITER = 100   # target update frequency
MEMORY_CAPACITY = 150
REPLACE_TARGET_ITER = 100
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]

arg = [N_ACTIONS,N_STATES,LR,GAMMA,EPSILON,REPLACE_TARGET_ITER,MEMORY_CAPACITY,BATCH_SIZE]

In [68]:
tf.reset_default_graph()

In [69]:
n_epochs = 100

In [70]:
class DeepQNetwork:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=100,
            memory_size=150,
            batch_size=32,
            e_greedy_increment=None,
            output_graph=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        # consist of [target_net, evaluate_net]
        self._build_net()

        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

        
        ###### 很重要,每執行一定步數,要把eval net同步到target net ######
        with tf.variable_scope('hard_replacement'):
            self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.InteractiveSession()

        if output_graph:
            # $ tensorboard --logdir=logs
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())
        self.cost_his = []

    def _build_net(self):
        
        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input State
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')  # input Next State
        self.r = tf.placeholder(tf.float32, [None, ], name='r')  # input Reward
        self.a = tf.placeholder(tf.int32, [None, ], name='a')  # input Action

        w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
        
        #  evaluate_net
        with tf.variable_scope('eval_net'):
            e1 = tf.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='e1')
            self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer, name='q')

        # target_net
        with tf.variable_scope('target_net'):
            t1 = tf.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='t1')
            self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer, name='t2')
        
        # 取target net對下一個state得到的Q值
        with tf.variable_scope('q_target'):
            q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_')    # shape=(None, )
            self.q_target = tf.stop_gradient(q_target)
            
            
        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
#             a_indices = tf.expand_dims(self.a,axis=0)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)    # shape=(None, )
            
            
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        # to have batch dimension when feed into tf placeholder
        observation = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            # forward feed the observation and get q value for every actions
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def learn(self):
        # check to replace target parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.target_replace_op)
            print('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]
    

        
        _, cost = self.sess.run(
            [self._train_op, self.loss],
            feed_dict={
                self.s: batch_memory[:, :self.n_features],
                self.a: batch_memory[:, self.n_features],
                self.r: batch_memory[:, self.n_features + 1],
                self.s_: batch_memory[:, -self.n_features:],
            })

        self.cost_his.append(cost)

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

    def plot_cost(self):
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
        plt.ylabel('Cost')
        plt.xlabel('training steps')
        plt.show()

In [71]:
DQN = DeepQNetwork(*arg, output_graph=True)

In [72]:

env = gym.make('CartPole-v0')
env = env.unwrapped
for i_episode in range(n_epochs):
    s = env.reset()
    ep_r = 0
    while True:
        env.render()
        a = DQN.choose_action(s)
#         print(a)
        # take action
        s_, r, done, info = env.step(a)

        # modify the reward
        x, x_dot, theta, theta_dot = s_
        
    
        # reward要重新定義 預設是 : Reward is 1 for every step taken, including the termination step
        ## env.x_threshold代表x方向的最大距離,r1越大代表越靠近中間
        ## -0.8是要讓reward 不要那麼通膨, 最中間也只能拿 1-0.8 = 0.2 reward
        ## -0.8不加也是可以train的起來的
        r1 = (env.x_threshold - abs(x)) / env.x_threshold -0.8
        
        ## 角度越接近正垂直，r2越大
        ## -0.5同上是修正值
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians -0.5
        r = r1 + r2

        DQN.store_transition(s, a, r, s_)

        ep_r += r
        if DQN.memory_counter > DQN.memory_size:
            DQN.learn()
            if done:
                print('Ep: ', i_episode,
                      '| Ep_r: ', round(ep_r, 2))

        if done:
            break
        s = s_
env.close()


target_params_replaced

Ep:  15 | Ep_r:  1.29
Ep:  16 | Ep_r:  2.53
Ep:  17 | Ep_r:  1.44
Ep:  18 | Ep_r:  1.41
Ep:  19 | Ep_r:  3.71
Ep:  20 | Ep_r:  4.04
Ep:  21 | Ep_r:  2.39
Ep:  22 | Ep_r:  1.57
Ep:  23 | Ep_r:  4.19

target_params_replaced

Ep:  24 | Ep_r:  7.31

target_params_replaced

Ep:  25 | Ep_r:  66.31
Ep:  26 | Ep_r:  2.1
Ep:  27 | Ep_r:  2.21
Ep:  28 | Ep_r:  2.9
Ep:  29 | Ep_r:  3.17
Ep:  30 | Ep_r:  2.75

target_params_replaced

Ep:  31 | Ep_r:  2.72
Ep:  32 | Ep_r:  2.26
Ep:  33 | Ep_r:  3.09
Ep:  34 | Ep_r:  3.12
Ep:  35 | Ep_r:  2.38
Ep:  36 | Ep_r:  3.51
Ep:  37 | Ep_r:  2.14
Ep:  38 | Ep_r:  1.24
Ep:  39 | Ep_r:  0.81
Ep:  40 | Ep_r:  2.22

target_params_replaced

Ep:  41 | Ep_r:  1.87
Ep:  42 | Ep_r:  5.4
Ep:  43 | Ep_r:  2.99
Ep:  44 | Ep_r:  2.16
Ep:  45 | Ep_r:  2.72
Ep:  46 | Ep_r:  3.01
Ep:  47 | Ep_r:  2.92

target_params_replaced

Ep:  48 | Ep_r:  11.1
Ep:  49 | Ep_r:  4.58

target_params_replaced

Ep:  50 | Ep_r:  39.19

target_params_replaced

Ep:  51 |

In [73]:
env.close()