In [1]:
import tensorflow as tf
import numpy as np
import gym

  from ._conv import register_converters as _register_converters


In [2]:
env = gym.make('CartPole-v0')
env = env.unwrapped

In [3]:
# 超參數
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # greedy policy
GAMMA = 0.9                 # reward discount
TARGET_REPLACE_ITER = 100   # target update frequency
MEMORY_CAPACITY = 2000
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape[0]

In [4]:
def init_weight(shape):
    return tf.Variable(tf.random_normal(shape=shape,stddev=0.02))
def init_bias(shape):
    return tf.Variable(tf.zeros(shape=shape))

In [53]:
class Net():
    def __init__(self,scope_name):
        
        self.hidden_dim = [50]
        with tf.variable_scope(scope_name) as scope:
            self.w1 = init_weight([N_STATES,self.hidden_dim[0]])
            self.b1 = init_bias([self.hidden_dim[0]])
            self.w2 = init_weight([self.hidden_dim[0],N_ACTIONS])
            self.b2 = init_bias(N_ACTIONS)
            
            self.var_list = { v.name[len(scope.name):]:v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES,scope=scope.name)}
        
    
    def forward(self,X):
        z = tf.nn.relu( tf.add(tf.matmul(X,self.w1) , self.b1))
        out = tf.nn.relu( tf.add(tf.matmul(z,self.w2),self.b2) )
        
        
        
        return out,self.var_list
    
    
    

In [None]:
class DQN():
    
    def __init__(self):
        self.eval_net = Net("q_eval")
        self.target_net = Net("q_target")
        self.memory = np.zeros((MEMORY_CAPACITY,N_STATES*2+2))
        self.memory_counter = 0
        self.learn_step_counter = 0

        
        ## 定義graph
        self.X = tf.placeholder(tf.float32,[None,N_STATES])
#         self.q_target = tf.placeholder(tf.float32,[None,N_ACTIONS])
#         self.q_eval = tf.placeholder(tf.float32,[None,N_ACTIONS])
        self.eval_out , self.q_eval_var_dict = self.eval_net.forward(self.X)
        self.target_out , _ = self.target_net.forward(self.X)
        
        q_eval_var_list = [var for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES , scope="q_eval")]
        
        self.copy_op = [tf.assign(main_name, self.target_net[var_name]) for var_name, main_name in self.q_eval_var_dict.items()]
        self.copy_target_to_main = tf.group(*copy_op)
        
        self.loss = tf.reduce_mean(tf.squared_difference(self.target_out, self.eval_out))
        self.step = tf.train.AdamOptimizer(learning_rate = LR).minimize(self.loss , var_list = self.q_eval_var_list)
        
    def choose_action(self , state):
        state = tf.expand_dims(tf.convert_to_tensor(states,np.float32),0)
        
        if np.random.uniform() < EPSILON:
            action_value = self.eval_net(state)
            action = tf.argmax(action_value,1)
            
        else:            
            action = np.random.choice(N_ACTIONS)
            
        return action
    
    def store_transition(self,s,a,r,s_):
        transition = np.hstack((s,a,r,s_))
        index = MEMORY_CAPACITY % self.memory_counter
        
        self.memory[index,:] = transition
        
        self.memory_counter += 1
        
        
    def learn(self):
        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)
        
        # 同步
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.sess.run(copy_target_to_main)
        self.learn_step_counter+=1
        
        # 從memory中取隨機取batch個entry去訓練
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
        b_memory = self.memory[sample_index, :]
        b_s = tf.convert_to_tensor(b_memory[:, :N_STATES],dtype=tf.float32)
        b_a = tf.convert_to_tensor(b_memory[:, N_STATES:N_STATES+1].astype(int),dtype=tf.int64)
        b_r = tf.convert_to_tensor(b_memory[:, N_STATES+1:N_STATES+2],dtype=tf.float32)
        b_s_ = tf.convert_to_tensor(b_memory[:, -N_STATES:],dtype=tf.float32)
        
        
        loss,_ = sess.run([self.loss,self.step],feed_dict={self.X:b_s})


In [49]:
q_net1 = Net()

In [44]:
tf.reset_default_graph()

In [60]:
for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES):
    print(v.name[8:])
    print(v)

eight/Variable:0
<tf.Variable 'hidden_weight/Variable:0' shape=(4, 50) dtype=float32_ref>
eight/Variable_1:0
<tf.Variable 'hidden_weight/Variable_1:0' shape=(50,) dtype=float32_ref>
eight/Variable_2:0
<tf.Variable 'hidden_weight/Variable_2:0' shape=(50, 2) dtype=float32_ref>
eight/Variable_3:0
<tf.Variable 'hidden_weight/Variable_3:0' shape=(2,) dtype=float32_ref>
eight_1/Variable:0
<tf.Variable 'hidden_weight_1/Variable:0' shape=(4, 50) dtype=float32_ref>
eight_1/Variable_1:0
<tf.Variable 'hidden_weight_1/Variable_1:0' shape=(50,) dtype=float32_ref>
eight_1/Variable_2:0
<tf.Variable 'hidden_weight_1/Variable_2:0' shape=(50, 2) dtype=float32_ref>
eight_1/Variable_3:0
<tf.Variable 'hidden_weight_1/Variable_3:0' shape=(2,) dtype=float32_ref>


In [52]:
tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES,scope="hidden_weight/Variable")

[<tf.Variable 'hidden_weight/Variable:0' shape=(4, 50) dtype=float32_ref>,
 <tf.Variable 'hidden_weight/Variable_1:0' shape=(50,) dtype=float32_ref>,
 <tf.Variable 'hidden_weight/Variable_2:0' shape=(50, 2) dtype=float32_ref>,
 <tf.Variable 'hidden_weight/Variable_3:0' shape=(2,) dtype=float32_ref>]

In [6]:
n_epochs = 200

In [None]:
env = gym.make('CartPole-v0')
env = env.unwrapped

for i in range(n_epochs):
    s = env.reset()
    
    while True:
        action = dqn.choose_action(s)
        
        