In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import gym

  from ._conv import register_converters as _register_converters


In [2]:
tf.reset_default_graph()

In [3]:
class Policy_Gradient:
    
    def __init__(
        self, 
        n_actions, 
        n_states, 
        gamma = 0.9, #遠見程度
        epsilon = None,  #保守程度，越大就越容易用Q值大小來採取行動；越小則越容易產生隨機行動
        epsilon_increase = None,
        learning_rate = 0.001, #神經網路的更新率
        #memory_size = 50, #####
        #batch_size = 32, #####
        nueron_num = 10
    ):
    
        self.n_actions = n_actions
        self.n_states = n_states
        self.gamma = gamma
        #self.epsilon_max = epsilon #####
        #self.epsilon_increase = epsilon_increase #####
        #self.epsilon = 0 if epsilon_increase is not None else epsilon #####
        self.lr = learning_rate
        #self.memory_size = memory_size #####
        #self.memory_counter = 0 #####
        #self.batch_size = batch_size ####
        self.nueron_num = nueron_num
        
        ##### initialize memory
        """
        
        """
        self.past_state, self.past_action, self.past_reward = [], [], []
        self.action_one_hot = np.zeros(self.n_actions, dtype=np.int32)

        """

        """
        
        tf.reset_default_graph() ## 重新 build graph 需要跑這行
        self.sess = tf.Session() #宣告session
        #輸入current state
        self.state_input = tf.placeholder(shape = [None, self.n_states], 
                                          name = 'state_input',
                                          dtype = tf.float32)
        """
        輸入real action和神經網路的output act_proba算cross entropy當作更新方向
        以超級瑪莉的遊戲為例 action = [上, 下, 左, 右] 如果實際action為向左則
        action = [0, 0, 1, 0]。
        也可以將這四個動作用0, 1, 2 ,3代表，如此的話只需要用一維來存取動作，也就是輸
        入shape = [None, 1]，那後面再算cross entropy的話就要用tf.nn.sparse_
        softmax_cross_entropy_with_logits，大家也可以試著改寫看看。
        """    
        self.real_action = tf.placeholder(shape = [None, self.n_actions], 
                                          name = 'real_action',
                                          dtype = tf.float32)
        """
        但是有時候產生的動作會帶來好的效果或壞的效果並且程度不一，因此loss不能光用神經網路的
        輸出action_proba和real action的cross entropy代表，因此這邊乘上action_reward
        來校正loss。例如某個動作很有幫助那必然會產生很大的action_reward，因此乘上很大的
        action_reward即可加大loss讓此動作之後產生的機率被放大；相反的，某個動作如果產生很
        好的效果反而會帶來負的action_reward使得loss變負的，讓更新方向相反使得之後輸出此動
        作的機會減少。
        """
        self.Vt = tf.placeholder(shape= [None, ], 
                                            name="Vt",
                                            dtype = tf.float32)
        #搭建神經網路
        with tf.variable_scope('Actor'):
            self.act_proba = self.build_network(self.nueron_num, Trainable = True, \
                             scope = 'net_eval') 
            
        
        #管理神經網路的parameters
        self.Actor_eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/net_eval')
        
        
        #loss
        """
        算出 “神經網路輸出的動作機率”與 “實際動作”的cross entropy當作loss，但是更新的方向和力道就利用action
        reward來決定。例如，這一回合產生的所有動作組合如果得到很好的reward，那就應該讓神經網路的輸出機率更靠近實
        際輸出的結果，因此cross_entropy和action_reward相乘的到的loss就更大，更新力度就更大。相反的，這一回
        合產生的所有動作組合如果得到負的reward，那就應該讓神經網路輸出動作的機率更遠離實際輸出結果，在這樣的狀況
        下，cross_entropy和action_reward相乘的到的loss就會得到負的，神經網路的參數更新方向就會往反方向。
        """
        self.cross_entropy = tf.reduce_sum(-tf.log(self.act_proba)*self.real_action, axis=1)
        self.loss = tf.reduce_sum(self.cross_entropy*self.Vt)
 
        
        self.train = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss, var_list=self.Actor_eval_params)
    
        self.sess.run(tf.global_variables_initializer()) #將神經網路初始化
    
    def write_memory(self, current_state, reward, action): #####
        
        ## past_action 要存one hot
        action_one_hot = self.action_one_hot.copy()
        action_one_hot[action]=1
        
        self.past_state.append(current_state)
        self.past_action.append(action_one_hot)
        self.past_reward.append(reward)
    
    
    def build_network(self, neuron_num, Trainable, scope): 
         with tf.variable_scope(scope):
            init_w , init_b = tf.random_normal_initializer(0.,0.2) , tf.constant_initializer(0.1)
            
            fc1 = tf.layers.dense(self.state_input , neuron_num , activation=tf.nn.tanh , \
                                  kernel_initializer = init_w , bias_initializer = init_b ,trainable = Trainable)
            
            fc2 = tf.layers.dense(fc1 , neuron_num , activation=tf.nn.tanh , \
                                  kernel_initializer = init_w , bias_initializer = init_b ,trainable = Trainable)
 
            output = tf.layers.dense(inputs = fc2, units = self.n_actions, \
                activation = tf.nn.softmax, kernel_initializer=init_w, \
                bias_initializer=init_b, trainable=Trainable)
        
         return output
    
    
    def choose_action(self , current_state):
        act_prob = self.sess.run(self.act_proba, feed_dict={self.state_input : current_state[np.newaxis,:]})
        
   
        act = np.random.choice(range(act_prob.shape[1]) , p = np.reshape(act_prob,-1) )
        
        return act
        
    
    def learn(self): #####
        vt = self.calculate_Vt(self.past_reward)

        self.sess.run(self.train,feed_dict={
             self.state_input: self.past_state,  # shape=[None, n_state]
             self.real_action: self.past_action,  # shape=[None, n_actions]
             self.Vt: vt  # shape=[None, ]
        })
        
        #清空memory entry
        self.past_state , self.past_action , self.past_reward = [],[],[]
        
        
    
    def calculate_Vt(self,reward):
        
        vt = np.zeros_like(reward)
        reward_temp=0
        for i in reversed(range(len(reward))):
            reward_temp = reward[i] + reward_temp* self.gamma
            vt[i] = reward_temp
            
        ## normalize reward
        vt -= np.mean(vt)
        vt /= np.std(vt)
        
        
        return vt
        
    def model_save(self, model_name):
        
        saver = tf.train.Saver()
        saver.save(self.sess, "saved_models/{}.ckpt".format(model_name))
    
    def model_restore(self, model_name):
        
        saver = tf.train.Saver()
        saver.restore(self.sess, "saved_models/{}.ckpt".format(model_name))

In [4]:
def training(save_model, model_name):
    step_record = []
    #dead_record = []
    for episode in range(200):
        # initial environment並給出起始的state
        current_state = env.reset()
        step = 0
        total_reward = 0
        while True:
            # 產生環境視窗
            env.render()

            # 根據現在的狀態選擇動作
            action = RL.choose_action(current_state)

            # 產生動作和環境互動後產生下一個狀態、獎勵值及遊戲是否結束
            next_state, reward, done, _ = env.step(action)
            
            total_reward+= reward
            
            
            # 將資訊存至記憶體中以便進行experience replay
            RL.write_memory(current_state, reward, action)
            
            #if reward < 0 :
            #    dead+=1
            # swap state
            current_state = next_state

            # break while loop when end of this episode
            if done:
                RL.learn()
                print('episode:{} steps:{} total_reward:{}'.format(episode, step, total_reward))
                step_record.append(step)
                break
            step += 1

    # end of game
    if save_model:
        RL.model_save(model_name)
    print('game over')
    env.close()
    return step_record

In [5]:
step_result = []
env = gym.make('CartPole-v0')
env.seed(1)     # reproducible, general Policy gradient has high variance
env = env.unwrapped
RL = Policy_Gradient(n_actions = 2, 
                   n_states = 4,
                   gamma = 0.99,
                   learning_rate = 0.01,
                 )
step_record = training(save_model = True, model_name='PG_try')
step_result.append(pd.DataFrame(data = step_record))

episode:0 steps:18 total_reward:19.0
episode:1 steps:41 total_reward:42.0
episode:2 steps:15 total_reward:16.0
episode:3 steps:28 total_reward:29.0
episode:4 steps:36 total_reward:37.0
episode:5 steps:26 total_reward:27.0
episode:6 steps:56 total_reward:57.0
episode:7 steps:22 total_reward:23.0
episode:8 steps:15 total_reward:16.0
episode:9 steps:9 total_reward:10.0
episode:10 steps:12 total_reward:13.0
episode:11 steps:13 total_reward:14.0
episode:12 steps:26 total_reward:27.0
episode:13 steps:12 total_reward:13.0
episode:14 steps:19 total_reward:20.0
episode:15 steps:40 total_reward:41.0
episode:16 steps:49 total_reward:50.0
episode:17 steps:23 total_reward:24.0
episode:18 steps:10 total_reward:11.0
episode:19 steps:19 total_reward:20.0
episode:20 steps:19 total_reward:20.0
episode:21 steps:17 total_reward:18.0
episode:22 steps:38 total_reward:39.0
episode:23 steps:16 total_reward:17.0
episode:24 steps:9 total_reward:10.0
episode:25 steps:10 total_reward:11.0
episode:26 steps:10 tota

KeyboardInterrupt: 

In [7]:
reward_result = pd.DataFrame(reward_record)
reward_result.columns = ['Policy Gradient']
reward_result.plot()
plt.xlabel('episode')
plt.ylabel('Total Reward')
plt.show()