In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import gym
import matplotlib.pyplot as plt

In [2]:
env = gym.make('Pendulum-v0')

num_actions = 16
min_act = -2
max_act = 2
dis = 0.99
REPLAY_MEMORY = 10000
batch_size = 256
alpha = 0.6
beta_init = 0.4
eps = 0.01

input_size = env.observation_space.shape[0]
output_size = num_actions

[2018-02-09 02:12:50,087] Making new env: Pendulum-v0


In [3]:
class DQN :
    def __init__(self, session, input_size, output_size, name="main") :
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self._build_network()
        
    def _build_network(self, h_size=64, l_rate=0.01) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
            
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.relu(tf.matmul(self._X, W1))
            
            W2 = tf.get_variable("W2", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            W3 = tf.get_variable("W3", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer2 = tf.nn.relu(tf.matmul(layer1, W2))
            layer3 = tf.nn.relu(tf.matmul(layer1, W3))
            
            W_V = tf.get_variable("W_V", shape=[h_size, 1],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            W_A = tf.get_variable("W_A", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            self.Value = tf.matmul(layer2, W_V)
            self.Advantage = tf.matmul(layer3, W_A)
            
            self._Qpred = self.Value + self.Advantage - tf.reduce_mean(self.Advantage,
                                                                       reduction_indices=1,keep_dims=True)
        
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
        
        self._WIS = tf.placeholder(shape=[1, 1], dtype=tf.float32)
        #self._WIS = tf.placeholder(shape=[1, self.output_size], dtype=tf.float32)
        
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        #self._loss = tf.reduce_mean(tf.multiply(self._WIS, tf.square(self._Y - self._Qpred)))
        #self._loss = tf.reduce_mean(self._WIS * tf.square(self._Y - self._Qpred))
        
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X : x})
    
    def update(self, x_stack, y_stack, w_stack):
        return self.session.run([self._loss, self._train],
                                feed_dict={self._X : x_stack, self._Y : y_stack, self._WIS : w_stack})

In [4]:
def softmax(x, scale = 1):
    x = np.array(x)/scale
    max_x = np.max(x)
    e_x = np.exp(x - max_x)
    p = e_x/e_x.sum()
    p = p/p.sum()
    return p

def softV(x, scale = 1):
    x = np.array(x)/scale
    max_x = np.max(x)
    e_x = np.exp(x - max_x)
    e_sum = e_x.sum()
    e_sum = scale * (np.log(e_sum) + max_x)
    return e_sum

In [5]:
def dec_action(a):
    return min_act + a * (max_act - min_act) / (num_actions - 1)

In [6]:
def replay_train (mainDQN, targetDQN, train_batch, w_batch) :
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    w_stack = np.empty(0).reshape(0, 0)
    
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        if done :
            Q[0,action] = reward
        else :
            action_mat = targetDQN.predict(next_state)
            action_V = softV(action_mat[0])
            Q[0,action] = reward + dis * action_V
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    for w in w_batch:
        w_stack = np.vstack([w])
        
    return mainDQN.update(x_stack, y_stack, w_stack)

In [7]:
def get_copy_var_ops(dest_scope_name="target", src_scope_name="main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
    
    return op_holder

In [8]:
def bot_play(mainDQN) :
    s = env.reset()
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s,reward,done,_ = env.step(a)
        reward_sum += reward
        
        if done :
            print ("Total score : {}".format(reward_sum))
            break

In [9]:
def main():
    max_episodes = 5000
    end_episode = 0
    step_count_total = 0
    beta = beta_init
    
    replay_buffer = deque()
    TD_error_list = []
    steps_list = []
    step_avg_list = []
    
    with tf.Session() as sess :
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        
        tf.initialize_all_variables().run()
        copy_ops = get_copy_var_ops(dest_scope_name = "target",
                                                src_scope_name = "main")
        sess.run(copy_ops)
    
        for episode in range(1, max_episodes):
            done = False
            step_count = 0
            TD_error = 0
            state = env.reset()
            
            while not done:
                action_mat = mainDQN.predict(state)
                action_max = softmax(action_mat[0])
                action_max = np.cumsum(action_max)
                
                rand_batch = random.random()
                AC_index = np.nonzero(action_max >= rand_batch)[0][0]
                action = dec_action(AC_index)        
                
                next_state, reward, done, _ = env.step([action])
                step_count += reward
                
                if done:
                    #if step_count < 200:
                    #    reward = -100
                    TD_error = reward
                else:
                    action_mat = targetDQN.predict(next_state)
                    action_V = softV(action_mat[0])
                    TD_error = reward + dis * action_V
                    
                TD_error -= np.max(mainDQN.predict(state))
                TD_error = pow((abs(TD_error) + eps), alpha)
                TD_error_list.append(TD_error)
                
                if beta < 1:
                    beta +=(1 - beta_init)/REPLAY_MEMORY
                
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                
            print("episode: {}   steps: {}".format(episode, step_count))
            steps_list.append(step_count)
            
            if episode < 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / episode)
                
            if episode == 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
                
            if episode > 100:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 101]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
            
            sample = 0
            if len(replay_buffer) < batch_size:
                sample = len(replay_buffer)
            else:
                sample = batch_size
            
            TD_copy = []
            TD_norm_list = []
            TD_accum_list = []
            W_is_list = []
                
            start = 0
            len_TD = len(TD_error_list)
            if(len_TD > REPLAY_MEMORY):
                start = len_TD - REPLAY_MEMORY
                TD_copy = TD_error_list[start : len_TD]
                len_TD = REPLAY_MEMORY
            else:
                TD_copy = TD_error_list[:]
                
            sum_TD = sum(TD_copy)
            TD_norm_list = [TD_copy[i] / sum_TD for i in range(len_TD)]
            TD_accum_list = np.cumsum(TD_norm_list)
                
            #W_is_list = [np.power((REPLAY_MEMORY * TD_norm_list[i]), -beta) for i in range(len_TD)]
            #maxW = np.max(W_is_list)
            #W_is_list = [W_is_list[i] / maxW for i in range(len_TD)]
                
            W_is_list = np.ones([len(TD_accum_list)])
                              
            minibatch = []
            w_batch = []
                
            TDT = np.zeros([len(TD_accum_list)])
            for i in range(sample):
                check = True
                while check:
                    rand_batch = random.random()
                    TD_index = np.nonzero(TD_accum_list >= rand_batch)[0][0]
                    if TDT[TD_index] == 0:
                        TDT[TD_index] = 1
                        check = False
                    
                w_batch.append(W_is_list[TD_index])
                minibatch.append(replay_buffer[TD_index])
                    
            loss, _ = replay_train(mainDQN, targetDQN, minibatch, w_batch)
                
            #print ("Loss :  ", loss)
            sess.run(copy_ops)
                
            print("{}           {}".format(episode, step_avg_list[episode - 1]))
            end_episode += 1
            if step_avg_list[episode - 1] > 195:
                break
        
        print("--------------------------------------------------")
        print("--------------------------------------------------")
        for episode in range(end_episode + 1, max_episodes):
            s = env.reset()
            reward_sum = 0
            while True :
                #env.render()
                a = np.argmax(mainDQN.predict(s))
                s,reward,done,_ = env.step(a)
                reward_sum += reward
        
                if done :
                    #print("episode: {}   steps: {}".format(episode, reward_sum))
                    steps_list.append(reward_sum)
                    step_count_total += steps_list[episode - 1]
                    step_count_total -= steps_list[episode - 101]
                    step_avg_list.append(step_count_total / 100)
                    print("{}           {}".format(episode, step_avg_list[episode - 1]))
                    break
        
        x_values = list(range(1, max_episodes))
        y_values = step_avg_list[:]
        plt.plot(x_values, y_values, c='green')
        plt.grid(True)
        plt.show()

In [10]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2018-02-09 02:12:51,900] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


episode: 1   steps: -957.776438553
1           -957.776438553
episode: 2   steps: -1006.35928089
2           -982.067859723
episode: 3   steps: -1586.42510377
3           -1183.5202744
episode: 4   steps: -1654.82377789
4           -1301.34615028
episode: 5   steps: -1350.69397943
5           -1311.21571611
episode: 6   steps: -1502.9998376
6           -1343.17973636
episode: 7   steps: -1301.41972172
7           -1337.21401998
episode: 8   steps: -1686.14067334
8           -1380.82985165
episode: 9   steps: -1135.94851345
9           -1353.62081407
episode: 10   steps: -1101.73227983
10           -1328.43196065
episode: 11   steps: -1465.5747279
11           -1340.89948494
episode: 12   steps: -1169.91784559
12           -1326.651015
episode: 13   steps: -1321.55110407
13           -1326.25871416
episode: 14   steps: -1652.02993006
14           -1349.52808672
episode: 15   steps: -1167.13322859
15           -1337.36842951
episode: 16   steps: -1315.35632834
16           -1335.99267319

episode: 129   steps: -1614.84478279
129           -1299.52677562
episode: 130   steps: -1123.32586206
130           -1299.07625516
episode: 131   steps: -1113.11368128
131           -1292.3991467
episode: 132   steps: -1300.56018164
132           -1289.14738395
episode: 133   steps: -1505.54772368
133           -1289.71690789
episode: 134   steps: -1468.13683743
134           -1291.52339455
episode: 135   steps: -1268.08684014
135           -1288.26241346
episode: 136   steps: -1653.20731676
136           -1290.50055237
episode: 137   steps: -1166.96475795
137           -1284.8848954
episode: 138   steps: -1076.48482739
138           -1284.44646446
episode: 139   steps: -1589.9268048
139           -1283.98689752
episode: 140   steps: -1345.04537374
140           -1284.83187347
episode: 141   steps: -1030.7798563
141           -1284.91393155
episode: 142   steps: -1518.22135198
142           -1284.98516812
episode: 143   steps: -1777.08602959
143           -1292.38585677
episode: 144  

253           -1291.53342858
episode: 254   steps: -1260.39030823
254           -1291.24422394
episode: 255   steps: -1149.06473929
255           -1286.24952528
episode: 256   steps: -1726.30440646
256           -1292.97888577
episode: 257   steps: -1104.49482479
257           -1292.32314964
episode: 258   steps: -1189.61852221
258           -1290.646037
episode: 259   steps: -1168.50746582
259           -1290.87773189
episode: 260   steps: -1006.74531289


KeyboardInterrupt: 