In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import gym
import matplotlib.pyplot as plt

In [2]:
env = gym.make('Pendulum-v0')

num_actions = 16
min_act = -2
max_act = 2
dis = 0.99
REPLAY_MEMORY = 10000
batch_size = 256
alpha = 0.6
beta_init = 0.4
eps = 0.01

input_size = env.observation_space.shape[0]
output_size = num_actions

[2018-02-09 03:20:53,741] Making new env: Pendulum-v0


In [3]:
class DQN :
    def __init__(self, session, input_size, output_size, name="main") :
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self._build_network()
        
    def _build_network(self, h_size=64, l_rate=0.01) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
            
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.relu(tf.matmul(self._X, W1))
            
            W2 = tf.get_variable("W2", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            W3 = tf.get_variable("W3", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer2 = tf.nn.relu(tf.matmul(layer1, W2))
            layer3 = tf.nn.relu(tf.matmul(layer1, W3))
            
            W_V = tf.get_variable("W_V", shape=[h_size, 1],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            W_A = tf.get_variable("W_A", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            self.Value = tf.matmul(layer2, W_V)
            self.Advantage = tf.matmul(layer3, W_A)
            
            self._Qpred = self.Value + self.Advantage - tf.reduce_mean(self.Advantage,
                                                                       reduction_indices=1,keep_dims=True)
        
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
        
        self._WIS = tf.placeholder(shape=[1, 1], dtype=tf.float32)
        #self._WIS = tf.placeholder(shape=[1, self.output_size], dtype=tf.float32)
        
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        #self._loss = tf.reduce_mean(tf.multiply(self._WIS, tf.square(self._Y - self._Qpred)))
        #self._loss = tf.reduce_mean(self._WIS * tf.square(self._Y - self._Qpred))
        
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X : x})
    
    def update(self, x_stack, y_stack, w_stack):
        return self.session.run([self._loss, self._train],
                                feed_dict={self._X : x_stack, self._Y : y_stack, self._WIS : w_stack})

In [4]:
def replay_train (mainDQN, targetDQN, train_batch, w_batch) :
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    w_stack = np.empty(0).reshape(0, 0)
    
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        if done :
            Q[0,action] = reward
        else :
            action0 = np.argmax(mainDQN.predict(next_state))
            Q[0,action] = reward + dis * (targetDQN.predict(next_state)[0, action0])
    
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    for w in w_batch:
        w_stack = np.vstack([w])
        
    return mainDQN.update(x_stack, y_stack, w_stack)

In [5]:
def dec_action(a):
    return min_act + a * (max_act - min_act) / (num_actions - 1)

In [6]:
def get_copy_var_ops(dest_scope_name="target", src_scope_name="main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
    
    return op_holder

In [7]:
def bot_play(mainDQN) :
    s = env.reset()
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s,reward,done,_ = env.step(a)
        reward_sum += reward
        
        if done :
            print ("Total score : {}".format(reward_sum))
            break

In [8]:
def main():
    max_episodes = 500
    end_episode = 0
    step_count_total = 0
    beta = beta_init
    
    replay_buffer = deque()
    TD_error_list = []
    steps_list = []
    step_avg_list = []
    
    with tf.Session() as sess :
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        
        tf.initialize_all_variables().run()
        copy_ops = get_copy_var_ops(dest_scope_name = "target",
                                                src_scope_name = "main")
        sess.run(copy_ops)
    
        for episode in range(1, max_episodes):
            e = 1. / (((episode - 1) / 5) + 1)
            done = False
            step_count = 0
            TD_error = 0
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = random.randrange(num_actions)
                else:
                    action = np.argmax(mainDQN.predict(state))
                
                action_1 = dec_action(action)        
                
                next_state, reward, done, _ = env.step([action_1])
                step_count += reward
                
                if done:
                    #if step_count < 200:
                    #    reward = -100
                    TD_error = reward
                else:
                    action0 = np.argmax(mainDQN.predict(next_state))
                    TD_error = reward + dis * (targetDQN.predict(next_state)[0, action0])
                    
                TD_error -= np.max(mainDQN.predict(state))
                TD_error = pow((abs(TD_error) + eps), alpha)
                TD_error_list.append(TD_error)
                
                if beta < 1:
                    beta +=(1 - beta_init)/REPLAY_MEMORY
                
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                
                sample = 0
                if len(replay_buffer) < batch_size:
                    sample = len(replay_buffer)
                else:
                    sample = batch_size

                TD_copy = []
                TD_norm_list = []
                TD_accum_list = []
                W_is_list = []

                start = 0
                len_TD = len(TD_error_list)
                if(len_TD > REPLAY_MEMORY):
                    start = len_TD - REPLAY_MEMORY
                    TD_copy = TD_error_list[start : len_TD]
                    len_TD = REPLAY_MEMORY
                else:
                    TD_copy = TD_error_list[:]

                sum_TD = sum(TD_copy)
                TD_norm_list = [TD_copy[i] / sum_TD for i in range(len_TD)]
                TD_accum_list = np.cumsum(TD_norm_list)
                W_is_list = np.ones([len(TD_accum_list)])

                minibatch = []
                w_batch = []
                TDT = np.zeros([len(TD_accum_list)])
                for i in range(sample):
                    check = True
                    while check:
                        rand_batch = random.random()
                        TD_index = np.nonzero(TD_accum_list >= rand_batch)[0][0]
                        if TDT[TD_index] == 0:
                            TDT[TD_index] = 1
                            check = False

                    w_batch.append(W_is_list[TD_index])
                    minibatch.append(replay_buffer[TD_index])

                loss, _ = replay_train(mainDQN, targetDQN, minibatch, w_batch)

                #print ("Loss :  ", loss)
                sess.run(copy_ops)
                
            print("episode: {}   steps: {}".format(episode, step_count))
            steps_list.append(step_count)
            
            if episode < 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / episode)
                
            if episode == 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
                
            if episode > 100:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 101]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
             
            print("{}           {}".format(episode, step_avg_list[episode - 1]))
            end_episode += 1
            if step_avg_list[episode - 1] > 195:
                break
        
        print("--------------------------------------------------")
        print("--------------------------------------------------")
        for episode in range(end_episode + 1, max_episodes):
            s = env.reset()
            reward_sum = 0
            while True :
                #env.render()
                a = np.argmax(mainDQN.predict(s))
                s,reward,done,_ = env.step(a)
                reward_sum += reward
        
                if done :
                    #print("episode: {}   steps: {}".format(episode, reward_sum))
                    steps_list.append(reward_sum)
                    step_count_total += steps_list[episode - 1]
                    step_count_total -= steps_list[episode - 101]
                    step_avg_list.append(step_count_total / 100)
                    print("{}           {}".format(episode, step_avg_list[episode - 1]))
                    break
        
        x_values = list(range(1, max_episodes))
        y_values = step_avg_list[:]
        plt.plot(x_values, y_values, c='green')
        plt.grid(True)
        plt.show()

In [9]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2018-02-09 03:20:55,441] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


episode: 1   steps: -1675.56445292
1           -1675.56445292
episode: 2   steps: -1181.95429484
2           -1428.75937388
episode: 3   steps: -1731.03560191
3           -1529.51811656
episode: 4   steps: -1240.69784746
4           -1457.31304928
episode: 5   steps: -1024.75046692
5           -1370.80053281
episode: 6   steps: -1404.27443015
6           -1376.3795157
episode: 7   steps: -1312.4491193
7           -1367.24660193
episode: 8   steps: -1254.46543082
8           -1353.14895554
episode: 9   steps: -1623.96204066
9           -1383.23929833
episode: 10   steps: -1451.68372356
10           -1390.08374085
episode: 11   steps: -1369.91538574
11           -1388.25025403
episode: 12   steps: -1374.86145556
12           -1387.13452082
episode: 13   steps: -1388.34611994
13           -1387.22772075
episode: 14   steps: -1329.97654389
14           -1383.13835098
episode: 15   steps: -1425.4714365
15           -1385.96055668
episode: 16   steps: -1353.33420403
16           -1383.921409

episode: 129   steps: -1521.16299949
129           -1389.34052078
episode: 130   steps: -1350.03560642
130           -1386.71694802
episode: 131   steps: -1316.65079751
131           -1383.6939426
episode: 132   steps: -919.831048009
132           -1374.67362951
episode: 133   steps: -1387.42033224
133           -1371.33702524
episode: 134   steps: -1348.38554578
134           -1366.67291646
episode: 135   steps: -1265.60317985
135           -1360.15614573
episode: 136   steps: -1428.00648045
136           -1356.28681091
episode: 137   steps: -1446.14260704
137           -1352.0966527
episode: 138   steps: -1360.24781188
138           -1347.20922455
episode: 139   steps: -1419.5061172
139           -1345.13875945
episode: 140   steps: -1337.78126534
140           -1347.96994859
episode: 141   steps: -1288.38566352
141           -1346.31472534
episode: 142   steps: -1310.99479639
142           -1344.74566196
episode: 143   steps: -1169.5831687
143           -1341.05058288
episode: 144  

episode: 254   steps: -1174.33441569
254           -1210.20981643
episode: 255   steps: -1135.22789187
255           -1207.86582286
episode: 256   steps: -1135.85162148
256           -1206.38451708
episode: 257   steps: -1188.06607091
257           -1206.55166416
episode: 258   steps: -1190.17190027
258           -1206.23169255
episode: 259   steps: -1144.28603429
259           -1202.80677492
episode: 260   steps: -1145.43830253
260           -1202.7736265
episode: 261   steps: -1208.60478896
261           -1205.48443231
episode: 262   steps: -1343.45308309
262           -1208.96386508
episode: 263   steps: -1209.97857303
263           -1204.47050302
episode: 264   steps: -1344.1260714
264           -1202.41864186


KeyboardInterrupt: 