In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import gym

In [2]:
env = gym.make('CartPole-v0')

input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.99
REPLAY_MEMORY = 50000
batch_size = 256
alpha = 0.6
beta_init = 0.4
eps = 0.01

[2018-01-31 20:02:18,555] Making new env: CartPole-v0


In [3]:
class DQN :
    def __init__(self, session, input_size, output_size, name="main") :
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self._build_network()
        
    def _build_network(self, h_size=32, l_rate=0.05) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
            
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.relu(tf.matmul(self._X, W1))
            
            W2 = tf.get_variable("W2", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            W3 = tf.get_variable("W3", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer2 = tf.nn.relu(tf.matmul(layer1, W2))
            layer3 = tf.nn.relu(tf.matmul(layer1, W3))
            
            W_V = tf.get_variable("W_V", shape=[h_size, 1],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            W_A = tf.get_variable("W_A", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            self.Value = tf.matmul(layer2, W_V)
            self.Advantage = tf.matmul(layer3, W_A)
            
            self._Qpred = self.Value + self.Advantage - tf.reduce_mean(self.Advantage,
                                                                       reduction_indices=1,keep_dims=True)
    
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
        
        self._WIS = tf.placeholder(shape=[None, 1], dtype=tf.float32)
        
        #self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        self._loss = tf.reduce_sum(tf.multiply(self._WIS, tf.square(self._Y - self._Qpred)))
        
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X : x})
    
    def update(self, x_stack, y_stack, w_stack):
        return self.session.run([self._loss, self._train],
                                feed_dict={self._X : x_stack, self._Y : y_stack, self._WIS : w_stack})

In [4]:
def replay_train (mainDQN, targetDQN, train_batch, w_batch) :
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    w_stack = np.empty(0).reshape(0, 0)
    
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        if done :
            Q[0,action] = reward
        else :
            action0 = np.argmax(mainDQN.predict(next_state))
            Q[0,action] = reward + dis * (targetDQN.predict(next_state)[0, action0])
    
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    for w in w_batch:
        w_stack = np.vstack([w])
        
    return mainDQN.update(x_stack, y_stack, w_stack)

In [5]:
def get_copy_var_ops(dest_scope_name="target", src_scope_name="main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
    
    return op_holder

In [6]:
def bot_play(mainDQN) :
    s = env.reset()
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s,reward,done,_ = env.step(a)
        reward_sum += reward
        
        if done :
            print ("Total score : {}".format(reward_sum))
            break

In [7]:
def main():
    
    max_episodes = 1000
    step_count_total = 0
    beta = beta_init
    
    replay_buffer = deque()
    TD_error_list = []
    steps_list = []
    
    with tf.Session() as sess :
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        
        tf.initialize_all_variables().run()
        
        
        copy_ops = get_copy_var_ops(dest_scope_name = "target",
                                                src_scope_name = "main")
        sess.run(copy_ops)
    
        for episode in range(1, max_episodes):
            e = 1. / (((episode - 1) / 20) + 1)
            done = False
            step_count = 0
            TD_error = 0
            state = env.reset()
            
            while not done:
                #env.render()
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
                
                next_state, reward, done, _ = env.step(action)
                step_count += 1
                step_count_total += 1
                
                if done:
                    if step_count < 200:
                        reward = -100
                    TD_error = reward
                else:
                    action0 = np.argmax(mainDQN.predict(next_state))
                    TD_error = reward + dis * (targetDQN.predict(next_state)[0, action0])
                
                TD_error -= np.max(mainDQN.predict(state))
                TD_error = pow((abs(TD_error) + eps), alpha)
                TD_error_list.append(TD_error)
                
                if beta < 1:
                    beta +=(1 - beta_init)/REPLAY_MEMORY
                
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                
                #if step_count_total % 100
                
            print("episode: {}   steps: {}".format(episode, step_count))
            steps_list.append(step_count)
            
            if episode >= 10:
                TD_norm_list = []
                TD_accum_list = []
                W_is_list = []
                
                start = 0
                len_TD = len(TD_error_list)
                if(len_TD > REPLAY_MEMORY):
                    start = len_TD - REPLAY_MEMORY
                    TD_error_list = TD_error_list[start : len_TD]
                
                TD_norm_list = TD_error_list / sum(TD_error_list)
                TD_accum_list = np.cumsum(TD_norm_list)
                W_is_list = np.power((REPLAY_MEMORY * TD_norm_list), -beta)
                W_is_list = W_is_list / np.max(W_is_list)
                
                for i in range(batch_size):
                    minibatch = []
                    w_batch = []
                    
                    rand_batch = random.random()
                    TD_index = np.nonzero(TD_accum_list >= rand_batch)[0][0]
                    w_batch.append(W_is_list[TD_index])
                    minibatch.append(replay_buffer[TD_index])
                
                loss, _ = replay_train(mainDQN, targetDQN, minibatch, w_batch)
                
                print ("Loss :  ", loss)
                sess.run(copy_ops)
            
            if episode == 100:
                for i in range(0, 100):
                    step_count_total += steps_list[episode - 1 - i]
                print ("Step Average 100:  ", step_count_total/100)
            
            if episode > 100:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 101]
                print ("Step Average 100:  ", step_count_total/100)
            
                if step_count_total/100 > 195:
                    break
            
        print ("Simulation--------")
        bot_play(mainDQN)

In [8]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2018-01-31 20:02:19,962] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


episode: 1   steps: 14
episode: 2   steps: 12
episode: 3   steps: 11
episode: 4   steps: 18
episode: 5   steps: 21
episode: 6   steps: 18
episode: 7   steps: 22
episode: 8   steps: 15
episode: 9   steps: 19
episode: 10   steps: 16
('Loss :  ', 3239.383)
episode: 11   steps: 19
('Loss :  ', 3164.625)
episode: 12   steps: 10
('Loss :  ', 3222.2473)
episode: 13   steps: 15
('Loss :  ', 0.7223292)
episode: 14   steps: 23
('Loss :  ', 2975.9011)
episode: 15   steps: 49
('Loss :  ', 2670.4954)
episode: 16   steps: 22
('Loss :  ', 4.4068813)
episode: 17   steps: 15
('Loss :  ', 0.51100314)
episode: 18   steps: 14
('Loss :  ', 0.56919605)
episode: 19   steps: 12
('Loss :  ', 0.029228222)
episode: 20   steps: 23
('Loss :  ', 0.0328246)
episode: 21   steps: 10
('Loss :  ', 1904.877)
episode: 22   steps: 12
('Loss :  ', 1394.3854)
episode: 23   steps: 12
('Loss :  ', 1523.293)
episode: 24   steps: 10
('Loss :  ', 1377.2184)
episode: 25   steps: 11
('Loss :  ', 457.3029)
episode: 26   steps: 14
('

KeyboardInterrupt: 