In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import gym

In [2]:
env = gym.make('CartPole-v0')

input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000


[2018-01-09 10:11:21,861] Making new env: CartPole-v0


In [3]:
class DQN :
    def __init__(self, session, input_size, output_size, name="main") :
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self._build_network()
        
    def _build_network(self, h_size=32, l_rate=0.05) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
            
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))
            
            W2 = tf.get_variable("W2", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            self._Qpred = tf.matmul(layer1, W2)
    
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
    
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
    
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X : x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict={self._X : x_stack, self._Y : y_stack})    

In [4]:
def replay_train (mainDQN, targetDQN, train_batch) :
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        if done :
            Q[0,action] = reward
        else :
            Q[0,action] = reward + dis * np.max(targetDQN.predict(next_state))
    
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
    
    return mainDQN.update(x_stack, y_stack)

In [5]:
def get_copy_var_ops(dest_scope_name="target", src_scope_name="main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
    
    return op_holder

In [6]:
def bot_play(mainDQN) :
    s = env.reset()
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s,reward,done,_ = env.step(a)
        reward_sum += reward
        
        if done :
            print ("Total score : {}".format(reward_sum))
            break

In [7]:
def main():
    
    max_episodes = 5000
    
    replay_buffer = deque()
    
    with tf.Session() as sess :
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        
        tf.initialize_all_variables().run()
        
        
        copy_ops = get_copy_var_ops(dest_scope_name = "target",
                                   src_scope_name = "main")
        sess.run(copy_ops)
    
    
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()
            
            while not done:
                env.render()
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
    
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100
    
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
    
                state = next_state
                step_count += 1
                if step_count > 10000 :
                    break
                
            print("episode: {}   steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
    
            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
    
                print ("Loss :  ", loss)
                sess.run(copy_ops)
            
        bot_play(mainDQN)
        

In [8]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2018-01-09 10:11:23,118] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


episode: 0   steps: 14
episode: 1   steps: 10
('Loss :  ', 390.32095)
episode: 2   steps: 13
episode: 3   steps: 16
episode: 4   steps: 39
episode: 5   steps: 15
episode: 6   steps: 28
episode: 7   steps: 16
episode: 8   steps: 22
episode: 9   steps: 15
episode: 10   steps: 58
episode: 11   steps: 41
('Loss :  ', 955.16425)
episode: 12   steps: 12
episode: 13   steps: 20
episode: 14   steps: 11
episode: 15   steps: 12
episode: 16   steps: 13
episode: 17   steps: 12
episode: 18   steps: 21
episode: 19   steps: 13
episode: 20   steps: 13
episode: 21   steps: 12
('Loss :  ', 497.61075)
episode: 22   steps: 74
episode: 23   steps: 53
episode: 24   steps: 72
episode: 25   steps: 59
episode: 26   steps: 73
episode: 27   steps: 93
episode: 28   steps: 72
episode: 29   steps: 90
episode: 30   steps: 118
episode: 31   steps: 71
('Loss :  ', 32.610497)
episode: 32   steps: 78
episode: 33   steps: 89
episode: 34   steps: 88
episode: 35   steps: 36
episode: 36   steps: 48
episode: 37   steps: 36
e

episode: 303   steps: 22
episode: 304   steps: 20
episode: 305   steps: 16
episode: 306   steps: 19
episode: 307   steps: 23
episode: 308   steps: 17
episode: 309   steps: 18
episode: 310   steps: 13
episode: 311   steps: 21
('Loss :  ', 4.9213886)
episode: 312   steps: 23
episode: 313   steps: 28
episode: 314   steps: 23
episode: 315   steps: 31
episode: 316   steps: 33
episode: 317   steps: 23
episode: 318   steps: 21
episode: 319   steps: 23
episode: 320   steps: 66
episode: 321   steps: 47
('Loss :  ', 9.557764)
episode: 322   steps: 29
episode: 323   steps: 35
episode: 324   steps: 40
episode: 325   steps: 41
episode: 326   steps: 55
episode: 327   steps: 46
episode: 328   steps: 40
episode: 329   steps: 80
episode: 330   steps: 31
episode: 331   steps: 38
('Loss :  ', 8.832301)
episode: 332   steps: 86
episode: 333   steps: 106
episode: 334   steps: 74
episode: 335   steps: 129
episode: 336   steps: 99
episode: 337   steps: 99
episode: 338   steps: 88
episode: 339   steps: 103
ep

episode: 603   steps: 28
episode: 604   steps: 21
episode: 605   steps: 24
episode: 606   steps: 27
episode: 607   steps: 51
episode: 608   steps: 33
episode: 609   steps: 33
episode: 610   steps: 33
episode: 611   steps: 26
('Loss :  ', 3.0299091)
episode: 612   steps: 9
episode: 613   steps: 9
episode: 614   steps: 9
episode: 615   steps: 9
episode: 616   steps: 8
episode: 617   steps: 9
episode: 618   steps: 9
episode: 619   steps: 9
episode: 620   steps: 9
episode: 621   steps: 9
('Loss :  ', 1.2856123)
episode: 622   steps: 25
episode: 623   steps: 23
episode: 624   steps: 25
episode: 625   steps: 26
episode: 626   steps: 21
episode: 627   steps: 20
episode: 628   steps: 25
episode: 629   steps: 29
episode: 630   steps: 27
episode: 631   steps: 26
('Loss :  ', 1.2007945)
episode: 632   steps: 21
episode: 633   steps: 24
episode: 634   steps: 13
episode: 635   steps: 13
episode: 636   steps: 23
episode: 637   steps: 19
episode: 638   steps: 14
episode: 639   steps: 17
episode: 640 

KeyboardInterrupt: 