In [1]:
import numpy as np
import tensorflow as tf
import random

class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self.build_network()

    
    # h_size: hidden size, l_rate: learning rate
    def build_network(self, h_size = 10, l_rate = 1e-1):
        with tf.variable_scope(self.net_name):
            self._X=tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])

            # First layer of weights
            W1 = tf.get_variable('W1', shape=[self.input_size, h_size], 
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))

            # Second layer of weights
            W2 = tf.get_variable('W2', shape=[h_size, self.output_size], 
                                 initializer=tf.contrib.layers.xavier_initializer())

            # Q prediction
            self._Qpred = tf.matmul(layer1, W2)
        
        # We need to define the parts of the network needed for learning a polilcy
        self._Y = tf.placeholder(dtype=tf.float32, shape=[None, self.output_size])
        
        # Loss function
        self._loss = tf.reduce_sum(tf.square(self._Y - self._Qpred))
        
        # Learning
        self._train = tf.train.AdamOptimizer(learning_rate=l_rate).minimize(self._loss)
    

    def predict(self, state):
        s_t = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X: s_t})

    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict={self._X: x_stack, self._Y: y_stack})

In [2]:
from collections import deque
import gym

env = gym.make("CartPole-v0")
env._max_episode_steps = 10004

# Constant defining our neural network
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000

def replay_train(mainDQN, targetDQN, train_batch):
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    
    # Get sotred information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)

        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # get target from target DQN(Q')
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))

        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(x_stack, y_stack)

def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):
    # Copy variables src_scope to dest_scope
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
        
    return op_holder

def bot_play(mainDQN):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break
            
def main():
    max_episodes = 5000
    # store the previous observations in replay memory
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()
        
        # initial copy q_net -> target_net
        copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main")
        
        sess.run(copy_ops)
        
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    # Choose an action by greedily from the Q-network
                    action = np.argmax(mainDQN.predict(state))
                    
                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done: # Penalty
                    reward = -100
                
                # save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                step_count += 1
                if step_count > 10000: # Good enough, Let's move on
                    break
                    
            print("Episode: {} steps: {}".format(episode, step_count))
            if step_count > 10000:
                pass
                break
            
            if episode % 10 == 1: # train every 10 episodes
                # Get a random batch of experiences.
                for _ in range(50):
                    #Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                print("Loss :", loss)
                
                #Copy q_net -> target_net
                sess.run(copy_ops)
            
        bot_play(mainDQN)

[2017-08-04 05:34:10,778] Making new env: CartPole-v0


In [None]:
if __name__ == "__main__":
    main()

Episode: 0 steps: 13
Episode: 1 steps: 52
Loss : 173.43
Episode: 2 steps: 22
Episode: 3 steps: 46
Episode: 4 steps: 27
Episode: 5 steps: 45
Episode: 6 steps: 27
Episode: 7 steps: 33
Episode: 8 steps: 49
Episode: 9 steps: 29
Episode: 10 steps: 45
Episode: 11 steps: 25
Loss : 12276.2
Episode: 12 steps: 12
Episode: 13 steps: 11
Episode: 14 steps: 14
Episode: 15 steps: 13
Episode: 16 steps: 11
Episode: 17 steps: 13
Episode: 18 steps: 10
Episode: 19 steps: 9
Episode: 20 steps: 9
Episode: 21 steps: 9
Loss : 84.6495
Episode: 22 steps: 45
Episode: 23 steps: 28
Episode: 24 steps: 41
Episode: 25 steps: 53
Episode: 26 steps: 33
Episode: 27 steps: 51
Episode: 28 steps: 29
Episode: 29 steps: 55
Episode: 30 steps: 35
Episode: 31 steps: 35
Loss : 221.121
Episode: 32 steps: 26
Episode: 33 steps: 37
Episode: 34 steps: 30
Episode: 35 steps: 26
Episode: 36 steps: 26
Episode: 37 steps: 34
Episode: 38 steps: 25
Episode: 39 steps: 23
Episode: 40 steps: 33
Episode: 41 steps: 48
Loss : 67.9732
Episode: 42 ste

Loss : 101.511
Episode: 342 steps: 9
Episode: 343 steps: 9
Episode: 344 steps: 9
Episode: 345 steps: 9
Episode: 346 steps: 9
Episode: 347 steps: 9
Episode: 348 steps: 8
Episode: 349 steps: 9
Episode: 350 steps: 9
Episode: 351 steps: 10
Loss : 52.3689
Episode: 352 steps: 9
Episode: 353 steps: 9
Episode: 354 steps: 9
Episode: 355 steps: 9
Episode: 356 steps: 11
Episode: 357 steps: 8
Episode: 358 steps: 9
Episode: 359 steps: 9
Episode: 360 steps: 9
Episode: 361 steps: 8
Loss : 9396.76
Episode: 362 steps: 23
Episode: 363 steps: 36
Episode: 364 steps: 24
Episode: 365 steps: 24
Episode: 366 steps: 20
Episode: 367 steps: 20
Episode: 368 steps: 29
Episode: 369 steps: 23
Episode: 370 steps: 24
Episode: 371 steps: 24
Loss : 105.623
Episode: 372 steps: 39
Episode: 373 steps: 29
Episode: 374 steps: 37
Episode: 375 steps: 26
Episode: 376 steps: 29
Episode: 377 steps: 33
Episode: 378 steps: 29
Episode: 379 steps: 22
Episode: 380 steps: 22
Episode: 381 steps: 28
Loss : 199.755
Episode: 382 steps: 20


Loss : 64.8206
Episode: 682 steps: 8
Episode: 683 steps: 9
Episode: 684 steps: 9
Episode: 685 steps: 8
Episode: 686 steps: 8
Episode: 687 steps: 8
Episode: 688 steps: 13
Episode: 689 steps: 8
Episode: 690 steps: 19
Episode: 691 steps: 8
Loss : 38.6968
Episode: 692 steps: 118
Episode: 693 steps: 118
Episode: 694 steps: 102
Episode: 695 steps: 83
Episode: 696 steps: 121
Episode: 697 steps: 85
Episode: 698 steps: 113
Episode: 699 steps: 111
Episode: 700 steps: 107
Episode: 701 steps: 76
Loss : 99.4213
Episode: 702 steps: 17
Episode: 703 steps: 10
Episode: 704 steps: 9
Episode: 705 steps: 8
Episode: 706 steps: 9
Episode: 707 steps: 9
Episode: 708 steps: 11
Episode: 709 steps: 8
Episode: 710 steps: 8
Episode: 711 steps: 9
Loss : 9995.04
Episode: 712 steps: 56
Episode: 713 steps: 57
Episode: 714 steps: 58
Episode: 715 steps: 49
Episode: 716 steps: 53
Episode: 717 steps: 60
Episode: 718 steps: 51
Episode: 719 steps: 63
Episode: 720 steps: 55
Episode: 721 steps: 58
Loss : 70.0628
Episode: 722 

Loss : 118.312
Episode: 1022 steps: 8
Episode: 1023 steps: 9
Episode: 1024 steps: 10
Episode: 1025 steps: 9
Episode: 1026 steps: 9
Episode: 1027 steps: 9
Episode: 1028 steps: 9
Episode: 1029 steps: 9
Episode: 1030 steps: 9
Episode: 1031 steps: 8
Loss : 405.748
Episode: 1032 steps: 17
Episode: 1033 steps: 14
Episode: 1034 steps: 16
Episode: 1035 steps: 19
Episode: 1036 steps: 18
Episode: 1037 steps: 16
Episode: 1038 steps: 16
Episode: 1039 steps: 17
Episode: 1040 steps: 17
Episode: 1041 steps: 17
Loss : 118.47
Episode: 1042 steps: 27
Episode: 1043 steps: 21
Episode: 1044 steps: 39
Episode: 1045 steps: 24
Episode: 1046 steps: 36
Episode: 1047 steps: 29
Episode: 1048 steps: 26
Episode: 1049 steps: 23
Episode: 1050 steps: 28
Episode: 1051 steps: 21
Loss : 8294.12
Episode: 1052 steps: 14
Episode: 1053 steps: 12
Episode: 1054 steps: 13
Episode: 1055 steps: 11
Episode: 1056 steps: 12
Episode: 1057 steps: 13
Episode: 1058 steps: 14
Episode: 1059 steps: 15
Episode: 1060 steps: 12
Episode: 1061 

Loss : 62.6267
Episode: 1352 steps: 12
Episode: 1353 steps: 10
Episode: 1354 steps: 14
Episode: 1355 steps: 10
Episode: 1356 steps: 9
Episode: 1357 steps: 8
Episode: 1358 steps: 10
Episode: 1359 steps: 12
Episode: 1360 steps: 11
Episode: 1361 steps: 11
Loss : 31.6662
Episode: 1362 steps: 69
Episode: 1363 steps: 33
Episode: 1364 steps: 23
Episode: 1365 steps: 50
Episode: 1366 steps: 70
Episode: 1367 steps: 26
Episode: 1368 steps: 36
Episode: 1369 steps: 26
Episode: 1370 steps: 30
Episode: 1371 steps: 27
Loss : 101.017
Episode: 1372 steps: 9
Episode: 1373 steps: 10
Episode: 1374 steps: 10
Episode: 1375 steps: 8
Episode: 1376 steps: 8
Episode: 1377 steps: 8
Episode: 1378 steps: 10
Episode: 1379 steps: 8
Episode: 1380 steps: 10
Episode: 1381 steps: 9
Loss : 153.434
Episode: 1382 steps: 24
Episode: 1383 steps: 10001
