In [1]:
import numpy as np
import tensorflow as tf
import random

class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self.build_network()

    
    # h_size: hidden size, l_rate: learning rate
    def build_network(self, h_size = 10, l_rate = 1e-1):
        with tf.variable_scope(self.net_name):
            self._X=tf.placeholder(dtype=tf.float32, shape=[None, self.input_size])

            # First layer of weights
            W1 = tf.get_variable('W1', shape=[self.input_size, h_size], 
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))

            # Second layer of weights
            W2 = tf.get_variable('W2', shape=[h_size, self.output_size], 
                                 initializer=tf.contrib.layers.xavier_initializer())

            # Q prediction
            self._Qpred = tf.matmul(layer1, W2)
        
        # We need to define the parts of the network needed for learning a polilcy
        self._Y = tf.placeholder(dtype=tf.float32, shape=[None, self.output_size])
        
        # Loss function
        self._loss = tf.reduce_sum(tf.square(self._Y - self._Qpred))
        
        # Learning
        self._train = tf.train.AdamOptimizer(learning_rate=l_rate).minimize(self._loss)
    

    def predict(self, state):
        s_t = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X: s_t})

    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict={self._X: x_stack, self._Y: y_stack})

In [2]:
from collections import deque
import gym
from gym.envs.registration import register

env = gym.make('CartPole-v0')
env._max_episode_steps = 50000

input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000

def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)
    
    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)
        
        # terminal?
        if done:
            Q[0, action] = reward
        else:
            # Obtain the Q' values by feeding the new state through our network
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
    
    # Train our network using target and predicted Q values on each episode
    return DQN.update(x_stack, y_stack)


def bot_play(mainDQN):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print("Total score: {}".format(reward_sum))
            break
            

def main():
    max_episodes = 5000
    
    # store the previus observations in replay memory
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()
        
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    #choose an action by greedily from the Q-network
                    action = np.argmax(mainDQN.predict(state))
                    
                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done: # Penalty
                    reward = -100
                    
                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                    
                state = next_state
                step_count += 1
                if step_count > 10000: # Enough
                    break
                    
            print("Episode: {} Step: {}".format(episode, step_count))
            if step_count > 10000:
                pass
                break
            
            if episode % 10 == 1: # train every 10 episodes
                # Get a random batch of experiences.
                for _ in range(50):
                    #Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print("Loss :", loss)
            
        bot_play(mainDQN)

[2017-08-04 13:26:20,779] Making new env: CartPole-v0


In [3]:
if __name__ == "__main__":
    main()

Episode: 0 Step: 12
Episode: 1 Step: 10
Loss : 24.3523
Episode: 2 Step: 12
Episode: 3 Step: 13
Episode: 4 Step: 13
Episode: 5 Step: 9
Episode: 6 Step: 14
Episode: 7 Step: 12
Episode: 8 Step: 14
Episode: 9 Step: 13
Episode: 10 Step: 16
Episode: 11 Step: 16
Loss : 10583.7
Episode: 12 Step: 11
Episode: 13 Step: 11
Episode: 14 Step: 9
Episode: 15 Step: 10
Episode: 16 Step: 12
Episode: 17 Step: 9
Episode: 18 Step: 13
Episode: 19 Step: 11
Episode: 20 Step: 12
Episode: 21 Step: 11
Loss : 8289.72
Episode: 22 Step: 10
Episode: 23 Step: 9
Episode: 24 Step: 10
Episode: 25 Step: 10
Episode: 26 Step: 11
Episode: 27 Step: 9
Episode: 28 Step: 10
Episode: 29 Step: 9
Episode: 30 Step: 11
Episode: 31 Step: 11
Loss : 8377.24
Episode: 32 Step: 35
Episode: 33 Step: 25
Episode: 34 Step: 25
Episode: 35 Step: 23
Episode: 36 Step: 19
Episode: 37 Step: 44
Episode: 38 Step: 23
Episode: 39 Step: 30
Episode: 40 Step: 22
Episode: 41 Step: 25
Loss : 99.2013
Episode: 42 Step: 24
Episode: 43 Step: 53
Episode: 44 Step:

Loss : 8814.02
Episode: 362 Step: 19
Episode: 363 Step: 34
Episode: 364 Step: 44
Episode: 365 Step: 25
Episode: 366 Step: 17
Episode: 367 Step: 44
Episode: 368 Step: 15
Episode: 369 Step: 28
Episode: 370 Step: 13
Episode: 371 Step: 29
Loss : 88.0924
Episode: 372 Step: 23
Episode: 373 Step: 30
Episode: 374 Step: 27
Episode: 375 Step: 26
Episode: 376 Step: 29
Episode: 377 Step: 40
Episode: 378 Step: 33
Episode: 379 Step: 22
Episode: 380 Step: 22
Episode: 381 Step: 28
Loss : 147.662
Episode: 382 Step: 44
Episode: 383 Step: 55
Episode: 384 Step: 40
Episode: 385 Step: 57
Episode: 386 Step: 26
Episode: 387 Step: 42
Episode: 388 Step: 35
Episode: 389 Step: 29
Episode: 390 Step: 25
Episode: 391 Step: 25
Loss : 9262.38
Episode: 392 Step: 32
Episode: 393 Step: 26
Episode: 394 Step: 34
Episode: 395 Step: 34
Episode: 396 Step: 30
Episode: 397 Step: 99
Episode: 398 Step: 36
Episode: 399 Step: 67
Episode: 400 Step: 29
Episode: 401 Step: 75
Loss : 9833.96
Episode: 402 Step: 16
Episode: 403 Step: 66
E

Loss : 143.428
Episode: 712 Step: 36
Episode: 713 Step: 31
Episode: 714 Step: 26
Episode: 715 Step: 41
Episode: 716 Step: 43
Episode: 717 Step: 23
Episode: 718 Step: 53
Episode: 719 Step: 40
Episode: 720 Step: 32
Episode: 721 Step: 50
Loss : 11166.8
Episode: 722 Step: 21
Episode: 723 Step: 19
Episode: 724 Step: 44
Episode: 725 Step: 45
Episode: 726 Step: 27
Episode: 727 Step: 18
Episode: 728 Step: 18
Episode: 729 Step: 25
Episode: 730 Step: 77
Episode: 731 Step: 42
Loss : 319.883
Episode: 732 Step: 19
Episode: 733 Step: 24
Episode: 734 Step: 22
Episode: 735 Step: 15
Episode: 736 Step: 15
Episode: 737 Step: 19
Episode: 738 Step: 19
Episode: 739 Step: 26
Episode: 740 Step: 14
Episode: 741 Step: 26
Loss : 91.6251
Episode: 742 Step: 27
Episode: 743 Step: 23
Episode: 744 Step: 19
Episode: 745 Step: 29
Episode: 746 Step: 17
Episode: 747 Step: 25
Episode: 748 Step: 21
Episode: 749 Step: 17
Episode: 750 Step: 22
Episode: 751 Step: 19
Loss : 8446.46
Episode: 752 Step: 34
Episode: 753 Step: 24
E

Loss : 149.211
Episode: 1062 Step: 37
Episode: 1063 Step: 39
Episode: 1064 Step: 39
Episode: 1065 Step: 32
Episode: 1066 Step: 36
Episode: 1067 Step: 39
Episode: 1068 Step: 43
Episode: 1069 Step: 30
Episode: 1070 Step: 41
Episode: 1071 Step: 29
Loss : 43.102
Episode: 1072 Step: 27
Episode: 1073 Step: 28
Episode: 1074 Step: 50
Episode: 1075 Step: 27
Episode: 1076 Step: 25
Episode: 1077 Step: 26
Episode: 1078 Step: 27
Episode: 1079 Step: 22
Episode: 1080 Step: 32
Episode: 1081 Step: 31
Loss : 10345.5
Episode: 1082 Step: 61
Episode: 1083 Step: 23
Episode: 1084 Step: 23
Episode: 1085 Step: 25
Episode: 1086 Step: 65
Episode: 1087 Step: 22
Episode: 1088 Step: 23
Episode: 1089 Step: 27
Episode: 1090 Step: 27
Episode: 1091 Step: 23
Loss : 76.559
Episode: 1092 Step: 50
Episode: 1093 Step: 29
Episode: 1094 Step: 27
Episode: 1095 Step: 31
Episode: 1096 Step: 35
Episode: 1097 Step: 20
Episode: 1098 Step: 20
Episode: 1099 Step: 26
Episode: 1100 Step: 40
Episode: 1101 Step: 30
Loss : 102.103
Episode