In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import gym
import matplotlib.pyplot as plt

In [2]:
env = gym.make('CartPole-v0')

input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.99
REPLAY_MEMORY = 5000
batch_size = 256

[2018-02-01 19:46:58,299] Making new env: CartPole-v0


In [3]:
class DQN :
    def __init__(self, session, input_size, output_size, name="main") :
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self._build_network()
        
    def _build_network(self, h_size=64, l_rate=0.01) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
            
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.relu(tf.matmul(self._X, W1))
            
            W2 = tf.get_variable("W2", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            W3 = tf.get_variable("W3", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer2 = tf.nn.relu(tf.matmul(layer1, W2))
            layer3 = tf.nn.relu(tf.matmul(layer1, W3))
            
            W_V = tf.get_variable("W_V", shape=[h_size, 1],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            W_A = tf.get_variable("W_A", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            self.Value = tf.matmul(layer2, W_V)
            self.Advantage = tf.matmul(layer3, W_A)
            
            self._Qpred = self.Value + self.Advantage - tf.reduce_mean(self.Advantage,
                                                                       reduction_indices=1,keep_dims=True)
    
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
    
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
    
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X : x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict={self._X : x_stack, self._Y : y_stack})

In [4]:
def replay_train (mainDQN, targetDQN, train_batch) :
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        if done :
            Q[0,action] = reward
        else :
            action0 = np.argmax(mainDQN.predict(next_state))
            Q[0,action] = reward + dis * (targetDQN.predict(next_state)[0, action0])
    
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
    
    return mainDQN.update(x_stack, y_stack)

In [5]:
def get_copy_var_ops(dest_scope_name="target", src_scope_name="main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
    
    return op_holder

In [6]:
def bot_play(mainDQN) :
    s = env.reset()
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s,reward,done,_ = env.step(a)
        reward_sum += reward
        
        if done :
            print ("Total score : {}".format(reward_sum))
            break

In [7]:
def main():
    
    max_episodes = 1000
    step_count_total = 0
    end_episode = 0
    
    replay_buffer = deque()
    steps_list = []
    step_avg_list = []
    
    with tf.Session() as sess :
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        
        tf.initialize_all_variables().run()
        
        
        copy_ops = get_copy_var_ops(dest_scope_name = "target",
                                   src_scope_name = "main")
        sess.run(copy_ops)
    
        for episode in range(1, max_episodes):
            e = 1. / (((episode - 1) / 5) + 1)
            done = False
            step_count = 0
            state = env.reset()
            
            while not done:
                #env.render()
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
                
                next_state, reward, done, _ = env.step(action)
                
                if done:
                    if step_count < 200:
                        reward = -100
                
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
    
                state = next_state
                step_count += 1
                if step_count > 10000 :
                    break
                
            #print("episode: {}   steps: {}".format(episode, step_count))
            steps_list.append(step_count)
            
            if episode >= 30:
                minibatch = random.sample(replay_buffer, batch_size)
                loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                
                #print ("Loss :  ", loss)
                sess.run(copy_ops)
            
            if episode < 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / episode)
                
            if episode == 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
                
            if episode > 100:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 101]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
            
            print("{}           {}".format(episode, step_avg_list[episode - 1]))
            end_episode += 1
            if step_avg_list[episode - 1] > 195:
                break
        
        print("------")
        print("------")
        for episode in range(end_episode + 1, max_episodes):
            s = env.reset()
            reward_sum = 0
            while True :
                #env.render()
                a = np.argmax(mainDQN.predict(s))
                s,reward,done,_ = env.step(a)
                reward_sum += reward
        
                if done :
                    #print("episode: {}   steps: {}".format(episode, reward_sum))
                    steps_list.append(reward_sum)
                    step_count_total += steps_list[episode - 1]
                    step_count_total -= steps_list[episode - 101]
                    step_avg_list.append(step_count_total / 100)
                    print("{}           {}".format(episode, step_avg_list[episode - 1]))
                    break
        
        
        x_values = list(range(1, max_episodes))
        y_values = step_avg_list[:]
        plt.plot(x_values, y_values, c='red')
        plt.grid(True)
        plt.show()
        
        print ("Simulation--------")
        #bot_play(mainDQN)

In [8]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2018-02-01 19:46:59,821] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


1           12
2           13
3           14
4           13
5           15
6           15
7           14
8           16
9           17
10           16
11           16
12           15
13           15
14           15
15           15
16           14
17           14
18           14
19           14
20           14
21           13
22           13
23           13
24           13
25           13
26           13
27           13
28           13
29           12
30           12
31           16
32           16
33           16
34           16
35           17
36           20
37           25
38           30
39           34
40           35
41           37
42           41
43           41
44           44
45           47
46           50
47           53
48           53
49           56
50           56
51           56
52           56
53           56
54           57
55           59
56           59
57           61
58           61
59           61
60           61
61           61
62           61
63           61
6

KeyboardInterrupt: 