In [1]:
import numpy as np
import tensorflow as tf
import random
from collections import deque
import gym
import matplotlib.pyplot as plt

In [2]:
np.random.seed(0)
tf.set_random_seed(0)
random.seed(0)

env = gym.make('CartPole-v0')
env.seed(0)

input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.99
REPLAY_MEMORY = 50000
batch_size = 256
training_step = 10
copy_step = 40

[2018-02-16 03:21:41,824] Making new env: CartPole-v0


In [1]:
class DQN :
    def __init__(self, session, input_size, output_size, name="main") :
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self._build_network()
        
    def _build_network(self, h_size=64, l_rate=0.01) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
            
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer(seed = 0))
            layer1 = tf.nn.relu(tf.matmul(self._X, W1))
            
            W2 = tf.get_variable("W2", shape=[h_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer(seed = 0))
            layer2 = tf.nn.relu(tf.matmul(layer1, W2))
            
            W3 = tf.get_variable("W3", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer(seed = 0))
            
            self._Qpred = tf.matmul(layer2, W3)
    
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
    
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
    
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state):
        x = np.reshape(state, [1,self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X : x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train], feed_dict={self._X : x_stack, self._Y : y_stack})

In [None]:
def epsilonmax(x, eps):
    n = len(x)
    p = []
    for i in range(n):
        p.append(eps/n)
    p[np.argmax(x)] += 1 - eps
    return p

In [4]:
def replay_train (mainDQN, train_batch) :
    x_stack = np.empty(0).reshape(0, input_size)
    y_stack = np.empty(0).reshape(0, output_size)
    
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        if done :
            Q[0,action] = reward
        else :
            Q[0,action] = reward + dis * np.max(mainDQN.predict(next_state))
    
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
    
    return mainDQN.update(x_stack, y_stack)

In [5]:
def bot_play(mainDQN) :
    s = env.reset()
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s,reward,done,_ = env.step(a)
        reward_sum += reward
        
        if done :
            print ("Total score : {}".format(reward_sum))
            break

In [6]:
def main():
    
    max_episodes = 500
    step_count_total = 0
    global_step = 0
    end_episode = 0
    
    replay_buffer = deque()
    steps_list = []
    step_avg_list = []
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name="main")
        
        tf.initialize_all_variables().run()
    
        for episode in range(1, max_episodes):
            e = 1. / (((episode-1) / 5) + 1)
            done = False
            step_count = 0
            state = env.reset()
            
            while not done:
                #env.render()
                #if np.random.rand(1) < e:
                #    action = env.action_space.sample()
                #else:
                #    action = np.argmax(mainDQN.predict(state))
                
                action_max = epsilonmax(mainDQN.predict(state)[0], e)
                action = np.random.choice(len(action_max),size=1,p=action_max)[0]
                
                next_state, reward, done, _ = env.step(action)
                if done:
                    if step_count < 200:
                        reward = -100
    
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
    
                state = next_state
                step_count += 1
                global_step += 1
                
                if global_step > batch_size and global_step % training_step == 0:
                    
                
            steps_list.append(step_count)
    
            if episode >= 30:
                minibatch = random.sample(replay_buffer, batch_size)
                loss, _ = replay_train(mainDQN, minibatch)
                
                #print ("Loss :  ", loss)
                
            if episode < 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / episode)
        
            if episode == 100:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
            
            if episode > 100:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 101]
                step_avg_list.append(step_count_total / 100)
                #print ("Step Average 100:  ", step_avg_list[episode - 1])
            
            end_episode += 1
            print("{}           {}".format(episode, step_avg_list[episode - 1]))
            if step_avg_list[episode - 1] > 195:
                break
                
        print("--------------------------------------------------")
        print("--------------------------------------------------")
        for episode in range(end_episode + 1, max_episodes):
            s = env.reset()
            reward_sum = 0
            while True :
                #env.render()
                a = np.argmax(mainDQN.predict(s))
                s,reward,done,_ = env.step(a)
                reward_sum += reward
        
                if done :
                    #print("episode: {}   steps: {}".format(episode, reward_sum))
                    steps_list.append(reward_sum)
                    step_count_total += steps_list[episode - 1]
                    step_count_total -= steps_list[episode - 101]
                    step_avg_list.append(step_count_total / 100)
                    print("{}           {}".format(episode, step_avg_list[episode - 1]))
                    break
        
        x_values = list(range(1, max_episodes))
        y_values = step_avg_list[:]
        plt.plot(x_values, y_values, c='blue')
        plt.grid(True)
        plt.show()

In [7]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2018-02-16 03:21:42,938] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


1           12
2           13
3           15
4           16
5           15
6           22
7           21
8           21
9           27
10           31
11           32
12           33
13           33
14           37
15           39
16           38
17           39
18           40
19           40
20           41
21           42
22           43
23           44
24           43
25           43
26           46
27           45
28           44
29           44
30           43
31           44
32           43
33           42
34           41
35           40
36           39
37           38
38           37
39           37
40           36
41           35
42           35
43           34
44           33
45           33
46           32
47           32
48           31
49           31
50           31
51           30
52           30
53           29
54           29
55           29
56           28
57           28
58           28
59           27
60           27
61           27
62           26
63           26
6

KeyboardInterrupt: 