In [1]:
import gym
import numpy as np
import random
import tensorflow as tf
from collections import deque
from skimage.transform import resize

In [2]:
env = gym.make('BreakoutDeterministic-v4')

input_size = 84*84*4
output_size = env.action_space.n

HEIGHT = 84
WIDTH = 84
HISTORY_SIZE = 4

dis = 0.99
REPLAY_MEMORY = 400000

[2018-01-16 18:56:03,240] Making new env: BreakoutDeterministic-v4


In [3]:
class DQN :
    def __init__(self, session, input_size, output_size, name="main") :
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.height = 84
        self.width = 84
        self.history_size = 4
        self.net_name = name
        self._build_network()
    
    def pre_proc(X):
        x = np.uint8(resize(rgb2gray(X), (84, 84), mode='reflect') * 255)
        return x
    
    def _build_network(self, l_rate=0.00025) :
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder('float', [None, self.height, self.width, self.history_size])
            self.a = tf.placeholder('int64', [None])
            
            f1 = tf.get_variable("f1", shape=[8, 8, 4, 16],
                                 initializer=tf.contrib.layers.xavier_initializer_conv2d())
            f2 = tf.get_variable("f2", shape=[4, 4, 16, 32],
                                 initializer=tf.contrib.layers.xavier_initializer_conv2d())
            w1 = tf.get_variable("w1", shape=[9*9*32, 256],
                                 initializer=tf.contrib.layers.xavier_initializer())
            w2 = tf.get_variable("w2", shape=[256, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            
            c1 = tf.nn.relu(tf.nn.conv2d(self._X, f1, strides=[1, 4, 4, 1], padding="VALID"))
            c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 2, 2, 1], padding="VALID"))
            
            l1 = tf.reshape(c2, [-1, w1.get_shape().as_list()[0]])
            l2 = tf.nn.relu(tf.matmul(l1, w1))
            
            self._Qpred = tf.matmul(l2, w2)
        
        action_one_hot = tf.one_hot(self.a, self.output_size, 1.0, 0.0)
        q_val = tf.reduce_sum(tf.multiply(self._Qpred, action_one_hot), axis=1)
        
        self._Y = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
    
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
    
        self._train = tf.train.AdamOptimizer(learning_rate = l_rate).minimize(self._loss)
    
    def predict(self, state):
        return self.session.run(self._Qpred, feed_dict={self._X : np.reshape(state, [-1, 84, 84, 4])})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train],
                                feed_dict={self._X : np.reshape(x_stack, [-1, 84, 84, 4]), self._Y : y_stack})    

In [4]:
def replay_train (mainDQN, targetDQN, train_batch) :
    x_stack = np.empty(0).reshape(0, 84, 84, 4)
    y_stack = np.empty(0).reshape(0, output_size)
    
    for history, action, reward, done in train_batch:
        Q = mainDQN.predict(history[:, :, :4])
        
        if done :
            Q[0,action] = reward
        else :
            action0 = np.argmax(mainDQN.predict(history[:, :, 1:]))
            Q[0,action] = reward + dis * (targetDQN.predict(history[:, :, 1:])[0, action0])
        
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, np.reshape(history[:, :, :4], [-1, 84, 84, 4])])
    
    return mainDQN.update(x_stack, y_stack)

In [5]:
def get_copy_var_ops(dest_scope_name="target", src_scope_name="main"):
    op_holder = []
    
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name)
    
    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
    
    return op_holder

In [6]:
def rgb2gray(X) :
    return np.mean(X, axis=2).astype(np.uint8)

def conversion_image(X) :
    x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
    return x

def init_history_conv(history, state) :
    for i in range(5):
        history[:, :, i] = conversion_image(state)

In [7]:
def bot_play(mainDQN) :
    s = env.reset()
    history = np.zeros([84, 84, 5], dtype=np.uint8)
    init_history_conv(history, s)
    reward_sum = 0
    while True :
        env.render()
        a = np.argmax(mainDQN.predict(np.reshape(history[:, :, :4], [-1, 84, 84, 4])))
        
        s, reward, done, _ = env.step(a)
        history[:, :, 4] = conversion_image(s)
        history[:, :, :4] = history[:, :, 1:]
        
        if reward > 0 :
                    step_count += reward
        
        if done :
            print ("Total score : {}".format(reward_sum))
            break

In [None]:
def main():
    
    max_episodes = 10000
    
    replay_buffer = deque()
    
    with tf.Session() as sess :
        mainDQN = DQN(sess, input_size, output_size, name="main")
        targetDQN = DQN(sess, input_size, output_size, name="target")
        
        tf.initialize_all_variables().run()
        
        
        copy_ops = get_copy_var_ops(dest_scope_name = "target",
                                   src_scope_name = "main")
        sess.run(copy_ops)
    
        
        step_count_total = 0
        frame = 0
        
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            life = False
            step_count = 0
            
            state = env.reset()
            start_life = 5
            
            history = np.zeros([84, 84, 5], dtype=np.uint8)
            init_history_conv(history, state)
            
            while not done:
                env.render()
                
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(np.reshape(history[:, :, :4], [-1, 84, 84, 4])))
    
                next_state, reward, done, lives = env.step(action)
                history[:, :, 4] = conversion_image(next_state)
                
                if start_life > lives['ale.lives']:
                    life = True
                    start_life = lives['ale.lives']
                
                if done or life:
                    reward = -1
                    life = False
                
                if reward > 0 :
                    reward = 1
                    step_count += 1
                
                replay_buffer.append((np.copy(history[:,:,:]), action, reward, done))
                history[:, :, :4] = history[:, :, 1:]
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                frame += 1
                
                if step_count > 100 :
                    break
                
            print("episode: {}   steps: {}".format(episode, step_count))
            # print("frame:  ", frame)
            step_count_total += step_count
    
            if episode % 10 == 0 and episode > 0:
                if step_count_total == 1000:
                    break
                
                for _ in range(10):
                    minibatch = random.sample(replay_buffer, 32)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
    
                print ("Loss :  ", loss)
                print ("Step Total 10 :  ", step_count_total)
                step_count_total = 0
                sess.run(copy_ops)
        
        print ("Simulation--------")
        bot_play(mainDQN)

In [None]:
main()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2018-01-16 18:56:04,954] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py:107: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


episode: 0   steps: 1
episode: 1   steps: 2
episode: 2   steps: 1
episode: 3   steps: 2
episode: 4   steps: 0
episode: 5   steps: 2
episode: 6   steps: 0
episode: 7   steps: 2
episode: 8   steps: 3
episode: 9   steps: 0
episode: 10   steps: 0
('Loss :  ', 42.596924)
('Step Total 10 :  ', 13)
episode: 11   steps: 2
episode: 12   steps: 0
episode: 13   steps: 1
episode: 14   steps: 0
episode: 15   steps: 4
episode: 16   steps: 0
episode: 17   steps: 2
episode: 18   steps: 0
episode: 19   steps: 1
episode: 20   steps: 0
('Loss :  ', 35.571014)
('Step Total 10 :  ', 10)
episode: 21   steps: 0
episode: 22   steps: 2
episode: 23   steps: 0
episode: 24   steps: 0
episode: 25   steps: 0
episode: 26   steps: 0
episode: 27   steps: 0
episode: 28   steps: 0
episode: 29   steps: 0
episode: 30   steps: 2
('Loss :  ', 15.599367)
('Step Total 10 :  ', 4)
episode: 31   steps: 2
episode: 32   steps: 0
episode: 33   steps: 2
episode: 34   steps: 0
episode: 35   steps: 2
episode: 36   steps: 2
episode: 3

episode: 288   steps: 4
episode: 289   steps: 5
episode: 290   steps: 4
('Loss :  ', 0.22432059)
('Step Total 10 :  ', 40)
episode: 291   steps: 2
episode: 292   steps: 2
episode: 293   steps: 2
episode: 294   steps: 3
episode: 295   steps: 2
episode: 296   steps: 2
episode: 297   steps: 2
episode: 298   steps: 2
episode: 299   steps: 2
episode: 300   steps: 0
('Loss :  ', 0.26373342)
('Step Total 10 :  ', 19)
episode: 301   steps: 3
episode: 302   steps: 3
episode: 303   steps: 4
episode: 304   steps: 3
episode: 305   steps: 1
episode: 306   steps: 3
episode: 307   steps: 3
episode: 308   steps: 2
episode: 309   steps: 3
episode: 310   steps: 3
('Loss :  ', 0.06777519)
('Step Total 10 :  ', 28)
episode: 311   steps: 2
episode: 312   steps: 0
episode: 313   steps: 0
episode: 314   steps: 0
episode: 315   steps: 0
episode: 316   steps: 0
episode: 317   steps: 0
episode: 318   steps: 0
episode: 319   steps: 0
episode: 320   steps: 0
('Loss :  ', 2.183435)
('Step Total 10 :  ', 2)
episode

('Loss :  ', 0.0511049)
('Step Total 10 :  ', 13)
episode: 571   steps: 0
episode: 572   steps: 0
episode: 573   steps: 0
episode: 574   steps: 0
episode: 575   steps: 0
episode: 576   steps: 0
episode: 577   steps: 0
episode: 578   steps: 0
episode: 579   steps: 0
episode: 580   steps: 0
('Loss :  ', 0.080658294)
('Step Total 10 :  ', 0)
episode: 581   steps: 0
episode: 582   steps: 0
episode: 583   steps: 0
episode: 584   steps: 0
episode: 585   steps: 0
episode: 586   steps: 0
episode: 587   steps: 0
episode: 588   steps: 0
episode: 589   steps: 0
episode: 590   steps: 0
('Loss :  ', 0.07954642)
('Step Total 10 :  ', 0)
episode: 591   steps: 0
episode: 592   steps: 1
episode: 593   steps: 1
episode: 594   steps: 0
episode: 595   steps: 0
episode: 596   steps: 0
episode: 597   steps: 0
episode: 598   steps: 0
episode: 599   steps: 0
episode: 600   steps: 0
('Loss :  ', 0.027781734)
('Step Total 10 :  ', 2)
episode: 601   steps: 1
episode: 602   steps: 1
episode: 603   steps: 2
episod