In [1]:
import numpy as np
import random
import datetime
import time
import tensorflow as tf
from collections import deque
from mlagents.envs import UnityEnvironment

In [2]:
state_size = [84, 84, 3]
action_size = 8

load_model = False
train_mode = True

batch_size = 32
mem_maxlen = 50000
discount_factor = 0.9
learning_rate = 0.00025

run_episode = 15000
test_episode = 1000

start_train_episode = 1000

target_update_step = 10000
print_interval = 100
save_interval = 5000

epsilon_init = 1.0
epsilon_min = 0.1

# environment setting
env_config = {"gridSize": 7, "numGoals": 2, "numBoxes": 2, "numObstacles":1}

date_time = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S")

# environment path
game = "single-agent"
env_name = "../env/" + game

# save and load models path
save_path = "../saved_models/" + game + "/" + date_time + "_DQN"
# load_path = "../saved_models/" + game

In [3]:
class Model():
    def __init__(self, model_name):
        self.input = tf.placeholder(shape=[None, state_size[0], state_size[1], 
                                           state_size[2]], dtype=tf.float32)
        # normalize pixels into 0 ~ 1
        self.input_normalize = (self.input - (255.0 / 2)) / (255.0 / 2)

        # CNN Network -> Three convolution layers and two fully connected layers
        with tf.variable_scope(name_or_scope=model_name):
            self.conv1 = tf.layers.conv2d(inputs=self.input_normalize, filters=32, 
                                          activation=tf.nn.relu, kernel_size=[8,8], 
                                          strides=[4,4], padding="SAME")
            self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, 
                                          activation=tf.nn.relu, kernel_size=[4,4],
                                          strides=[2,2],padding="SAME")
            self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=64, 
                                          activation=tf.nn.relu, kernel_size=[3,3],
                                          strides=[1,1],padding="SAME")
 
            self.flat = tf.layers.flatten(self.conv3)

            self.fc1 = tf.layers.dense(self.flat,512,activation=tf.nn.relu)
            self.Q_Out = tf.layers.dense(self.fc1, action_size, activation=None)
        self.predict = tf.argmax(self.Q_Out, 1)

        self.target_Q = tf.placeholder(shape=[None, action_size], dtype=tf.float32)

        # calculate loss and optimize
        self.loss = tf.losses.huber_loss(self.target_Q, self.Q_Out)
        self.UpdateModel = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
        self.trainable_var = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, model_name)

In [4]:
class DQNAgent():
    def __init__(self):
        
        self.model = Model("Q")  # normal model
        self.target_model = Model("target")  # target model

        self.memory = deque(maxlen=mem_maxlen)  # replay memory
   
        self.sess = tf.Session()
        self.init = tf.global_variables_initializer()
        self.sess.run(self.init)

        self.epsilon = epsilon_init

        self.Saver = tf.train.Saver()
        self.Summary, self.Merge = self.Make_Summary()

        self.update_target()

        if load_model == True:
            self.Saver.restore(self.sess, load_path)

    # epsilon greedy
    def get_action(self, state):
        if self.epsilon > np.random.rand():
            return np.random.randint(0, action_size)
        else:
            predict = self.sess.run(self.model.predict, feed_dict={self.model.input: state})
            return np.asscalar(predict)

    # add data to replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state[0], action, reward, next_state[0], done))

    # save network model 
    def save_model(self):
        self.Saver.save(self.sess, save_path + "/model/model")

    # training
    def train_model(self, done):
        # decrease epsilon
        if done:
            if self.epsilon > epsilon_min:
                self.epsilon -= 0.0004

        # mini batch sampling for training
        mini_batch = random.sample(self.memory, batch_size)

        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []

        for i in range(batch_size):
            states.append(mini_batch[i][0])
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states.append(mini_batch[i][3])
            dones.append(mini_batch[i][4])

        # get target value
        target = self.sess.run(self.model.Q_Out, feed_dict={self.model.input: states})
        target_val = self.sess.run(self.target_model.Q_Out, 
                                   feed_dict={self.target_model.input: next_states})

        for i in range(batch_size):
            if dones[i]:
                target[i][actions[i]] = rewards[i]
            else:
                target[i][actions[i]] = rewards[i] + discount_factor * np.amax(target_val[i])

        # do training and calculate loss
        _, loss = self.sess.run([self.model.UpdateModel, self.model.loss],
                                feed_dict={self.model.input: states, 
                                           self.model.target_Q: target})
        return loss

    # target network update
    def update_target(self):
        for i in range(len(self.model.trainable_var)):
            self.sess.run(self.target_model.trainable_var[i].assign(self.model.trainable_var[i]))

    # write summaries to Tensorboard
    def Make_Summary(self):
        self.summary_loss = tf.placeholder(dtype=tf.float32)
        self.summary_reward = tf.placeholder(dtype=tf.float32)
        tf.summary.scalar("loss", self.summary_loss)
        tf.summary.scalar("reward", self.summary_reward)
        Summary = tf.summary.FileWriter(logdir=save_path, graph=self.sess.graph)
        Merge = tf.summary.merge_all()

        return Summary, Merge
    
    def Write_Summray(self, reward, loss, episode):
        self.Summary.add_summary(
            self.sess.run(self.Merge, feed_dict={self.summary_loss: loss, 
                                                 self.summary_reward: reward}), episode)

In [None]:
if __name__ == '__main__':
    # create unity environment
    env = UnityEnvironment(file_name=env_name)

    # set unity brain
    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    # create agent
    agent = DQNAgent()

    step = 0
    rewards = []
    losses = []

    # environment setting
    env_info = env.reset(train_mode=train_mode, config=env_config)[default_brain]

    # run episodes 
    for episode in range(run_episode + test_episode):
        if episode > run_episode:
            train_mode = False
            env_info = env.reset(train_mode=train_mode)[default_brain]
        
        # init. state, episode_rewards, done
        state = np.uint8(255 * np.array(env_info.visual_observations[0]))
        episode_rewards = 0
        done = False

        # one episode
        while not done:
            step += 1

            # get action and let the agent take the action
            action = agent.get_action(state)
            env_info = env.step(action)[default_brain]

            # get information of the next state
            next_state = np.uint8(255 * np.array(env_info.visual_observations[0]))
            reward = env_info.rewards[0]
            episode_rewards += reward
            done = env_info.local_done[0]

            # add data to replay memroy
            if train_mode:
                agent.append_sample(state, action, reward, next_state, done)
            else:
                time.sleep(0.01) 
                agent.epsilon = 0.05

            # state update
            state = next_state

            if episode > start_train_episode and train_mode:
                # training
                loss = agent.train_model(done)
                losses.append(loss)

                # target network update at a certain step
                if step % (target_update_step) == 0:
                    agent.update_target()

        rewards.append(episode_rewards)

        # print and write episode information
        if episode % print_interval == 0 and episode != 0:
            print("step: {} / episode: {} / reward: {:.2f} / loss: {:.4f} / epsilon: {:.3f}".format
                  (step, episode, np.mean(rewards), np.mean(losses), agent.epsilon))
            agent.Write_Summray(np.mean(rewards), np.mean(losses), episode)
            rewards = []
            losses = []

        # save network model
        if episode % save_interval == 0 and episode != 0:
            agent.save_model()
            print("Save Model {}".format(episode))

    env.close()


INFO:mlagents.envs:
'SokobanAcademy' started successfully!
Unity Academy name: SokobanAcademy
        Number of Brains: 1
        Number of Training Brains : 1
        Reset Parameters :
		numBoxes -> 2.0
		numGoals -> 2.0
		numObstacles -> 1.0
		gridSize -> 7.0
Unity brain name: SokobanLearning
        Number of Visual Observations (per agent): 1
        Vector Observation space size (per agent): 0
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): [8]
        Vector Action descriptions: 
INFO:mlagents.envs:Academy reset with parameters: gridSize -> 7, numGoals -> 2, numBoxes -> 2, numObstacles -> 1
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


step: 7229 / episode: 100 / reward: -1.11 / loss: nan / epsilon: 1.000
step: 15292 / episode: 200 / reward: -1.27 / loss: nan / epsilon: 1.000
step: 20187 / episode: 300 / reward: -1.13 / loss: nan / epsilon: 1.000
step: 27752 / episode: 400 / reward: -1.06 / loss: nan / epsilon: 1.000
step: 34587 / episode: 500 / reward: -1.18 / loss: nan / epsilon: 1.000
step: 40540 / episode: 600 / reward: -1.20 / loss: nan / epsilon: 1.000
step: 48176 / episode: 700 / reward: -1.00 / loss: nan / epsilon: 1.000
step: 54312 / episode: 800 / reward: -1.06 / loss: nan / epsilon: 1.000
step: 60949 / episode: 900 / reward: -1.19 / loss: nan / epsilon: 1.000
step: 67274 / episode: 1000 / reward: -1.15 / loss: nan / epsilon: 1.000
step: 75799 / episode: 1100 / reward: -1.18 / loss: 0.0009 / epsilon: 0.960
step: 82868 / episode: 1200 / reward: -1.14 / loss: 0.0004 / epsilon: 0.920
step: 90160 / episode: 1300 / reward: -1.06 / loss: 0.0003 / epsilon: 0.880
step: 97095 / episode: 1400 / reward: -0.85 / loss: 