In [1]:
import numpy as np
import pandas as pd
import random
import copy
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam, RMSprop
from collections import deque
from keras import backend as K

Using TensorFlow backend.


### 1.Load Maze

In [38]:
MAZE_DICT = {'S':0,' ':1,'#':2,'A':3,'B':4,'C':5,'D':6,'E':7,'G':8,'o':9}
INV_MAZE_DICT = dict((v, k) for k, v in MAZE_DICT.items()) 
def read_maze(maze_path):
        f = open(maze_path)
        line = f.read()
        f.close()

        maze = line.split('\n')
        maze = maze[:len(maze)-1]
        maze_num = []

        for i in maze:
            maze_num.append([MAZE_DICT[j] for j in i ])

        return np.array(maze_num)

maze_field = read_maze('./data/maze1.txt')


### 2.迷路の構造

In [39]:
class Maze_Function(object):
    def __init__(self,maze):
        self.maze = maze
        self.start_point = self.get_location(np.where(maze==0))
        self.goal_point = self.get_location(np.where(maze==8))
        self.item_points = self.get_location(np.where(maze==9))
        self.enemy_a = self.get_location(np.where(maze==3))
        self.enemy_b = self.get_location(np.where(maze==4))
        self.enemy_c = self.get_location(np.where(maze==5))
        self.enemy_d = self.get_location(np.where(maze==6))
        self.enemy_e = self.get_location(np.where(maze==7))
        self.item_num = len(self.item_points)
        self.movable_vec = [[1,0],[-1,0],[0,1],[0,-1],[0,0]]
        
    def display(self):
        for i in range(self.maze.shape[0]):
            line = ""
            for j in range(self.maze.shape[1]):
                line += INV_MAZE_DICT[self.maze[i,j]]
            print(line)
            
    def get_location(self,np_location):
        y_ = np_location[0]
        x_ = np_location[1]
        location = []
        for i in range(len(y_)):
            location.append([y_[i],x_[i]])
        return location
    
    def get_actions(self,state):
        movables = []

        for v in self.movable_vec:
            x = state[1] + v[1]
            y = state[0] + v[0]
            #print(v)
            #print([y,x])
            
            #print(self.maze[y,x])

            if not(0 < x <= self.maze.shape[1] and
                  0  < y <= self.maze.shape[0] and
                  self.maze[y,x] != 2):
                continue
            movables.append(v)
            #print("ok")

        return movables
        
    
    
    def enemy_action(self,state,count):
        
        if len(self.enemy_a) != 0:
            for i in range(len(self.enemy_a)):
                move_select = self.get_actions(self.enemy_a[i])
                if state[0] > self.enemy_a[i][0] and [1,0] in move_select:
                    self.enemy_a[i][0] += 1
                    continue
                elif state[0] < self.enemy_a[i][0] and [-1,0] in move_select:
                    self.enemy_a[i][0] -= 1
                    continue
                elif state[1] > self.enemy_a[i][1] and [0,1] in move_select:
                    self.enemy_a[i][1] += 1
                    continue
                elif state[1] < self.enemy_a[i][1] and [0,-1] in move_select:
                    self.enemy_a[i][1] -= 1
                    continue
                else:
                    if [-1,0] in move_select:
                        self.enemy_a[i][0] -= 1
                        continue
                    elif [0,-1] in move_select:
                        self.enemy_a[i][1] -= 1
                        continue
                    elif [1,0] in move_select:
                        self.enemy_a[i][0] += 1
                        continue
                    elif [0,1] in move_select:
                        self.enemy_a[i][0] += 1
        
        if len(self.enemy_b) != 0:
            for i in range(len(self.enemy_b)):
                move_select = self.get_actions(self.enemy_b[i])
                if state[1] > self.enemy_b[i][1] and [0,1] in move_select:
                    self.enemy_a[i][1] += 1
                    continue
                elif state[1] < self.enemy_b[i][1] and [0,-1] in move_select:
                    self.enemy_a[i][1] -= 1
                    continue
                elif state[0] > self.enemy_b[i][1] and [1,0] in move_select:
                    self.enemy_a[i][0] += 1
                    continue
                elif state[0] < self.enemy_b[i][1] and [-1,0] in move_select:
                    self.enemy_a[i][0] -= 1
                    continue
                else:
                    if [1,0] in move_select:
                        self.enemy_b[i][0] += 1
                        continue
                    elif [0,-1] in move_select:
                        self.enemy_b[i][1] -= 1
                        continue
                    elif [-1,0] in move_select:
                        self.enemy_b[i][0] -= 1
                        continue
                    elif [0,1] in move_select:
                        self.enemy_b[i][0] += 1  
                    
        if len(self.enemy_c) != 0:
            for i in range(len(self.enemy_c)):
                move_select = self.get_actions(self.enemy_c[i])
                if [0,-1] in move_select:
                    self.enemy_c[i][1] -= 1
                    continue
                elif [1,0] in move_select:
                    self.enemy_c[i][0] += 1
                    continue
                elif [0,1] in move_select:
                    self.enemy_c[i][1] += 1
                    continue
                elif [-1,0] in move_select:
                    self.enemy_c[i][0] -= 1           
   
        if len(self.enemy_d) != 0:
            for i in range(len(self.enemy_d)):
                move_select = self.get_actions(self.enemy_d[i])
                if [0,1] in move_select:
                    self.enemy_d[i][1] += 1
                    continue
                elif [1,0] in move_select:
                    self.enemy_d[i][0] += 1
                    continue
                elif [0,-1] in move_select:
                    self.enemy_d[i][1] -= 1
                    continue
                elif [-1,0] in move_select:
                    self.enemy_d[i][0] -= 1 

        if len(self.enemy_e) != 0:
            for i in range(len(self.enemy_e)):
                move_select = self.get_actions(self.enemy_e[i])
                if [0,1] in move_select:
                    self.enemy_e[i][1] += 1
                    continue
                elif [1,0] in move_select:
                    self.enemy_e[i][0] += 1
                    continue
                elif [0,1] in move_select:
                    self.enemy_e[i][1] -= 1
                    continue
                elif [-1,0] in move_select:
                    self.enemy_e[i][0] -= 1
    
    def get_value(self,state,action):
        value = 0
        done = False
       
        enemy_list = [self.enemy_a,self.enemy_b,self.enemy_c,self.enemy_d,self.enemy_e]
        if state in self.item_points:
            self.get_items(state)
            value += 100
        for j in enemy_list:
            if state in j:
                value -= 999
                done = True
                return value,done
        if state in self.goal_point:
            if len(self.item_points) == 0:
                value += 999
                done = True
                #print("#"*15)
                #print("     Goal     ")
                #print("#"*15)
        
        return value,done
            
            
    def get_items(self,state):
        self.item_points.remove(state)
        self.maze[state[0],state[1]] = 1
        
maze = Maze_Function(maze_field)
maze.display()

###########
#o   #   G#
# ##   ## #
# # o#o   #
# # ### # #
#S   o   o#
###########


In [63]:
class DQN_Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=100000)
        self.gamma = 0.9
        self.epsilon = 1.0
        self.e_decay = 0.9999
        self.e_min = 0.01
        self.learning_rate = 0.1
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(128, input_shape=(2,2), activation='tanh'))
        model.add(Flatten())
        model.add(Dense(128, activation='tanh'))
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss="mse", optimizer=RMSprop(lr=self.learning_rate))
        return model

    def remember_memory(self, state, action, reward, next_state, next_movables, done):
        self.memory.append((state, action, reward, next_state, next_movables, done))

    def choose_action(self, state, movables):
        if self.epsilon >= random.random():
            return random.choice(movables)
        else:
            return self.choose_best_action(state, movables)
        
    def choose_best_action(self, state, movables):
        best_actions = []
        max_act_value = -1000
        for action in movables:
            np_action = np.array([[state, action]])
            act_value = self.model.predict(np_action)
            if act_value > max_act_value:
                best_actions = [action,]
                max_act_value = act_value
            elif act_value == max_act_value:
                best_actions.append(action)
        return random.choice(best_actions)

    def replay_experience(self, batch_size):
        batch_size = min(batch_size, len(self.memory))
        minibatch = random.sample(self.memory, batch_size)
        X = []
        Y = []
        for i in range(batch_size):
            state, action, reward, next_state, next_movables, done = minibatch[i]
            input_action = [state, action]
            if done:
                target_f = reward
            else:
                next_rewards = []
                for i in next_movables:
                    np_next_s_a = np.array([[next_state, i]])
                    next_rewards.append(self.model.predict(np_next_s_a))
                np_n_r_max = np.amax(np.array(next_rewards))
                target_f = reward + self.gamma * np_n_r_max
            X.append(input_action)
            Y.append(target_f)
        np_X = np.array(X)
        np_Y = np.array([Y]).T
        self.model.fit(np_X, np_Y, epochs=10, verbose=0)
        if self.epsilon > self.e_min:
            self.epsilon *= self.e_decay

In [None]:
state_size = 2
action_size = 2
dqn_agent = DQN_Agent(state_size,action_size)

episodes = 20000
times = 500

for e in range(episodes):
    maze_field = read_maze('./data/maze1.txt')
    maze = Maze_Function(maze_field)
    #print(maze.maze)
    state = maze.start_point[0]
    score = 0
    remain_S = 1
    for time in range(times):
        next_state = state.copy()
        moveables = maze.get_actions(state)
        action = dqn_agent.choose_action(state,moveables)
        next_state[0] += action[0]
        next_state[1] += action[1]
        #print(next_state)
        #print(state)
        #print(maze.maze)

        maze.enemy_action(state=next_state,count=time+1)

        """
        print(maze.maze)
        print("#"*10)
        print(state)
        print("#"*10)
        print("#"*10)
        print(action)
        print("#"*10)
        """
        reward,done = maze.get_value(state,action)
        score += reward
        next_movables = maze.get_actions(next_state)
        dqn_agent.remember_memory(state,action,reward,next_state,next_movables,done)
        maze.maze[state[0],state[1]] = remain_S
        remain_S = maze.maze[next_state[0],next_state[1]]
        if remain_S == 9:
            remain_S = 1
        maze.maze[next_state[0],next_state[1]] = 0
        if done or time == (times - 1):
            if e % 500 == 0:
                print("episode: {}/{}, score: {}, e{:.2} \t {}".format(e,episodes,score,dqn_agent.epsilon,time))
                #print(state)
                #print(maze.enemy_a)
                #print(maze.enemy_b)
                #print(maze.item_points)
                #maze.display()
            break
        state = next_state
    dqn_agent.replay_experience(64)

episode: 0/20000, score: 400, e1.0 	 499
episode: 500/20000, score: 400, e0.95 	 499
episode: 1000/20000, score: 400, e0.9 	 499
episode: 1500/20000, score: 500, e0.86 	 499
episode: 2000/20000, score: 1499, e0.82 	 495
episode: 2500/20000, score: 1499, e0.78 	 377
episode: 3000/20000, score: 1499, e0.74 	 249
episode: 3500/20000, score: 400, e0.7 	 499
episode: 4000/20000, score: 500, e0.67 	 499
episode: 4500/20000, score: 400, e0.64 	 499
episode: 5000/20000, score: 400, e0.61 	 499
episode: 5500/20000, score: 400, e0.58 	 499
episode: 6000/20000, score: 400, e0.55 	 499
episode: 6500/20000, score: 500, e0.52 	 499
episode: 7000/20000, score: 400, e0.5 	 499
episode: 7500/20000, score: 1499, e0.47 	 441
episode: 8000/20000, score: 1499, e0.45 	 288
episode: 8500/20000, score: 500, e0.43 	 499
episode: 9000/20000, score: 500, e0.41 	 499
episode: 9500/20000, score: 1499, e0.39 	 466
episode: 10000/20000, score: 1499, e0.37 	 390
episode: 10500/20000, score: 500, e0.35 	 499
episode: 