#### Q Learning

Here is a maze, with the entrance marked __E__ (coordinates 0,0) and the exit marked __X__ (coordinates 3,3); the spaces containing 1's are blocked, while the spaces containing 0's are open:
<pre>
[E 0 0 1]
[0 1 0 0]
[0 0 1 0]
[1 1 0 X]
</pre>

The goal is to traverse the maze by moving North, South, East,or West using a __Q Learning__ alogrithm.

Note that there is one path through the maze:
<pre>
[E . . 1]
[0 1 . .]
[0 0 1 .]
[1 1 0 X]
</pre>

Let's create a class for the maze:

In [None]:
import numpy as np

class Maze():
    
    def __init__(self, random=False):
        if random:
            self.maze = np.random.randint(0,2,size=(4,4))
            self.maze[0][0] = 0  # no obstacles allowed at starting point...
            self.maze[3][3] = 0  # ...or at ending point.
        else:
            self.maze = np.array([[0,0,0,1],[0,1,0,0],[0,0,1,0],[1,1,0,0]])  # the initial maze is hardcoded
          
m = Maze()
print('== ORIGINAL ==')
print(m.maze)  
m = Maze(random=True)
print('== RANDOM ====')
print(m.maze)
print('note: the random maze may have no solution')

Let's set up the maze to act as an environment for machine learning...

In [1]:
import numpy as np

class Maze():
    
    N,S,E,W = 0,1,2,3
    # offsets to move North, South, East, or West
    offset = [(-1,0),(1,0),(0,1),(0,-1)]
    
    def __init__(self, random=False):   
        self.reset(random)
    
    # moved code from __init__ to here, to allow re-use of Maze instance
    def reset(self, random=False):
        if random:
            self.maze = np.random.randint(0,2,size=(4,4))
            self.maze[0][0] = 0  # no obstacles allowed at starting point...
            self.maze[3][3] = 0  # ...or at ending point.
        else:
            self.maze = np.array([[0,0,0,1],[0,1,0,0],[0,0,1,0],[1,1,0,0]])  # the initial maze is hardcoded
        self.player = np.array([0,0])
        self.path = np.zeros([4,4])
        self.i = 1
        self.path[0][0] = self.i  # initial position
        return self.player
    
    # action should be one of: Maze.N, Maze.S, Maze.E, Maze.W
    # returns reward, done
    # rewards are: +1 = success, -1 = failure, 0 = no outcome
    # done = True if a terminal state is reached, otherwise False
    def step(self, action):
        self.i += 1
        self.player = np.add(self.player, Maze.offset[action])
        if max(self.player) > 3 or min(self.player) < 0:        # out of bounds
            return self.player, -1, True
        else:
            self.path[self.player[0]][self.player[1]] = self.i
            if self.maze[self.player[0]][self.player[1]] != 0:  # moved onto a blocked space
                return self.player, -1, True
            elif np.array_equal(self.player, (3,3)):            # reached the exit
                return self.player, 1, True
            else:
                return self.player, 0, False                    # no outcome (player is on an open space)
    
    # return a random action (equally distributed across the action space)
    def sample(self):
        return np.random.randint(4)      

print('Here is what happens if you go out of bounds:')
e = Maze()
observation, reward, done = e.step(Maze.W)
print(observation, reward, done)
print('(the coordinates are off the maze,\n the reward is negative,\n and the episode is done)\n')
print('...and here is the only safe path through the maze:')
e = Maze()
print(e.step(Maze.E))
print(e.step(Maze.E))
print(e.step(Maze.S))
print(e.step(Maze.E))
print(e.step(Maze.S))
print(e.step(Maze.S))


Here is what happens if you go out of bounds:
[ 0 -1] -1 True
(the coordinates are off the maze,
 the reward is negative,
 and the episode is done)

...and here is the only safe path through the maze:
(array([0, 1]), 0, False)
(array([0, 2]), 0, False)
(array([1, 2]), 0, False)
(array([1, 3]), 0, False)
(array([2, 3]), 0, False)
(array([3, 3]), 1, True)


In [2]:
class QPlayer():
    
    EXPLORE = 0.01
    
    N,S,E,W = 0,1,2,3
    
    def __init__(self, explore=EXPLORE):
        super().__init__()
        self.q_table = np.zeros([4*4,4])
        self.explore = explore
        
    def run(self, environment):
        observation = environment.reset()
        done = False
        complete = 0
        while not done:
            state = observation[0] * 4 + observation[1]
            action = np.argmax(self.q_table[state])
            if action == 0 or np.random.random() < self.explore:
                action = environment.sample()
            observation, reward, done = environment.step(action)
            if done:
                self.q_table[state][action] = reward
                return reward
            else:
                future_state = observation[0] * 4 + observation[1]
                self.q_table[state][action] += reward + np.amax(self.q_table[future_state])

In [5]:
q = QPlayer()
e = Maze()
complete = 0
for n in range(100000):
    complete += 1 if q.run(e) > 0 else 0
    if np.sum(q.q_table) > 100:
        print(q.q_table)
        break
print(complete)


[[-1.  0. 21. -1.]
 [-1. -1. 35.  0.]
 [-1. 35. -1.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0. -1. -1.]
 [ 0.  0.  0.  0.]
 [ 0. -1. 21. -1.]
 [-1.  7.  0.  0.]
 [ 0. -1.  0. -1.]
 [-1. -1. -1.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
8


In [6]:
print(e.path)

[[1. 2. 3. 0.]
 [0. 0. 4. 5.]
 [0. 0. 0. 6.]
 [0. 0. 0. 7.]]


In [7]:
q.run(e)

1