#### Q Learning

Here is a maze, with the entrance marked __e__ (coordinates 0,0) and the exit marked __x__ (coordinates 3,3); the spaces containing 1's are blocked, while the spaces containing 0's are open:
<pre>
[e 0 0 1]
[0 1 0 0]
[0 0 1 0]
[1 1 0 x]
</pre>

The goal is to traverse the maze by moving North, South, East,or West using a __Q Learning__ alogrithm.

Note that there is one path through the maze:
<pre>
[e . . 1]
[0 1 . .]
[0 0 1 .]
[1 1 0 x]
</pre>

Let's create a class for the maze:

In [8]:
import numpy as np

class Maze():
    
    def __init__(self, random=False):
        if random:
            self.maze = np.random.randint(0,2,size=(4,4))
            self.maze[0][0] = 0  # no obstacles allowed at starting point...
            self.maze[3][3] = 0  # ...or at ending point.
        else:
            self.maze = np.array([[0,0,0,1],[0,1,0,0],[0,0,1,0],[1,1,0,0]])  # the initial maze is hardcoded
          
m = Maze()
print('== ORIGINAL ==')
print(m.maze)  
m = Maze(random=True)
print('== RANDOM ====')
print(m.maze)
print('note: the random maze may have no solution')

== ORIGINAL ==
[[0 0 0 1]
 [0 1 0 0]
 [0 0 1 0]
 [1 1 0 0]]
== RANDOM ====
[[0 1 1 0]
 [0 0 1 0]
 [1 0 1 1]
 [0 0 1 0]]
note: the random maze may have no solution


Let's set up the maze to act as an environment for machine learning...

In [37]:
import numpy as np

class Maze():
    
    # offsets to move North, South, East, or West
    N,S,E,W = (-1,0),(1,0),(0,1),(0,-1)
    
    def __init__(self, random=False):   
        self.reset(random)
    
    # moved code from __init__ to here, to allow re-use of Maze instance
    def reset(self, random=False):
        if random:
            self.maze = np.random.randint(0,2,size=(4,4))
            self.maze[0][0] = 0  # no obstacles allowed at starting point...
            self.maze[3][3] = 0  # ...or at ending point.
        else:
            self.maze = np.array([[0,0,0,1],[0,1,0,0],[0,0,1,0],[1,1,0,0]])  # the initial maze is hardcoded
        self.player = np.array([0,0])
        return self.player
    
    # action should be one of: Maze.N, Maze.S, Maze.E, Maze.W
    # returns reward, done
    # rewards are: +1 = success, -1 = failure, 0 = no outcome
    # done = True if a terminal state is reached, otherwise False
    def step(self, action):
        self.player = np.add(self.player, action)
        if max(self.player) > 3 or min(self.player) < 0:      # out of bounds
            return self.player, -1, True
        elif self.maze[self.player[0]][self.player[1]] != 0:  # moved onto a blocked space
            return self.player, -1, True
        elif np.array_equal(self.player, (3,3)):              # reached the exit
            return self.player, 1, True
        else:
            return self.player, 0, False                      # no outcome (player is on an open space)
    
    # return a random action (equally distributed across the action space)
    def sample(self):
        n = np.random.randint(4)
        if n == 0:
            return Maze.N
        elif n == 1:
            return Maze.S
        elif n == 2:
            return Maze.E
        elif n == 3:
            return Maze.W

print('Here is what happens if you go out of bounds:')
e = Maze()
observation, reward, done = e.step(Maze.W)
print(observation, reward, done)

print('...and here is the only safe path through the maze:')
e = Maze()
print(e.step(Maze.E))
print(e.step(Maze.E))
print(e.step(Maze.S))
print(e.step(Maze.E))
print(e.step(Maze.S))
print(e.step(Maze.S))


Here is what happens if you go out of bounds:
[ 0 -1] -1 True
...and here is the only safe path through the maze:
(array([0, 1]), 0, False)
(array([0, 2]), 0, False)
(array([1, 2]), 0, False)
(array([1, 3]), 0, False)
(array([2, 3]), 0, False)
(array([3, 3]), 1, True)


In [38]:
class QPlayer():
    
    EXPLORE = 0.05
    ALPHA = 0.10
    GAMMA = 0.90
    
    def __init__(self, explore=EXPLORE, alpha=ALPHA, gamma=GAMMA):
        super().__init__()
        self.q_table = np.zeros([4*4,4])
        self.explore = explore
        self.a = alpha
        self.g = gamma
        
    def run(self, environment):
        observation = environment.reset()
        done = False
        while not done:
            state = observation[0] * 4 + observation[1]
            action = np.argmax(self.q_table[state])
            observation, reward, done = environment.step(action)
            future_state = observation[0] * 4 + observation[1]
            print(self.q_table)
            print(future_state)
            self.q_table[state][action] += ((1-self.a) * self.q_table[state] 
                                            + self.a * (reward + np.amax(self.q_table[future_state])))

In [36]:
q = QPlayer()
e = Maze()
for n in range(100):
    q.run(e)
    print(np.sum(q.q_table))

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
0


ValueError: setting an array element with a sequence.