# Reinforcement Learning - Mace

source: https://github.com/gkhayes/maze_reinforcement_learning

check this out (later): https://samyzaf.com/ML/rl/qmaze.html

# Part 1 - Looking at Environment

In [1]:
import numpy as np

In [2]:
# Create transition and reward matrices
def create_matrices(maze, reward, penalty_s, penalty_l, prob):
    """Create reward and transition matrices for input into the mdp QLearning
    function
    
    Args:
    maze:  array. 0-1 numpy array giving the positions of the white cells
           (denoted 1) and the gray cells (denoted 0) in the maze;
    reward: float. Reward for reaching the end of the maze;
    penalty_s: float. Penalty for entering a white cell;
    penalty_l: float. Penalty for entering a gray cell;
    prob: float. Probability of moving in the intended direction.
    
    Returns:
    R: array. Reward matrix;
    T: array. Transition matrix.
    """
    
    r, c = np.shape(maze)
    states = r*c
    p = prob
    q = (1 - prob)*0.5
    
    # Create reward matrix
    path = maze*penalty_s  # each white cell ( =path) gets a penalty
    walls = (1 - maze)*penalty_l # each wall cell gets a penalty
    combined = path + walls
    print('path')
    print(path)
    print('\nwalls')
    print(walls)
    print('\ncombined')
    print(combined)
    print(combined[-1, -1])
    
    combined[-1, -1] = reward
            
    R = np.reshape(combined, states)
    
    # Create transition matrix
    T_up = np.zeros((states, states))
    T_left = np.zeros((states, states))
    T_right = np.zeros((states, states))
    T_down = np.zeros((states, states))
    
    wall_ind = np.where(R == penalty_l)[0]
    print('wall_ind')
    print(wall_ind)

    for i in range(states):
        # Up
        if (i - c) < 0 or (i - c) in wall_ind :
            T_up[i, i] += p
        else:
            T_up[i, i - c] += p
        
        if i%c == 0 or (i - 1) in wall_ind:
            T_up[i, i] += q
        else:
            T_up[i, i-1] += q
        
        if i%c == (c - 1) or (i + 1) in wall_ind:
            T_up[i, i] += q
        else:
            T_up[i, i+1] += q
            
        # Down
        if (i + c) > (states - 1) or (i + c) in wall_ind:
            T_down[i, i] += p
        else:
            T_down[i, i + c] += p
        
        if i%c == 0 or (i - 1) in wall_ind:
            T_down[i, i] += q
        else:
            T_down[i, i-1] += q
        
        if i%c == (c - 1) or (i + 1) in wall_ind:
            T_down[i, i] += q
        else:
            T_down[i, i+1] += q
            
        # Left
        if i%c == 0 or (i - 1) in wall_ind:
            T_left[i, i] += p
        else:
            T_left[i, i-1] += p
            
        if (i - c) < 0 or (i - c) in wall_ind:
            T_left[i, i] += q
        else:
            T_left[i, i - c] += q
        
        if (i + c) > (states - 1) or (i + c) in wall_ind:
            T_left[i, i] += q
        else:
            T_left[i, i + c] += q
        
        # Right
        if i%c == (c - 1) or (i + 1) in wall_ind:
            T_right[i, i] += p
        else:
            T_right[i, i+1] += p
            
        if (i - c) < 0 or (i - c) in wall_ind:
            T_right[i, i] += q
        else:
            T_right[i, i - c] += q
        
        if (i + c) > (states - 1) or (i + c) in wall_ind:
            T_right[i, i] += q
        else:
            T_right[i, i + c] += q
    
    T = [T_up, T_left, T_right, T_down] 
    
    return T, R

In [3]:
# Define maze array
maze =  np.array([
    [ 1.,  0.,  1.,  1.,  1.,  1.,  1.],
    [ 1.,  1.,  1.,  0.,  0.,  1.,  0.],
    [ 0.,  0.,  0.,  1.,  1.,  1.,  0.],
    [ 1.,  1.,  1.,  1.,  0.,  0.,  1.],
    [ 1.,  0.,  0.,  0.,  1.,  1.,  1.],
    [ 1.,  0.,  1.,  1.,  1.,  1.,  1.],
    [ 1.,  1.,  1.,  0.,  1.,  1.,  1.]]) 
    
# Create transition and reward matrices
T, R = create_matrices(maze=maze, reward=1, penalty_s=-0.04, penalty_l=-0.75, prob=0.8)

path
[[-0.04 -0.   -0.04 -0.04 -0.04 -0.04 -0.04]
 [-0.04 -0.04 -0.04 -0.   -0.   -0.04 -0.  ]
 [-0.   -0.   -0.   -0.04 -0.04 -0.04 -0.  ]
 [-0.04 -0.04 -0.04 -0.04 -0.   -0.   -0.04]
 [-0.04 -0.   -0.   -0.   -0.04 -0.04 -0.04]
 [-0.04 -0.   -0.04 -0.04 -0.04 -0.04 -0.04]
 [-0.04 -0.04 -0.04 -0.   -0.04 -0.04 -0.04]]

walls
[[-0.   -0.75 -0.   -0.   -0.   -0.   -0.  ]
 [-0.   -0.   -0.   -0.75 -0.75 -0.   -0.75]
 [-0.75 -0.75 -0.75 -0.   -0.   -0.   -0.75]
 [-0.   -0.   -0.   -0.   -0.75 -0.75 -0.  ]
 [-0.   -0.75 -0.75 -0.75 -0.   -0.   -0.  ]
 [-0.   -0.75 -0.   -0.   -0.   -0.   -0.  ]
 [-0.   -0.   -0.   -0.75 -0.   -0.   -0.  ]]

combined
[[-0.04 -0.75 -0.04 -0.04 -0.04 -0.04 -0.04]
 [-0.04 -0.04 -0.04 -0.75 -0.75 -0.04 -0.75]
 [-0.75 -0.75 -0.75 -0.04 -0.04 -0.04 -0.75]
 [-0.04 -0.04 -0.04 -0.04 -0.75 -0.75 -0.04]
 [-0.04 -0.75 -0.75 -0.75 -0.04 -0.04 -0.04]
 [-0.04 -0.75 -0.04 -0.04 -0.04 -0.04 -0.04]
 [-0.04 -0.04 -0.04 -0.75 -0.04 -0.04 -0.04]]
-0.04
wall_ind
[ 1 10 11 13 14

In [4]:
R

array([-0.04, -0.75, -0.04, -0.04, -0.04, -0.04, -0.04, -0.04, -0.04,
       -0.04, -0.75, -0.75, -0.04, -0.75, -0.75, -0.75, -0.75, -0.04,
       -0.04, -0.04, -0.75, -0.04, -0.04, -0.04, -0.04, -0.75, -0.75,
       -0.04, -0.04, -0.75, -0.75, -0.75, -0.04, -0.04, -0.04, -0.04,
       -0.75, -0.04, -0.04, -0.04, -0.04, -0.04, -0.04, -0.04, -0.04,
       -0.75, -0.04, -0.04,  1.  ])

In [5]:
T

[array([[1. , 0. , 0. , ..., 0. , 0. , 0. ],
        [0.1, 0.8, 0.1, ..., 0. , 0. , 0. ],
        [0. , 0. , 0.9, ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 0.1, 0.1, 0. ],
        [0. , 0. , 0. , ..., 0.1, 0. , 0.1],
        [0. , 0. , 0. , ..., 0. , 0.1, 0.1]]),
 array([[0.9, 0. , 0. , ..., 0. , 0. , 0. ],
        [0.8, 0.1, 0. , ..., 0. , 0. , 0. ],
        [0. , 0. , 0.9, ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 0.9, 0. , 0. ],
        [0. , 0. , 0. , ..., 0.8, 0.1, 0. ],
        [0. , 0. , 0. , ..., 0. , 0.8, 0.1]]),
 array([[0.9, 0. , 0. , ..., 0. , 0. , 0. ],
        [0. , 0.1, 0.8, ..., 0. , 0. , 0. ],
        [0. , 0. , 0.1, ..., 0. , 0. , 0. ],
        ...,
        [0. , 0. , 0. , ..., 0.1, 0.8, 0. ],
        [0. , 0. , 0. , ..., 0. , 0.1, 0.8],
        [0. , 0. , 0. , ..., 0. , 0. , 0.9]]),
 array([[0.2, 0. , 0. , ..., 0. , 0. , 0. ],
        [0.1, 0. , 0.1, ..., 0. , 0. , 0. ],
        [0. , 0. , 0.1, ..., 0. , 0. , 0. ],
        ..

In [6]:
T[1]

array([[0.9, 0. , 0. , ..., 0. , 0. , 0. ],
       [0.8, 0.1, 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0.9, ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0.9, 0. , 0. ],
       [0. , 0. , 0. , ..., 0.8, 0.1, 0. ],
       [0. , 0. , 0. , ..., 0. , 0.8, 0.1]])

In [8]:
T[1].shape

(49, 49)

In [12]:
T[0][0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [13]:
T[0][1]

array([0.1, 0.8, 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])

In [14]:
T[0][2]

array([0. , 0. , 0.9, 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])

In [15]:
T[0][3]

array([0. , 0. , 0.1, 0.8, 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])