Note: this is going to be an space with no walls so that we can have multiple opportunity to move car

In [1]:
import numpy as np
import random

from sklearn.externals import joblib

In the gridworld we have only 4 possible actions

In [2]:
actions = ['up','down','left','right']
actionNum = [index for index, _ in enumerate(actions)]

# actionNum encodes: up, down, left and right as 0, 1, 2 and 3 respectively.
actionNum

[0, 1, 2, 3]

Let us consider the gridworld. And let it be a 4 by 4 pixaleted world

In [3]:
# our states will be 16 set of tuples 
num_rows = 9
num_cols = 9

row_max = num_rows - 1
col_max = num_cols - 1

epsilon = 0.5
alpha = 0.3
gamma = 0.9

states = []

for i in range(num_rows):
    for j in range(num_cols):
        states.append((i,j))
        
len(states)

81

In [4]:
statesAvailable = states[:]

negState = (row_max-1, col_max-1)
statesAvailable.pop(statesAvailable.index(negState))

posState = (row_max, col_max)
statesAvailable.pop(statesAvailable.index(posState))

wallStateMatrix = []
for i in [1,3,5,7]:
    for j in [1,3,5,7]:
        if (i,j) != (7,7):
            wallState = (i,j)
            statesAvailable.pop(statesAvailable.index(wallState))
            wallStateMatrix.append(wallState)
            
print(wallStateMatrix, negState, posState, len(statesAvailable))

[(1, 1), (1, 3), (1, 5), (1, 7), (3, 1), (3, 3), (3, 5), (3, 7), (5, 1), (5, 3), (5, 5), (5, 7), (7, 1), (7, 3), (7, 5)] (7, 7) (8, 8) 64


In [5]:
# now that we have wall, pos and negative states defined
# we are going to create and update the reward matrix
# -------------- just going to put pos and neg states
r_matrix = np.zeros((num_rows, num_cols), dtype=np.int)
for wallState in wallStateMatrix:
    r_matrix[wallState] = -150
    
r_matrix[posState] = 100
r_matrix[negState] = -100

r_matrix

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, -150,    0, -150,    0, -150,    0, -150,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, -150,    0, -150,    0, -150,    0, -150,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, -150,    0, -150,    0, -150,    0, -150,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0, -150,    0, -150,    0, -150,    0, -100,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,  100]])

In [6]:
# given coordinate and grid size it will tell if [up,down,left,right]
# is available
def actionsAvailable(state, row_max, col_max):
    row, col = state
    # directions = [up,down,left,right]
    # initialize direction -1 implies cant move to stated direction
    directions = [-1,-1,-1,-1] 
    if row < row_max: directions[1] = 0
    if row > 0: directions[0] = 0
    if col < col_max: directions[3] = 0
    if col > 0: directions[2] = 0
    return directions

actionsAvailable((5,0),num_rows-1,num_cols-1)

[0, 0, -1, 0]

Here is the equation that will be used:

Q(S_t,A_t) = Q(S_t,A_t) + alpha X [R_t+1 + gamma X max{Q(S_t+1,A_t+1)}-Q(S_t,A_t)]

- some things to be congnicent off:
1. R_t+1 is the reward for the state that you are going to


In [7]:
# we are going to update q_matrix by putting -1 where we can't go
def initializeQMatrix(states, row_max, col_max):
    q_matrix = np.zeros((len(states),len(actionNum)), dtype=np.int)
    for state in states:
        qRowNum = int((col_max + 1)*state[0] + state[1])
        q_matrix[qRowNum] = actionsAvailable(state,row_max, col_max)
        
    return q_matrix
    
def getIndices(q):
    indexList = []
    for index,val in enumerate(q):
        if val != -1:
            indexList.append(index)
    chooseActionIndex = np.random.choice(indexList)
    return chooseActionIndex

def choose_action(state, row_max, col_max):
    '''
    1. given state get its row number [qRowNum]
    2. for qRowNum get all column items (remember columnms signify up,down,left, right in that order)
    3. find max value amongnst all column(action) elements [qMax]
    4. get indices in from q where column elements == qMax [indices]
    5. random choice on any element in 'indices'
    '''
    qRowNum = int((col_max + 1)*state[0] + state[1]) # get the row number of q-matrix
    q = list(q_matrix[qRowNum]) # one state - four action
    
    # exploration phase where we randomly choose any direction that is not -1 (out of grid bounds)
    if random.random() < epsilon:
        chooseActionIndex = getIndices(q)
    
    # exploitation phase, take existing rewards and do with max
    else:
        qMax = np.max(q)
        indices = np.argwhere(q == qMax)
        chooseActionIndex = np.random.choice(indices.flatten('F'))
    return chooseActionIndex

# now that we know the action to take (from choose_action)
# get state where action leads us
def stateTo(state, actionTaken):
    row, col = state
    if actionTaken == 0: row -= 1
    if actionTaken == 1: row += 1
    if actionTaken == 2: col -= 1
    if actionTaken == 3: col += 1
        
    return row,col

# Now get reward in this state
def getReward(state):
    return r_matrix[state[0], state[1]]

# def updateQMatrix(state, actionChosen, row_max, reward):
#     qRowNum = int((16/(row_max + 1))*state[0] + state[1])
#     q_matrix[qRowNum, actionChosen] = reward
#     return q_matrix

def updateQMatrix(state, actionChosen, stateSentTo, col_max, reward, alpha, gamma):
    
    # getting previous q value
    qRowNum = int((col_max + 1)*state[0] + state[1])
    old_q = q_matrix[qRowNum, actionChosen]
    
    # getting best action from new q value
    qRowNum_latest_state = int((col_max + 1)*stateSentTo[0] + stateSentTo[1])
    q = list(q_matrix[qRowNum_latest_state])
    
    #--- new section
    new_q = []
    for val in q:
        if val != -1:
            new_q.append(val)
    qMax = np.max(new_q)
    #--- new section
    
    indices = np.argwhere(q == qMax)
    chooseActionIndex = np.random.choice(indices.flatten('F'))
    
    latest_q = q_matrix[qRowNum_latest_state, chooseActionIndex]
    q_matrix[qRowNum,actionChosen] = old_q + alpha*(reward + (gamma * latest_q) - old_q)
    
    return q_matrix

In [8]:
q_matrix = initializeQMatrix(states, row_max, col_max)

def process_qMatrix(state):
    
    while (state != posState) and (state != negState):
        actionChosen = choose_action(state, row_max, col_max)
        stateSentTo = stateTo(state, actionChosen)
        rewardInNewState = getReward(stateSentTo)
        #updateQMatrix(state, actionChosen, stateSentTo, row_max, reward, alpha, gamma):
        updateQMatrix(state, actionChosen, stateSentTo, col_max, rewardInNewState, alpha, gamma)

        #print('start state: ', state)
        #print('action taken: ', actionChosen)
        #print('Action takes us to this state: ', stateSentTo)
        #print('Reward at state we are in: ', rewardInNewState)
        #print('Q matrix: ', q_matrix)
        
        state = stateSentTo

In [9]:
q_matrix = initializeQMatrix(states, row_max, col_max)
for _ in range(500000):
    stateIndex = np.random.choice(len(statesAvailable))
    process_qMatrix(statesAvailable[stateIndex])

In [10]:
def makeMoveMatrix(q_matrix):
    
    actionAbbr = ['U','D','L','R']
    movementGrid = []

    for i in range(row_max+1):
        #x = q_matrix[i*5 : i*5+5].flatten()
        x = q_matrix[i*num_rows : i*num_rows+num_rows].flatten()
        row = []
        for j in range(col_max+1):
            row.append(actionAbbr[np.argmax(x[j*4:j*4+4])])
        movementGrid.append(row)
    moveMatrix = np.array(movementGrid)
    
    #putting in the wall, pos and neg states
    for wallState in wallStateMatrix:
        moveMatrix[wallState] = 'X'
    moveMatrix[posState] = '+'
    moveMatrix[negState] = '-'
    
    return moveMatrix
        

moveMatrix = makeMoveMatrix(q_matrix)

print('moveMatrix: \n', moveMatrix)

moveMatrix: 
 [['D' 'R' 'D' 'R' 'D' 'R' 'D' 'R' 'D']
 ['D' 'X' 'D' 'X' 'D' 'X' 'D' 'X' 'D']
 ['D' 'R' 'D' 'R' 'D' 'R' 'D' 'R' 'D']
 ['D' 'X' 'D' 'X' 'D' 'X' 'D' 'X' 'D']
 ['D' 'R' 'D' 'R' 'D' 'R' 'D' 'R' 'D']
 ['D' 'X' 'D' 'X' 'D' 'X' 'D' 'X' 'D']
 ['D' 'R' 'D' 'R' 'D' 'R' 'D' 'R' 'D']
 ['D' 'X' 'D' 'X' 'D' 'X' 'D' '-' 'D']
 ['R' 'R' 'R' 'R' 'R' 'R' 'R' 'R' '+']]


In [11]:
q_matrix

array([[  -1,    0,   -1,    0],
       [  -1, -146,    0,    1],
       [  -1,    4,    0,    4],
       [  -1, -139,    1,    8],
       [  -1,   12,    4,   12],
       [  -1, -131,    8,   16],
       [  -1,   21,   12,   21],
       [  -1, -121,   16,   26],
       [  -1,   32,   21,   -1],
       [   0,    1,   -1, -146],
       [   0,    4,    0,    4],
       [   1,    8, -146, -139],
       [   4,   12,    4,   12],
       [   8,   16, -139, -131],
       [  12,   21,   12,   21],
       [  16,   26, -131, -121],
       [  21,   32,   21,   32],
       [  26,   39, -121,   -1],
       [   0,    4,   -1,    4],
       [-146, -139,    1,    8],
       [   4,   12,    4,   12],
       [-139, -131,    8,   16],
       [  12,   21,   12,   21],
       [-131, -121,   16,   26],
       [  21,   32,   21,   32],
       [-121, -108,   26,   39],
       [  32,   46,   32,   -1],
       [   1,    8,   -1, -139],
       [   4,   12,    4,   12],
       [   8,   16, -139, -131],
       [  

## ============== Don't Touch ================

In [12]:
joblib.dump(q_matrix, 'qMatrix_9by9_manhattan_blocks.pkl')

['qMatrix_9by9_manhattan_blocks.pkl',
 'qMatrix_9by9_manhattan_blocks.pkl_01.npy']

In [13]:
q_matrix_load = joblib.load('qMatrix_9by9_manhattan_blocks.pkl')

In [14]:
len(q_matrix_load)

81

In [15]:
def makeMoveMatrix(q_matrix):
    
    actionAbbr = ['U','D','L','R']
    movementGrid = []

    for i in range(row_max+1):
        #x = q_matrix[i*5 : i*5+5].flatten()
        x = q_matrix[i*num_rows : i*num_rows+num_rows].flatten()
        row = []
        for j in range(col_max+1):
            row.append(actionAbbr[np.argmax(x[j*4:j*4+4])])
        movementGrid.append(row)
    moveMatrix = np.array(movementGrid)
    
    #putting in the wall, pos and neg states
    for wallState in wallStateMatrix:
        moveMatrix[wallState] = 'X'
    moveMatrix[posState] = '+'
    moveMatrix[negState] = '-'
    
    return moveMatrix

In [37]:
makeMoveMatrix(q_matrix_load)

array([['D', 'D', 'R', 'R', 'D', 'D', 'X'],
       ['R', 'D', 'U', 'X', 'R', 'D', 'D'],
       ['X', 'D', 'X', 'D', 'X', 'D', 'L'],
       ['R', 'R', 'R', 'D', 'R', 'D', 'X'],
       ['U', 'U', 'X', 'D', 'X', 'D', 'L'],
       ['U', 'U', 'X', 'D', 'X', 'D', '-'],
       ['X', 'U', 'X', 'R', 'R', 'R', '+']], 
      dtype='<U1')

In [16]:
posNegWallState = [posState, negState, wallStateMatrix]
posNegWallState

[(8, 8),
 (7, 7),
 [(1, 1),
  (1, 3),
  (1, 5),
  (1, 7),
  (3, 1),
  (3, 3),
  (3, 5),
  (3, 7),
  (5, 1),
  (5, 3),
  (5, 5),
  (5, 7),
  (7, 1),
  (7, 3),
  (7, 5)]]

In [17]:
joblib.dump(posNegWallState, 'posNegWallState_9by9_manhattan_block.pkl')

['posNegWallState_9by9_manhattan_block.pkl']

In [18]:
posNegWall_load = joblib.load('posNegWallState_9by9_manhattan_block.pkl')

In [19]:
posNegWall_load

[(8, 8),
 (7, 7),
 [(1, 1),
  (1, 3),
  (1, 5),
  (1, 7),
  (3, 1),
  (3, 3),
  (3, 5),
  (3, 7),
  (5, 1),
  (5, 3),
  (5, 5),
  (5, 7),
  (7, 1),
  (7, 3),
  (7, 5)]]