In this iteration we are going to calculate the new q based on the previous one with
previous q, alpha, beta, gamma

In [1]:
import numpy as np
import random

from sklearn.externals import joblib

In the gridworld we have only 4 possible actions

In [2]:
actions = ['up','down','left','right']
actionNum = [index for index, _ in enumerate(actions)]

# actionNum encodes: up, down, left and right as 0, 1, 2 and 3 respectively.
actionNum

[0, 1, 2, 3]

Let us consider the gridworld. And let it be a 4 by 4 pixaleted world

In [3]:
# our states will be 16 set of tuples 
states = []
for i in range(5):
    for j in range(5):
        states.append((i,j))
        
len(states)

25

In [4]:
# 
statesAvailable = states[:]

# wallState = states[np.random.randint(0,len(statesAvailable))]
# posState = states[np.random.randint(0,len(statesAvailable))]
# negState = states[np.random.randint(0,len(statesAvailable))]

# wallState = (2,1)
# posState = (3,3)
# negState = (2,3)

wallState = (3,2)
posState = (4,4)
negState = (3,4)

statesAvailable.pop(statesAvailable.index(wallState))
statesAvailable.pop(statesAvailable.index(posState))
statesAvailable.pop(statesAvailable.index(negState))

print(wallState, posState, negState, len(statesAvailable))

(3, 2) (4, 4) (3, 4) 22


In [5]:
# now that we have wall, pos and negative states defined
# we are going to create and update the reward matrix
r_matrix = np.zeros((5,5), dtype=np.int)
r_matrix[wallState] = -50
r_matrix[posState] = 100
r_matrix[negState] = -100

r_matrix

array([[   0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0],
       [   0,    0,  -50,    0, -100],
       [   0,    0,    0,    0,  100]])

In [6]:
# given coordinate and grid size it will tell if [up,down,left,right]
# is available
def actionsAvailable(state, row_max, col_max):
    row, col = state
    directions = [-1,-1,-1,-1]
    if row < row_max: directions[1] = 0
    if row > 0: directions[0] = 0
    if col < col_max: directions[3] = 0
    if col > 0: directions[2] = 0
    return directions

actionsAvailable((5,0),4,4)

[0, -1, -1, 0]

Here is the equation that will be used:

Q(S_t,A_t) = Q(S_t,A_t) + alpha X [R_t+1 + gamma X max{Q(S_t+1,A_t+1)}-Q(S_t,A_t)]

- some things to be congnicent off:
1. R_t+1 is the reward for the state that you are going to


In [11]:
# we are going to update q_matrix by putting -1 where we can't go
def initializeQMatrix(states, row_max, col_max):
    q_matrix = np.zeros((len(states),len(actionNum)), dtype=np.int)
    for state in states:
        qRowNum = int((25/(4 + 1))*state[0] + state[1])
        q_matrix[qRowNum] = actionsAvailable(state,4,4)
        
    return q_matrix
    
def choose_action(state, row_max, col_max):
    '''
    1. given state get its row number [qRowNum]
    2. for qRowNum get all column items (remember columnms signify up,down,left, right in that order)
    3. find max value amongnst all column(action) elements [qMax]
    4. get indices in from q where column elements == qMax [indices]
    5. random choice on any element in 'indices'
    '''
    qRowNum = int((25/(row_max + 1))*state[0] + state[1])
    q = list(q_matrix[qRowNum])
    qMax = np.max(q)
    
    indices = np.argwhere(q == qMax)
    chooseActionIndex = np.random.choice(indices.flatten('F'))
    return chooseActionIndex

# now that we know the action to take (from choose_action)
# get state where action leads us
def stateTo(state, actionTaken):
    row, col = state
    if actionTaken == 0: row -= 1
    if actionTaken == 1: row += 1
    if actionTaken == 2: col -= 1
    if actionTaken == 3: col += 1
        
    return row,col

# Now get reward in this state
def getReward(state):
    return r_matrix[state[0], state[1]]

# def updateQMatrix(state, actionChosen, row_max, reward):
#     qRowNum = int((16/(row_max + 1))*state[0] + state[1])
#     q_matrix[qRowNum, actionChosen] = reward
#     return q_matrix

def updateQMatrix(state, actionChosen, stateSentTo, row_max, reward, alpha, gamma):
    
    # getting previous q value
    qRowNum = int((25/(row_max + 1))*state[0] + state[1])
    old_q = q_matrix[qRowNum, actionChosen]
    
    # getting best action from new q value
    qRowNum_latest_state = int((25/(row_max + 1))*stateSentTo[0] + stateSentTo[1])
    q = list(q_matrix[qRowNum_latest_state])
    qMax = np.max(q)
    
    indices = np.argwhere(q == qMax)
    chooseActionIndex = np.random.choice(indices.flatten('F'))
    
    latest_q = q_matrix[qRowNum_latest_state, chooseActionIndex]
    q_matrix[qRowNum,actionChosen] = old_q + alpha*(reward + (gamma * latest_q) - old_q)
    
    return q_matrix

In [12]:
row_max = 4
col_max = 4
epsilon = 0.1
alpha = 0.2
gamma = 0.9

q_matrix = initializeQMatrix(states, row_max, col_max)

def process_qMatrix(state):
    #state = (0,0)
    
    while (state != posState) and (state != negState):
        actionChosen = choose_action(state, row_max, col_max)
        stateSentTo = stateTo(state, actionChosen)
        rewardInNewState = getReward(stateSentTo)
        #updateQMatrix(state, actionChosen, stateSentTo, row_max, reward, alpha, gamma):
        updateQMatrix(state, actionChosen, stateSentTo, row_max, rewardInNewState, alpha, gamma)

        print('start state: ', state)
        print('action taken: ', actionChosen)
        print('Action takes us to this state: ', stateSentTo)
        print('Reward at state we are in: ', rewardInNewState)
        print('Q matrix: ', q_matrix)
        
        state = stateSentTo

In [13]:
for _ in range(200):
    stateIndex = np.random.choice(len(statesAvailable))
    process_qMatrix(statesAvailable[stateIndex])

start state:  (1, 4)
action taken:  2
Action takes us to this state:  (1, 3)
Reward at state we are in:  0
Q matrix:  [[-1  0 -1  0]
 [-1  0  0  0]
 [-1  0  0  0]
 [-1  0  0  0]
 [-1  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0 -1 -1  0]
 [ 0 -1  0  0]
 [ 0 -1  0  0]
 [ 0 -1  0  0]
 [ 0 -1  0 -1]]
start state:  (1, 3)
action taken:  0
Action takes us to this state:  (0, 3)
Reward at state we are in:  0
Q matrix:  [[-1  0 -1  0]
 [-1  0  0  0]
 [-1  0  0  0]
 [-1  0  0  0]
 [-1  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0 -1 -1  0]
 [ 0 -1  0  0]
 [ 0 -1  0  0]
 [ 0 -1  0  0]
 [ 0 -1  0 -1]]
start state:

In [10]:
q_matrix

array([[ -1,   0,  -1,  17],
       [ -1,  31,   0,   0],
       [ -1,  36,   0,   0],
       [ -1,  35,   0,   0],
       [ -1,  35,   0,  -1],
       [  0,   0,  -1,  14],
       [  0,   0,   0,  39],
       [  0,   0,   0,  48],
       [  0,  58,   0,   0],
       [  0,  48,   0,  -1],
       [  0,   0,  -1,  35],
       [  0,   0,   0,  48],
       [  0, -10,   0,  58],
       [  0,  69,   0,   0],
       [  0, -20,  58,  -1],
       [  8,   0,  -1,   0],
       [ 33,   0,   0, -10],
       [  0,   0,   0,   0],
       [  0,  82, -10, -20],
       [  0,   0,   0,  -1],
       [  0,  -1,  -1,  14],
       [  0,  -1,   0,  61],
       [-10,  -1,   0,  82],
       [  0,  -1,   0,  96],
       [  0,  -1,   0,  -1]])

In [56]:
joblib.dump(q_matrix, 'qMatrix.pkl')

['qMatrix.pkl', 'qMatrix.pkl_01.npy']

In [57]:
q_matrix_load = joblib.load('qMatrix.pkl')

In [58]:
q_matrix_load

array([[ -1,   0,  -1,  39],
       [ -1,  48,   0,   0],
       [ -1,  58,   0,   0],
       [ -1,  48,   0,  -1],
       [  0,  48,  -1,   0],
       [  0,   0,   0,  58],
       [  0,  69,   0,   0],
       [  0, -20,  58,  -1],
       [  0,  58,  -1,   0],
       [  0,   0,   0,   0],
       [  0,  82,   0, -20],
       [  0,   0,   0,  -1],
       [  0,  -1,  -1,  69],
       [  0,  -1,   0,  82],
       [  0,  -1,   0,  96],
       [  0,  -1,   0,  -1]])