In [1]:
import numpy as np

In [2]:
r_matrix = np.matrix('''-1 -1 -1 -1 0 -1;
                        -1 -1 -1 0 -1 100; 
                        -1 -1 -1 0 -1 -1; 
                        -1 0 0 -1 0 -1; 
                        0 -1 -1 0 -1 100; 
                        -1 0 -1 -1 0 100''')

print('reward matrix shape: ', r_matrix.shape)
r_matrix

reward matrix shape:  (6, 6)


matrix([[ -1,  -1,  -1,  -1,   0,  -1],
        [ -1,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,   0,  -1,  -1],
        [ -1,   0,   0,  -1,   0,  -1],
        [  0,  -1,  -1,   0,  -1, 100],
        [ -1,   0,  -1,  -1,   0, 100]])

In [3]:
r_matrix.shape

(6, 6)

In [4]:
q_matrix = np.zeros((6,6), dtype=np.int)

print('q matrix shape: ', q_matrix.shape)
q_matrix

q matrix shape:  (6, 6)


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [5]:
gamma = 0.8
initial_state = 1

In [25]:
def positiveRewards(state):
    '''
    - given a state we should spit out the states not -1
    - we should also keep a note of the indices of above numbers
    - after getting the indices we 
    '''
    # for state=1 ==> matrix([[ -1,  -1,  -1,   0,  -1, 100]])
    # output dimension of this is 2
    matrixListForChosenState = r_matrix[state]
    
    # for the above matrix output is
    # array([[0, 3],
    #       [0, 5]], dtype=int32), you can see 3rd and 5th element of 0th row are the >-1 values
    indices = np.argwhere(matrixListForChosenState > -1)

    return indices

indices = positiveRewards(initial_state)
print('indices: ', indices)

indices:  [[0 3]
 [0 5]]


In [26]:
actionChosen = indices[np.random.choice(range(len(indices)))][1]
actionChosen

5

In [27]:
reward = r_matrix[initial_state, actionChosen]
reward

100

In [28]:
arrayOfNextStates = positiveRewards(actionChosen)
arrayOfNextStates

array([[0, 1],
       [0, 4],
       [0, 5]], dtype=int32)

In [30]:
def qValuesForOtherState(actionChosen, arrayOfNextStates):
    qValues = []
    for element in arrayOfNextStates[:,1]:
        qValues.append(q_matrix[actionChosen, element])
        
    return np.max(qValues)

qMax = qValuesForOtherState(actionChosen, arrayOfNextStates)
qMax

0

## Do not run the next Line or section 2 will not work

In [31]:
q_matrix[initial_state, actionChosen] = reward + gamma*qMax
q_matrix

array([[  0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0, 100],
       [  0,   0,   0,   0,   0,   0],
       [  0,  80,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0]])

# Part 2

### This section is a condensed version of procedure above

In [1]:
import numpy as np

In [2]:
r_matrix = np.matrix('''-1 -1 -1 -1 0 -1;
                        -1 -1 -1 0 -1 100; 
                        -1 -1 -1 0 -1 -1; 
                        -1 0 0 -1 0 -1; 
                        0 -1 -1 0 -1 100; 
                        -1 0 -1 -1 0 100''')

q_matrix = np.zeros((6,6), dtype=np.int)

def positiveRewards(state):
    '''
    - given a state we should spit out the states not -1
    - we should also keep a note of the indices of above numbers
    - after getting the indices we 
    '''
    # for state=1 ==> matrix([[ -1,  -1,  -1,   0,  -1, 100]])
    # output dimension of this is 2
    matrixListForChosenState = r_matrix[state]
    
    # for the above matrix output is
    # array([[0, 3],
    #       [0, 5]], dtype=int32), you can see 3rd and 5th element of 0th row are the >-1 values
    indices = np.argwhere(matrixListForChosenState > -1)

    return indices

def qValuesForOtherState(actionChosen, arrayOfNextStates):
    qValues = []
    for element in arrayOfNextStates[:,1]:
        qValues.append(q_matrix[actionChosen, element])
        
    return np.max(qValues)

In [3]:
def q_update(initial_state, q_matrix):
    indices = positiveRewards(initial_state)
    actionChosen = indices[np.random.choice(range(len(indices)))][1]
    reward = r_matrix[initial_state, actionChosen]
    arrayOfNextStates = positiveRewards(actionChosen)
    q_matrix[initial_state, actionChosen] = reward + gamma * qValuesForOtherState(actionChosen, arrayOfNextStates)
    return indices, actionChosen, reward, arrayOfNextStates, q_matrix

In [5]:
gamma = 0.8
initial_state = 1

In [7]:
def episode(initial_state):
    while initial_state != 5:    
        _, actionChosen, _, _, _ = q_update(initial_state, q_matrix)
        initial_state = actionChosen
    return q_matrix

In [12]:
for i in range(6):
    for _ in range(500):
        episode(i)

In [15]:
q_matrix

array([[  0,   0,   0,   0,  80,   0],
       [  0,   0,   0,  64,   0, 100],
       [  0,   0,   0,  64,   0,   0],
       [  0,  80,  51,   0,  80,   0],
       [ 64,   0,   0,  64,   0, 100],
       [  0,   0,   0,   0,   0,   0]])

In [45]:
def pathToReward(start_index, q_matrix_input):
    path = [start_index]
    while start_index != 5:
        arg = np.argwhere(q_matrix_input[start_index] == np.max(q_matrix_input[start_index]))
        new_index = arg.tolist()[0][0]
        path.append(new_index)
        start_index = new_index
    return path
    
pathToReward(2, q_matrix)

[2, 3, 1, 5]

#### This section is for testing the output

In [5]:
# indices, actionChosen, reward, arrayOfNextStates, q_matrix = q_update(initial_state, q_matrix)

# print('indices: ', indices)
# print('action chosen: ', actionChosen)
# print('reward: ', reward)
# print('array of indices: ', arrayOfNextStates)
# print('q_matrix: ', q_matrix)

indices:  [[0 1]
 [0 2]
 [0 4]]
action chosen:  1
reward:  0
array of indices:  [[0 3]
 [0 5]]
q_matrix:  [[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]
