In [None]:
updated from: http://mnemstudio.org/path-finding-q-learning-tutorial.htm

In [1]:
import numpy as np

In [2]:
r_matrix = np.matrix('''-1 -1 -1 -1 0 -1;
                        -1 -1 -1 0 -1 100; 
                        -1 -1 -1 0 -1 -1; 
                        -1 0 0 -1 0 -1; 
                        0 -1 -1 0 -1 100; 
                        -1 0 -1 -1 0 100''')

print('reward matrix shape: ', r_matrix.shape)
r_matrix

reward matrix shape:  (6, 6)


matrix([[ -1,  -1,  -1,  -1,   0,  -1],
        [ -1,  -1,  -1,   0,  -1, 100],
        [ -1,  -1,  -1,   0,  -1,  -1],
        [ -1,   0,   0,  -1,   0,  -1],
        [  0,  -1,  -1,   0,  -1, 100],
        [ -1,   0,  -1,  -1,   0, 100]])

In [3]:
r_matrix.shape

(6, 6)

In [4]:
q_matrix = np.zeros((6,6), dtype=np.int)

print('q matrix shape: ', q_matrix.shape)
q_matrix

q matrix shape:  (6, 6)


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [5]:
gamma = 0.8
initial_state = 1

In [6]:
def positiveRewards(state):
    '''
    - given a state we should spit out the states not -1
    - we should also keep a note of the indices of above numbers
    - after getting the indices we 
    '''
    # for state=1 ==> matrix([[ -1,  -1,  -1,   0,  -1, 100]])
    # output dimension of this is 2
    matrixListForChosenState = r_matrix[state]
    
    # for the above matrix output is
    # array([[0, 3],
    #       [0, 5]], dtype=int32), you can see 3rd and 5th element of 0th row are the >-1 values
    indices = np.argwhere(matrixListForChosenState > -1)

    return indices

indices = positiveRewards(initial_state)
print('indices: ', indices)

indices:  [[0 3]
 [0 5]]


In [7]:
actionChosen = indices[np.random.choice(range(len(indices)))][1]
actionChosen

3

In [8]:
reward = r_matrix[initial_state, actionChosen]
reward

0

In [28]:
arrayOfNextStates = positiveRewards(actionChosen)
arrayOfNextStates

array([[0, 1],
       [0, 4],
       [0, 5]], dtype=int32)

In [30]:
def qValuesForOtherState(actionChosen, arrayOfNextStates):
    qValues = []
    for element in arrayOfNextStates[:,1]:
        qValues.append(q_matrix[actionChosen, element])
        
    return np.max(qValues)

qMax = qValuesForOtherState(actionChosen, arrayOfNextStates)
qMax

0

## Do not run the next Line or section 2 will not work

In [31]:
q_matrix[initial_state, actionChosen] = reward + gamma*qMax
q_matrix

array([[  0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0, 100],
       [  0,   0,   0,   0,   0,   0],
       [  0,  80,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0]])

# Part 2

### This section is a condensed version of procedure above

In [1]:
import numpy as np

In [2]:
r_matrix = np.matrix('''-1 -1 -1 -1 0 -1;
                        -1 -1 -1 0 -1 100; 
                        -1 -1 -1 0 -1 -1; 
                        -1 0 0 -1 0 -1; 
                        0 -1 -1 0 -1 100; 
                        -1 0 -1 -1 0 100''')

q_matrix = np.zeros((6,6), dtype=np.int)

def positiveRewards(state):
    '''
    - given a state we should spit out the states not -1
    - we should also keep a note of the indices of above numbers
    - after getting the indices we 
    '''
    # for state=1 ==> matrix([[ -1,  -1,  -1,   0,  -1, 100]])
    # output dimension of this is 2
    matrixListForChosenState = r_matrix[state]
    
    # for the above matrix output is
    # array([[0, 3],
    #       [0, 5]], dtype=int32), you can see 3rd and 5th element of 0th row are the >-1 values
    indices = np.argwhere(matrixListForChosenState > -1)

    return indices

def qValuesForOtherState(actionChosen, arrayOfNextStates):
    qValues = []
    for element in arrayOfNextStates[:,1]:
        qValues.append(q_matrix[actionChosen, element])
        
    return np.max(qValues)

In [3]:
def q_update(initial_state, q_matrix):
    indices = positiveRewards(initial_state)
    actionChosen = indices[np.random.choice(range(len(indices)))][1]
    reward = r_matrix[initial_state, actionChosen]
    arrayOfNextStates = positiveRewards(actionChosen)
    q_matrix[initial_state, actionChosen] = reward + gamma * qValuesForOtherState(actionChosen, arrayOfNextStates)
    return indices, actionChosen, reward, arrayOfNextStates, q_matrix

In [4]:
gamma = 0.8
initial_state = 1

In [5]:
def episode(initial_state):
    while initial_state != 5:    
        _, actionChosen, _, _, _ = q_update(initial_state, q_matrix)
        initial_state = actionChosen
    return q_matrix

In [6]:
for i in range(6):
    for _ in range(500):
        episode(i)

In [7]:
q_matrix

array([[  0,   0,   0,   0,  80,   0],
       [  0,   0,   0,  64,   0, 100],
       [  0,   0,   0,  64,   0,   0],
       [  0,  80,  51,   0,  80,   0],
       [ 64,   0,   0,  64,   0, 100],
       [  0,   0,   0,   0,   0,   0]])

In [8]:
def pathToReward(start_index, q_matrix_input):
    path = [start_index]
    while start_index != 5:
        arg = np.argwhere(q_matrix_input[start_index] == np.max(q_matrix_input[start_index]))
        new_index = arg.tolist()[0][0]
        path.append(new_index)
        start_index = new_index
    return path
    
pathToReward(2, q_matrix)

[2, 3, 1, 5]

#### This section is for testing the output

In [5]:
# indices, actionChosen, reward, arrayOfNextStates, q_matrix = q_update(initial_state, q_matrix)

# print('indices: ', indices)
# print('action chosen: ', actionChosen)
# print('reward: ', reward)
# print('array of indices: ', arrayOfNextStates)
# print('q_matrix: ', q_matrix)

indices:  [[0 1]
 [0 2]
 [0 4]]
action chosen:  1
reward:  0
array of indices:  [[0 3]
 [0 5]]
q_matrix:  [[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


# Part 3
#### Here is defining a new class

In [9]:
import random


class QLearn:
    def __init__(self, actions, epsilon=0.1, alpha=0.2, gamma=0.9):
        self.q = {}

        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.actions = actions

    def getQ(self, state, action):
        return self.q.get((state, action), 0.0)
        # return self.q.get((state, action), 1.0)

    def learnQ(self, state, action, reward, value):
        oldv = self.q.get((state, action), None)
        if oldv is None:
            self.q[(state, action)] = reward
        else:
            self.q[(state, action)] = oldv + self.alpha * (value - oldv)

    def chooseAction(self, state):
        if random.random() < self.epsilon:
            action = random.choice(self.actions)
        else:
            q = [self.getQ(state, a) for a in self.actions]
            maxQ = max(q)
            count = q.count(maxQ)
            if count > 1:
                best = [i for i in range(len(self.actions)) if q[i] == maxQ]
                i = random.choice(best)
            else:
                i = q.index(maxQ)

            action = self.actions[i]
        return action

    def learn(self, state1, action1, reward, state2):
        maxqnew = max([self.getQ(state2, a) for a in self.actions])
        self.learnQ(state1, action1, reward, reward + self.gamma*maxqnew)

import math
def ff(f,n):
    fs = "{:f}".format(f)
    if len(fs) < n:
        return ("{:"+n+"s}").format(fs)
    else:
        return fs[:n]

# borrowing from the q learningstudywolf.wordpress.com

In [1]:
import numpy as np

In the gridworld we have only 4 possible actions

In [2]:
actions = ['up','down','left','right']
actionNum = [index for index, _ in enumerate(actions)]

# actionNum encodes: up, down, left and right as 0, 1, 2 and 3 respectively.
actionNum

[0, 1, 2, 3]

Let us consider the gridworld. And let it be a 4 by 4 pixaleted world

In [3]:
# our states will be 16 set of tuples 
states = []
for i in range(4):
    for j in range(4):
        states.append((i,j))
        
states

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3)]

In [4]:
# 
statesAvailable = states[:]

# wallState = states[np.random.randint(0,len(statesAvailable))]
# posState = states[np.random.randint(0,len(statesAvailable))]
# negState = states[np.random.randint(0,len(statesAvailable))]

wallState = (2,1)
posState = (3,3)
negState = (2,3)

statesAvailable.pop(statesAvailable.index(wallState))
statesAvailable.pop(statesAvailable.index(posState))
statesAvailable.pop(statesAvailable.index(negState))

print(wallState, posState, negState, len(statesAvailable))

(2, 1) (3, 3) (2, 3) 13


In [5]:
# now that we have wall, pos and negative states defined
# we are going to create and update the reward matrix
r_matrix = np.zeros((4,4), dtype=np.int)
r_matrix[wallState] = -1
r_matrix[posState] = 100
r_matrix[negState] = -100

r_matrix

array([[   0,    0,    0,    0],
       [   0,    0,    0,    0],
       [   0,   -1,    0, -100],
       [   0,    0,    0,  100]])

In [6]:
# Now defining the car state
carState = (0,0)

In [7]:
# given coordinate and grid size it will tell if [up,down,left,right]
# is available
def actionsAvailable(state, row_max, col_max):
    row, col = state
    directions = [-1,-1,-1,-1]
    if row < row_max: directions[1] = 0
    if row > 0: directions[0] = 0
    if col < col_max: directions[3] = 0
    if col > 0: directions[2] = 0
    return directions

actionsAvailable((3,3),3,3)

[0, -1, 0, -1]

Here is the equation that will be used:

Q(S_t,A_t) = Q(S_t,A_t) + alpha X [R_t+1 + gamma X max{Q(S_t+1,A_t+1)}-Q(S_t,A_t)]

- some things to be congnicent off:
1. R_t+1 is the reward for the state that you are going to


In [8]:
q_matrix = np.zeros((len(states),len(actionNum)), dtype=np.int)

array([[-1,  0, -1,  0],
       [-1,  0,  0,  0],
       [-1,  0,  0,  0],
       [-1,  0,  0, -1],
       [ 0,  0, -1,  0],
       [ 0,  0,  0,  0],
       [ 0,  0,  0,  0],
       [ 0,  0,  0, -1],
       [ 0,  0, -1,  0],
       [ 0,  0,  0,  0],
       [ 0,  0,  0,  0],
       [ 0,  0,  0, -1],
       [ 0, -1, -1,  0],
       [ 0, -1,  0,  0],
       [ 0, -1,  0,  0],
       [ 0, -1,  0, -1]])

In [77]:
row_max = 3

# we are going to update q_matrix by putting -1 where we can't go
for state in states:
    qRowNum = int((16/(3 + 1))*state[0] + state[1])
    q_matrix[qRowNum] = actionsAvailable(state,3,3)
    
def choose_action(state, row_max, col_max):
    '''
    1. given state get its row number [qRowNum]
    2. for qRowNum get all column items
    3. find max value amongnst all column(action) elements [qMax]
    4. get indices in from q where column elements == qMax [indices]
    5. random choice on any element in 'indices'
    '''
    qRowNum = int((16/(row_max + 1))*state[0] + state[1])
    q = list(q_matrix[qRowNum])
    qMax = np.max(q)
    
    indices = np.argwhere(q == qMax)
    chooseActionIndex = np.random.choice(indices.flatten('F'))
    return chooseActionIndex

# now that we know the action to take (from choose_action)
# get state where action leads us
def stateTo(state, actionTaken):
    row, col = state
    if actionTaken == 0: row -= 1
    if actionTaken == 1: row += 1
    if actionTaken == 2: col -= 1
    if actionTaken == 3: col += 1
        
    return row,col

# Now get reward in this state
def getReward(state):
    return r_matrix[state[0], state[1]]

def updateQMatrix(state, actionChosen, row_max, reward):
    qRowNum = int((16/(row_max + 1))*state[0] + state[1])
    q_matrix[qRowNum, actionChosen] = reward
    return q_matrix
    

# !!!!! STARTING HERE !!!!
state = (3,2)
actionChosen = choose_action(state,3,3)
stateSentTo = stateTo(state, actionChosen)
rewardInNewState = getReward(stateSentTo)
updateQMatrix(state, actionChosen, row_max, rewardInNewState)

print('start state: ', state)
print('action taken: ', actionChosen)
print('Action takes us to this state: ', stateSentTo)
print('Reward at state we are in: ', rewardInNewState)
print('Q matrix: ', q_matrix)

start state:  (3, 2)
action taken:  2
Action takes us to this state:  (3, 1)
Reward at state we are in:  0
Q matrix:  [[-1  0 -1  0]
 [-1  0  0  0]
 [-1  0  0  0]
 [-1  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0  0 -1  0]
 [ 0  0  0  0]
 [ 0  0  0  0]
 [ 0  0  0 -1]
 [ 0 -1 -1  0]
 [ 0 -1  0  0]
 [ 0 -1  0  0]
 [ 0 -1  0 -1]]
