Exploration into changing + and - states to make dynamic environment

1) first iteration produced:
- qMatrix_9by9_changing_rewards_v1.pkl
- posNegState_9by9_changing_rewards_v1



In [114]:
import numpy as np
import random

from sklearn.externals import joblib

In the gridworld we have only 4 possible actions

In [115]:
actions = ['up','down','left','right']
actionNum = [index for index, _ in enumerate(actions)]

# actionNum encodes: up, down, left and right as 0, 1, 2 and 3 respectively.
actionNum

[0, 1, 2, 3]

Let us consider the gridworld. And let it be a 4 by 4 pixaleted world

In [11]:
# our states will be 16 set of tuples 
num_rows = 9
num_cols = 9

row_max = num_rows - 1
col_max = num_cols - 1

epsilon = 0.5
alpha = 0.3
gamma = 0.9

states = []

for i in range(num_rows):
    for j in range(num_cols):
        states.append((i,j))
        
len(states)

81

In [12]:
statesAvailable = states[:]

negState = (4,7)
statesAvailable.pop(statesAvailable.index(negState))

posState = (4,1)
statesAvailable.pop(statesAvailable.index(posState))

# wallStateMatrix = []
# for i in [1,3,5,7]:
#     for j in [1,3,5,7]:
#         if (i,j) != (7,7):
#             wallState = (i,j)
#             statesAvailable.pop(statesAvailable.index(wallState))
#             wallStateMatrix.append(wallState)
            
#print(wallStateMatrix, negState, posState, len(statesAvailable))
print(posState, negState, len(statesAvailable))

(4, 1) (4, 7) 79


In [13]:
# now that we have wall, pos and negative states defined
# we are going to create and update the reward matrix
# -------------- just going to put pos and neg states
r_matrix = np.zeros((num_rows, num_cols), dtype=np.int)
# for wallState in wallStateMatrix:
#     r_matrix[wallState] = -150
    
r_matrix[posState] = 100
r_matrix[negState] = -100

r_matrix

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  100,    0,    0,    0,    0,    0, -100,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [14]:
# given coordinate and grid size it will tell if [up,down,left,right]
# is available
def actionsAvailable(state, row_max, col_max):
    row, col = state
    # directions = [up,down,left,right]
    # initialize direction -1 implies cant move to stated direction
    directions = [-1,-1,-1,-1] 
    if row < row_max: directions[1] = 0
    if row > 0: directions[0] = 0
    if col < col_max: directions[3] = 0
    if col > 0: directions[2] = 0
    return directions

actionsAvailable((5,0),num_rows-1,num_cols-1)

[0, 0, -1, 0]

Here is the equation that will be used:

Q(S_t,A_t) = Q(S_t,A_t) + alpha X [R_t+1 + gamma X max{Q(S_t+1,A_t+1)}-Q(S_t,A_t)]

- some things to be congnicent off:
1. R_t+1 is the reward for the state that you are going to


In [15]:
# we are going to update q_matrix by putting -1 where we can't go
def initializeQMatrix(states, row_max, col_max):
    q_matrix = np.zeros((len(states),len(actionNum)), dtype=np.int)
    for state in states:
        qRowNum = int((col_max + 1)*state[0] + state[1])
        q_matrix[qRowNum] = actionsAvailable(state,row_max, col_max)
        
    return q_matrix
    
def getIndices(q):
    indexList = []
    for index,val in enumerate(q):
        if val != -1:
            indexList.append(index)
    chooseActionIndex = np.random.choice(indexList)
    return chooseActionIndex

def choose_action(state, row_max, col_max):
    '''
    1. given state get its row number [qRowNum]
    2. for qRowNum get all column items (remember columnms signify up,down,left, right in that order)
    3. find max value amongnst all column(action) elements [qMax]
    4. get indices in from q where column elements == qMax [indices]
    5. random choice on any element in 'indices'
    '''
    qRowNum = int((col_max + 1)*state[0] + state[1]) # get the row number of q-matrix
    q = list(q_matrix[qRowNum]) # one state - four action
    
    # exploration phase where we randomly choose any direction that is not -1 (out of grid bounds)
    if random.random() < epsilon:
        chooseActionIndex = getIndices(q)
    
    # exploitation phase, take existing rewards and do with max
    else:
        qMax = np.max(q)
        indices = np.argwhere(q == qMax)
        chooseActionIndex = np.random.choice(indices.flatten('F'))
    return chooseActionIndex

# now that we know the action to take (from choose_action)
# get state where action leads us
def stateTo(state, actionTaken):
    row, col = state
    if actionTaken == 0: row -= 1
    if actionTaken == 1: row += 1
    if actionTaken == 2: col -= 1
    if actionTaken == 3: col += 1
        
    return row,col

# Now get reward in this state
def getReward(state):
    return r_matrix[state[0], state[1]]

# def updateQMatrix(state, actionChosen, row_max, reward):
#     qRowNum = int((16/(row_max + 1))*state[0] + state[1])
#     q_matrix[qRowNum, actionChosen] = reward
#     return q_matrix

def updateQMatrix(state, actionChosen, stateSentTo, col_max, reward, alpha, gamma):
    
    # getting previous q value
    qRowNum = int((col_max + 1)*state[0] + state[1])
    old_q = q_matrix[qRowNum, actionChosen]
    
    # getting best action from new q value
    qRowNum_latest_state = int((col_max + 1)*stateSentTo[0] + stateSentTo[1])
    q = list(q_matrix[qRowNum_latest_state])
    
    #--- new section
    new_q = []
    for val in q:
        if val != -1:
            new_q.append(val)
    qMax = np.max(new_q)
    #--- new section
    
    indices = np.argwhere(q == qMax)
    chooseActionIndex = np.random.choice(indices.flatten('F'))
    
    latest_q = q_matrix[qRowNum_latest_state, chooseActionIndex]
    q_matrix[qRowNum,actionChosen] = old_q + alpha*(reward + (gamma * latest_q) - old_q)
    
    return q_matrix

In [16]:
q_matrix = initializeQMatrix(states, row_max, col_max)

def process_qMatrix(state):
    
    while (state != posState) and (state != negState):
        actionChosen = choose_action(state, row_max, col_max)
        stateSentTo = stateTo(state, actionChosen)
        rewardInNewState = getReward(stateSentTo)
        #updateQMatrix(state, actionChosen, stateSentTo, row_max, reward, alpha, gamma):
        updateQMatrix(state, actionChosen, stateSentTo, col_max, rewardInNewState, alpha, gamma)

        #print('start state: ', state)
        #print('action taken: ', actionChosen)
        #print('Action takes us to this state: ', stateSentTo)
        #print('Reward at state we are in: ', rewardInNewState)
        #print('Q matrix: ', q_matrix)
        
        state = stateSentTo

In [12]:
q_matrix = initializeQMatrix(states, row_max, col_max)
for _ in range(500000):
    stateIndex = np.random.choice(len(statesAvailable))
    process_qMatrix(statesAvailable[stateIndex])

In [17]:
def makeMoveMatrix(q_matrix):
    
    actionAbbr = ['U','D','L','R']
    movementGrid = []

    for i in range(row_max+1):
        #x = q_matrix[i*5 : i*5+5].flatten()
        x = q_matrix[i*num_rows : i*num_rows+num_rows].flatten()
        row = []
        for j in range(col_max+1):
            row.append(actionAbbr[np.argmax(x[j*4:j*4+4])])
        movementGrid.append(row)
    moveMatrix = np.array(movementGrid)
    
    #putting in the wall, pos and neg states
#     for wallState in wallStateMatrix:
#         moveMatrix[wallState] = 'X'
    moveMatrix[posState] = '+'
    moveMatrix[negState] = '-'
    
    return moveMatrix
        

moveMatrix = makeMoveMatrix(q_matrix)

#print('q_Matrix: \n', q_matrix)
print('moveMatrix: \n', moveMatrix)

moveMatrix: 
 [['D' 'D' 'D' 'D' 'D' 'D' 'D' 'D' 'D']
 ['U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U']
 ['U' '+' 'U' 'U' 'U' 'U' 'U' '-' 'U']
 ['U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U']
 ['U' 'U' 'U' 'U' 'U' 'U' 'U' 'U' 'U']]


In [32]:
# investigation into how to get neighbour q's given initial state
#num_rows = 9
#num_cols = 9
def neighbourStates(state, q_matrix):
    '''
    - Depending on 'state' we will get the one [above_it, below_it, left_of_it, on_its_right]
    - if no such state exists we will get -1
    - example:
      neighbourStates((0,0)) ==> [-1, (1, 0), -1, (0, 1)]
    '''
    neighbours = actionsAvailable(state, row_max, col_max)
    if neighbours[0] != -1:
        neighbours[0] = (state[0]-1, state[1])
    if neighbours[1] != -1:
        neighbours[1] = (state[0]+1, state[1])
    if neighbours[2] != -1:
        neighbours[2] = (state[0], state[1]-1)
    if neighbours[3] != -1:
        neighbours[3] = (state[0], state[1]+1)
        
    # adding extra section to give me q_matrix
    qForNeighbours = []
    correctValForQ = [1,0,3,2]
    for index, state in enumerate(neighbours):
        if state != -1:
            getRowNum = (num_cols)*state[0]+state[1]
            qForNeighbours.append(q_matrix[getRowNum][correctValForQ[index]])
        else:
            qForNeighbours.append(-1)
    
    return neighbours, qForNeighbours

neighbourStates(negState, q_matrix)

([(3, 7), (5, 7), (4, 6), (4, 8)], [0, 0, 0, 0])

In [48]:
q_matrix[4]

array([-1, 39, 39, 26])

## ============== Don't Touch ================

In [17]:
joblib.dump(q_matrix, 'qMatrix_9by9_changing_rewards_v1.pkl')

['qMatrix_9by9_changing_rewards_v1.pkl',
 'qMatrix_9by9_changing_rewards_v1.pkl_01.npy']

In [5]:
q_matrix_load = joblib.load('qMatrix_9by9_changing_rewards_v1.pkl')

In [6]:
len(q_matrix_load)

81

In [7]:
def makeMoveMatrix(q_matrix):
    
    actionAbbr = ['U','D','L','R']
    movementGrid = []

    for i in range(row_max+1):
        #x = q_matrix[i*5 : i*5+5].flatten()
        x = q_matrix[i*num_rows : i*num_rows+num_rows].flatten()
        row = []
        for j in range(col_max+1):
            row.append(actionAbbr[np.argmax(x[j*4:j*4+4])])
        movementGrid.append(row)
    moveMatrix = np.array(movementGrid)
    
    #putting in the wall, pos and neg states
#     for wallState in wallStateMatrix:
#         moveMatrix[wallState] = 'X'
    moveMatrix[posState] = '+'
    moveMatrix[negState] = '-'
    
    return moveMatrix

In [8]:
makeMoveMatrix(q_matrix_load)

array([['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'L', 'L'],
       ['R', '+', 'L', 'L', 'L', 'L', 'L', '-', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'L', 'L'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U']], 
      dtype='<U1')

In [20]:
# posNegWallState = [posState, negState, wallStateMatrix]
# posNegWallState
posNegState = [posState, negState]
posNegState

[(4, 1), (4, 7)]

In [22]:
joblib.dump(posNegState, 'posNegState_9by9_changing_rewards_v1.pkl')

['posNegState_9by9_changing_rewards_v1.pkl']

In [9]:
posNegWall_load = joblib.load('posNegState_9by9_changing_rewards_v1.pkl')

In [10]:
posNegWall_load

[(4, 1), (4, 7)]

# Using the above saved states to state processing

In [82]:
q_matrix_load = joblib.load('qMatrix_9by9_changing_rewards_v1.pkl')

In [83]:
posNegWall_load = joblib.load('posNegState_9by9_changing_rewards_v1.pkl')

In [84]:
q_matrix_load[:10]

array([[-1, 54, -1, 54],
       [-1, 63, 46, 46],
       [-1, 54, 54, 39],
       [-1, 46, 46, 32],
       [-1, 39, 39, 26],
       [-1, 32, 32, 21],
       [-1, 26, 26, 16],
       [-1, 21, 21, 12],
       [-1, 16, 16, -1],
       [46, 63, -1, 63]])

In [85]:
posState, negState = posNegWall_load

In [86]:
makeMoveMatrix(q_matrix_load)

array([['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'L', 'L'],
       ['R', '+', 'L', 'L', 'L', 'L', 'L', '-', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'L', 'L'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U']], 
      dtype='<U1')

In [112]:
q_matrix_load

array([[ -1,  54,  -1,  54],
       [ -1,  63,  46,  46],
       [ -1,  54,  54,  39],
       [ -1,  46,  46,  32],
       [ -1,  39,  39,  26],
       [ -1,  32,  32,  21],
       [ -1,  26,  26,  16],
       [ -1,  21,  21,  12],
       [ -1,  16,  16,  -1],
       [ 46,  63,  -1,  63],
       [ 54,  73,  54,  54],
       [ 46,  63,  63,  46],
       [ 39,  54,  54,  39],
       [ 32,  46,  46,  32],
       [ 26,  39,  39,  26],
       [ 21,  32,  32,  21],
       [ 16,  26,  26,  16],
       [ 12,  21,  21,  -1],
       [ 54,  73,  -1,  73],
       [ 63,  84,  63,  63],
       [ 54,  73,  73,  54],
       [ 46,  63,  63,  46],
       [ 39,  54,  54,  39],
       [ 32,  46,  46,  32],
       [ 26,  39,  39,  26],
       [ 21,  32,  32,  21],
       [ 16,  26,  26,  -1],
       [ 63,  84,  -1,  84],
       [ 73,  97,  73,  73],
       [ 63,  84,  84,  63],
       [ 54,  73,  73,  54],
       [ 46,  63,  63,  46],
       [ 39,  54,  54,  39],
       [ 32,  46,  46,  32],
       [ 26, -

In [87]:
neighbourStates(posState, q_matrix_load)

([(3, 1), (5, 1), (4, 0), (4, 2)], [97, 97, 97, 97])

In [101]:
q = np.copy(q_matrix_load[36])
-1*q

array([-73, -73,   1, -97])

In [121]:
# first order is to change the values of pos and neg states and then apply flip
def flipPosAndNeg(posNegStates, q_mat):
    qTemp = q_mat[:]
    correctValForQ = [1,0,3,2]
    for element in posNegStates:
        neighbours = neighbourStates(element, qTemp)
        state, stateVal = neighbours
        for index, st in enumerate(state):
            #print(index, st)
            getRowForState = num_cols*st[0]+st[1]
            qTemp[getRowForState, correctValForQ[index]] = -1*qTemp[getRowForState, correctValForQ[index]]
            
    return qTemp
            
            

def q_X_Transform(lst, direction=0):
    # type=0 implies x, 1 implies y flip
    temp = lst[:]
    if direction == 0:
        if -1 not in temp[2:]:
            valAtIndex3 = temp[2]
            temp[2] = temp[3]
            temp[3] = valAtIndex3
    return temp

temp = q_X_Transform([1,2,-1,4])
temp

qTemp = flipPosAndNeg([posState, negState], q_matrix_load)
qTemp[36:44]

array([[ 73,  73,  -1,  97],
       [  0,   0,   0,   0],
       [ 73,  73,  97,  73],
       [ 63,  63,  84,  63],
       [ 54,  54,  73,  54],
       [ 46,  46,  63,  46],
       [ 39,  39,  54, -97],
       [  0,   0,   0,   0]])

In [93]:
qTemp = []
for element in q_matrix_load:
    qTemp.append(q_X_Transform(list(element)))
    
qMatrix_transformed = np.array(qTemp)
flipPosAndNeg([posState, negState], q_matrix_load)

([(3, 1), (5, 1), (4, 0), (4, 2)], [97, 97, 97, 97])
([(3, 7), (5, 7), (4, 6), (4, 8)], [-97, -97, -97, -97])


array([[ -1,  54,  -1,  54],
       [ -1,  63,  46,  46],
       [ -1,  54,  54,  39],
       [ -1,  46,  46,  32],
       [ -1,  39,  39,  26],
       [ -1,  32,  32,  21],
       [ -1,  26,  26,  16],
       [ -1,  21,  21,  12],
       [ -1,  16,  16,  -1],
       [ 46,  63,  -1,  63],
       [ 54,  73,  54,  54],
       [ 46,  63,  63,  46],
       [ 39,  54,  54,  39],
       [ 32,  46,  46,  32],
       [ 26,  39,  39,  26],
       [ 21,  32,  32,  21],
       [ 16,  26,  26,  16],
       [ 12,  21,  21,  -1],
       [ 54,  73,  -1,  73],
       [ 63,  84,  63,  63],
       [ 54,  73,  73,  54],
       [ 46,  63,  63,  46],
       [ 39,  54,  54,  39],
       [ 32,  46,  46,  32],
       [ 26,  39,  39,  26],
       [ 21,  32,  32,  21],
       [ 16,  26,  26,  -1],
       [ 63,  84,  -1,  84],
       [ 73,  97,  73,  73],
       [ 63,  84,  84,  63],
       [ 54,  73,  73,  54],
       [ 46,  63,  63,  46],
       [ 39,  54,  54,  39],
       [ 32,  46,  46,  32],
       [ 26, -

In [69]:
makeMoveMatrix(qMatrix_transformed)

array([['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
       ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'L', 'L'],
       ['R', '+', 'L', 'L', 'L', 'L', 'L', '-', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'L', 'L'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U'],
       ['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U']], 
      dtype='<U1')

In [96]:
# don't remove
q_matrix_load[36:45]

array([[ 73,  73,  -1,  97],
       [  0,   0,   0,   0],
       [ 73,  73,  97,  73],
       [ 63,  63,  84,  63],
       [ 54,  54,  73,  54],
       [ 46,  46,  63,  46],
       [ 39,  39,  54, -97],
       [  0,   0,   0,   0],
       [ 26,  26, -97,  -1]])