In [1]:
import numpy as np

In [2]:
R = np.matrix([
    [-1,-1,-1,-1,0,-1],
    [-1,-1,-1,0,-1,100],
    [-1,-1,-1,0,-1,-1],
    [-1,0,0,-1,0,-1],
    [-1,0,0,-1,-1,100],
    [-1,0,-1,-1,0,100]
])

print(R)

[[ -1  -1  -1  -1   0  -1]
 [ -1  -1  -1   0  -1 100]
 [ -1  -1  -1   0  -1  -1]
 [ -1   0   0  -1   0  -1]
 [ -1   0   0  -1  -1 100]
 [ -1   0  -1  -1   0 100]]


In [3]:
# Q matrix 
Q = np.matrix(np.zeros([6,6]))
print(Q)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [4]:
# learning parameter
gamma = 0.8

# inital state (usually to be chosen at random)
initial_state = 1

In [5]:
# this function returns all available actions in the state given ans # an argument 
def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >=0)[1]
    return av_act

In [6]:
# get available actions in the current state
available_act = available_actions(initial_state)

In [8]:
# this function chooses at random which action to be performed withing the range of all the available actions 
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act, 1))
    return next_action

In [9]:
# sample next action to be performed
action = sample_next_action(available_act)

In [14]:
# this function updates the Q matrix according to the path selected and the Q learning algorithm
def update(current_state, action, gamma):
    max_index = np.where(Q[action,]== np.max(Q[action,]))[1]
    if max_index.shape[0]>1:
        max_index=int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    Q[current_state, action] = R[current_state, action] + gamma + max_value

In [15]:
update(initial_state, action, gamma)
print(Q)

[[  0.    0.    0.    0.    0.    0. ]
 [  0.    0.    0.    0.    0.  100.8]
 [  0.    0.    0.    0.    0.    0. ]
 [  0.    0.    0.    0.    0.    0. ]
 [  0.    0.    0.    0.    0.    0. ]
 [  0.    0.    0.    0.    0.    0. ]]


In [16]:
# train over 10000 iteraion (re-iterate the # process above)

for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)
    
print(Q)

[[    0.      0.      0.      0.  96737.6     0. ]
 [    0.      0.      0.  96536.8     0.  96837.6]
 [    0.      0.      0.  96636.      0.      0. ]
 [    0.  96635.2 96636.8     0.  96637.6     0. ]
 [    0.  96635.2 96636.8     0.      0.  96736.8]
 [    0.  96635.2     0.      0.  96636.  96736.8]]


In [17]:
# normalize the 'trained' q matrix
print('Trained Q matrix')
print(Q/np.max(Q)*100)

Trained Q matrix
[[  0.           0.           0.           0.          99.89673433
    0.        ]
 [  0.           0.           0.          99.68937685   0.
  100.        ]
 [  0.           0.           0.          99.7918164    0.
    0.        ]
 [  0.          99.79099028  99.79264253   0.          99.79346865
    0.        ]
 [  0.          99.79099028  99.79264253   0.           0.
   99.8959082 ]
 [  0.          99.79099028   0.           0.          99.7918164
   99.8959082 ]]


In [18]:
current_state = 2
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    if next_step_index.shape[0]>1:
        next_step_index = int(np.random.choice(next_step_index, size=1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index

print('selected path')
print(steps)

selected path
[2, 3, 4, 5]


In [None]:
# the main goal is max reward and max value with rainforcement learning 