In [1]:
import numpy as np

## INITIALIZATION

In [2]:
# Reward matrix
R = np.matrix([ [-1,-1,-1,-1,0,-1], # 100:Final state , -1 :Not Available , 0: Available (no reward)
        [-1,-1,-1,0,-1,100],
        [-1,-1,-1,0,-1,-1],
        [-1,0,0,-1,0,-1],
        [-1,0,0,-1,-1,100],
        [-1,0,-1,-1,0,100] ])

# Quality matrix
Q = np.matrix(np.zeros([6,6])) # For the first time it filled by 0's

gamma = 0.8

## FUNCTIONS

In [3]:
def get_next_action(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0)[1] # It takes the columns of greater and equal to 0 
    next_action = int(np.random.choice(av_act,1)) #Choose randomly
    return next_action

In [4]:
def update(current_state, action, gamma):
    
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index] # Max value in Q matrix
    
    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value


## TRAINING

In [5]:
for i in range(10000):
    current_state = np.random.randint(0, int(Q.shape[0])) # Gives the random ints 0 to 6
    action = get_next_action(current_state) # We send current state 
    update(current_state,action,gamma) # And update the state using Q formula
    
# Normalize the "trained" Q matrix
print("Trained Q matrix:")
print(Q/np.max(Q)*100)

Trained Q matrix:
[[  0.    0.    0.    0.   80.    0. ]
 [  0.    0.    0.   64.    0.  100. ]
 [  0.    0.    0.   64.    0.    0. ]
 [  0.   80.   51.2   0.   80.    0. ]
 [  0.   80.   51.2   0.    0.  100. ]
 [  0.   80.    0.    0.   80.  100. ]]


## 

## TESTING

In [6]:
# Goal state = 5
# Best sequence path starting from 2 -> 2, 3, 1, 5

In [7]:
current_state = 2 # Initially we defined the current state as 2
steps = [current_state]

while current_state != 5: # Continue to be 5
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1] #Choose biggest state
    
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size = 1)) #If its more than 1
    else:
        next_step_index = int(next_step_index)
    
    steps.append(next_step_index) #And we append the choosen state to steps list
    current_state = next_step_index 

# Print selected sequence of steps
print("Selected path:")
print(steps)


Selected path:
[2, 3, 1, 5]
