<a href="https://colab.research.google.com/github/oamerl/machine-learning-projects/blob/main/Reinforcement-Learning/grid-world/grid_world.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Needed Imports

In [None]:
import numpy as np

# the following code ensures that you can see your (print) results for multiple tasks within a coding block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Policy Evaluation Algorithm

In [None]:
grid_cols = 5 # number of columns in grid (x-axis)
grid_rows = 5 # number of rows in grid (y-axis)

discount_factors = [0.75, 0.85, 0.9] # discount factors to be tried
theta_thres = 0.01 # theta threshold for accuracy

# Define deterministic state transition probability
state_transition_prob = 1

# uniform policy with probability of 0.25 for each action to be chosen
actions_prob_policy = [0.25, 0.25, 0.25, 0.25]

# list of possible actions defined as numpy arrays to be able to apply mathematical operations on them
actions = [np.array([-1, 0]), # North
           np.array([1, 0]), # South
           np.array([0, -1]), # West
           np.array([0, 1])] # East

cell_A = [0, 1] ; cell_A_prime = [4, 1] # unique cell A location
cell_B = [0, 3] ; cell_B_prime = [2, 3] # unique cell B location

# initial value function initialized with zeros
previous_value = np.zeros((grid_cols, grid_rows))

# loop over different discount factors and getting value function for each one
for dicount_factor in discount_factors:

    while True:

        # initialization for current iteration value
        current_iter_value =  np.zeros((grid_cols, grid_rows))

        # evaluate the value function for each state
        for y in range(grid_rows):
            for x in range(grid_cols):

                current_state = [y,x] # current state location

                for action_indx in range(len(actions)): # looping over possible actions for each state

                    # Unique Cell A state condition
                    if current_state == cell_A:
                        next_state = cell_A_prime # unique condtion of teleporting to state A prime
                        reward = 10.0 # special reward value to be accrued

                    # Unique Cell B state condition
                    elif current_state == cell_B:
                        next_state = cell_B_prime # unique condtion of teleporting to state B prime
                        reward = 5.0 # special reward value to be accrued

                    # For any other state other than A or B
                    else:
                        next_state = np.array(current_state) + actions[action_indx] # the next state to land on based on the action

                        # Check if next state is outside the grid
                        if next_state[0] < 0 or next_state[0] >= grid_rows or next_state[1] < 0 or next_state[1] >= grid_cols:
                            next_state = current_state
                            reward = -1.0
                        # if next state is within the grid
                        else:
                            reward = 0.0

                    action_prob = actions_prob_policy[action_indx] # getting the probability of current action

                    # new value
                    current_iter_value[y, x] = current_iter_value[y, x] + action_prob * (state_transition_prob *(reward + dicount_factor * previous_value[next_state[0], next_state[1]]))

        # if the difference between the values is below the accuracy paramter theta will break and return current_iter_value
        if np.sum(np.abs(previous_value - current_iter_value)) < theta_thres:
            break
        # couldn't implement the idea of delta so used the above condition instead, another way would be going around the whole grid for a defined number of times instead of uisng while loop

        # if the condition is not satisfied will go for another interation
        else:
            previous_value = current_iter_value # updating the previous iteration variable with the newly calculated value

    print("Value function in case of discount factor gamma = ", dicount_factor, "\n")
    current_iter_value
    print("\n")
    print("-----------------------------------------------------------------------------")

Value function in case of discount factor gamma =  0.75 



array([[ 2.21211313,  9.3812912 ,  3.34519602,  5.11400177,  0.74879611],
       [ 0.65807248,  2.20408747,  1.33343379,  1.24674753,  0.05075425],
       [-0.23139657,  0.3822621 ,  0.31572576,  0.1523317 , -0.44037806],
       [-0.70940966, -0.25041636, -0.18331346, -0.30985432, -0.77723292],
       [-1.25998414, -0.82538043, -0.73503355, -0.84494532, -1.28590421]])



-----------------------------------------------------------------------------
Value function in case of discount factor gamma =  0.85 



array([[ 2.91309001,  9.05344334,  4.0499772 ,  5.23293169,  1.18652542],
       [ 1.18021666,  2.70545595,  1.89784016,  1.64391587,  0.33127614],
       [-0.06964679,  0.59874796,  0.53098714,  0.27390503, -0.42583896],
       [-0.861861  , -0.3502687 , -0.27273241, -0.46063032, -1.00715864],
       [-1.59910894, -1.11414074, -1.00500664, -1.16323629, -1.67085218]])



-----------------------------------------------------------------------------
Value function in case of discount factor gamma =  0.9 



array([[ 3.31283295,  8.79309665,  4.43105838,  5.32555202,  1.49505291],
       [ 1.52534712,  2.99596941,  2.25352774,  1.91069663,  0.55032164],
       [ 0.05456092,  0.74180281,  0.67654928,  0.36141026, -0.40006032],
       [-0.96980851, -0.43179294, -0.35132735, -0.58220167, -1.17977415],
       [-1.85386334, -1.34146119, -1.22560997, -1.41938165, -1.97171949]])



-----------------------------------------------------------------------------


In [None]:
#cond = np.abs(previous_value - current_iter_value) < theta_thres
#if False not in cond:
#    break

#delta = np.maximum(delta, np.abs(previous_value - current_iter_value))