In [51]:
import numpy as np

In [52]:
grid = np.zeros((4, 3))
policy = np.zeros_like(grid, dtype=str)
print(grid)
print(policy)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[['' '' '']
 ['' '' '']
 ['' '' '']
 ['' '' '']]


In [53]:
grid[:] = -0.04
grid[0,1]=1     # THE REWARD STATE
grid[0,2]=-1;   #THE RED STATE
grid[2,1]=np.nan
policy[2,1]="N.A"
grid

array([[-0.04,  1.  , -1.  ],
       [-0.04, -0.04, -0.04],
       [-0.04,   nan, -0.04],
       [-0.04, -0.04, -0.04]])

In [54]:
reward_Step=-0.04
reward_red=-1
reward_goal=1
p_indirec_action=0.7
p_perp=0.15

In [55]:
gamma=0.95 #discount factor 

threshold=0.0001 #this is the convergenve threshold

In [56]:
# Define a function to get the next state based on the current state and action
def get_next_state(row, col, action):
    if action == "up":
        next_row = max(row - 1, 0)
        next_col = col
    elif action == "down":
        next_row = min(row + 1, grid.shape[0] - 1)
        next_col = col
    elif action == "left":
        next_row = row
        next_col = max(col - 1, 0)
    elif action == "right":
        next_row = row
        next_col = min(col + 1, grid.shape[1] - 1)
    
    if(np.isnan(grid[next_row,next_col])):
        return row,col
    return next_row, next_col

In [57]:
# Perform Value Iteration
V = np.zeros_like(grid)
V[0,1]=1
V[0,2]=-1
print(V)

[[ 0.  1. -1.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]


In [58]:

def get_utility(grid,V,row,col):
    expected_utility = [] #this will take max of multiply with gamma and add it to the reward
    perp_actions = {"up": ["left", "right"], "down": ["left", "right"], "left": ["up", "down"], "right": ["up", "down"]}
    for action in perp_actions.keys():
        next_row, next_col = get_next_state(row, col, action)
        perp_row_col = [(get_next_state(row, col, perp_action)) for perp_action in perp_actions[action]]
        expected_utility.append(p_indirec_action*V[next_row, next_col] + p_perp*(V[perp_row_col[0][0], perp_row_col[0][1]] + V[perp_row_col[1][0], perp_row_col[1][1]]))
        
#     print(expected_utility)
    policy[row, col] = max(perp_actions.keys(), key=lambda x: expected_utility[list(perp_actions.keys()).index(x)])
    if grid[row,col] == 1:
        policy[row,col]="N.A"
        return reward_goal
    elif grid[row,col] == -1:
        policy[row,col]="N.A"
        return reward_red 
    else:
        return reward_Step + gamma*max(expected_utility)

           

In [59]:
iterations=0
while True:
    delta = 0
    iterations=iterations+1
    V_temp = np.zeros_like(V)
    for row in range(grid.shape[0]):
        for col in range(grid.shape[1]):
            V_temp[row][col]=V[row][col]
            
    for row in range(grid.shape[0]-1, -1, -1):
        for col in range(grid.shape[1]):
            if np.isnan(grid[row, col]):
                V[row,col]=0
                continue    
            V[row, col] = get_utility(grid, V_temp, row, col)
            delta = max(delta, np.abs(V_temp[row][col] - V[row, col]))

    if delta < threshold:
        break
    print("Iteration:" + str(iterations))
    print("\n")

    print(V)
    print("\n")


        

Iteration:1


[[ 0.625  1.    -1.   ]
 [-0.04   0.625 -0.04 ]
 [-0.04   0.    -0.04 ]
 [-0.04  -0.04  -0.04 ]]


Iteration:2


[[ 0.7083625  1.        -1.       ]
 [ 0.4589875  0.6136     0.227425 ]
 [-0.078      0.        -0.078    ]
 [-0.078     -0.078     -0.078    ]]


Iteration:3


[[ 0.79134737  1.         -1.        ]
 [ 0.58390478  0.72281378  0.214429  ]
 [ 0.24299669  0.          0.08900762]
 [-0.1141     -0.1141     -0.1141    ]]


Iteration:4


[[ 0.82097343  1.         -1.        ]
 [ 0.6724534   0.73876256  0.31085475]
 [ 0.41755074  0.          0.12796246]
 [ 0.0890743  -0.148395   -0.01332843]]


Iteration:5


[[ 0.83781332  1.         -1.        ]
 [ 0.70704561  0.76512141  0.32701176]
 [ 0.52618347  0.          0.20318771]
 [ 0.22921804 -0.02305817  0.02204945]]


Iteration:6


[[ 0.8451424   1.         -1.        ]
 [ 0.72692966  0.77235317  0.35525999]
 [ 0.58014762  0.          0.23537131]
 [ 0.33928979  0.10585842  0.09497608]]


Iteration:7


[[ 0.84902027  1.   

In [60]:
print("Utility Values: "+ str(iterations))
print("\n")

print(V)
print("\n")
print("Optimal Policy\n")
print(policy)


Utility Values: 22


[[ 0.85270162  1.         -1.        ]
 [ 0.74520539  0.78573618  0.38277098]
 [ 0.6371478   0.          0.30005495]
 [ 0.51825584  0.42603817  0.33356905]]


Optimal Policy

[['r' 'N' 'N']
 ['u' 'u' 'l']
 ['u' 'N' 'u']
 ['u' 'l' 'l']]
