In [1]:
import numpy as np

def Value_Iteration(V,P,r,discount):
    ''' Value Iteration - a numerical solution to a MDP
    
    # Arguments: 
        V - a 1D np.array. V[x] gives value for state x
        P - a 3D np.array. P[a][x][y] gives probablity of x -> y for action a 
        r - a 3D np.array. r[a][x][y] gives reward for x -> y for action a
        discount - a float. disount factor
    
    # Returns:
        Value function and policy from **one** value iteration
    '''
    number_of_actions = len(P)
    number_of_states = len(P[0])
    
    Q = np.zeros((number_of_actions,number_of_states))
    
    for a in range(number_of_actions):
        for x in range(number_of_states):          
            Q[a][x] = np.dot(P[a][x],r[a][x]+discount*V) 
            
    V_new = np.amax(Q, axis=0)
    
    pi_new = np.argmax(Q, axis=0)
    
    return V_new, pi_new

In [2]:
'''
Define the matrix P
'''

P_Left = np.array(
        [
        [1,0,0,0,0,0,0,0,0,0,0,0,0],
        [1,0,0,0,0,0,0,0,0,0,0,0,0],
        [0,1,0,0,0,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,1,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,1,0,0,0,0,0,0],
        [0,0,0,0,0,0,1,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,1,0,0,0,0],
        [0,0,0,0,0,0,0,0,1,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,1,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,0,0,0,0,0,0,1]
        ])                   

P_Right = np.array(
        [
        [0,1,0,0,0,0,0,0,0,0,0,0,0],
        [0,0,1,0,0,0,0,0,0,0,0,0,0],
        [0,0,0,1,0,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,1,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,0,1,0,0,0,0,0],
        [0,0,0,0,0,0,0,1,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,1,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,1,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,1,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,0,0,0,0,0,0,1]
        ])
                   
P_Up = np.array(
        [
        [0,0,0,0,1,0,0,0,0,0,0,0,0],
        [0,1,0,0,0,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,1,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,0,0,1,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,0,0,0,0,1,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,1,0],
        [0,0,0,0,0,0,0,0,1,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,1,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,1,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,0,0,0,0,0,0,1]
        ])                   
                   
P_Down = np.array(
        [
        [1,0,0,0,0,0,0,0,0,0,0,0,0],
        [0,1,0,0,0,0,0,0,0,0,0,0,0],
        [0,0,1,0,0,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [1,0,0,0,0,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,1,0,0,0,0,0,0,0,0,0,0],
        [0,0,0,1,0,0,0,0,0,0,0,0,0],
        [0,0,0,0,1,0,0,0,0,0,0,0,0],
        [0,0,0,0,0,1,0,0,0,0,0,0,0],
        [0,0,0,0,0,0,1,0,0,0,0,0,0],
        [0,0,0,0,0,0,0,0,0,0,0,0,1],
        [0,0,0,0,0,0,0,0,0,0,0,0,1]
        ])                 

P_Random = 0.25*P_Left + 0.25*P_Right + 0.25*P_Up + 0.25*P_Down

Epsilon = 0.8
P_Left = ( ( 1 - Epsilon ) * P_Left + Epsilon * P_Random )
P_Right = ( ( 1 - Epsilon ) * P_Right + Epsilon * P_Random )
P_Up = ( ( 1 - Epsilon ) * P_Up + Epsilon * P_Random )
P_Down = ( ( 1 - Epsilon ) * P_Down + Epsilon * P_Random )

P = [P_Left, P_Right, P_Up, P_Down]

In [3]:
number_of_states = 13 # including exit state 
number_of_actions = 4

In [4]:
r = np.array([[
                [0,0,0,-1,0,-2,0,0,0,0,0,+2,0] 
                    for x in range(number_of_states)] 
                        for a in range(number_of_actions)])
Epsilon = 0.8
discount = 0.9

In [5]:
V =np.zeros(13)
for _ in range(100):
    V, pi = Value_Iteration(V,P,r,discount)

In [6]:
V

array([0.03957056, 0.0564322 , 0.08053864, 0.        , 0.02783092,
       0.        , 0.71078858, 0.88773408, 0.01981325, 0.01478528,
       1.13488207, 0.        , 0.        ])

In [7]:
pi

array([1, 1, 2, 0, 3, 0, 2, 2, 3, 1, 1, 0, 0])