## Value Iteration on GRID Game Example

First lets import standard numeric library
<img src="Pictures/Grid.png" alt="Drawing" style="width: 400px;"/>

In [8]:
import numpy as np
from IPython.display import clear_output

Below we define the costs of the cells in the grid, and $\epsilon$ the probability of a random action.

In [2]:
Grid=np.matrix([0,0,0,-1,0,-2,0,0,0,0,0,+2])
Epsilon = 0.8
beta = 0.9

We define the jumps taken from following each action: Left, Right, Up and Down.
(This could probably be automated for bigger examples.)

In [3]:
P_Left = np.matrix(
                    [
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,0,0,0,1,0,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1]
                    ]
                   )

P_Right = np.matrix(
                    [
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,1,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,0,1,0,0,0,0],
                    [0,0,0,0,0,0,0,1,0,0,0,0],
                    [0,0,0,0,0,0,0,0,0,1,0,0],
                    [0,0,0,0,0,0,0,0,0,0,1,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1],
                    [0,0,0,0,0,0,0,0,0,0,0,1]
                    ]
                   )
                   
P_Up = np.matrix(
                    [
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,0,0,1,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,0,0,0,1,0,0],
                    [0,0,0,0,0,0,0,0,0,0,1,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1],
                    ]
                   )
                   
P_Down = np.matrix(
                    [
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,1,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,1,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1]
                    ]
                   )

Define initial values of the value function

In [4]:
V=np.transpose(Grid)

The following updates the value function once (allowing for random actions).

In [9]:
def Update_V(V,P_Left,P_Right,P_Up,P_Down,Epsilon,beta):
    P_Random = 0.25*P_Left + 0.25*P_Right + 0.25*P_Up + 0.25*P_Down

    Q_Left = ( ( 1 - Epsilon ) * P_Left + Epsilon * P_Random ) @ V
    Q_Right = ( ( 1 - Epsilon ) * P_Right + Epsilon * P_Random ) @ V
    Q_Up = ( ( 1 - Epsilon ) * P_Up + Epsilon * P_Random ) @ V
    Q_Down = ( ( 1 - Epsilon ) * P_Down + Epsilon * P_Random ) @ V

    Left_OR_Right = np.maximum(Q_Left,Q_Right)
    Up_OR_Down = np.maximum(Q_Up,Q_Down)
    V_new =beta * np.maximum(Left_OR_Right,Up_OR_Down)
    
    for i in [3,5,11]:
        V_new[i]=V[i]
    
    return V_new 

We perform value iteration 100 times 

In [10]:
for _ in range(1000):
    V = Update_V(V,P_Left,P_Right,P_Up,P_Down,Epsilon,beta)
    print(np.transpose(V))

[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
[[ 0.0

Finally output the values found.

In [11]:
print(np.transpose(V))

[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
