## Value Iteration on GRID Game Example

First lets import standard numeric library
<img src="Pictures/Grid.png" alt="Drawing" style="width: 400px;"/>

In [1]:
import numpy as np
from IPython.display import clear_output

Below we define the costs of the cells in the grid, and $\epsilon$ the probability of a random action.

In [2]:
Grid=np.matrix([0,0,0,-1,0,-2,0,0,0,0,0,+2])
Epsilon = 0.8
beta = 0.9

We define the jumps taken from following each action: Left, Right, Up and Down.
(This could probably be automated for bigger examples.)

In [3]:
P_Left = np.matrix(
                    [
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,0,0,0,1,0,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1]
                    ]
                   )

P_Right = np.matrix(
                    [
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,1,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,0,1,0,0,0,0],
                    [0,0,0,0,0,0,0,1,0,0,0,0],
                    [0,0,0,0,0,0,0,0,0,1,0,0],
                    [0,0,0,0,0,0,0,0,0,0,1,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1],
                    [0,0,0,0,0,0,0,0,0,0,0,1]
                    ]
                   )
                   
P_Up = np.matrix(
                    [
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,0,0,1,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1],
                    [0,0,0,0,0,0,0,0,1,0,0,0],
                    [0,0,0,0,0,0,0,0,0,1,0,0],
                    [0,0,0,0,0,0,0,0,0,0,1,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1],
                    ]
                   )
                   
P_Down = np.matrix(
                    [
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [0,1,0,0,0,0,0,0,0,0,0,0],
                    [0,0,1,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [1,0,0,0,0,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,1,0,0,0,0,0,0,0,0,0],
                    [0,0,0,1,0,0,0,0,0,0,0,0],
                    [0,0,0,0,1,0,0,0,0,0,0,0],
                    [0,0,0,0,0,1,0,0,0,0,0,0],
                    [0,0,0,0,0,0,1,0,0,0,0,0],
                    [0,0,0,0,0,0,0,0,0,0,0,1]
                    ]
                   )

In [7]:
P_Up[0,]

0

Define initial values of the value function

In [12]:
V=np.transpose(Grid)

The following updates the value function once (allowing for random actions).

In [13]:
def Update_V(V,P_Left,P_Right,P_Up,P_Down,Epsilon,beta):
    P_Random = 0.25*P_Left + 0.25*P_Right + 0.25*P_Up + 0.25*P_Down

    Q_Left = ( ( 1 - Epsilon ) * P_Left + Epsilon * P_Random ) @ V
    Q_Right = ( ( 1 - Epsilon ) * P_Right + Epsilon * P_Random ) @ V
    Q_Up = ( ( 1 - Epsilon ) * P_Up + Epsilon * P_Random ) @ V
    Q_Down = ( ( 1 - Epsilon ) * P_Down + Epsilon * P_Random ) @ V

    Left_OR_Right = np.maximum(Q_Left,Q_Right)
    Up_OR_Down = np.maximum(Q_Up,Q_Down)
    V_new =beta * np.maximum(Left_OR_Right,Up_OR_Down)
    
    for i in [3,5,11]:
        V_new[i]=V[i]
    
    return V_new 

We perform value iteration 100 times 

In [14]:
for _ in range(100):
    V = Update_V(V,P_Left,P_Right,P_Up,P_Down,Epsilon,beta)
    print(np.transpose(V))

[[ 0.    0.   -0.18 -1.    0.   -2.    0.    0.54  0.   -0.36  0.72  2.  ]]
[[ 0.     -0.0324 -0.2124 -1.      0.     -2.      0.324   0.6372 -0.0648
  -0.1656  0.7848  2.    ]]
[[-0.005832 -0.049896 -0.107424 -1.       -0.011664 -2.        0.417312
   0.713016 -0.053136 -0.118944  0.889776  2.      ]]
[[-0.01423008 -0.0393984  -0.05808528 -1.         -0.01586304 -2.
   0.50444208  0.74345904 -0.04473792 -0.07065504  0.93386592  2.        ]]
[[-0.0176313  -0.0297616  -0.01594791 -1.         -0.01888635 -2.
   0.55035858  0.7646222  -0.03453425 -0.044579    0.96617753  2.        ]]
[[-0.01827753 -0.01962906  0.00990138 -1.         -0.01936252 -2.
   0.58164983  0.77669654 -0.02725564 -0.02641647  0.98495228  2.        ]]
[[-0.01688835 -0.00679192  0.02764296 -1.         -0.01845643 -2.
   0.60086742  0.78450235 -0.0215375  -0.01507816  0.99723341  2.        ]]
[[-0.01184706  0.00446647  0.04006546 -1.         -0.01660087 -2.
   0.61334632  0.78936656 -0.0165038  -0.00758679  1.00494408 

Finally output the values found.

In [15]:
print(np.transpose(V))

[[ 0.0356135   0.05078898  0.07248478 -1.          0.02504783 -2.
   0.63970972  0.79896067  0.01783193  0.01330675  1.02139386  2.        ]]
