# Gridworld
* Very simple gridworld experiments

In [213]:
gridworld = [
         ["", "b", "b", "", "", "eH"],
         ["", "", "", "", "", "eU"],
         ["", "", "", "b", "", ""],
         ["", "", "", "b", "", ""],
         ["s", "", "", "", "", ""]
]
m = len(world)
n = len(world[0])
print("mxn shape", m,n)

mxn shape 5 6


In [214]:
reward_f = np.array([[-1.0, -1.0, -1.0, -1.0, -1.0, 10.0],
                   [-1.0, -1.0, -1.0, -1.0, -1.0, -10.0],
                   [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
                   [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
                   [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0]]) 

print("value shape", value.shape)
print(value[0][0])

value shape (5, 6)
-1.0


---

# Value function in a **Markov Random Process**
* 0.25% chance of going UP, DOWN, LEFT or RIGHT.
* Value iteration algorithm:
   * $ V_{t+1}(s) = R_t + \gamma * \sum_{s'}{ P(s'|s) * V_t(s') } $

In [215]:

def calc_value_function(gridworld, reward_f, rate, k_epochs):
  m,n = len(gridworld), len(gridworld[0])
  value_f = np.zeros((m,n))
  value_backup = np.zeros((m,n))
  

  for k in range(k_epochs):
    for i in range(m):
      for j in range(n):
        if gridworld[i][j] in ["", "s"]:
          # for possible ways to move
          c_left = (i, 0 if j == 0 else j-1)
          c_right = (i, n-1 if j == n-1 else j+1)
          c_up = (0 if i==0 else i-1, j)
          c_down = (m-1 if i==(m-1) else i+1, j)

          # keep in the same cell if neighbour is blocked
          c_left = (i,j) if gridworld[c_left[0]][c_left[1]] == 'b' else c_left
          c_right = (i,j) if gridworld[c_right[0]][c_right[1]] == 'b' else c_right
          c_up = (i,j) if gridworld[c_up[0]][c_up[1]] == 'b' else c_up
          c_down = (i,j) if gridworld[c_down[0]][c_down[1]] == 'b' else c_down

          value_backup[i][j] = reward_f[i][j] + rate * (  (1/4) * value_f[c_left[0], c_left[1]] + 
                                                          (1/4) * value_f[c_right[0], c_right[1]] + 
                                                          (1/4) * value_f[c_up[0], c_up[1]] + 
                                                          (1/4) * value_f[c_down[0], c_down[1]])
          
          value_f = value_backup.copy()
  return value_f




# **lambda rate** low
* myopic evaluation (see only immediate reward)

In [216]:
rate = 0.001
v = calc_value_function(gridworld, reward_f, rate, 3000)

for i in range(v.shape[0]):
  print(np.around(v[i],2))

[-1.  0.  0. -1. -1.  0.]
[-1. -1. -1. -1. -1.  0.]
[-1. -1. -1.  0. -1. -1.]
[-1. -1. -1.  0. -1. -1.]
[-1. -1. -1. -1. -1. -1.]


In [217]:
rate = 0.2
v = calc_value_function(gridworld, reward_f, rate, 3000)

for i in range(v.shape[0]):
  print(np.around(v[i],2))

[-1.25  0.    0.   -1.25 -1.18  0.  ]
[-1.25 -1.25 -1.25 -1.25 -1.18  0.  ]
[-1.25 -1.25 -1.25  0.   -1.24 -1.18]
[-1.25 -1.25 -1.25  0.   -1.25 -1.25]
[-1.25 -1.25 -1.25 -1.25 -1.25 -1.25]


In [218]:
rate = 0.5
v = calc_value_function(gridworld, reward_f, rate, 3000)

for i in range(v.shape[0]):
  print(np.around(v[i],2))

[-2.    0.    0.   -1.93 -1.66  0.  ]
[-2.   -2.   -1.99 -1.95 -1.69  0.  ]
[-2.   -2.   -2.    0.   -1.91 -1.69]
[-2.   -2.   -2.    0.   -1.98 -1.95]
[-2.   -2.   -2.   -2.   -2.   -1.99]


# lambda **rate high**
* far-sighted evaluation



In [221]:
rate = 0.9999999999
v = calc_value_function(gridworld, reward_f, rate, 3000)

for i in range(v.shape[0]):
  print(np.around(v[i],2))

[-125.29    0.      0.    -45.89  -27.93    0.  ]
[-121.29 -112.78  -95.8   -59.86  -33.89    0.  ]
[-121.79 -117.25 -110.76    0.    -43.77  -34.18]
[-122.85 -119.66 -115.22    0.    -59.23  -54.79]
[-123.09 -119.33 -111.25  -95.18  -75.12  -66.95]


# Trapped path


In [222]:
gridworld = [
         ["", "b", "b", "", "", "eH"],
         ["", "", "", "", "", "eU"],
         ["", "b", "b", "b", "", ""],
         ["", "b", "", "b", "", ""],
         ["s", "b", "", "b", "", ""]
]

* trapped states show no difference in evaluation for not high enogh lambda rates

In [223]:
rate = 0.8
v = calc_value_function(gridworld, reward_f, rate, 3000)

for i in range(v.shape[0]):
  print(np.around(v[i],2))

[-4.98  0.    0.   -4.13 -3.1   0.  ]
[-4.97 -4.9  -4.73 -4.29 -3.29  0.  ]
[-4.99  0.    0.    0.   -4.05 -3.37]
[-4.99  0.   -5.    0.   -4.56 -4.41]
[-5.    0.   -5.    0.   -4.76 -4.72]


* here the trapped states are more visible due to a higher **lambda rate**:

In [226]:
rate = 0.99999
v = calc_value_function(gridworld, reward_f, rate, 3000)

for i in range(v.shape[0]):
  print(np.around(v[i],2))

[-122.5     0.      0.    -36.16  -21.74    0.  ]
[-118.5   -98.53  -74.56  -46.59  -25.05    0.  ]
[-130.49    0.      0.      0.    -27.88  -21.16]
[ -138.48     0.   -3370.16     0.     -33.44   -31.6 ]
[ -142.47     0.   -3370.71     0.     -36.82   -36.21]


---