<a href="https://colab.research.google.com/github/nilesh0109/ML_SoSe19/blob/master/Ex08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

# N x N Grid world

In [0]:
class Environment:
  
  
  actions = ['left', 'right', 'up', 'down']
  
  def __init__(self, grid_size=(4,4)):
    self.grid_size = grid_size 
    self.s = np.arange(self.grid_size[0] * self.grid_size[1])
    self.v = np.zeros(self.grid_size)
    self.terminal_state_indices = [self.s[0], self.s[-1]]
    # we are considering a random policy i.e. uniform probablity for each action
  
  def isTerminalState(self, state_index):
    return state_index in self.terminal_state_indices
    
  def step(self, state_index, action_index):
    num_rows, num_cols = self.grid_size
    if state_index % num_cols == 0 and self.actions[action_index] == 'left':
      return state_index, 0
    elif (state_index + 1) % num_cols == 0 and self.actions[action_index] == 'right':
      return state_index, 0
    elif state_index + 1 < num_cols and self.actions[action_index] == 'up':
      return state_index, 0
    elif state_index + 1 > (num_rows - 1) * num_cols and self.actions[action_index] == 'down':
      return state_index, 0
    else:      
      getNextState = {
          'left': -1,
          'right': 1,
          'up': -num_cols,
          'down': num_cols
      }
      next_state = getNextState[self.actions[action_index]] + state_index
      reward = -1 
      return next_state,reward
   
  def update_vtable(self, state_index): 
    num_rows, num_cols = self.grid_size
    num_valid_actions = 0
    v_sum = 0
    for action_index in np.arange(len(self.actions)):
      next_state, reward = self.step(state_index, action_index)
      if next_state == state_index:
        #it means the agent went outside the grid so this action is not possible
        continue
        
      v_sum += reward + self.v[int(next_state/num_rows), next_state%num_rows]
      num_valid_actions += 1
      # counting the number of valid actions to get the correct expectation value
    return np.round((1/num_valid_actions) * v_sum, 1)
      
      
  
  def run_iterative_Policy_Eval(self, num_iterations):
    self.v_copy = np.zeros((num_iterations, self.v.shape[0], self.v.shape[1]))
    for i in range(num_iterations):
      for state_ind in self.s:
        v_row_index, v_col_index = int(state_ind/self.grid_size[0]) , state_ind % self.grid_size[0]
        if not self.isTerminalState(state_ind):
          self.v_copy[i, v_row_index, v_col_index] = self.update_vtable(state_ind)
      self.v = self.v_copy[i]

# 4 X 4 Grid Environment

In [0]:
A = Environment()

In [4]:
print('Num States')
print(A.s)
print('termainal States')
print(A.terminal_state_indices)
print('Num Actions')
print(A.actions)
print('initial Value function')
A.v

Num States
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
termainal States
[0, 15]
Num Actions
['left', 'right', 'up', 'down']
initial Value function


array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [0]:
A.run_iterative_Policy_Eval(5)

In [6]:
print('Final Value function')
A.v

Final Value function


array([[ 0. , -3.5, -4.5, -4.8],
       [-3.5, -4.4, -4.7, -4.5],
       [-4.5, -4.7, -4.4, -3.5],
       [-4.8, -4.5, -3.5,  0. ]])

In [7]:
for i in range(A.v_copy.shape[0]):
  print('----- Value Iteration after step ',i, '-------------------')
  print(A.v_copy[i])

----- Value Iteration after step  0 -------------------
[[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -1.  0.]]
----- Value Iteration after step  1 -------------------
[[ 0.  -1.7 -2.  -2. ]
 [-1.7 -2.  -2.  -2. ]
 [-2.  -2.  -2.  -1.7]
 [-2.  -2.  -1.7  0. ]]
----- Value Iteration after step  2 -------------------
[[ 0.  -2.3 -2.9 -3. ]
 [-2.3 -2.8 -3.  -2.9]
 [-2.9 -3.  -2.8 -2.3]
 [-3.  -2.9 -2.3  0. ]]
----- Value Iteration after step  3 -------------------
[[ 0.  -2.9 -3.8 -3.9]
 [-2.9 -3.6 -3.8 -3.8]
 [-3.8 -3.8 -3.6 -2.9]
 [-3.9 -3.8 -2.9  0. ]]
----- Value Iteration after step  4 -------------------
[[ 0.  -3.5 -4.5 -4.8]
 [-3.5 -4.4 -4.7 -4.5]
 [-4.5 -4.7 -4.4 -3.5]
 [-4.8 -4.5 -3.5  0. ]]


In [0]:
def getArrows(A, row_index, col_index):
  neighbours = np.array([-10000, -10000, -10000, -10000], dtype=np.float32)
  arrows = ''
  num_rows, num_cols = A.shape

  if col_index - 1 >= 0:
    neighbours[0] = A[row_index, col_index - 1]
  if row_index - 1 >= 0:
    neighbours[1] = A[row_index - 1, col_index]
  if row_index + 1 < num_rows:
    neighbours[2] = A[row_index + 1, col_index]
  if col_index + 1 < num_cols:
    neighbours[3] = A[row_index, col_index + 1]

  max_indices = np.argwhere(neighbours == neighbours.max())
  
  if 0 in max_indices:
    arrows += '←'
  if 1 in max_indices:
    if 2 in max_indices:
      arrows += '↕'
    else:
      arrows += '↑'
  elif 2 in max_indices:
    arrows += '↓'
  
  if 3 in max_indices:
    arrows += '→'
  return arrows
    
def showGreedyAction(A):
  action_matrix = np.full(A.shape, '')
  print(action_matrix.dtype)
  for i in range(A.shape[0]):
    for j in range(A.shape[1]):
      action_matrix[i,j] = getArrows(A, i, j)
  return action_matrix
  

**Printing the optimal action policy**

In [9]:
print(A.v)
print(showGreedyAction(A.v))

[[ 0.  -3.5 -4.5 -4.8]
 [-3.5 -4.4 -4.7 -4.5]
 [-4.5 -4.7 -4.4 -3.5]
 [-4.8 -4.5 -3.5  0. ]]
<U1
[['↓' '←' '←' '←']
 ['↑' '←' '←' '↓']
 ['↑' '↑' '↓' '↓']
 ['↑' '→' '→' '←']]


# 10 x 10 Grid Experiment

In [0]:
B = Environment(grid_size= (10,10))

In [11]:
print('Num States')
print(B.s)
print('termainal States')
print(B.terminal_state_indices)
print('Num Actions')
print(B.actions)
print('initial Value function')
B.v

Num States
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
termainal States
[0, 99]
Num Actions
['left', 'right', 'up', 'down']
initial Value function


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [0]:
B.run_iterative_Policy_Eval(25)

In [13]:
print('Final Value function')
B.v

Final Value function


array([[  0. , -12.3, -18.4, -21.6, -23.3, -24.2, -24.6, -24.9, -25. ,
        -25. ],
       [-12.3, -16.7, -20.2, -22.4, -23.7, -24.4, -24.7, -24.8, -25. ,
        -25. ],
       [-18.4, -20.2, -22. , -23.2, -24. , -24.5, -24.7, -24.8, -24.8,
        -24.9],
       [-21.6, -22.4, -23.2, -23.9, -24.4, -24.6, -24.8, -24.7, -24.7,
        -24.6],
       [-23.3, -23.6, -24. , -24.4, -24.6, -24.7, -24.6, -24.5, -24.4,
        -24.2],
       [-24.1, -24.4, -24.5, -24.6, -24.7, -24.5, -24.4, -24.1, -23.6,
        -23.3],
       [-24.6, -24.6, -24.7, -24.7, -24.6, -24.4, -23.9, -23.2, -22.4,
        -21.6],
       [-24.8, -24.8, -24.8, -24.7, -24.5, -24. , -23.2, -22. , -20.2,
        -18.4],
       [-25. , -24.9, -24.8, -24.6, -24.3, -23.6, -22.4, -20.2, -16.7,
        -12.3],
       [-25. , -25. , -24.8, -24.6, -24.1, -23.3, -21.6, -18.4, -12.3,
          0. ]])

In [14]:
for i in range(B.v_copy.shape[0]):
  print('----- Value Iteration after step ',i, '-------------------')
  print(B.v_copy[i])

----- Value Iteration after step  0 -------------------
[[ 0. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1.  0.]]
----- Value Iteration after step  1 -------------------
[[ 0.  -1.7 -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-1.7 -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2.  -2. ]
 [-2.  -2.  -2.  -2.  -2.  -2.  -

In [15]:
print(showGreedyAction(B.v))

<U1
[['↓' '←' '←' '←' '←' '←' '←' '←' '←' '←']
 ['↑' '←' '←' '←' '←' '←' '←' '←' '←' '↓']
 ['↑' '↑' '←' '←' '←' '←' '←' '←' '↓' '↓']
 ['↑' '↑' '↑' '←' '←' '←' '←' '↓' '↓' '↓']
 ['↑' '↑' '↑' '↑' '←' '↓' '↓' '↓' '↓' '↓']
 ['↑' '↑' '↑' '↑' '→' '↓' '↓' '↓' '↓' '↓']
 ['↑' '↑' '↑' '↑' '→' '→' '↓' '↓' '↓' '↓']
 ['↑' '↑' '↑' '→' '→' '→' '→' '↓' '↓' '↓']
 ['↑' '↑' '→' '→' '→' '→' '→' '→' '↓' '↓']
 ['↑' '→' '→' '→' '→' '→' '→' '→' '→' '←']]


# Simplified Environment Impelmentation(4X4 Grid )

In [0]:

class Environment2:
  num_columns = 4
  grid_size = num_columns * num_columns
  
  actions = ['left', 'right', 'up', 'down']
  
  def __init__(self):
    self.s = np.arange(self.grid_size)
    self.v = np.zeros(self.grid_size)
    self.pi_i = 1 / len(self.actions)
    # we are considering a random policy i.e. uniform probablity for each action
  
  def isTerminalState(self, state_index):
    return state_index == 0 or state_index == 15
    
  def step(self, state_index, action_index):
    if state_index % self.num_columns == 0 and self.actions[action_index] == 'left':
      return state_index, 0
    elif (state_index + 1) % self.num_columns == 0 and self.actions[action_index] == 'right':
      return state_index, 0
    elif state_index + 1 < self.num_columns and self.actions[action_index] == 'up':
      return state_index, 0
    elif state_index + 1 > self.grid_size - self.num_columns and self.actions[action_index] == 'down':
      return state_index, 0
    else:      
      getNextState = {
          'left': -1,
          'right': 1,
          'up': -self.num_columns,
          'down': self.num_columns
      }
      next_state = getNextState[self.actions[action_index]] + state_index
      reward = -1 
      return next_state,reward
   
  def update_vtable(self, state_index): 
    num_valid_actions = 0
    v_sum = 0
    for action_index in np.arange(len(self.actions)):
      next_state, reward = self.step(state_index, action_index)
      if next_state == state_index:
        #it means the agent went outside the grid so this action is not possible
        continue
      v_sum += reward + self.v[next_state]
      num_valid_actions += 1
      # counting the number of valid actions to get the correct expectation value
    return (1/num_valid_actions) * v_sum
      
      
  
  def run_iterative_Policy_Eval(self, num_iterations):
    v_copy = np.zeros_like(self.v)
    for i in range(num_iterations):
      for state_ind in self.s:
        if not self.isTerminalState(state_ind):
          v_copy[state_ind] = self.update_vtable(state_ind)
      self.v = v_copy

In [0]:
F = Environment2()

In [18]:
F.v

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [0]:
F.run_iterative_Policy_Eval(1)

In [20]:
F.v

array([ 0., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1.,  0.])

In [0]:
F.run_iterative_Policy_Eval(1)

In [22]:
F.v

array([ 0.        , -1.66666667, -2.        , -1.66666667, -1.66666667,
       -2.        , -2.        , -2.        , -2.        , -2.        ,
       -2.        , -1.66666667, -2.        , -2.        , -1.66666667,
        0.        ])