In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import random

### Implementation of The Environment

In [2]:
class gridworld_env:
    def __init__(self, y,x):
        
        # The board of the grid world
        self.world = np.arange(0,25).reshape((5,5))
        
        # Current position of the agent on the board
        self.current_position = (y,x)
        
        # Gamma coefficient
        self.gamma = 0.9
        
        # Possible moves
        self.WEST = 0
        self.NORTH = 1
        self.EAST = 2
        self.SOUTH = 3
        self.action_spaces = [self.WEST, self.NORTH, self.EAST, self.SOUTH]
        
        # Special states
        self.aprime = (0,1)
        self.bprime = (0,3)
    '''
    Agent take an action to move to new state, the environment returns 
    the reward for the action
    '''
    def take_action(self, action):
        
        # Compute the reward and new position of the agent for the action
        reward = 0
        new_position = self.current_position
        if action == self.NORTH:
            new_position = (max(self.current_position[0]-1,0), self.current_position[1])
            if new_position == self.current_position:
                reward = -1
        elif action == self.WEST:
            new_position = (self.current_position[0], max(self.current_position[1]-1,0))
            if new_position == self.current_position:
                reward = -1
        elif action == self.EAST:
            new_position = (self.current_position[0], (self.current_position[1]+1)%5)
            if new_position[1] == 0:
                reward = -1
                new_position = self.current_position
        elif action == self.SOUTH:
            new_position = ((self.current_position[0]+1)%5, self.current_position[1])
            if new_position[0] == 0:
                reward = -1
                new_position = self.current_position
        
        # Special reward if the agent is in A prime or B prime, then
        # we can ignore the computation of the reward above
        if self.current_position == self.aprime:
            reward = 10
            new_position = (4,1)
        elif self.current_position == self.bprime:
            reward = 5
            new_position = (2,3)
            
        # Create a new state of the gridworld
        new_env = gridworld_env(new_position[0], new_position[1])
        
        # Return the reward of the action and the new state of the gridworld
        return reward, new_env

### Implementation of The State Value Function
Estimate the state value function that estimate how good it is for an agent to be in a given state

$$v_\pi(s) = \sum_a \pi(a|s) \sum_{s', r}P(s',r|s,a)\Big[\ r + \gamma v_\pi(s') \Big] \quad\forall s\in S$$

s: given state <br>
s': future state <br>
a: an action the agent can make in that state <br>
r: possible reward <br>

In [17]:
def approx_state_val():
    
    # Matrix containing the probability of choosing an action
    actions_p = np.array([0.25, 0.25, 0.25, 0.25])
    
    # Possible actions an agent can make
    actions = [0, 1, 2, 3]
    
    # Table displaying the value of the states
    # The environment is a 5x5 grid world
    state_val_table = np.zeros((5,5))
    
    # Discount rate
    gamma = 0.9
    
    while True:
        
        # Temporary state value table for the current run
        tmp_table = np.zeros((5,5))
        
        # For each state in the state space, approximate the value of that state
        for x in range(0, 5):
            for y in range(0, 5):
                
                # Initialize the environment
                env = gridworld_env(y,x)
                
                new_val = 0
                for action in actions:
                    reward, new_env = env.take_action(action)
                    
                    # Get new position of the agent after taking an action
                    new_position = new_env.current_position
                    new_x = new_position[1]
                    new_y = new_position[0]
                    
                    # Approximate the new value of the state
                    # Since the action is deterministic, the conditional probability
                    # in the equation is 1
                    new_val += actions_p[action] * (reward + gamma * state_val_table[new_y, new_x])
    
                # Update the state value table
                tmp_table[y, x] = new_val
        
        # Check if the state value table converge
        if abs(np.sum(tmp_table) - np.sum(state_val_table)) < 0.00001:
            state_val_table = tmp_table
            break
        
        # If not converge yet, set the temporary table to be the new
        # value state table
        state_val_table = tmp_table
        
    return state_val_table

### Approximate the Value of The States

In [19]:
print(approx_state_val())

[[ 3.30899965  8.78929517  4.42762249  5.3223709   1.49218207]
 [ 1.52159138  2.99232117  2.25014326  1.90757502  0.54740602]
 [ 0.0508258   0.7381739   0.67311657  0.35818953 -0.40313783]
 [-0.97358899 -0.43549212 -0.35487896 -0.58560178 -1.18307177]
 [-1.85769724 -1.34522795 -1.22926395 -1.42291484 -1.97517574]]
