In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import random

### Implementation of The Environment

In [3]:
class gridworld_env:
    def __init__(self, y,x):
        
        # The board of the grid world
        self.world = np.arange(0,25).reshape((5,5))
        
        # Current position of the agent on the board
        self.current_position = (y,x)
        
        # Gamma coefficient
        self.gamma = 0.9
        
        # Possible moves
        self.WEST = 0
        self.NORTH = 1
        self.EAST = 2
        self.SOUTH = 3
        self.action_spaces = [self.WEST, self.NORTH, self.EAST, self.SOUTH]
        
        # Special states
        self.aprime = (0,1)
        self.bprime = (0,3)
    '''
    Agent take an action to move to new state, the environment returns 
    the reward for the action
    '''
    def take_action(self, action):
        
        # Compute the reward and new position of the agent for the action
        reward = 0
        new_position = self.current_position
        if action == self.NORTH:
            new_position = (max(self.current_position[0]-1,0), self.current_position[1])
            if new_position == self.current_position:
                reward = -1
        elif action == self.WEST:
            new_position = (self.current_position[0], max(self.current_position[1]-1,0))
            if new_position == self.current_position:
                reward = -1
        elif action == self.EAST:
            new_position = (self.current_position[0], (self.current_position[1]+1)%5)
            if new_position[1] == 0:
                reward = -1
                new_position = self.current_position
        elif action == self.SOUTH:
            new_position = ((self.current_position[0]+1)%5, self.current_position[1])
            if new_position[0] == 0:
                reward = -1
                new_position = self.current_position
        
        # Special reward if the agent is in A prime or B prime, then
        # we can ignore the computation of the reward above
        if self.current_position == self.aprime:
            reward = 10
            new_position = (4,1)
        elif self.current_position == self.bprime:
            reward = 5
            new_position = (2,3)
            
        # Create a new state of the gridworld
        new_env = gridworld_env(new_position[0], new_position[1])
        
        # Return the reward of the action and the new state of the gridworld
        return reward, new_env

### Implementation of The Tabular TD(0) for Estimating $v_\pi$

Input: the policy $\pi$ to be evaluated <br>
Algorithm parameters: step size $\alpha \in (0,1]$, small $\epsilon>0$ <br>
Initialize $V(s)$, for all $s\in S^+$, arbitrarily except that $V(terminal) = 0$ <br>
<br>
Loop for each episode: <br>
$\quad$ Initialize $s$ <br>
$\quad$ Loop for each step of episode: <br>
$\quad\quad$ $a \leftarrow$ action given by $\pi$ for state $s$ <br>
$\quad\quad$ Take action $a$, observe reward $r$ and next state $s'$ <br>
$\quad\quad$ $V(s) \leftarrow V(s) + \alpha\Big[ R + \gamma V(s')-V(s) \Big]$ <br>
$\quad\quad$ $s \leftarrow s'$ <br>
$\quad$ Until $s$ is terminal <br>

In [13]:
def td0_learning(episodes, steps):
    
    # State value talbe for every states in the grid world environment
    s_table = np.zeros((5,5))
    
    # Learning rate
    alpha = 0.001
    
    # Discount rate
    gamma = 0.9
    
    for episode in range(0, episodes):
        
        # Initialize new environment with a random starting position
        new_x = random.randint(0, 3)
        new_y = random.randint(0, 3)
        env = gridworld_env(new_y, new_x)
        
        for step in range(0, steps):
            
            # Take a random action a
            a = random.randint(0, 3)
            reward, new_env = env.take_action(a)
            
            # Get the state of the agent prior to taking action a
            prev_y = env.current_position[0]
            prev_x = env.current_position[1]
            
            # Get the state of the agent after taking action a
            curr_y = new_env.current_position[0]
            curr_x = new_env.current_position[1]
            
            # Update the value of the state prior to taking action a
            s_table[prev_y, prev_x] += alpha * (reward + gamma * s_table[curr_y, curr_x] - s_table[prev_y, prev_x]) 
            
            # Update the new environment
            env = new_env
            
    # Return the approximation of the value of states
    return s_table

### Test the TD(0) Algorithm with 100,000 Episodes of Length 100

In [14]:
print(td0_learning(100000, 100))

[[ 3.3505652   8.79149056  4.49336496  5.32596168  1.49893126]
 [ 1.52869309  2.98058435  2.26885413  1.98484334  0.53863336]
 [ 0.06655148  0.73839183  0.68221542  0.39102632 -0.38323434]
 [-0.98744136 -0.41182561 -0.32071835 -0.58421933 -1.18358308]
 [-1.83778317 -1.34291497 -1.20144778 -1.43113267 -1.9777177 ]]
