<a href="https://colab.research.google.com/github/m7saikat/IE-7374/blob/master/lab_4_1_Policy_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Policy evaluation**




In [0]:
import os
import numpy as np
from tabulate import tabulate

Defining a class that constructs a basic MDP problem. A MDP has the following attributes.

1. State,
2. Possible actions,
3. Rewards after each of the action taken.

The class initializes a basic grid world as a MDP porblem, with certain features, such as `height` `width` of the grid, which represents the state. Starting position of the agent, `start_state`, `terminal_state` of the agent.

In [0]:
class BaseGridworld:
    """
    Defines the base class for the Gridworld MDP.
    State representation: (x,y); width and height coordinate standard cartisian.
    Action representation: (x_offset, y_offset); where offset is distance in the x or y direction from the starting point (0,0).
    E.g. (0, 0) is starting point for the agent; x increases in right direction, y increases in up direction.
         Action (1, 0) is increments x with +1 and y with 0; Thus state (0,0) + action (1,0) = next_state (1,0). ## RIGHT ACTIONmaention
         Action (0, 1) is increments x with 0 and y with +1; Thus state (0,0) + action (0,1) = next_state (0,1).
         state (1,2) + action (0,1) = next_state (1,3)
    """
    def __init__(self, width, height, start_state=None, terminal_states=[]):
        """
        Args
            width, height -- ints; dimensions of the grid for x and y.
            start_state -- tuple; agent start state.
            terminal_states -- list of tuples; special states (if any) with specified reward and action (e.g. cliff in Ch 6)
        """
        # note: all coordinates in 0-indexed cartesian x-y; with origin (0,0) in the bottom left
        self.width = width
        self.height = height
        self.start_state = start_state
        self.terminal_states = terminal_states

        self.reset_state()

    def get_possible_actions(self, state):
        # default actions: north, west, south, east
        all_actions = [(0,1), (-1,0), (0,-1), (1,0)]
        return all_actions

    def get_states(self):
        return [(x,y) for x in range(self.width) for y in range(self.height)]

    def get_state_reward_transition(self, state, action):
        # perform action
        next_state = np.array(state) + np.array(action)

        # clip to grid in case action resulted in off-the-grid state
        next_state = self._clip_state_to_grid(next_state)

        # make into tuple of ints
        next_state = int(next_state[0]), int(next_state[1])

        # get reward
        reward = self.get_reward(state, action, next_state)
        # print (reward)

        return next_state, reward
    
    def get_reward(self, state, action, next_state):
        if state in self.terminal_states:
            return 0
        else:
            return -1

    def _clip_state_to_grid(self, state):
        '''
          Given an interval, values outside the interval are clipped to the interval edges.
          For example, if an interval of [0, 1] is specified, values smaller than 0 become 0, and values larger than 1 become 1.
        '''
        x, y = state
        return np.clip(x, 0, self.width-1), np.clip(y, 0, self.height-1)

    def is_terminal(self, state):
        return tuple(state) in self.terminal_states

    def reset_state(self):
        self.state = self.start_state
        return self.state


Mapping integers to directions

In [0]:
# --------------------
# Display functions
# --------------------

def action_to_nwse(action):
    """ translate an action from tuple (e.g. (1,0)) to letter coordinates (e.g. '→') """
    x, y = action
    ret = ''
    if y == +1: ret += '↑'
    if y == -1: ret += '↓'
    if x == +1: ret += '← '
    if x == -1: ret += '→'
    return ret

In [0]:
class UniformPolicyAgent:
    def __init__(self, mdp, γ=0.9, eps=1e-2, n_iterations=1000):
        self.mdp = mdp
        self.γ = γ

        # initialize values
        self.values = np.zeros((self.mdp.width, self.mdp.height))
        self.policy = {}

        # Iterative policy evaluation algorithm (Ch 4, p 59)
        for i in range(n_iterations):
            # Variable to hold the values, V(k+1), i.e updated values. 
            # The function zeros_like is an numpy function that creates a 
            # similar(same size and shape) zero matrix as that that of the argument 
            new_values = np.zeros_like(self.values)

            for state in self.mdp.get_states():
                if state in self.mdp.terminal_states:
                    continue

                q_values = {}
                for action in self.mdp.get_possible_actions(state):

                    # uniform action probability: 
                    action_prob = 1/len(self.mdp.get_possible_actions(state))

                    # compute q_value and update value estimate
                    q_values[action] = self.compute_q_value(state, action)

                    # Bellman equation
                    new_values[state] += action_prob * q_values[action]

            # if improvement less then eps (after at least 1 iteration), stop iteration
            if np.sum(np.abs(new_values - self.values)) < eps:
                break

            # update values with new_values for the next iteration loop
            self.values = new_values

            # record optimal policy
            self.policy = self.update_policy()

    def compute_q_value(self, state, action):
        # get next state and reward from the transition model
        next_state, reward = self.mdp.get_state_reward_transition(state, action)
        return reward + self.γ * self.values[next_state]

    def update_policy(self):
        policy = {}
        for state in self.mdp.get_states():
            if state in self.mdp.terminal_states:
                continue
            q_values = {}
            for action in self.mdp.get_possible_actions(state):
                q_values[action] = self.compute_q_value(state, action)
            policy[state] = [a for a, v in q_values.items() if round(v, 5) == round(max(q_values.values()), 5)]
        return policy

In [0]:
def compute_state_value_and_policy(iterations = [], γ = 1):
    mdp = BaseGridworld(width=4, height=4, terminal_states=[(0,3), (3,0)])

    for n_iter in iterations:
        agent = UniformPolicyAgent(mdp=mdp, γ=γ, n_iterations=n_iter)


        print('#'*120)
        print('#', ' '*10, 'k = {}'.format(n_iter))
        print('#'*120)

        print('V(k) for the random policy:')

        print (np.flipud(agent.values.T))
      
        formatted_grid = tabulate(np.flipud(agent.values.T), tablefmt='grid')
        print(formatted_grid)  # transform so (0,0) is bottom-left

        grid = [['' for x in range(mdp.width)] for y in range(mdp.height)]
        for (x,y), v in agent.policy.items():
            grid[y][x] = [action_to_nwse(v_i) for v_i in v]
        # invert vertical coordinate so (0,0) is bottom left of the displayed grid
        grid = grid[::-1]

        print('Greedy policy wrt v(k):')
        print(tabulate(grid, tablefmt='grid'))



In [45]:
# List of iterations after which you want to know the policy and the state-values
iteration_all = [0, 1, 2, 3, 10, 1000]
iteration_0 = [0]
iteration_1 = [1]
iteration_3 = [3]

discount= γ = 1

compute_state_value_and_policy(iterations=iteration_all, γ=γ)
print ("""
# --------------------
# Figure 4.1: Convergence of iterative policy evaluation on a small gridworld.
# The left column is the sequence of approximations of the state-value function for the random policy
# (all actions equally likely). The right column is the sequence of greedy policies corresponding to
# the value function estimates (arrows are shown for all actions achieving the maximum, and the numbers
# shown are rounded to two significant digits). The last policy is guaranteed only to be an improvement
# over the random policy, but in this case it, and all policies after the third iteration, are optimal.
# --------------------
""")

########################################################################################################################
#            k = 0
########################################################################################################################
V(k) for the random policy:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
+---+---+---+---+
| 0 | 0 | 0 | 0 |
+---+---+---+---+
| 0 | 0 | 0 | 0 |
+---+---+---+---+
| 0 | 0 | 0 | 0 |
+---+---+---+---+
| 0 | 0 | 0 | 0 |
+---+---+---+---+
Greedy policy wrt v(k):
+--+--+--+--+
|  |  |  |  |
+--+--+--+--+
|  |  |  |  |
+--+--+--+--+
|  |  |  |  |
+--+--+--+--+
|  |  |  |  |
+--+--+--+--+
########################################################################################################################
#            k = 1
########################################################################################################################
V(k) for the random policy:
[[ 0. -1. -1. -1.]
 [-1. -1. -1. -1.]
 [-1. -1. -