# Policy Evaluation

This is a full implementation of the policy-evaluation algorithm. All we need is: the policy we’re trying to evaluate, the MDP, the discount factor, gamma, defaults to 1, and theta (a small number that we use to check for converge)

In [None]:
import numpy as np

def policy_evaluation(pi, P, gamma=1.0, theta=1e-10):
    
    # initialize the first-iteration estimates to zero.
    prev_V = np.zeros(len(P))
    
    # looping forever...
    while True:
        # initialize the current-iteration estimates to zero as well.
        V = np.zeros(len(P))
        
        # loop through all states to estimate the state-value function
        for s in range(len(P)):
            
            # we use the policy pi to get the possible transitions,
            # each transition tuple has a probability, next state, 
            # reward, and a done flag indicating whether the next_state 
            # is terminal or not
            for prob, next_state, reward, done in P[s][pi(s)]:
                
                # calculate the value of that state by summing up the 
                # weighted value of that transition,
                # notice how we use the done flag to ensure the value of 
                # the next state when landing on a terminal state is zero
                V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
        
        # at the end of each iteration (a state sweep), we make sure 
        # that the state-value functions are changing; 
        # otherwise, we call it converged
        if np.max(np.abs(prev_V - V)) < theta:
            break
        
        # finally, copy to get ready for the next iteration or 
        prev_V = V.copy()
        
    # return the latest state-value function    
    return V

We can use the implemented algorthms to iterativelly evaluate the state value function for a random policy in the taxi environment. 

In [None]:
import gym

env = gym.make('Taxi-v3')
P = env.env.P
init_state, _ = env.reset()

In [None]:
def rand_dict(n):
   from random import randrange
   mydict = {i: randrange(6) for i in range(n)}
   return mydict

rnd_policy = rand_dict(500)

rnd_pi = lambda s: rnd_policy[s]

We can prepare some functions to show a policy and to calculate its probability of success and its main return using simulation.

In [None]:
import numpy as np

def print_policy(pi, P, action_symbols=('<', 'v', '>', '^', 'P', 'D'), n_cols=5, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(3), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [None]:
import random

def probability_success(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; # env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, _ = env.reset()
        done, steps = False, 0
        while not done and steps < max_steps:
            state, _, done, _, h = env.step(pi(state))
            steps += 1
        results.append(done)
    return np.sum(results)/len(results)

In [None]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; # env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, _ = env.reset()
        done, steps = False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _, _ = env.step(pi(state))
            results[-1] += reward
            steps += 1
    return np.mean(results)

We can show first the random policy values:

In [None]:
print_policy(pi=rnd_pi, P=P, title='random policy')

ps = probability_success(env, rnd_pi)*100
mr = mean_return(env, rnd_pi)

print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(ps,mr))

Let’s now run policy evaluation for the policy and prepare a function to show the result.

In [None]:
def print_state_value_function(V, P, n_cols=5, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(3), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [None]:
V = policy_evaluation(rnd_pi, P, gamma=0.99)
print_state_value_function(V, P, prec=3)

It seems being a random policy doesn’t pay well in the taxi environment! Fascinating results, but a question arises: are there any better policies for this environment?