> Here we will compute the value function using the policy, but in value iteration method, we compute the value function by taking the maximum over Q values

In [1]:
import gym
import numpy as np


In [2]:
env = gym.make('FrozenLake-v0')

> **Algorithm – policy iteration**


The steps of the policy iteration algorithm is given as follows:

* Initialize a random policy
* Compute the value function using the given policy
* Extract a new policy using the value function obtained from step 2
* If the extracted policy is the same as the policy used in step 2, then stop, else send the extracted new policy to step 2 and repeat steps 2 to 4

In [3]:
def compute_value_function(policy):

  num_iterations = 1000
  threshold = 1e-20
  gamma = 1.0

  value_table = np.zeros(env.observation_space.n)

  for i in range(num_iterations):

    updated_value_table = np.copy(value_table)

    for s in range(env.observation_space.n):

      # Seleceting the action in the state according to policy
      a = policy[s]

      value_table[s]= sum([prob * (r + gamma * updated_value_table[s_])
                             for prob,s_, r, _ in env.P[s][a]])
      

    if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
      break
    
  return value_table









In [4]:
def extract_policy(value_table):

  gamma = 1.0
  policy = np.zeros(env.observation_space.n)


  for s in range(env.observation_space.n):

    Q_values = [
                sum([ prob * (r + gamma * value_table[s_])
                     for prob, s_, r,_ in env.P[s][a]])
                for a in range(env.action_space.n)
    ]

    # extracting policy by selecting the action which has maximum Q value

    policy[s] = np.argmax(np.array(Q_values))

  return policy

In [8]:
def policy_iteration(env):

  num_iterations = 1000
  policy = np.zeros(env.observation_space.n)

  for i in range(num_iterations):
    value_function = compute_value_function(policy)
    new_policy = extract_policy(value_function)

    if (np.all(policy == new_policy)):
      break

    policy = new_policy

  return policy

In [9]:
optimal_policy = policy_iteration(env)

In [10]:
optimal_policy

array([0., 3., 3., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])