<a href="https://colab.research.google.com/github/maggieliuzzi/reinforcement_learning/blob/master/dynamic_programming/control/PolicyIteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dynamic Programming** | Control Problem | Policy Iteration

Find optimal policy and value function.

- State Transitions (the next state and reward given your action-state pair): deterministic (all p(s',r|s,a) = 1 or 0)


In [0]:
from __future__ import print_function, division
from builtins import range
import numpy as np
!wget "https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/rl/grid_world.py"
from grid_world import standard_grid, negative_grid
!wget "https://raw.githubusercontent.com/maggieliuzzi/reinforcement_learning/master/environments/utils.py"
from utils import print_values, print_policy

In [0]:
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

In [3]:
grid = negative_grid()  # reward of -0.1 for every non-terminal state

print("rewards:")
print_values(grid.rewards, grid)

rewards:
---------------------------
-0.10|-0.10|-0.10| 1.00|
---------------------------
-0.10| 0.00|-0.10|-1.00|
---------------------------
-0.10|-0.10|-0.10|-0.10|


In [4]:
## Uniform random policy
policy = {}
for s in grid.actions.keys():
  policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

print("initial policy:")
print_policy(policy, grid)

initial policy:
---------------------------
  L  |  L  |  R  |     |
---------------------------
  L  |     |  R  |     |
---------------------------
  U  |  R  |  U  |  L  |


In [0]:
V = {}
states = grid.all_states()
for s in states:
  # V[s] = 0
  if s in grid.actions:
    V[s] = np.random.random()
  else:
    # terminal state
    V[s] = 0

In [0]:
# repeat until convergence - will break out when policy does not change
while True:

  # policy evaluation step - we already know how to do this!
  while True:
    biggest_change = 0
    for s in states:
      old_v = V[s]

      # V(s) only has value if it's not a terminal state
      if s in policy:
        a = policy[s]
        grid.set_state(s)
        r = grid.move(a)
        V[s] = r + GAMMA * V[grid.current_state()]
        biggest_change = max(biggest_change, np.abs(old_v - V[s]))

    if biggest_change < SMALL_ENOUGH:
      break

  # policy improvement step
  is_policy_converged = True
  for s in states:
    if s in policy:
      old_a = policy[s]
      new_a = None
      best_value = float('-inf')
      # loop through all possible actions to find the best current action
      for a in ALL_POSSIBLE_ACTIONS:
        grid.set_state(s)
        r = grid.move(a)
        v = r + GAMMA * V[grid.current_state()]
        if v > best_value:
          best_value = v
          new_a = a
      policy[s] = new_a
      if new_a != old_a:
        is_policy_converged = False

  if is_policy_converged:
    break

In [7]:
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)

values:
---------------------------
 0.62| 0.80| 1.00| 0.00|
---------------------------
 0.46| 0.00| 0.80| 0.00|
---------------------------
 0.31| 0.46| 0.62| 0.46|
policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  R  |  U  |  L  |
