<a href="https://colab.research.google.com/github/maggieliuzzi/reinforcement_learning/blob/master/dynamic_programming/control/ValueIteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dynamic Programming** | Control Problem | Value Iteration

Find optimal policy and value function.

State Transitions (the next state and reward given your action-state pair): probabilistic (0.5 for desired position, 0.5/3 in any of the other three)

In [0]:
from __future__ import print_function, division
from builtins import range
import numpy as np
!wget "https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/rl/grid_world.py"
from grid_world import windy_grid, ACTION_SPACE
!wget "https://raw.githubusercontent.com/maggieliuzzi/reinforcement_learning/master/environments/utils.py"
from utils import print_values, print_policy

In [0]:
SMALL_ENOUGH = 1e-3
GAMMA = 0.9

In [0]:
def get_transition_probs_and_rewards(grid):
  ### define transition probabilities and grid ###
  # the key is (s, a, s'), the value is the probability
  # that is, transition_probs[(s, a, s')] = p(s' | s, a)
  # any key NOT present will considered to be impossible (i.e. probability 0)
  transition_probs = {}

  # to reduce the dimensionality of the dictionary, we'll use deterministic
  # rewards, r(s, a, s')
  # note: you could make it simpler by using r(s') since the reward doesn't
  # actually depend on (s, a)
  rewards = {}

  for (s, a), v in grid.probs.items():
    for s2, p in v.items():
      transition_probs[(s, a, s2)] = p
      rewards[(s, a, s2)] = grid.rewards.get(s2, 0)

  return transition_probs, rewards

In [20]:
grid = windy_grid()

transition_probs, rewards = get_transition_probs_and_rewards(grid)

print("rewards:")
print_values(grid.rewards, grid)

rewards:
---------------------------
 0.00| 0.00| 0.00| 1.00|
---------------------------
 0.00| 0.00| 0.00|-1.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|
rewards:
---------------------------
 0.00| 0.00| 0.00| 1.00|
---------------------------
 0.00| 0.00| 0.00|-1.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|


In [21]:
# state -> action
# we'll randomly choose an action and update as we learn
policy = {}
for s in grid.actions.keys():
  policy[s] = np.random.choice(ACTION_SPACE)

# initial policy
print("initial policy:")
print_policy(policy, grid)

initial policy:
---------------------------
  R  |  R  |  U  |     |
---------------------------
  D  |     |  U  |     |
---------------------------
  L  |  R  |  L  |  L  |
initial policy:
---------------------------
  D  |  U  |  U  |     |
---------------------------
  D  |     |  R  |     |
---------------------------
  D  |  D  |  L  |  L  |


In [0]:
# initialize V(s)
V = {}
states = grid.all_states()
for s in states:
  V[s] = 0

In [0]:
# repeat until convergence
# V[s] = max[a]{ sum[s',r] { p(s',r|s,a)[r + gamma*V[s']] } }
it = 0
while True:
  biggest_change = 0
  for s in grid.all_states():
    if not grid.is_terminal(s):
      old_v = V[s]
      new_v = float('-inf')

      for a in ACTION_SPACE:
        v = 0
        for s2 in grid.all_states():
          # reward is a function of (s, a, s'), 0 if not specified
          r = rewards.get((s, a, s2), 0)
          v += transition_probs.get((s, a, s2), 0) * (r + GAMMA * V[s2])

        # keep v if it's better
        if v > new_v:
          new_v = v

      V[s] = new_v
      biggest_change = max(biggest_change, np.abs(old_v - V[s]))

  it += 1
  if biggest_change < SMALL_ENOUGH:
    break

In [0]:
# find a policy that leads to optimal value function
for s in policy.keys():
  best_a = None
  best_value = float('-inf')
  # loop through all possible actions to find the best current action
  for a in ACTION_SPACE:
    v = 0
    for s2 in grid.all_states():
      # reward is a function of (s, a, s'), 0 if not specified
      r = rewards.get((s, a, s2), 0)
      v += transition_probs.get((s, a, s2), 0) * (r + GAMMA * V[s2])

    # best_a is the action associated with best_value
    if v > best_value:
      best_value = v
      best_a = a
  policy[s] = best_a

In [25]:
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)

values:
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00| 0.48| 0.00|
---------------------------
 0.66| 0.59| 0.53| 0.48|
policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  D  |     |
---------------------------
  U  |  L  |  L  |  L  |
values:
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00| 0.48| 0.00|
---------------------------
 0.66| 0.59| 0.53| 0.48|
policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  D  |     |
---------------------------
  U  |  L  |  L  |  L  |
