<a href="https://colab.research.google.com/github/maggieliuzzi/reinforcement_learning/blob/master/monte_carlo/prediction/PolicyEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Monte Carlo** | Prediction Problem | Policy Evaluation

Given a policy, find the value function

- Policy (deciding what action to take given the state): 1) deterministic, 2) probabilistic

In [0]:
from __future__ import print_function, division
from builtins import range
import numpy as np
!wget "https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/rl/grid_world.py"
from grid_world import standard_grid, negative_grid
!wget "https://raw.githubusercontent.com/maggieliuzzi/reinforcement_learning/master/environments/utils.py"
from utils import print_values, print_policy

In [0]:
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

In [0]:
## For random policy
def random_action(a):
  # choose given a with probability 0.5
  # choose some other a' != a with probability 0.5/3
  p = np.random.random()
  if p < 0.5:
    return a
  else:
    tmp = list(ALL_POSSIBLE_ACTIONS)
    tmp.remove(a)
    return np.random.choice(tmp)

In [0]:
def play_game(grid, policy):
  # returns a list of states and corresponding returns

  # reset game to start at a random position
  # we need to do this, because given our current deterministic policy
  # we would never end up at certain states, but we still want to measure their value
  start_states = list(grid.actions.keys())
  start_idx = np.random.choice(len(start_states))
  grid.set_state(start_states[start_idx])

  s = grid.current_state()
  states_and_rewards = [(s, 0)] # list of tuples of (state, reward)
  while not grid.game_over():
    a = policy[s]  ## Deterministic Policy
    a = random_action(a)  ## Probabilistic Policy
    r = grid.move(a)
    s = grid.current_state()
    states_and_rewards.append((s, r))
  # calculate the returns by working backwards from the terminal state
  G = 0
  states_and_returns = []
  first = True
  for s, r in reversed(states_and_rewards):
    # the value of the terminal state is 0 by definition
    # we should ignore the first state we encounter
    # and ignore the last G, which is meaningless since it doesn't correspond to any move
    if first:
      first = False
    else:
      states_and_returns.append((s, G))
    G = r + GAMMA*G
  states_and_returns.reverse() # we want it to be in order of state visited

  return states_and_returns

In [19]:
grid = standard_grid()

print("rewards:")
print_values(grid.rewards, grid)

rewards:
---------------------------
 0.00| 0.00| 0.00| 1.00|
---------------------------
 0.00| 0.00| 0.00|-1.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|


In [0]:
# state -> action
policy = {
  (2, 0): 'U',
  (1, 0): 'U',
  (0, 0): 'R',
  (0, 1): 'R',
  (0, 2): 'R',
  (1, 2): 'R',
  (2, 1): 'R',
  (2, 2): 'R',
  (2, 3): 'U',
}
policy = {
  (2, 0): 'U',
  (1, 0): 'U',
  (0, 0): 'R',
  (0, 1): 'R',
  (0, 2): 'R',
  (1, 2): 'U',
  (2, 1): 'L',
  (2, 2): 'U',
  (2, 3): 'L',
}

In [0]:
# Initialising V and returns
V = {}
returns = {}  # dictionary of state -> list of returns we've received
states = grid.all_states()
for s in states:
  if s in grid.actions:
    returns[s] = []
  else:
    # terminal state or state we can't otherwise get to
    V[s] = 0

In [0]:
for t in range(5000):  # 100

  # generate an episode using pi
  states_and_returns = play_game(grid, policy)
  seen_states = set()
  for s, G in states_and_returns:
    # check if we have already seen s
    # called "first-visit" MC policy evaluation
    if s not in seen_states:
      returns[s].append(G)
      V[s] = np.mean(returns[s])
      seen_states.add(s)

In [23]:
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)

values:
---------------------------
 0.43| 0.56| 0.73| 0.00|
---------------------------
 0.33| 0.00| 0.19| 0.00|
---------------------------
 0.26| 0.18| 0.09|-0.20|
policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  L  |  U  |  L  |
