In [1]:
from __future__ import print_function
#import numpy as np
%pylab
import sys
if "../" not in sys.path:
  sys.path.append("../") 
#from lib.envs.gridworld import GridworldEnv
from GridWorldEnv import GridworldEnv

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


In [2]:
env = GridworldEnv()

In [3]:
def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI environment. env.P represents the transition probabilities of the environment.
        theta: Stopping threshold. If the value of all states changes less than theta
            in one iteration we are done.
        discount_factor: lambda time discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.        
    """
    
    V = np.zeros(env.nS)
    policy = np.empty([env.nS, env.nA])

    def action_values(state, values):
        action_value = np.empty(env.nA)
        for ac in range(env.nA):
            action_value[ac] = 0.0
            for probability, next_state, reward, _ in env.P[state][ac]:
                action_value[ac] += \
                   probability * (reward + discount_factor * values[next_state])
        return action_value, np.max(action_value), np.sum(action_value)

    while True:
        oldV = np.copy(V)
        for st in range(env.nS):
            _, V[st], _ = action_values(st, oldV)
        if np.allclose(oldV, V, theta):
            break

    for st in range(env.nS):
        ac_vs, max_value, _ = action_values(st, V)
        argmax = np.where(ac_vs == max_value, 1.0, 0.0)
        policy[st] = argmax / np.count_nonzero(argmax)

    return policy, V

In [4]:
policy, v = value_iteration(env)

print("Policy Probability Distribution:")
print(policy)
print()

print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print()

print("Value Function:")
print(v)
print()

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print()

Policy Probability Distribution:
[[ 0.25  0.25  0.25  0.25]
 [ 0.    0.    0.    1.  ]
 [ 0.    0.    0.    1.  ]
 [ 0.    0.    0.5   0.5 ]
 [ 1.    0.    0.    0.  ]
 [ 0.5   0.    0.    0.5 ]
 [ 0.25  0.25  0.25  0.25]
 [ 0.    0.    1.    0.  ]
 [ 1.    0.    0.    0.  ]
 [ 0.25  0.25  0.25  0.25]
 [ 0.    0.5   0.5   0.  ]
 [ 0.    0.    1.    0.  ]
 [ 0.5   0.5   0.    0.  ]
 [ 0.    1.    0.    0.  ]
 [ 0.    1.    0.    0.  ]
 [ 0.25  0.25  0.25  0.25]]

Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):
[[0 3 3 2]
 [0 0 0 2]
 [0 0 1 2]
 [0 1 1 0]]

Value Function:
[ 0. -1. -2. -3. -1. -2. -3. -2. -2. -3. -2. -1. -3. -2. -1.  0.]

Reshaped Grid Value Function:
[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]



In [5]:
# Test the value function
expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)