In [1]:
from env import create_standard_grid
from algorithms import compute_policy_from_values
from env import create_custom_grid_1, create_custom_grid_2

In [14]:
# from page 83 of Sutton and Barto, RL 2nd. Ed.
def value_iteration(gw, gamma=0.25, epsilon=0.000001):
    count = 0
    while True:
        count += 1
        biggest_change_in_value = 0
        for node in gw:
            state = node.state
            if not gw.is_terminal(state) and not gw.is_barrier(state):
                old_value = gw.get_value(state)
                new_value = float('-inf')
                # valid decisions and rewards at current state
                dr = gw.valid_decisions_and_rewards(state)
                for action, reward in dr.items():
                    reward = gw.get_reward_for_action(state, action)
                    value_at_dest = gw.get_value_at_destination(state, action)
                    value = reward + gamma*value_at_dest
                    if value > new_value:
                        new_value = value
                    gw.set_value(state, new_value)
                biggest_change_in_value = max(biggest_change_in_value,
                                                  abs(new_value - old_value))
        if biggest_change_in_value < epsilon:
            break

In [15]:
gw = create_custom_grid_1()

print("")
print("Initial Values")
gw.print_values()

# compute values
value_iteration(gw)

print("")
print("Values after Value Iteration")
gw.print_values()

# compute policy from values
policy = compute_policy_from_values(gw)

print("") 
print("New Policy")
gw.print_policy(policy)


Initial Values
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------

Values after Value Iteration
-------------------------------------
|   0.02 |   0.06 |   0.25 |   0.06 |
-------------------------------------
|   0.06 |   0.25 |   1.00 |   0.25 |
-------------------------------------
|   0.02 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.06 |   0.25 |   1.00 |   0.25 |
-------------------------------------

New Policy
-------------------------------------
|  Right |  Right |   Down |   Left |
-------------------------------------
|  Right |  Right |   Down |   Left |
-------------------------------------
|   Down |        |        |        |
------------------------------