In [6]:
from env import create_custom_grid_1, create_custom_grid_2

In [7]:
gw1 = create_custom_grid_1()

In [49]:
policy1 = {
    (0, 0): 'right', (0, 1): 'right', (0, 2): 'up', (0, 3): 'up',
    (1, 0): 'up', (1, 1): '', (1, 2): '', (1, 3): '',
    (2, 0): 'right', (2, 1): 'right', (2, 2): 'down', (2, 3): 'down',
    (3, 0): 'right', (3, 1): 'right', (3, 2): 'down', (3, 3): 'down'
}

In [50]:
print("Policy")
gw1.print_policy(policy1)
print("Initial Values")
gw1.print_values()

Policy
-------------------------------------
|  Right |  Right |   Down |   Down |
-------------------------------------
|  Right |  Right |   Down |   Down |
-------------------------------------
|     Up |        |        |        |
-------------------------------------
|  Right |  Right |     Up |     Up |
-------------------------------------
Initial Values
-------------------------------------
|   0.73 |   0.81 |   0.90 |   0.81 |
-------------------------------------
|   0.81 |   0.90 |   1.00 |   0.90 |
-------------------------------------
|   0.73 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.81 |   0.90 |   1.00 |  -1.00 |
-------------------------------------


In [93]:
def iterative_policy_evaluation(gw, policy, gamma, epsilon):
    #print(gw)
    #print(policy)
    #print(gamma)
    #print(epsilon)
    while True:
        biggest_change = 0
        for node in gw:
            state = node.state
            if not gw.is_terminal(state) and not gw.is_barrier(state):
                # get current (old) value
                old_value = gw.get_value(state)
                # get action from policy
                action = policy[state]
                # get immediate reward for action
                reward = gw.get_reward_for_action(state, action)
                # get value at destination state
                value_at_dest = gw.get_value_at_destination(state, action)
                # compute new value
                #print("state: {}".format(state))
                #print("action: {}".format(action))
                #print("reward: {}".format(reward))
                #print("gamma: {}".format(gamma))
                #print("value_at_dest: {}".format(value_at_dest))
                new_value = reward + gamma*value_at_dest
                # set new value for state
                gw.set_value(state, new_value)
                # see if |new_value-old_value| is larger than biggest_change
                biggest_change = max(
                    biggest_change, abs(new_value-old_value))
        # iterated over all states, so see if biggest_change is small enough
        if biggest_change < epsilon:
            break

In [69]:
print("Policy")
gw1.print_policy(policy1)
iterative_policy_evaluation(gw1, policy1, 0.9, 0.99)
print("Values for the policy")
gw1.print_values()

Policy
-------------------------------------
|  Right |  Right |   Down |   Down |
-------------------------------------
|  Right |  Right |   Down |   Down |
-------------------------------------
|     Up |        |        |        |
-------------------------------------
|  Right |  Right |     Up |     Up |
-------------------------------------
Values for the policy
-------------------------------------
|   0.06 |   0.23 |   0.90 |  -0.90 |
-------------------------------------
|   0.23 |   0.90 |   1.00 |  -1.00 |
-------------------------------------
|   0.06 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.23 |   0.90 |   1.00 |  -1.00 |
-------------------------------------


In [75]:
policy2 = {
    (0, 0): 'right', (0, 1): 'left', (0, 2): 'up', (0, 3): 'up',
    (1, 0): 'up', (1, 1): '', (1, 2): '', (1, 3): '',
    (2, 0): 'right', (2, 1): 'right', (2, 2): 'down', (2, 3): 'down',
    (3, 0): 'right', (3, 1): 'left', (3, 2): 'right', (3, 3): 'left'
}
print("Policy")
gw1.print_policy(policy2)

Policy
-------------------------------------
|  Right |   Left |  Right |   Left |
-------------------------------------
|  Right |  Right |   Down |   Down |
-------------------------------------
|     Up |        |        |        |
-------------------------------------
|  Right |   Left |     Up |     Up |
-------------------------------------


In [83]:
iterative_policy_evaluation(gw1, policy1, 0.5, 0.01)
print("Values for the policy")
gw1.print_values()

Values for the policy
-------------------------------------
|   0.12 |   0.25 |   0.50 |  -0.50 |
-------------------------------------
|   0.25 |   0.50 |   1.00 |  -1.00 |
-------------------------------------
|   0.12 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.25 |   0.50 |   1.00 |  -1.00 |
-------------------------------------


In [84]:
def iterative_policy_evaluation(gw, policy, gamma, epsilon, alpha):

    while True:
        biggest_change = 0
        for node in gw:
            state = node.state
            if not gw.is_terminal(state) and not gw.is_barrier(state):
                # get current (old) value
                old_value = gw.get_value(state)
                # get action from policy
                action = policy[state]
                # get immediate reward for action
                reward = gw.get_reward_for_action(state, action)
                # get value at destination state
                value_at_dest = gw.get_value_at_destination(state, action)
                # compute new value
                new_value = alpha * (reward + gamma*value_at_dest)
                # set new value for state
                gw.set_value(state, new_value)
                # see if |new_value-old_value| is larger than biggest_change
                biggest_change = max(
                    biggest_change, abs(new_value-old_value))
        # iterated over all states, so see if biggest_change is small enough
        if biggest_change < epsilon:
            break

In [92]:
iterative_policy_evaluation(
    gw1, policy1, 0.9, 0.01, 0.9)
print("Values for the policy")
gw1.print_values()

Values for the policy
-------------------------------------
|   0.48 |   0.59 |   0.73 |  -0.73 |
-------------------------------------
|   0.59 |   0.73 |   0.90 |  -0.90 |
-------------------------------------
|   0.48 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.59 |   0.73 |   0.90 |  -0.90 |
-------------------------------------
