In [1]:
import numpy as np
from matplotlib import pyplot as plt
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy

In [2]:
SMALL_ENOUGH = 10e-4
GAMMA = .9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

In [26]:
def deterministic_main(grid_name):
    '''This grid gives you a neagative reward of -0.1 for every non terminal state. We want to see, whether it will envourage an
    agent to find shorter path to the goal'''

    if grid_name == "negative":
        grid = negative_grid(step_cost=-.1)

        #print rewards
        print("rewards: ")
        print_values(grid.rewards, grid)
    else:
        grid = standard_grid()

    #Creating a Deterministic random policy which maps each state to a random action.
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)


    print("Initial Policy: ")
    print_policy(policy, grid)

    #initialize V(s)
    states = grid.all_states()
    V = {}
    for s in states:
        if s in grid.actions:
            V[s] = np.random.random() #generates a random number between zero to one.
        else:
            V[s] = 0

    #Value iteration repeating untill the value function converges and also finds the optimal policy
    while True:
        biggest_change = 0
        for s in states:
            old_v = V.get(s)
            
            #V(s) has a value only for a non terminal state
            if s in policy:
                new_v = float('-inf')
                v = 0
                for a in ALL_POSSIBLE_ACTIONS:
                    grid.set_state(s)
                    r = grid.move(a)
                    v = r + (GAMMA * V.get(grid.current_state()))
                    if v > new_v:
                        new_v = v
                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - new_v))

        if biggest_change < SMALL_ENOUGH:
            break
    
    #Finding a policy that leads to optimal value_function
    for s in policy.keys():
        best_a = None
        best_value = float('-inf')
        
        #loop through all possible actions and find best current action
        for a in ALL_POSSIBLE_ACTIONS:#Action we did
            grid.set_state(s)
            r = grid.move(a)
            v = r + GAMMA * (V.get(grid.current_state()))
            if v > best_value:
                best_value = v
                best_a = a
        policy[s] = best_a

    print("Values: ")
    print_values(V, grid)

    print("Policy: ")
    print_policy(policy, grid)

In [31]:
def random_main(grid_name):
    '''This grid gives you a neagative reward of -0.1 for every non terminal state. We want to see, whether it will envourage an
    agent to find shorter path to the goal'''

    if grid_name == "negative":
        grid = negative_grid(step_cost=-.1)

        #print rewards
        print("rewards: ")
        print_values(grid.rewards, grid)
    else:
        grid = standard_grid()

    #Creating a Deterministic random policy which maps each state to a random action.
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)


    print("Initial Policy: ")
    print_policy(policy, grid)

    #initialize V(s)
    states = grid.all_states()
    V = {}
    for s in states:
        if s in grid.actions:
            V[s] = np.random.random() #generates a random number between zero to one.
        else:
            V[s] = 0

    #Value iteration repeating untill the value function converges and also finds the optimal policy
    while True:
        biggest_change = 0
        for s in states:
            old_v = V.get(s)
            
            #V(s) has a value only for a non terminal state
            if s in policy:
                new_v = float('-inf')
                for a in ALL_POSSIBLE_ACTIONS:
                    v = 0
                    for a2 in ALL_POSSIBLE_ACTIONS:
                        grid.set_state(s)
                        if a == a2:
                            p = 0.5
                        else:
                            p = 0.5/3
                        r = grid.move(a2)
                        v += p * (r + (GAMMA * V.get(grid.current_state())))

                    if v > new_v:
                        new_v = v

                V[s] = new_v
                biggest_change = max(biggest_change, np.abs(old_v - new_v))

        if biggest_change < SMALL_ENOUGH:
            break
    
    #Finding a policy that leads to optimal value_function
    for s in policy.keys():
        best_a = None
        best_value = float('-inf')
        
        #loop through all possible actions and find best current action
        for a in ALL_POSSIBLE_ACTIONS:#Action we did
            v = 0
            for a2 in ALL_POSSIBLE_ACTIONS: #Resulting Action
                if a == a2:
                    p = 0.5
                else:
                    p = 0.5/3
                grid.set_state(s)
                r = grid.move(a)
                v += p* (r + GAMMA * (V.get(grid.current_state())))
            if v > best_value:
                best_value = v
                best_a = a
        policy[s] = best_a

    print("Values: ")
    print_values(V, grid)

    print("Policy: ")
    print_policy(policy, grid)

In [28]:
if __name__ == '__main__':
    deterministic_main("negative")

rewards: 
--------------------
 -0.1 -0.1 -0.1  1
--------------------
 -0.1  0 -0.1 -1
--------------------
 -0.1 -0.1 -0.1 -0.1
Initial Policy: 
--------------------
 U | U | R |   |
--------------------
 R |   | L |   |
--------------------
 D | R | R | U |
Values: 
--------------------
  0.62  0.8  1.0  0
--------------------
  0.46  0  0.8  0
--------------------
  0.31  0.46  0.62  0.46
Policy: 
--------------------
 R | R | R |   |
--------------------
 U |   | U |   |
--------------------
 U | R | U | L |


In [32]:
random_main('negative')

rewards: 
--------------------
 -0.1 -0.1 -0.1  1
--------------------
 -0.1  0 -0.1 -1
--------------------
 -0.1 -0.1 -0.1 -0.1
Initial Policy: 
--------------------
 L | D | L |   |
--------------------
 D |   | U |   |
--------------------
 L | R | R | L |
Values: 
--------------------
 -0.08  0.2  0.55  0
--------------------
 -0.28  0 -0.06  0
--------------------
 -0.42 -0.44 -0.33 -0.57
Policy: 
--------------------
 R | R | R |   |
--------------------
 U |   | U |   |
--------------------
 U | R | U | L |
