# **Policy Iteration - Draft**
### 2022/04/22, A. J. Zerouali

This is a debug file. Objective is to implement policy iteration for an environment such as Windy GridWorld.

In [1]:
#####################################################
##### IMPORTANT: ALWAYS EXECUTE THIS CELL FIRST #####
#####################################################
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## I - Windy GridWorld

Next cell contains the main test environment class, along with a helper function for building the environment. This section also contains the printing functions and the random policy generator.

### 1) Windy GridWorld Class and helper function

In [2]:
##### WINDY GRIDWORLD #####
# Updated: 22/04/07, A. J. Zerouali
# The Windy GridWorld environment used in Lazy Programmer's course.
# This is a 3x4 grid, with wall at (1,1), +1 reward at the terminal 
# square (0,3), and -1 reward at the terminal square (1,3).
# For the "windy" variant, the main changes occur in the move() method.
# States are (i,j) tuples, actions are characters, containers are dictionaries.


# GridWorld_simple with only 3x4 grid. This is the environment.
class GridWorld_Windy_small():
    def __init__(self, rows, cols, ini_state, non_term_states, term_states, actions):
        # Attributes rows and cols are dimensions of the grid
        self.rows = rows
        self.cols = cols
        # Coordinates of agent
        self.i = ini_state[0]
        self.j = ini_state[1]
        # State and action spaces
        self.non_term_states = non_term_states
        self.term_states = term_states
        self.actions = actions 
        # The next attributes are populated using the set() method
        self.adm_actions = {}
        self.rewards = {}
        self.transition_probs = {}
        
    # Method setting up the actions, rewards, and transition probabilities
    def set(self, rewards, adm_actions, transition_probs):
        # INPUT: adm_actions: Dictionary of (i,j):[a_i] = (row,col):[action list]
        #        rewards: Dictionary of (i,j):r = (row,col):reward
        #        transition_probs: Dictionary of (i,j):{a_i:p_ij}= ...
        #                          .. (row,col):{dictionary of probs for each action}
        # WARNING: Do not confuse self.adm_actions with self.actions. Latter is the action space,
        #          adm_actions are the accessible actions from a state (dict. {s_i:[a_ij]}).
        self.rewards = rewards
        self.adm_actions = adm_actions
        self.transition_probs = transition_probs
    
    # Method that sets current state of agent
    def set_state(self, s):
        # INPUT: s: (i,j)=(row,col), coord. of agent
        self.i = s[0]
        self.j = s[1]

    # Method to return current state of agent
    def current_state(self):
        return (self.i, self.j)
    
    # Method to check if current agent state is terminal
    # Note: Lazy Prog not explciting terminal states
    def is_terminal(self, s):
        return (s in self.term_states)
    
    # HAS TO BE MODIFIED FOR WINDY GRIDWORLD
    # Method to perform action in environment
    def move(self, action):
        # Input:  action: New action to execute
        # Output: reward
        # Comments: - Requires transition probabilities. 
        #           - Calls numpy.random.choice(), doesn't work with dictionaries.
        
        # Check if action is admissible in current state
        if action in adm_actions[self.current_state()]:
            
            # Convert transition_probs to lists compatible with np.random.choice().
            # Recall self.transition_probs[(self.current_state(), action)] is a dictionary,
            # while np.random.choice() works with ints or ndarrays.
            next_states = list(self.transition_probs[(self.current_state(), action)].keys())
            next_states_probs = list(self.transition_probs[(self.current_state(), action)].values())
            
            # Generate a random index (this Numpy function is tricky)
            rand_ind = np.random.choice(a = len(next_states), p = next_states_probs)
            # Set new state of agent
            s_new = next_states[rand_ind] # Not necessary, for debug
            self.set_state(s_new)
            
        # END IF
   
        # Return reward. If not in given dictionary, return 0
        return self.rewards.get((self.i, self.j), 0)

    # Method to check if agent is currently in terminal state
    def game_over(self):
        # Output true if agent is in terminal states (0,3) or (1,3)
        return ( (self.i, self.j) in self.term_states)
    
    # Method returnning all admissible states, i.e. not in the wall (1,1)
    def all_states(self):
        return (self.non_term_states | self.term_states )
# END CLASS

# Helper function to construct an environment.
# Consists mainly of initializations.
def windy_standard_grid(penalty=0):
    # Input: penalty: Float. Penalty for moving to non terminal state.
    # Output: env. Windy_GridWorld_small() object (the environment).
    
    # Start at bottom left (randomize later)
    ini_state = (2,0)
    # Action space 
    ACTION_SPACE = {"U", "D", "L", "R"}
    # Non terminal states
    NON_TERMINAL_STATES = {(0,0), (0,1), (0,2), (1,0), (1,2), (2,0), (2,1), (2,2), (2,3)}
    # Terminal states
    TERMINAL_STATES = {(0,3), (1,3)}
    
    # Instantiate:
    env = GridWorld_Windy_small(3, 4, ini_state, NON_TERMINAL_STATES, TERMINAL_STATES, ACTION_SPACE)

    
    # Dictionary of rewards
    # Not storing 0s if penalty=0
    rewards = {(0,3):1, (1,3): -1}
    # Poplate non terminal states for penalty != 0
    if penalty != 0:
        for s in NON_TERMINAL_STATES:
            rewards[s] = penalty
    
    # Dictionary of admissible actions per state
    adm_actions = {
        (0,0): ("D", "R"),
        (0,1): ("L", "R"),
        (0,2): ("L", "R", "D"),
        (1,0): ("D", "U"),
        (1,2): ("U", "D", "R"),
        (2,0): ("U", "R"),
        (2,1): ("L", "R"),
        (2,2): ("U", "R", "L"),
        (2,3): ("U", "L"),
    }
    
    # Dictionary of transition probabilities
    # NOTE: I've modified the instructor's implementation.
    #       I've removed all tautologies (agent doesn't stay in current state).
    transition_probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        
        ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
    }
    
    # Assign missing environment attributes
    env.set(rewards, adm_actions, transition_probs)
    
    # Output line
    return env

# END DEF windy_standard_grid()

### 2) Printing functions

In [3]:
##### PRINTING FUNCTIONS #####
# 2022/04/06, AJ Zerouali
# Modified from Lazy Prog's GitHub

def print_values(Val_fn, env):
    print(f"## VALUE FUNCTION ##")
    for i in range(env.rows):
        print("------------------------")
        for j in range(env.cols):
            v = Val_fn.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="") # -ve sign takes up an extra space
        print("")
    print("------------------------")

def print_policy(Pi_fn, env):
    # REMARK: WILL ONLY PRINT A DETERMINISTIC POLICY WITH {(i,j):{"action":1.0}}
    print(f"##  POLICY  ##")
    for i in range(env.rows):
        print("------------------------")
        for j in range(env.cols):
            if (i,j) not in [(1,1), (0,3), (1,3)]:
                # WARNING: Will only work if there's one and only one element
                a = list(Pi_fn[(i,j)].keys())[0]
                print("  %s  |" % a, end="")
            elif (i,j) == (1,1):
                print("  %s  |" % " ", end="")
        print("")
    print("------------------------")

'''
The policy looks like this:

pi = {
    (2, 0): {'U': 1.0},
    (1, 0): {'U': 1.0},
    (0, 0): {'R': 1.0},
    (0, 1): {'R': 1.0},
    (0, 2): {'R': 1.0},
    (1, 2): {'U': 1.0},
    (2, 1): {'R': 1.0},
    (2, 2): {'U': 1.0},
    (2, 3): {'L': 1.0},
  }

'''

"\nThe policy looks like this:\n\npi = {\n    (2, 0): {'U': 1.0},\n    (1, 0): {'U': 1.0},\n    (0, 0): {'R': 1.0},\n    (0, 1): {'R': 1.0},\n    (0, 2): {'R': 1.0},\n    (1, 2): {'U': 1.0},\n    (2, 1): {'R': 1.0},\n    (2, 2): {'U': 1.0},\n    (2, 3): {'L': 1.0},\n  }\n\n"

### 3) Random policy generator

The following generates a random deterministic policy.

In [4]:
##### RANDOM DETERMINISTIC POLICY GENERATOR #####
## 2022/04/08, AJ Zerouali
# Recall: rand_ind = np.random.choice(a = len(next_states), p = next_states_probs)

def gen_random_policy(env):
    # Input: env, Windy_GridWorld_simple object (environment).
    # Output: Pi, a (deterministic) policy dictionary.
    non_term_states = env.non_term_states
    adm_actions = env.adm_actions
    Pi = {}
    
    for s in non_term_states:
        actions_list = list(adm_actions[s])
        a_random = actions_list[np.random.randint(len(actions_list))]
        Pi[s] = {a_random:1.0}
    
    return Pi

### 4) Value function comparison

**22/04/22**
Let's write a function that compares two value functions.

In [5]:
# Compare two value functions #

def compare_value_fns(V_old, V_new, non_term_states):
    # ARGUMENTS: - V_old and V_new: Dictionaries of 2 value functions to compare
    #            - non_term_states: Set of non-terminal states in the environment
    # OUTPUT: delta_V = sup_{s in S} |V_old(s)- V_new(s)|
    delta_V = 0
    for s in non_term_states:
        delta_V = max(delta_V, abs(V_old[s]-V_new[s]))
        
    return delta_V

## II - Policy Iteration 

This section is divided into the following parts (implemented as functions below):

1) Iterative policy evaluation

2) Policy improvement

3) Policy iteration

The "main" is in the next section. Each of the parts above is implemented in a distinct cell. There is an appendix below with a test case for the iterative policy eval function.

### 1) Iterative policy evaluation

A more general function for stochastic policies and non-trivial transitions.

In [6]:
##### ITERATIVE POLICY EVALUATION #####
## 2022/04/08, AJ Zerouali
## REMARK: This function takes into account s

def iter_policy_eval(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma):
    # ARGUMENTS:
    #  Pi: Dict. Policy function to be evaluated, from main() function.
    #  V_ini: Dict. Initial value fn, from main() function.
    #  P_trans: Dict. Transition probabilities of MDP, from main() function.
    #  Rwds: Dict. Rewards by (state, action, state_new), from main() function.
    #  adm_actions: Dict. Admissible actions in a given state, from grid attributes.
    #  non_term_states: Set. Non terminal states, from grid attributes.
    #  term_states: Set. Terminal states only, from grid attributes.
    #  epsilon: Float. Convergence threshold (for sup norm of value function), from main() function.
    #  gamma: Float. Discount factor, from main() function.
    
    # OUTPUT:
    #  V_pi: Dict. Value function corresp. to Pi
    #  k: Number of iterations for convergence of policy eval.
    
    
    # INITIALIZATIONS
    # V_k and V_(k+1) ini. (get switched in while loop)
    V_new = V_ini
    for s in term_states:
        V_new[s] = 0
    V_old = {}
    # Iteration counter ini
    k = 0
    # Stopping Boolean ini
    V_is_stable = False
    
    
    # MAIN LOOP
    # Iterates over k
    while not V_is_stable:
        
        # Initialize V_k and V_(k+1)
        V_old = V_new
        V_new = {}
        for s in term_states:
            V_new[s] = 0
        # Initialize sup|V_(k+1) - V_k|
        Delta_V = 0
        
        # EVALUATE V_(k+1)=V_new
        # Loop over non terminal states
        for s in non_term_states:  
            
            # COMPUTE V_(k+1)(s)
            
            # Initialize
            V_s_new = 0
            
            # Loop over admissible actions in state s
            for a in adm_actions[s]:
                
                # Add sum over s_ind only if pi(a|s) is non-zero:
                if (Pi[s].get(a,0) != 0):
                
                    # This loop is only over non-trivial transitions
                    for s_ind in P_trans[(s,a)].keys(): 
                        # UPDATE V_s_new
                        V_s_new += Pi[s].get(a,0)*P_trans[(s,a)].get(s_ind,0) \
                                    *( Rwds.get(s_ind,0) + gamma*V_old[s_ind] )  
                    # END FOR OVER s_ind
                    
                # END IF

            # END FOR OVER a
            
            # Assign V_(k+1)(s)
            V_new[s] = V_s_new
            
            # Update sup|V_(k+1) - V_k|
            Delta_V = max(Delta_V, abs(V_s_new-V_old.get(s,0)) )
            
        # END FOR OVER s     
        
        # Update stopping Boolean
        V_is_stable = (Delta_V < epsilon)
        
        # Update iteration counter
        k += 1
    # END WHILE
    
    # Return V_pi and number of iterations
    return V_new, k

**22/04/22 - 19:11**
So I finally got this to work and recovered the first result (zero penalty) of Lazy Programmer. I had to change the way I broke the main while loop in policy iteration. I have to clarify where my mistake was. 

Secondly, I need to modify the code so as to compare two value functions and use the epsilon threshold to break the main loop. On the first successful try, I limited *N_iter* to 31. 

**22/04/22 - 21:01**
The cells below reproduce the results of Lazy Programmer. I made a mistake in the update line of the policy improvement step, it was the wrong value function variable.

### 2) Policy improvement by policy iteration


**Functions that we'll need:**
1) Need a function that will implement policy improvement.

2) Policy iteration will call iterative policy evaluation and policy improvement function in (1).

3) Need a function that generates a random initial policy.


For policy improvement function, keep the following in mind:
* Create a dictionary for $Q_\pi$.
* Use the *max()* function to extract the argmax from $Q_\pi(s,\cdot)$ for each $s$. Syntax is as follows:

                {argmax in dict} = max(dict, key = dict.get)
* Major flaw/complication of present design: The algorithm is valid for a deterministic policy which is implemented as a stochastic policy.


In [7]:
###########################################
## POLICY IMPROVEMENT - improve_policy() ##
###########################################
# 2022/04/22 - A. J. Zerouali
# This function is called in the main loop of the policy iteration algorithm.

def improve_policy(Pi, V_pi, P_trans, Rwds, adm_actions, non_term_states, term_states, gamma):
    
    
    # Initialize policy_is_stable
    policy_is_stable = True
    
    for s in non_term_states:
        
        # Store old action
        a_old = list(Pi[s].keys())[0]
        
        # Initialize Vs_dict (dictionary for Q_pi(s,-))
        Vs_dict = {}
        
        # Loop over admissible actions
        for a in adm_actions[s]:
            
            V_temp = 0
            
            # Loop over non-zero probability transitions
            # Evaluate new V_pi(s)
            for s_ind in P_trans[(s,a)].keys(): 
                V_temp += P_trans[(s,a)].get(s_ind,0)*\
                            ( Rwds.get(s_ind,0) + gamma*V_pi[s_ind] )
            # END FOR over s_ind
            
            # Store V_temp in Vs_dict
            Vs_dict[a] = V_temp     
            
        # END FOR over a in adm_actions[s]
        
        # Get argmax
        a_new = max(Vs_dict, key = Vs_dict.get)
        
        # Update policy with argmax:
        Pi[s] = {a_new:1.0}
        # Update V? Not necessary, gets evaluated again at beginning of loop
        
        # CLARIFY WHY THIS IS THE WAY
        if a_old != a_new:
            policy_is_stable = False
        
    # END FOR s in non_term_states
        
    return Pi, policy_is_stable
###########################################
## END OF improve_policy()               ##
###########################################

### 3) Policy Iteration Algorithm

This function is the genesis of all the above. It implements the famous "Policy Iteration Algorithm" of dynamic programming, and calls both the policy evaluation and policy improvement functions.

**Remarks:** 
1) The first execution was around 2:30 on 2022/04/09 and crashed.

2) This function is called in the "main" section below. Didn't work

3) 2022/04/22 - 21:25 - Finally debugged. Gives correct result with penalty 0. See test below function

In [10]:
################################
## POLICY ITERATION ALGORITHM ##
################################

def Policy_Iteration(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma):

    # Initialize counter and looping Boolean
    N_iter = 0
    policy_is_stable = True #Necessary?
    
    # Init. V_old
    V_old = V_ini

    # Loop until policy_is_stable = True
    while True:
        #######################
        ## POLICY EVALUATION ##
        #######################

        # Execute policy eval function
        V_new, k = iter_policy_eval(Pi, V_old, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)

        # DEBUG:
        print(f"Policy evaluation fn iter_policy_eval() converged after {k} iterations.")

        ###########################################
        ## POLICY IMPROVEMENT - improve_policy() ##
        ###########################################

        Pi, policy_is_stable = improve_policy(Pi, V_new, P_trans, Rwds, adm_actions, non_term_states, term_states, gamma)

        # Break condition (Tricky)####
        # Update policy iteration counter
        N_iter += 1

        # Compare value functions:
        delta_V = compare_value_fns(V_old, V_new, non_term_states)
        # Update value function
        V_old = V_new

        # BREAK WHILE condition
        #if policy_is_stable or N_iter>30:
        #    break
        if policy_is_stable:
            break
        elif delta_V<=epsilon:
            break

    # END WHILE not policy_is_stable

    # DEBUG/REMINDER: In function, should finish with
    return V_new, Pi, N_iter

# END DEF Policy_Iteration

#### Test with zero penalty. 

The optimal policy is (Lecture 57):

        ##  POLICY  ##
        ------------------------
          R  |  R  |  R  |
        ------------------------
          U  |     |  D  |
        ------------------------
          U  |  L  |  L  |  L  |
        ------------------------
        
The optimal value function is:

        ## VALUE FUNCTION ##
        ------------------------
         0.81| 0.90| 1.00| 0.00|
        ------------------------
         0.73| 0.00| 0.48| 0.00|
        ------------------------
         0.66| 0.59| 0.53| 0.48|
        ------------------------

In [67]:
del grid, Pi_star, V_star

In [12]:
######################
## INIT ENVIRONMENT ##
######################

# Penalty
pen = -2
# Discount factor and error threshold
gamma = 0.9
epsilon = 1e-3

# Create environment
grid = windy_standard_grid(penalty=pen)
print(f"Windy GridWorld environment with penalty = {pen} created ... \n")

# Initialize policy
Pi = gen_random_policy(grid)

# Print optimal (deterministic) policy
print(f"Printing initial policy ...")
print_policy(Pi, grid)
print("\n")

# Initialize value function
V_ini = {}
for s in (grid.non_term_states | grid.term_states):
    V_ini[s] = 0

##############################
## EXECUTE POLICY ITERATION ##
##############################

print(f"Executing policy iteration algorithm ...")
# SIGNATURE: Policy_Iteration(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
(V_star, Pi_star, N_iter) = Policy_Iteration(Pi, V_ini, grid.transition_probs, grid.rewards, grid.adm_actions, \
                                            grid.non_term_states, grid.term_states, epsilon, gamma)


###################
## PRINT RESULTS ##
###################

# Print N_iter
# Print optimal value function
print(f"Policy_Iteration() converged after {N_iter} iterations ...\n")

# Print optimal value function
print(f"Printing optimal value function ...")
print_values(V_star, grid)

# Print optimal (deterministic) policy
print(f"Printing optimal policy ...")
print_policy(Pi_star, grid)

Windy GridWorld environment with penalty = -2 created ... 

Printing initial policy ...
##  POLICY  ##
------------------------
  D  |  R  |  R  |
------------------------
  U  |     |  U  |
------------------------
  U  |  L  |  R  |  U  |
------------------------


Executing policy iteration algorithm ...
Policy evaluation fn iter_policy_eval() converged after 74 iterations.
Policy evaluation fn iter_policy_eval() converged after 2 iterations.
Policy evaluation fn iter_policy_eval() converged after 2 iterations.
Policy_Iteration() converged after 3 iterations ...

Printing optimal value function ...
## VALUE FUNCTION ##
------------------------
-2.99|-1.10| 1.00| 0.00|
------------------------
-4.69| 0.00|-1.00| 0.00|
------------------------
-6.15|-4.61|-2.90|-1.00|
------------------------
Printing optimal policy ...
##  POLICY  ##
------------------------
  R  |  R  |  R  |
------------------------
  U  |     |  R  |
------------------------
  R  |  R  |  U  |  U  |
--------------

### Windy GridWorld with various penalties

In this part I'm attempting to reproduce the results of Lecture 57.

In [13]:
### Windy GridWorld with various penalties
# 2022/04/22
### SIGNATURES:
# windy_standard_grid(penalty=0)
# Policy_Iteration(Pi_ini, V_ini, ---grid attributes---)
# print_values(Val_fn, env)
# print_policy(Val_fn, env)

# Discount factor and error threshold
gamma = 0.9
epsilon = 1e-3

# Penalty list
penalties = [0.0, -0.1, -0.2, -0.4, -0.5, -2]

# Loop over penalties
for pen in penalties:
    
    # Create environment
    grid = windy_standard_grid(penalty=pen)
    print(f"Windy GridWorld environment with penalty = {pen} created ... \n")

    # Initialize policy
    Pi = gen_random_policy(grid)

    # Print optimal (deterministic) policy
    print(f"Printing initial policy ...")
    print_policy(Pi, grid)
    print("\n")

    # Initialize value function
    V_ini = {}
    for s in (grid.non_term_states | grid.term_states):
        V_ini[s] = 0

    ##############################
    ## EXECUTE POLICY ITERATION ##
    ##############################

    print(f"Executing policy iteration algorithm ...")
    # SIGNATURE: Policy_Iteration(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
    (V_star, Pi_star, N_iter) = Policy_Iteration(Pi, V_ini, grid.transition_probs, grid.rewards, grid.adm_actions, \
                                                grid.non_term_states, grid.term_states, epsilon, gamma)


    ###################
    ## PRINT RESULTS ##
    ###################

    # Print N_iter
    # Print optimal value function
    print(f"Policy_Iteration() converged after {N_iter} iterations ...\n")

    # Print optimal value function
    print(f"Printing optimal value function ...")
    #print_values(V_star, grid) #
    print_values(V_star, grid)

    # Print optimal (deterministic) policy
    print(f"Printing optimal policy ...")
    #print_policy(Pi_star, grid)
    print_policy(Pi_star, grid)
    
    # Separator
    print("_____________________________________________\n\n")

Windy GridWorld environment with penalty = 0.0 created ... 

Printing initial policy ...
##  POLICY  ##
------------------------
  R  |  L  |  R  |
------------------------
  U  |     |  U  |
------------------------
  U  |  R  |  L  |  U  |
------------------------


Executing policy iteration algorithm ...
Policy evaluation fn iter_policy_eval() converged after 3 iterations.
Policy evaluation fn iter_policy_eval() converged after 2 iterations.
Policy evaluation fn iter_policy_eval() converged after 2 iterations.
Policy evaluation fn iter_policy_eval() converged after 4 iterations.
Policy evaluation fn iter_policy_eval() converged after 3 iterations.
Policy_Iteration() converged after 5 iterations ...

Printing optimal value function ...
## VALUE FUNCTION ##
------------------------
 0.81| 0.90| 1.00| 0.00|
------------------------
 0.73| 0.00| 0.48| 0.00|
------------------------
 0.66| 0.59| 0.53| 0.48|
------------------------
Printing optimal policy ...
##  POLICY  ##
------------

## Policy iteration debug



### Policy improvement and policy iteration - Scrap

This is getting more and more complex. 

The next cell sets up a test case with "wind" in state (1,2) and with zero penalty (Lecture 57). 

In [48]:
# Penalty
pen = -0.5
# Discount factor and error threshold
gamma = 0.9
epsilon = 1e-3

# Create environment
grid = windy_standard_grid(penalty=pen)
print(f"Windy GridWorld environment with penalty = {pen} created ... \n")

# Initialize policy
Pi = gen_random_policy(grid)

# Print optimal (deterministic) policy
print(f"Printing initial policy ...")
print_policy(Pi, grid)
print("\n")

# Initialize value function
V_ini = {}
for s in (grid.non_term_states | grid.term_states):
    V_ini[s] = 0

# DEBUG: Initialize arguments of iter_policy_eval(), improve_policy() and Policy_Iteration()
# P_trans, Rwds, adm_actions, non_term_states, term_states
P_trans = grid.transition_probs
Rwds = grid.rewards
adm_actions = grid.adm_actions
non_term_states = grid.non_term_states
term_states = grid.term_states

# Might have to suppress this later
V_old = V_ini

Windy GridWorld environment with penalty = -0.5 created ... 

Printing initial policy ...
##  POLICY  ##
------------------------
  R  |  R  |  L  |
------------------------
  D  |     |  U  |
------------------------
  U  |  R  |  L  |  L  |
------------------------




#### Policy iteration

Next comes the main loop of policy iteration. Upon convergence, we're supposed to get that the optimal policy is (Lecture 57):

        ##  POLICY  ##
        ------------------------
          R  |  R  |  R  |
        ------------------------
          U  |     |  D  |
        ------------------------
          U  |  L  |  L  |  L  |
        ------------------------
        
The optimal value function is:

        ## VALUE FUNCTION ##
        ------------------------
         0.81| 0.90| 1.00| 0.00|
        ------------------------
         0.73| 0.00| 0.48| 0.00|
        ------------------------
         0.66| 0.59| 0.53| 0.48|
        ------------------------


In [49]:
########################
### POLICY ITERATION ###
########################
    
print(f"Executing policy iteration algorithm ...")
# SIGNATURE: Policy_Iteration(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
#(V_star, Pi_star, N_iter) = Policy_Iteration(Pi, V_ini, grid.transition_probs, grid.rewards, grid.adm_actions, \
#                                            grid.non_term_states, grid.term_states, epsilon, gamma)

# REMARK: V_ini becomes V_old

# Initialize counter and looping Boolean
N_iter = 0
# policy_is_stable = False

# Loop until policy_is_stable = True
while True:
    #######################
    ## POLICY EVALUATION ##
    #######################
    
    # Execute policy eval function
    V_new, k = iter_policy_eval(Pi, V_old, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)

    # DEBUG:
    print(f"Policy evaluation fn iter_policy_eval() converged after {k} iterations.")
    
    ###########################################
    ## POLICY IMPROVEMENT - improve_policy() ##
    ###########################################
    # DEBUG/REMINDER This block normally starts with the following line
    # def improve_policy(Pi, V_pi, P_trans, Rwds, adm_actions, non_term_states, term_states, gamma):
    
    # Initialize policy_is_stable
    policy_is_stable = True
    
    for s in non_term_states:
        
        # Store old action
        a_old = list(Pi[s].keys())[0]
        
        # Initialize Vs_dict (dictionary for Q_pi(s,-))
        Vs_dict = {}
        
        # Loop over admissible actions
        for a in adm_actions[s]:
            
            V_temp = 0
            
            # Loop over non-zero probability transitions
            # Evaluate new V_pi(s)
            for s_ind in P_trans[(s,a)].keys(): 
                V_temp += P_trans[(s,a)].get(s_ind,0)*\
                            ( Rwds.get(s_ind,0) + gamma*V_new[s_ind] )
            # END FOR over s_ind
            
            # Store V_temp in Vs_dict
            Vs_dict[a] = V_temp     
            
        # END FOR over a in adm_actions[s]
        
        # Get argmax
        a_new = max(Vs_dict, key = Vs_dict.get)
        
        # Update policy with argmax:
        Pi[s] = {a_new:1.0}
        # Update V? Not necessary, gets evaluated again at beginning of loop
        
        # CLARIFY WHY THIS IS THE WAY
        if a_old != a_new:
            policy_is_stable = False
        
    # END FOR s in non_term_states
        
    # DEBUG/REMINDER: This block ends with
    #return Pi, policy_is_stable
    ###########################################
    ## END OF improve_policy()               ##
    ###########################################
    
    # Break condition (Tricky)####
    # Update policy iteration counter
    N_iter += 1

    # Compare value functions:
    delta_V = compare_value_fns(V_old, V_new, non_term_states)
    # Update value function
    V_old = V_new
    
    # BREAK WHILE condition
    #if policy_is_stable or N_iter>30:
    #    break
    if policy_is_stable:
        break
    elif delta_V<=epsilon:
        break
    
# END WHILE not policy_is_stable

# DEBUG/REMINDER: In function, should finish with
#return V_pi, Pi, N_iter


Executing policy iteration algorithm ...
Policy evaluation fn iter_policy_eval() converged after 60 iterations.
Policy evaluation fn iter_policy_eval() converged after 3 iterations.
Policy evaluation fn iter_policy_eval() converged after 4 iterations.
Policy evaluation fn iter_policy_eval() converged after 3 iterations.
Policy evaluation fn iter_policy_eval() converged after 2 iterations.
Policy evaluation fn iter_policy_eval() converged after 2 iterations.


In [50]:
###################
## PRINT RESULTS ##
###################

# Print N_iter
# Print optimal value function
print(f"Policy_Iteration() converged after {N_iter} iterations ...\n")

# Print optimal value function
print(f"Printing optimal value function ...")
#print_values(V_star, grid) #
print_values(V_new, grid)

# Print optimal (deterministic) policy
print(f"Printing optimal policy ...")
#print_policy(Pi_star, grid)
print_policy(Pi, grid)

Policy_Iteration() converged after 6 iterations ...

Printing optimal value function ...
## VALUE FUNCTION ##
------------------------
-0.14| 0.40| 1.00| 0.00|
------------------------
-0.63| 0.00|-0.30| 0.00|
------------------------
-1.06|-1.19|-0.77|-1.00|
------------------------
Printing optimal policy ...
##  POLICY  ##
------------------------
  R  |  R  |  R  |
------------------------
  U  |     |  U  |
------------------------
  U  |  R  |  U  |  U  |
------------------------


In [19]:
list(Pi_star[(0,1)].keys())[0]

'R'

## III - Policy Iteration in Windy GridWorld

In this part I'm attempting to reproduce the results of Lecture 57 of Lazy Programmer's 1st RL course. Idea is to visualize value function and deterministic policy for several penalties. First execution on 22/04/09 failed (miserably).


**Comments:**
1) First bug: The line 

            V_pi, k = iter_policy_eval(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
was initially:
            
            V_pi = iter_policy_eval(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)

C++ would've never overlooked this...

2) The output variables *V_star* and *Pi_star* in *Policy_Iteration()* should actually be ***V_pi*** and ***Pi***.

3) First successful execution of the cell below was on 22/04/21 at 19:20. The final policy and value function obtained are completely wrong. I generate random initial policies, which could be causing some issue, so I'll start with a well defined policy. The fact that the value function  

In [13]:
### Windy GridWorld with various penalties
# 2022/04/21
### SIGNATURES:
# windy_standard_grid(penalty=0)
# Policy_Iteration(Pi_ini, V_ini, grid)
# print_values(Val_fn, env)
# print_policy(Val_fn, env)

# Penalty list
penalties = [0.0, -0.1, -0.2, -0.4, -0.5, -2]

# Discount factor and error threshold
gamma = 0.9
epsilon = 1e-3

# Loop over penalties
for pen in penalties:
    
    # Create environment
    grid = windy_standard_grid(penalty=pen)
    print(f"Windy GridWorld environment with penalty = {pen} created ...")
    
    # Initialize policy and value function
    Pi = gen_random_policy(grid)
    V_ini = {}
    for s in (grid.non_term_states | grid.term_states):
        V_ini[s] = 0
    
    # Execute policy iteration
    print(f"Executing policy iteration algorithm ...")
    # SIGNATURE: Policy_Iteration(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
    (V_star, Pi_star, N_ter) = Policy_Iteration(Pi, V_ini, grid.transition_probs, grid.rewards, grid.adm_actions, \
                                            grid.non_term_states, grid.term_states, epsilon, gamma)
    
    
    # Print optimal value function
    print(f"Printing optimal value function ...")
    print_values(V_star, grid)
    
    # Print optimal (deterministic) policy
    print(f"Printing optimal policy ...")
    print_policy(Pi_star, grid)
    
    # Separator
    print("_____________________________________________")
    


Windy GridWorld environment with penalty = 0.0 created ...
Executing policy iteration algorithm ...
Printing optimal value function ...
## VALUE FUNCTION ##
------------------------
 0.00| 0.00| 0.00| 0.00|
------------------------
 0.00| 0.00| 0.00| 0.00|
------------------------
 0.00| 0.00| 0.00|-1.00|
------------------------
Printing optimal policy ...
##  POLICY  ##
------------------------
  D  |  L  |  R  |
------------------------
  D  |     |  D  |
------------------------
  U  |  L  |  U  |  L  |
------------------------
_____________________________________________
Windy GridWorld environment with penalty = -0.1 created ...
Executing policy iteration algorithm ...
Printing optimal value function ...
## VALUE FUNCTION ##
------------------------
-0.99|-0.99|-0.99| 0.00|
------------------------
-0.99| 0.00|-1.00| 0.00|
------------------------
-0.99|-0.99|-0.99|-0.99|
------------------------
Printing optimal policy ...
##  POLICY  ##
------------------------
  D  |  L  |  R

## Appendix A: Test case for iterative policy evaluation

Below is a test case with deterministic policy and non-windy GridWorld. It's a reality check from the first few lectures on DP. The test should give the following table:

        Value function
        
        ------------------------
         0.81| 0.90| 1.00| 0.00|
        ------------------------
         0.73| 0.00| 0.90| 0.00|
        ------------------------
         0.66| 0.73| 0.81| 0.73|
        ------------------------
        
**Remark:** Execute all cells before and including *iter_policy_eval()*.

In [6]:
##### (NON)WINDY GRIDWORLD (TEST) #####
# Updated: 22/04/08, A. J. Zerouali
# Test with non-windy case and deterministic policy

def test_standard_grid():
    # Start at bottom left (randomize later)
    ini_state = (2,0)
    # Action space 
    ACTION_SPACE = {"U", "D", "L", "R"}
    # Non terminal states
    NON_TERMINAL_STATES = {(0,0), (0,1), (0,2), (1,0), (1,2), (2,0), (2,1), (2,2), (2,3)}
    # Terminal states
    TERMINAL_STATES = {(0,3), (1,3)}
    
    # Instantiate:
    # 
    env = GridWorld_Windy_small(3, 4, ini_state, NON_TERMINAL_STATES, TERMINAL_STATES, ACTION_SPACE)

    
    # Dictionary of rewards
    # Not storing 0s
    rewards = {(0,3):1, (1,3): -1}
    
    # Dictionary of admissible actions per state
    adm_actions = {
        (0,0): ("D", "R"),
        (0,1): ("L", "R"),
        (0,2): ("L", "R", "D"),
        (1,0): ("D", "U"),
        (1,2): ("U", "D", "R"),
        (2,0): ("U", "R"),
        (2,1): ("L", "R"),
        (2,2): ("U", "R", "L"),
        (2,3): ("U", "L"),
    }
    
    # Dictionary of deterministic transitions:
    transition_probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        
        ((1, 2), 'U'): {(0, 2): 1.0},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
    }
    
    # Assign missing environment attributes
    env.set(rewards, adm_actions, transition_probs)
    
    # Output line
    return env

# END DEF test_standard_grid()

# Create environment
# adm_actions, rewards and transition_probs are attributes of grid

grid = test_standard_grid()


### The policy dictionary ###
pi = {
    (2, 0): {'U': 1.0},
    (1, 0): {'U': 1.0},
    (0, 0): {'R': 1.0},
    (0, 1): {'R': 1.0},
    (0, 2): {'R': 1.0},
    (1, 2): {'U': 1.0},
    (2, 1): {'R': 1.0},
    (2, 2): {'U': 1.0},
    (2, 3): {'L': 1.0},
  }

### Initial value function ###
# Just a dictionary of 0s
V = {}
for s in grid.all_states():
    V[s] = 0

# Discount factor and convergence threshold
gamma = 0.9
epsilon = 1e-3

# Compute V_pi
# Signature: iter_policy_eval(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
V_pi, N_iter = iter_policy_eval(pi, V, grid.transition_probs, grid.rewards, grid.adm_actions,\
                                grid.non_term_states, grid.term_states, epsilon, gamma)

# Print the value function function obtained
print(f"The Windy GridWorld deterministic test converged after N_iter={N_iter} iterations. V_pi is:")
print_values(V_pi, grid)

The Windy GridWorld deterministic test converged after N_iter=6 iterations. V_pi is:
## VALUE FUNCTION ##
------------------------
 0.81| 0.90| 1.00| 0.00|
------------------------
 0.73| 0.00| 0.90| 0.00|
------------------------
 0.66| 0.73| 0.81| 0.73|
------------------------


## Appendix B: Some bug documentation

**22/04/21 - 19:20 **

I got the following results:

            Windy GridWorld environment with penalty = 0.0 created ...
            Executing policy iteration algorithm ...
            Printing optimal value function ...
            ## VALUE FUNCTION ##
            ------------------------
             0.00| 0.00| 0.00| 0.00|
            ------------------------
             0.00| 0.00| 0.00| 0.00|
            ------------------------
             0.00| 0.00| 0.00|-1.00|
            ------------------------
            Printing optimal policy ...
            ##  POLICY  ##
            ------------------------
              D  |  L  |  R  |
            ------------------------
              D  |     |  D  |
            ------------------------
              U  |  L  |  U  |  L  |
            ------------------------
            _____________________________________________
            Windy GridWorld environment with penalty = -0.1 created ...
            Executing policy iteration algorithm ...
            Printing optimal value function ...
            ## VALUE FUNCTION ##
            ------------------------
            -0.99|-0.99|-0.99| 0.00|
            ------------------------
            -0.99| 0.00|-1.00| 0.00|
            ------------------------
            -0.99|-0.99|-0.99|-0.99|
            ------------------------
            Printing optimal policy ...
            ##  POLICY  ##
            ------------------------
              D  |  L  |  R  |
            ------------------------
              D  |     |  D  |
            ------------------------
              U  |  L  |  R  |  L  |
            ------------------------
            _____________________________________________
            Windy GridWorld environment with penalty = -0.2 created ...
            Executing policy iteration algorithm ...
            Printing optimal value function ...
            ## VALUE FUNCTION ##
            ------------------------
            -1.99|-1.99|-1.99| 0.00|
            ------------------------
            -1.99| 0.00|-1.99| 0.00|
            ------------------------
            -1.99|-1.99|-1.99|-1.99|
            ------------------------
            Printing optimal policy ...
            ##  POLICY  ##
            ------------------------
              D  |  L  |  R  |
            ------------------------
              D  |     |  R  |
            ------------------------
              U  |  L  |  U  |  U  |
            ------------------------
            _____________________________________________
            Windy GridWorld environment with penalty = -0.4 created ...
            Executing policy iteration algorithm ...
            Printing optimal value function ...
            ## VALUE FUNCTION ##
            ------------------------
            -3.99|-3.99|-1.73| 0.00|
            ------------------------
            -3.99| 0.00|-1.48| 0.00|
            ------------------------
            -3.99|-3.99|-1.73|-1.96|
            ------------------------
            Printing optimal policy ...
            ##  POLICY  ##
            ------------------------
              D  |  R  |  R  |
            ------------------------
              D  |     |  R  |
            ------------------------
              U  |  R  |  U  |  U  |
            ------------------------
            _____________________________________________
            Windy GridWorld environment with penalty = -0.5 created ...
            Executing policy iteration algorithm ...
            Printing optimal value function ...
            ## VALUE FUNCTION ##
            ------------------------
            -0.14| 0.40| 1.00| 0.00|
            ------------------------
            -4.99| 0.00|-1.76| 0.00|
            ------------------------
            -4.99|-4.99|-1.40|-1.00|
            ------------------------
            Printing optimal policy ...
            ##  POLICY  ##
            ------------------------
              R  |  R  |  R  |
            ------------------------
              U  |     |  U  |
            ------------------------
              U  |  R  |  R  |  U  |
            ------------------------
            _____________________________________________
            Windy GridWorld environment with penalty = -2 created ...
            Executing policy iteration algorithm ...
            Printing optimal value function ...
            ## VALUE FUNCTION ##
            ------------------------
            -19.99|-19.99| 1.00| 0.00|
            ------------------------
            -19.99| 0.00|-19.99| 0.00|
            ------------------------
            -19.99|-19.99|-19.99|-19.99|
            ------------------------
            Printing optimal policy ...
            ##  POLICY  ##
            ------------------------
              D  |  R  |  R  |
            ------------------------
              D  |     |  R  |
            ------------------------
              U  |  L  |  U  |  U  |
            ------------------------
            _____________________________________________
            ​
            ​

#### Backup...

In [34]:
########################
### POLICY ITERATION ###
########################
    
print(f"Executing policy iteration algorithm ...")
# SIGNATURE: Policy_Iteration(Pi, V_ini, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
#(V_star, Pi_star, N_iter) = Policy_Iteration(Pi, V_ini, grid.transition_probs, grid.rewards, grid.adm_actions, \
#                                            grid.non_term_states, grid.term_states, epsilon, gamma)

# REMARK: V_ini becomes V_old

# Initialize counter and looping Boolean
N_iter = 0
# policy_is_stable = False

# Loop until policy_is_stable = True
while True:
    #######################
    ## POLICY EVALUATION ##
    #######################
    
    V_new, k = iter_policy_eval(Pi, V_old, P_trans, Rwds, adm_actions, non_term_states, term_states, epsilon, gamma)
    # NEW: 22/04/22
    # Update value function
    #V_ini = V_pi
    # I've changed the above. Now it's below, after the compare_value_fns() call (for break condition)
    # DEBUG:
    print(f"Policy evaluation fn iter_policy_eval() converged after {k} iterations.")
    
    ###########################################
    ## POLICY IMPROVEMENT - improve_policy() ##
    ###########################################
    # DEBUG/REMINDER This block normally starts with the following line
    # def improve_policy(Pi, V_pi, P_trans, Rwds, adm_actions, non_term_states, term_states, gamma):
    
    # Initialize policy_is_stable
    policy_is_stable = True
    
    for s in non_term_states:
        
        # Store old action
        a_old = list(Pi[s].keys())[0]
        
        # Initialize Q_pi(s,-)
        Q_pi_s = {}
        
        # Loop over admissible actions
        for a in adm_actions[s]:
            Q_sa = 0
            
            
            # Loop over non-zero probability transitions
            for s_ind in P_trans[(s,a)].keys(): 
                # UPDATE Q_sa
                Q_sa += P_trans[(s,a)].get(s_ind,0) \
                        *( Rwds.get(s_ind,0) + gamma*V_pi[s_ind] )
            # END FOR over s_ind
            
            # Store Q_pi(s,a)
            Q_pi_s[a] = Q_sa
            
        # END FOR over a
        
        # Get argmax
        ### Getting the argmax in a dictionary is done with max(dict, key = dict.get).
        a_new = max(Q_pi_s, key = Q_pi_s.get)
       
        # Assign new value:
        Pi[s] = {a_new:1.0}
        
        # DEBUG
        # This condition is not good. It should not even be in this loop
        if a_old != a_new:
            policy_is_stable = False
        
        delta_V = compare_value_fns(V_old, V_new, non_term_states)
        V_old = V_new

    # END FOR s in non_term_states
    
    
    # DEBUG/REMINDER: This block ends with
    #return Pi, policy_is_stable
    
    ###########################################
    ## END OF improve_policy()               ##
    ###########################################
    
    
    # Break condition (Tricky)####
    # Update policy iteration counter
    N_iter += 1
    
    # DEBUG/REMINDER: In function, should finish with
    #return V_pi, Pi, N_iter
    
    # BREAK WHILE condition
    #if policy_is_stable or N_iter>30:
    #    break
    
    if policy_is_stable:
        break
    elif delta_V<=epsilon:
        break
    
# END WHILE not policy_is_stable

Executing policy iteration algorithm ...
Policy evaluation fn iter_policy_eval() converged after 2 iterations.
