# averaging n-step return

An implementation of "averaging n-step return "  using a gridworld.

Based on: https://youtu.be/PnHCvfgC_ZA?t=5005 

More info about this can be found in "Reinforcement Learning: an introduction" 2nd edition by Barto and Sutton


The gridworld has the shape(3,4) with a winning state "w"(0,3), and a lossing state "l"(1,3), a non valid state "x"(2,1) and a start state s(3,0)

|  |  |  |  |
|---|---|---|---|
|  |  |  | w |
|  |  |  | l |
|  | x |  |  |
| s |  |  |  |

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import pixiedust

import grid_world

Pixiedust database opened successfully


### Disccount factor and step size

In [2]:

GAMMA = 0.9
ALPHA =0.1

### Auxiliary function to display the values of a policy after finishing iterative policy evaluation

In [3]:
def print_values(V,grid):
    for i in range(grid.width):
        print("--------------------------")
        for j in range(grid.height):
            v = V.get((i,j),0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")

### Auxiliary function to display a stochastic policy

In [4]:
def print_policy(P,grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            a = P.get((i,j),' ')
            if isinstance(a,dict):
                a = list(a)[0]
            print("  %s  |" % a, end="")
        print("")

### From or defined grid world file, import a negative grid ,retrieve all actions and states and print grid rewards
Negative grid is used to encourage the agent to find a shortest path to the goal

In [5]:
grid = grid_world.Grid.standard_grid()
states = grid.all_states()
actions = list(set([action   for action_tup in grid.actions.values() for action in action_tup]))

In [6]:
actions

['U', 'L', 'R', 'D']

In [7]:
print("Rewards of grid")
print_values(grid.rewards,grid)

Rewards of grid
--------------------------
 0.00| 0.00| 0.00| 1.00|
--------------------------
 0.00| 0.00| 0.00|-1.00|
--------------------------
 0.00| 0.00| 0.00| 0.00|


### Initialize  policy

In [8]:
policy = {(2,0):'U',
         (1,0):'U',
         (0,0):'R',
         (0,1):'R',
         (0,2):'R',
         (1,2):'R',
         (2,1):'R',
         (2,2):'R',
         (2,3):'U'}

print_policy(policy,grid)

---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


In [24]:
#%%pixie_debugger
def average_n_step_returns(policy,episodes ,gamma = 1,alpha = 0.9,steps_to_average = [2,3,4]):
    V = defaultdict(lambda:0)
    n = max(steps_to_average)
    
    for episode in range(episodes):
        s = (2,0)
        grid.set_state(s)
        T = np.inf # book code sets it to infinite
        finished = False
        t = 0
        
        rewards = dict() #to store the sequence of rewards
        states = dict() # to store the sequence of actions
        
        states[0] = s
        
        while not finished: # finished when tao = T  - 1 (update time is before end of episode)

            if t < T:
                a = policy[s] #take action given by the policy for state s
                
                r = grid.move(a) # observe reward   
                s1 = grid.current_state() #observe the new state
                rewards[t+1] = r
                states[t+1] = s1
                
                if grid.game_over(): #if St+1 is terminal set T to last timestemp
                    T = t + 1
                    
            updated_time =  t  - n +1 # in the book the variable is represente by "tao"
            
            if updated_time >= 0: #update only if  at least "n" steps are already performed
                n_step_returns = dict().fromkeys(steps_to_average,0)
                update_limit = min(updated_time+n,T) 
                G = 0
                for i in range(updated_time +1 , update_limit + 1): # i is the  reward un time i from the next step of the update time
                    
                    for n_step in steps_to_average:
                        
                        if i <= updated_time + n_step:
                            n_step_returns[n_step] = n_step_returns[n_step] + ((gamma**(i - updated_time - 1))* rewards[i])
                    
                if updated_time + n <= T:
                    for n_step in steps_to_average:
                        n_step_returns[n_step] =  n_step_returns[n_step] +  (gamma**n_step)*V[states[updated_time+n_step]]
                        G += n_step_returns[n_step]
                        
                    G/= len(steps_to_average)
                    
                    V[states[updated_time]]  = V[states[updated_time]] + alpha*(G -  V[states[updated_time]])
            
            s  = s1
            t += 1
                
            finished = updated_time == T -1
                
    return V

V = average_n_step_returns(policy,10000,0.9,0.5,[1,2,3,4])

In [15]:
policy


{(0, 0): 'R',
 (0, 1): 'R',
 (0, 2): 'R',
 (1, 0): 'U',
 (1, 2): 'R',
 (2, 0): 'U',
 (2, 1): 'R',
 (2, 2): 'R',
 (2, 3): 'U'}

In [25]:
print("Policy")

print_policy(policy,grid)

Policy
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


In [26]:
print_values(grid=grid,V=V)

--------------------------
 0.00| 0.00| 0.00| 0.00|
--------------------------
 0.18| 0.00| 0.00| 0.00|
--------------------------
 0.04| 0.00| 0.00| 0.00|


## Conclusions
* The average of n-step returns works better than simple n-step return.
* One of the main problems in this implementation is that states closer to finish state are not uptated to to the restriction that if we average a n-step return for an n that is never reached, the average is not performed and the update is not performed.
* Possible solution is to average only  returns for valid values of n. **Experiment on the next cell**

In [31]:
#%%pixie_debugger
def average_n_step_returns(policy,episodes ,gamma = 1,alpha = 0.9,steps_to_average = [2,3,4]):
    V = defaultdict(lambda:0)
    n = max(steps_to_average)
    
    for episode in range(episodes):
        s = (2,0)
        grid.set_state(s)
        T = np.inf # book code sets it to infinite
        finished = False
        t = 0
        
        rewards = dict() #to store the sequence of rewards
        states = dict() # to store the sequence of actions
        
        states[0] = s
        
        while not finished: # finished when tao = T  - 1 (update time is before end of episode)

            if t < T:
                a = policy[s] #take action given by the policy for state s
                
                r = grid.move(a) # observe reward   
                s1 = grid.current_state() #observe the new state
                rewards[t+1] = r
                states[t+1] = s1
                
                if grid.game_over(): #if St+1 is terminal set T to last timestemp
                    T = t + 1
                    
            updated_time =  t  - n +1 # in the book the variable is represente by "tao"
            
            if updated_time >= 0: #update only if  at least "n" steps are already performed
                n_step_returns = dict().fromkeys(steps_to_average,0)
                update_limit = min(updated_time+n,T) 
                G = 0
                for i in range(updated_time +1 , update_limit + 1): # i is the  reward un time i from the next step of the update time
                    
                    for n_step in steps_to_average:
                        
                        if i <= updated_time + n_step:
                            n_step_returns[n_step] = n_step_returns[n_step] + ((gamma**(i - updated_time - 1))* rewards[i])
                    
                if updated_time  < T:
                    valid_steps = 0
                    for n_step in steps_to_average:
                        if updated_time + n_step <= T:
                            n_step_returns[n_step] =  n_step_returns[n_step] +  (gamma**n_step)*V[states[updated_time+n_step]]
                            G += n_step_returns[n_step]
                            valid_steps+=1
                        
                    G/= valid_steps
                    
                    V[states[updated_time]]  = V[states[updated_time]] + alpha*(G -  V[states[updated_time]])
            
            s  = s1
            t += 1
                
            finished = updated_time == T -1
                
    return V

V = average_n_step_returns(policy,10000,0.9,0.5,[1,2,3,4,5])

In [32]:
print_values(grid=grid,V=V)

--------------------------
 0.81| 0.90| 1.00| 0.00|
--------------------------
 0.73| 0.00| 0.00| 0.00|
--------------------------
 0.66| 0.00| 0.00| 0.00|


In [None]:
## Conclusions
* This