# n-step SARSA Control

An implementation of "n-step SARSA" for control  using a gridworld.

More info about "n-step SARSA" can be found on section 7.2 of "Reinforcement Learning: an introduction" 2nd edition by Barto and Sutton


The gridworld has the shape(3,4) with a winning state "w"(0,3), and a lossing state "l"(1,3), a non valid state "x"(2,1) and a start state s(3,0)

|  |  |  |  |
|---|---|---|---|
|  |  |  | w |
|  |  |  | l |
|  | x |  |  |
| s |  |  |  |

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

import grid_world

### Disccount factor and step size

In [2]:

GAMMA = 0.9
ALPHA =0.1

### Auxiliary function to display the values of a policy after finishing iterative policy evaluation

In [3]:
def print_values(V,grid):
    for i in range(grid.width):
        print("--------------------------")
        for j in range(grid.height):
            v = V.get((i,j),0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")

### Auxiliary function to display a stochastic policy

In [4]:
def print_policy(P,grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            a = P.get((i,j),' ')
            if isinstance(a,dict):
                a = list(a)[0]
            print("  %s  |" % a, end="")
        print("")

### From or defined grid world file, import a negative grid ,retrieve all actions and states and print grid rewards
Negative grid is used to encourage the agent to find a shortest path to the goal

In [5]:
grid = grid_world.Grid.standard_grid()
states = grid.all_states()
actions = list(set([action   for action_tup in grid.actions.values() for action in action_tup]))

In [6]:
def argmax_dict(dictionary):
    # returns the argmax key and the max value from a dictionary
    # will be used for policy improvement from Q
    max_key = None
    max_val = float("-inf")
    
    for k,v in dictionary.items():
        if v > max_val:
            max_val = v
            max_key = k
            
    return max_key,max_val
        
argmax_dict({"a":1,"b":2})

('b', 2)

In [7]:
actions

['L', 'D', 'U', 'R']

In [8]:
def epsilon_greedy_action(Q,state,epsilon=0.1):
    # choose an action using epsilon-greedy strategy
    probability = np.random.random()
    result = 0
    
    if probability < epsilon:
        #explore
        result = np.random.choice(actions)
    else: 
        #exploit
        result = argmax_dict(Q[state])[0]
        
    return result

In [9]:
print("Rewards of grid")
print_values(grid.rewards,grid)

Rewards of grid
--------------------------
 0.00| 0.00| 0.00| 1.00|
--------------------------
 0.00| 0.00| 0.00|-1.00|
--------------------------
 0.00| 0.00| 0.00| 0.00|


### Initialize  policy

In [10]:
policy = {(2,0):'U',
         (1,0):'U',
         (0,0):'R',
         (0,1):'R',
         (0,2):'R',
         (1,2):'R',
         (2,1):'R',
         (2,2):'R',
         (2,3):'U'}

print_policy(policy,grid)

---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


In [11]:
print("Rewards of grid")
print_values(grid.rewards,grid)

Rewards of grid
--------------------------
 0.00| 0.00| 0.00| 1.00|
--------------------------
 0.00| 0.00| 0.00|-1.00|
--------------------------
 0.00| 0.00| 0.00| 0.00|


In [13]:
def n_step_SARSA(policy,episodes ,gamma = 1,alpha = 0.9,n=2):
    Q = dict()
    V = defaultdict()
    for state in grid.all_states():
        Q[state] = dict()
        for action in actions:
            Q[state][action] = 0 # np.random.random()
            
    for episode in range(1,episodes+1):
        
        epsilon =  1/episode
        states = dict() #to store the sequence of states
        rewards = dict() #to store the sequence of rewards
        taken_actions = dict() #to store the sequence of actions
        
        s = (2,0)
        grid.set_state(s)
        states[0] = s
        
        a = epsilon_greedy_action(Q,s,epsilon)
        taken_actions[0] = a
        
        rewards[0] = 0
        
        T = 100000 #bo0ke code sets it to infinite
        finished  = False
        t = 0
        
        while not finished: #finished when tao = T  - 1 (update time is before end of episode)
            if t < T:
                r = grid.move(a)
                s1 =grid.current_state()
                rewards[t+1] = r
                states[t+1] = s1
                
                if grid.game_over(): #if St+1 is terminal set T to last timestemp
                    T =  t +1
                else:
                    a = epsilon_greedy_action(Q,s1,epsilon)
                    taken_actions[t+1] = a
                    
            updated_time = t - n + 1 # in the book the variable is represente by "tao" (the time step t for update)
            
            if updated_time >= 0: #update only when at least n state-action sequences observed
                update_limit = min(updated_time +n, T)
                G = 0 
                for i in range(updated_time +1, update_limit +1):
                    G += ((gamma**(i-updated_time-1))*rewards[i])
                    
                if updated_time + n < T :
                    G = G + ((gamma**n)*Q[states[updated_time+n]][taken_actions[updated_time+n]])
                Q[states[updated_time]][taken_actions[updated_time]]  =  Q[states[updated_time]][taken_actions[updated_time]] + alpha*(G - Q[states[updated_time]][taken_actions[updated_time]])
                
                
            s = s1
            t+=1
            
            finished = (updated_time == T -1)
        
    for s in policy.keys():
        state_greedy_action = argmax_dict(Q[s])
                    
        if not state_greedy_action[0] is None:
            policy[s] = state_greedy_action[0]
            V[s] = Q[s][state_greedy_action[0]]
                
    return Q,V,policy

Q,V,policy = n_step_SARSA(policy,500,0.9,0.25,5)

In [14]:
policy

{(0, 0): 'R',
 (0, 1): 'R',
 (0, 2): 'R',
 (1, 0): 'U',
 (1, 2): 'D',
 (2, 0): 'U',
 (2, 1): 'L',
 (2, 2): 'L',
 (2, 3): 'L'}

In [15]:
print("Policy")
print_policy(policy,grid)

Policy
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  D  |     |
---------------------------
  U  |  L  |  L  |  L  |


In [16]:
print_values(grid=grid,V=V)

--------------------------
 0.81| 0.90| 1.00| 0.00|
--------------------------
 0.73| 0.00| 0.08| 0.00|
--------------------------
 0.66| 0.20| 0.10| 0.00|


## Conclusions
* n-step sarsa speedups policy learning because updates "n" possible Q entries instead of only 1.
* Its similar to n-step prediction conbined with sarsa but taking n actions