# Temporal Difference Control: Expected Sarsa

An implementation of "Double Q-learning"  using a gridworld.

More info about "Double Q-learning" can be found on section 6.7 of "Reinforcement Learning: an introduction" 3rd edition by Barto and Sutton


The gridworld has the shape(3,4) with a winning state "w"(0,3), and a lossing state "l"(1,3), a non valid state "x"(2,1) and a start state s(3,0)

|  |  |  |  |
|---|---|---|---|
|  |  |  | w |
|  |  |  | l |
|  | x |  |  |
| s |  |  |  |

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

import grid_world

### Disccount factor and step size

In [2]:

GAMMA = 0.9
ALPHA =0.1

### Auxiliary function to display the values of a policy after finishing iterative policy evaluation

In [3]:
def print_values(V,grid):
    for i in range(grid.width):
        print("--------------------------")
        for j in range(grid.height):
            v = V.get((i,j),0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")

### Auxiliary function to display a stochastic policy

In [4]:
def print_policy(P,grid):
    for i in range(grid.width):
        print("---------------------------")
        for j in range(grid.height):
            a = P.get((i,j),' ')
            if isinstance(a,dict):
                a = list(a)[0]
            print("  %s  |" % a, end="")
        print("")

### From or defined grid world file, import a negative grid ,retrieve all actions and states and print grid rewards
Negative grid is used to encourage the agent to find a shortest path to the goal

In [5]:
grid = grid_world.Grid.standard_grid()
states = grid.all_states()
actions = list(set([action   for action_tup in grid.actions.values() for action in action_tup]))

In [6]:
def argmax_dict(dictionary):
    # returns the argmax key and the max value from a dictionary
    # will be used for policy improvement from Q
    max_key = None
    max_val = float("-inf")
    
    for k,v in dictionary.items():
        if v > max_val:
            max_val = v
            max_key = k
            
    return max_key,max_val
        
argmax_dict({"a":1,"b":2})

('b', 2)

In [7]:
actions

['U', 'L', 'R', 'D']

In [8]:
def epsilon_greedy_action(Q,state,epsilon=0.1):
    # choose an action using epsilon-greedy strategy
    probability = np.random.random()
    result = 0
    
    if probability < epsilon:
        #explore
        result = np.random.choice(actions)
    else: 
        #exploit
        result = argmax_dict(Q[state])[0]
        
    return result

In [9]:
print("Rewards of grid")
print_values(grid.rewards,grid)

Rewards of grid
--------------------------
 0.00| 0.00| 0.00| 1.00|
--------------------------
 0.00| 0.00| 0.00|-1.00|
--------------------------
 0.00| 0.00| 0.00| 0.00|


### Initialize  policy

In [10]:
policy = {(2,0):'U',
         (1,0):'U',
         (0,0):'R',
         (0,1):'R',
         (0,2):'R',
         (1,2):'R',
         (2,1):'R',
         (2,2):'R',
         (2,3):'U'}

print_policy(policy,grid)

---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


In [11]:
def double_Q_learning(grid,policy,episodes,gamma =1,alpha=1):
    # double Q learning needs double memory because uses 2 estimates of Q
    Q1 = dict()
    Q2 = dict()
    # a combination of Q1 and Q2 is used to to e-greedy action selection, can be an average or sum
    Q = dict()
    
    
    for state in grid.all_states():
        Q1[state] = dict()
        Q2[state] = dict()
        Q[state] = dict()
        for action in actions:
            Q1[state][action] = 0
            Q2[state][action] = 0
            Q[state][action] = 0
            
            

    for episode in range(1,episodes +1):
        epsilon  =  0.5
        finished = False
        
        s = (2,0)
        grid.set_state(s)
        
        
        while not finished:
            a = epsilon_greedy_action(Q,s,epsilon)
            r = grid.move(a)    
            s1 = grid.current_state()
            
            # double learning update (randomly alternate between which Q to update as function of the other)
            q_prob = np.random.random()
            
            if q_prob < 0.5:
                Q1_max_action = argmax_dict(Q1[s1])[0]
                Q1[s][a] = Q1[s][a] + alpha*(r + (gamma*Q2[s1][Q1_max_action]) - Q1[s][a])
            else:
                Q2_max_action = argmax_dict(Q2[s1])[0]
                Q2[s][a] = Q2[s][a] + alpha*(r + (gamma*Q1[s1][Q2_max_action]) - Q2[s][a])
                
            ## combine Q1 and Q2 on a single Q used for e-greedy action selection
            for state in grid.all_states():
                for action in actions:
                    Q[state][action] = (Q1[state][action] + Q2[state][action])/2
                
            # expected sarsa update left for comparison
            #Q[s][a] = Q[s][a] + alpha*(r + (gamma*expected_Qs1) - Q[s][a])
            
            # Q learning update left for comparison
            #new_state_argmax = argmax_dict(Q[s1]) # dictionary with (action,value) meaning returns the best action and its value
            #Q[s][a] = Q[s][a] + alpha*(r + (gamma*new_state_argmax[1]) - Q[s][a])
            
            #sarsa update left for comparison to q LEARNING
            #Q[s][a] = Q[s][a] + alpha*(r +(gamma*Q[s1][a1]) - Q[s][a])
            
            finished =grid.game_over()
                
            s = s1
            
    for s in policy.keys():
        state_greedy_action = argmax_dict(Q[s])
        
        if not state_greedy_action[0] is None:
            policy[s] = state_greedy_action[0]
            
    return policy,Q

policy,Q = double_Q_learning(grid,policy,100,GAMMA,ALPHA)

In [12]:
policy

{(0, 0): 'R',
 (0, 1): 'R',
 (0, 2): 'R',
 (1, 0): 'U',
 (1, 2): 'U',
 (2, 0): 'U',
 (2, 1): 'R',
 (2, 2): 'U',
 (2, 3): 'L'}

In [13]:
print("Policy")
print_policy(policy,grid)

Policy
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  R  |  U  |  L  |


In [14]:
V = defaultdict(lambda:0)
for state in policy.keys():
    V[state] = Q[state][policy[state]]
    print("state  |policy action| state value")
    print(state,"|      ",policy[state] , "    |", V[state] )
    

state  |policy action| state value
(2, 0) |       U     | 0.349539822247958
state  |policy action| state value
(1, 0) |       U     | 0.5284008771870028
state  |policy action| state value
(0, 0) |       R     | 0.7194453806022407
state  |policy action| state value
(0, 1) |       R     | 0.866922356031626
state  |policy action| state value
(0, 2) |       R     | 0.9924546754953881
state  |policy action| state value
(1, 2) |       U     | 0.6013999512870056
state  |policy action| state value
(2, 1) |       R     | 0.04121736874435202
state  |policy action| state value
(2, 2) |       U     | 0.1978937221361523
state  |policy action| state value
(2, 3) |       L     | 0.006862111536491047


In [15]:
print_values(grid=grid,V=V)

--------------------------
 0.72| 0.87| 0.99| 0.00|
--------------------------
 0.53| 0.00| 0.60| 0.00|
--------------------------
 0.35| 0.04| 0.20| 0.01|


## Conclusions
* Double Q-learning helps avoid the "maximization bias" found in Q learning by using the max value of estimates
* It allows to learn a version of Q action values not positive biased
* It does so by using 2 Q estimates and using one to update the other alternating randomly
* Theres also double sarsa and double expected sarsa methods