# Maximization Bias
- States: Terminal 1 <-> B <-> A <-> Terminal 2 
- Actions: Left, Right
- Rewards: 0 except ending in Terminal 1 reward drawn from a normal distribution with mean 0.1 and variance 1.0

In [46]:
import random

In [47]:
# Epsilon
epsilon = 0.1

# Episodes
episodes = 300



In [48]:
def e_greedy(Q1, Q2, policy, state, epsilon):
    """
    Return an action based on e-greedy policy
    """
    
    # Exploitation Arg max a Q2
    argmax = Q2[state].index(max(Q2[state]))
    
    # Q1 > Q2
    if max(Q1[state]) > max(Q2[state]):
        
    
        # Exploitation Arg max a
        argmax = Q1[state].index(max(Q1[state]))
    
    # e-greedy action
    action = policy[state][argmax]
    
    # Exploration
    if random.random() < epsilon:
        
        # Explorative action
        action = random.choice(policy[state])
    
    # e-greedy action
    return action

In [49]:
def move(state, action):
    """
    Return environment obervation
    """

    # S' after 
    s_prime = state + action
    
    # Check if terminal one reached
    if s_prime == 0:
        
        # S', R = uniform distribution on termination
        return s_prime, random.gauss(-0.1, 1.0)
    
    # S', R = 0 on all transitions and terminal two
    return s_prime, 0

In [50]:
def double_Q_learning(episodes, epsilon):
    """
    Return q* from Q-Learning Off-Policy TD Control (e-greedy)
    """
    
    ###########################################
    # Double Q-Learning for estimating Q = q* #
    ###########################################
    
    # Policy and Terminal
    policy = {0: [0, 0], 1: [-1, 1], 2: [-1, 1], 3: [0, 0]}; terminal = [0,3]
    
    # Algorithm parameters: step size a -> (0, 1], small "a > 0
    alpha = 0.1

    # Initialize Q(s, a), for all s of S+, a of A(s), arbitrarily except that Q(terminal, ·)=0
    Q1 = {0: [0, 0], 1: [0, 0], 2: [0, 0], 3: [0, 0]}; Q2 = {0: [0, 0], 1: [0, 0], 2: [0, 0], 3: [0, 0]} 
    
    #Loop for each episode:
    for _ in range(episodes):

        # Initialize S
        S = 2

        #Loop for each step of episode: until S is terminal
        while S not in terminal:
            
            # Choose A from S using policy derived from Q1 & Q2 (e-greedy)
            A = e_greedy(Q1, Q2, policy, S, epsilon)

            # Take action A, observe R, S'
            S_prime, R = move(S, A)
            
            # With 0.5 probabilility
            if random.choice((0, 1)) == 1:
                
                # Q1(S, A) <-- Q1(S, A) + [R + gamma*Q2(S', argmax Q1(S', a)) - Q1(S, A)]
                Q1[S][policy[S].index(A)] = Q1[S][policy[S].index(A)] + alpha*(R + Q2[S_prime][Q1[S_prime].index(max(Q1[S_prime]))]  - Q1[S][policy[S].index(A)])
            
            # else:
            else:
                
                # Q2(S, A) <-- Q2(S, A) + [R + gamma*Q1(S', argmax Q2(S', a)) - Q2(S, A)]
                Q2[S][policy[S].index(A)] = Q2[S][policy[S].index(A)] + alpha*(R + Q1[S_prime][Q2[S_prime].index(max(Q2[S_prime]))] - Q2[S][policy[S].index(A)])
                
            # S <-- S'
            S = S_prime
            
    # Output Q estimate of q*
    return list(Q1.values()), list(Q2.values())

In [51]:
Q1, Q2 = double_Q_learning(episodes, epsilon)
print(Q1)
print()
print(Q2)

[[0, 0], [-0.1872525738735224, 0.0], [-0.017257507212251496, 0.0], [0, 0]]

[[0, 0], [-0.26303165999468825, 0.0], [-0.004797619023098102, 0.0], [0, 0]]
