# Game 4

In [1]:
import random
import matplotlib.pyplot as plt

In [2]:
# Step-size parameter --> (0,1]
alpha = 0.1

# Gamma
gamma = 0.9

# TERMINAL
terminal = [0, 5]

# n in n-step
n = 2

# Episodes (runs)
episodes = 1

In [3]:
def n_step_td(alpha, gamma, n, terminal, episodes):
    """
    n-step TD for estimating V = v_pi
    """

    # Input: a policy pi
    policy = 1
    
    # Initialize V(s) arbitrarily, for all s 2 S
    V = [0] * 6
    
    # All store and access operations (for St and Rt) can take their index mod n + 1
    store = []
    
    # Loop for each episode:
    for i in range(episodes):
        
        # Initialize S0 != terminal
        S = 1
        
        # Store S0 != terminal
        store.append((S, 0))
        
        # T <-- inf
        T = float('inf');  t = tau = 0
        
        # Loop for t = 0, 1, 2,... : Until t' = T - 1
        while tau != T-1:

            # If t < T, then:
            if t < T:
                
                # Take an action according to pi(·|St)
                S += policy
                
                # reset reward
                R = 0
                
                # Take action A, observe R, S'
                if S == terminal[0]: R = 0
                    
                # Take action A, observe R, S'
                if S == terminal[1]: R = 1

                # Store the next reward as Rt+1 and the next state as St+1
                store.append((S, R))
                
                # If St+1 is terminal, then T <-- t + 1
                if S in terminal: T = t + 1
                print(T)
            # t' <-- t - n +1 (t' is the time whose state’s estimate is being updated)
            tau = t - n + 1
            
            # If t' >= 0:
            if tau >= 0:
            
                # G <-- sum(i=t'+1 to min(t'+n,T)) gamma^i-t'-1*Ri
                G = sum([(gamma**(i-tau-1))*store[i][1] for i in range(tau + 1, min(tau + n, T)+1)])
                print(f"{t}: G={G} tau={tau}")
                # If t'+n < T, then: G <-- G + gamma^n*V(St'+n)
                if tau+n < T: G = G + gamma**n*V[store[tau+n][0]]

                # V(St') <-- V(St') + a[G - V(St')]
                V[store[tau][0]] = V[store[tau][0]] + alpha*(G - V[store[tau][0]])
            
            # Until t' = T - 1
            t += 1

    # Return v_pi
    return V

In [4]:
print(n_step_td(alpha, gamma, n, terminal, episodes))

inf
inf
1: G=0.0 tau=0
inf
2: G=0.0 tau=1
4
3: G=0.9 tau=2
4: G=1.0 tau=3
[0, 0.0, 0.0, 0.09000000000000001, 0.1, 0]


In [5]:
def n_step_q_lol(alpha, gamma, n, terminal, episodes):
    """
    n-step Q-learning Off-policy Learning
    """

    # Input: a policy pi
    policy = (-1, 1)
    
    # Initialize V(s) arbitrarily, for all s 2 S
    V = [0] * 6; Q = {i: [0, 0] for i in range(6)}
    
    # All store and access operations (for St and Rt) can take their index mod n + 1
    store = []
    
    # Loop for each episode:
    for i in range(episodes):
        
        # Initialize S0 != terminal
        S = 1
        
        # Store S0 != terminal
        store.append((S, 0))
        
        # T <-- inf
        T = float('inf');  t = tau = 0
        
        # Loop for t = 0, 1, 2,... : Until t' = T - 1
        while tau != T-1:

            # If t < T, then:
            if t < T:
                
                # Take an action according to pi(·|St)
                S += policy[1]
                
                # reset reward
                R = 0
                
                # Take action A, observe R, S'
                if S == terminal[0]: R = 0
                    
                # Take action A, observe R, S'
                if S == terminal[1]: R = 1

                # Store the next reward as Rt+1 and the next state as St+1
                store.append((S, R))
                
                # If St+1 is terminal, then T <-- t + 1
                if S in terminal: T = t + 1
                print(T)
            # t' <-- t - n +1 (t' is the time whose state’s estimate is being updated)
            tau = t - n + 1
            
            # If t' >= 0:
            if tau >= 0:
            
                # G <-- sum(i=t'+1 to min(t'+n,T)) gamma^i-t'-1*Ri
                G = sum([(gamma**(i-tau-1))*store[i][1] for i in range(tau + 1, min(tau + n, T)+1)])
                print(f"{t}: G={G} tau={tau}")
                # If t'+n < T, then: G <-- G + gamma^n*max_aQ(St'+n, a)
                if tau+n < T: G = G + gamma**n*max(Q[store[tau+n][0]])

                # Q(St',At') <-- Q(St', At') + a[G - Q(St', At')]
                Q[store[tau][0]][1] = Q[store[tau][0]][1] + alpha*(G - Q[store[tau][0]][1])
            
            # Until t' = T - 1
            t += 1

    # Return v_pi
    return Q

In [6]:
print(n_step_q_lol(alpha, gamma, n, terminal, episodes))

inf
inf
1: G=0.0 tau=0
inf
2: G=0.0 tau=1
4
3: G=0.9 tau=2
4: G=1.0 tau=3
{0: [0, 0], 1: [0, 0.0], 2: [0, 0.0], 3: [0, 0.09000000000000001], 4: [0, 0.1], 5: [0, 0]}


In [7]:
def e_greedy(Q1, Q2, policy, state, epsilon):
    """
    Return an action based on e-greedy policy
    """
    
    # Exploitation Arg max a Q2
    argmax = Q2[state].index(max(Q2[state]))
    
    # Q1 > Q2
    if max(Q1[state]) > max(Q2[state]):
        
    
        # Exploitation Arg max a
        argmax = Q1[state].index(max(Q1[state]))
    
    # e-greedy action
    action = policy[state][argmax]
    
    # Exploration
    if random.random() < epsilon:
        
        # Explorative action
        action = random.choice(policy[state])
    
    # e-greedy action
    return action

In [8]:
def move(state, action):
    """
    Return environment obervation
    """

    # S' after 
    s_prime = state + action
    
    # Check if terminal one reached
    if s_prime == 0:
        
        # S', R = uniform distribution on termination
        return s_prime, random.gauss(-0.1, 1.0)
    
    # S', R = 0 on all transitions and terminal two
    return s_prime, 0

In [9]:
def double_Q_learning(episode, epsilon = 0.1):
    """
    Return q* from Q-Learning Off-Policy TD Control (e-greedy)
    """
    
    ###########################################
    # Double Q-Learning for estimating Q = q* #
    ###########################################
    
    # Policy and Terminal
    policy = {0: [0, 0], 1: [-1, 1], 2: [-1, 1], 3: [0, 0]}; terminal = [0,3]
    
    # Algorithm parameters: step size a -> (0, 1], small "a > 0
    alpha = 0.1

    # Initialize Q(s, a), for all s of S+, a of A(s), arbitrarily except that Q(terminal, ·)=0
    Q1 = {0: [0, 0], 1: [0, 0], 2: [0, 0], 3: [0, 0]}; Q2 = {0: [0, 0], 1: [0, 0], 2: [0, 0], 3: [0, 0]} 
    
    #Loop for each episode:
    for _ in range(episodes):

        # Initialize S
        S = 2

        #Loop for each step of episode: until S is terminal
        while S not in terminal:
            
            # Choose A from S using policy derived from Q1 & Q2 (e-greedy)
            A = e_greedy(Q1, Q2, policy, S, epsilon)

            # Take action A, observe R, S'
            S_prime, R = move(S, A)
            
            # With 0.5 probabilility
            if random.choice((0, 1)) == 1:
                
                # Q1(S, A) <-- Q1(S, A) + [R + gamma*Q2(S', argmax Q1(S', a)) - Q1(S, A)]
                Q1[S][policy[S].index(A)] = Q1[S][policy[S].index(A)] + alpha*(R + Q2[S_prime][Q1[S_prime].index(max(Q1[S_prime]))]  - Q1[S][policy[S].index(A)])
            
            # else:
            else:
                
                # Q2(S, A) <-- Q2(S, A) + [R + gamma*Q1(S', argmax Q2(S', a)) - Q2(S, A)]
                Q2[S][policy[S].index(A)] = Q2[S][policy[S].index(A)] + alpha*(R + Q1[S_prime][Q2[S_prime].index(max(Q2[S_prime]))] - Q2[S][policy[S].index(A)])
                
            # S <-- S'
            S = S_prime
            
    # Output Q estimate of q*
    return list(Q1.values()), list(Q2.values())