# Cliff Walking

In [88]:
import random

In [89]:
# Episodes 
episodes = 170

# Start
start = (3, 0)

# Goal
goal = (3, 11)

# Epsilon for e-greedy
epsilon = 0.1

In [90]:
def create():
    """
    Create and return Q and arbitrary policy
    """
    
    # Width of Grid
    width = 4

    # Create length to iterate grid
    length = 12

    # Set of equiprobable actions
    actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]
    
    # Policy and Q
    policy = {}; Q = {}

    # Create states for policy
    for i in range(length*width):

        # Create row index
        row = i // length

        # Create column index
        column = i % length

        # Create policy
        policy[(row, column)] = actions
        
        # Create Q(s,a)
        Q[(row, column)] = [0] * len(actions)

    # Q and policy 
    return Q, policy

In [91]:
def boundary(state):
    """
    Returns if agent is in bounds
    """
    # row boundary
    row_boundary = (state[0] < 0 or state[0] >= 4) 
        
    # column boundary
    column_boundary = (state[1] < 0 or state[1] >= 12) 
    
    # if not in bounds
    return row_boundary or column_boundary

In [92]:
def e_greedy(Q, policy, state, goal, epsilon):
    """
    Return an action based on e-greedy policy
    """
    
    # state is goal or out of bounds
    if state == goal or boundary(state):
        
        # Do not move
        return (0,0)

    # Exploitation Arg max a
    argmax = Q[state].index(max(Q[state]))
    
    # e-greedy action
    action = policy[state][argmax]
    
    # Exploration
    if random.random() < epsilon:
        
        # Explorative action
        action = random.choice(policy[state])
    
    # e-greedy action
    return action

In [93]:
def move(state, action, goal, start):
    """
    Return environment obervation
    """
    
    #########
    # CLIFF #
    #########
    cliff = [(3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10)]
    
    # S' after stochastic wind
    s_prime = (state[0] + action[0], state[1] + action[1])
    
    # Check goal reached
    if s_prime == goal:
        
        # S', R = 0 on termination
        return s_prime, 0
    
    # Agent Fell off cliff
    if s_prime in cliff:
        
        # Back to the start
        return start, -100
    
    # check if out of bounds
    if boundary(s_prime):
        
        # Remain in state
        s_prime = state
    
    # S', R = -1 on all transitions
    return s_prime, -1

In [94]:
def sarsa_on_policy(episodes, start, goal, epsilon):
    """
    Return q* from SARSA On-Policy (e-greedy)
    """
    
    ########################################################
    # Sarsa (on-policy TD control) for estimating Q = q_pi #
    ########################################################

    # Algorithm parameters: step size a -> (0, 1], small "a > 0
    alpha = 0.5

    # Initialize Q(s, a), for all s of S+, a of A(s), arbitrarily except that Q(terminal, ·)=0
    Q, policy = create()

    #Loop for each episode:
    for _ in range(episodes):

        # Initialize S
        S = start

        # Choose A from S using policy derived from Q (e-greedy)
        A = e_greedy(Q, policy, S, goal, epsilon)

        #Loop for each step of episode: until S is terminal
        while S != goal:

            # Take action A, observe R, S'
            S_prime, R = move(S, A, goal, start)
            
            # Choose A' from S' using policy derived from Q (e-greedy)
            A_prime = e_greedy(Q, policy, S_prime, goal, epsilon)

            # Q(S, A) <-- Q(S, A) + [R + gamma*Q(S', A') - Q(S, A)]
            Q[S][policy[S].index(A)] = Q[S][policy[S].index(A)] + alpha*(R + Q[S_prime][policy[S_prime].index(A_prime)] - Q[S][policy[S].index(A)])
            
            # S <-- S'; A <-- A';
            S = S_prime; A = A_prime
            
    # Output Q estimate of q*
    return Q

In [95]:
def Q_learning(episodes, start, goal, epsilon):
    """
    Return q* from Q-Learning Off-Policy TD Control (e-greedy)
    """
    
    ##########################################################
    # Q-Learning Off-Policy TD Control for estimating Q = q* #
    ##########################################################

    # Algorithm parameters: step size a -> (0, 1], small "a > 0
    alpha = 0.5

    # Initialize Q(s, a), for all s of S+, a of A(s), arbitrarily except that Q(terminal, ·)=0
    Q, policy = create()

    #Loop for each episode:
    for _ in range(episodes):

        # Initialize S
        S = start

        #Loop for each step of episode: until S is terminal
        while S != goal:
            
            # Choose A from S using policy derived from Q (e-greedy)
            A = e_greedy(Q, policy, S, goal, epsilon)

            # Take action A, observe R, S'
            S_prime, R = move(S, A, goal, start)

            # Q(S, A) <-- Q(S, A) + [R + gamma*Q(S', A') - Q(S, A)]
            Q[S][policy[S].index(A)] = Q[S][policy[S].index(A)] + alpha*(R + max(Q[S_prime]) - Q[S][policy[S].index(A)])
            
            # S <-- S'
            S = S_prime
            
    # Output Q estimate of q*
    return Q

In [96]:
def expected_sarsa(episodes, start, goal, epsilon):
    """
    Return q* from Expected Sarsa TD Control (e-greedy)
    """
    
    ###################################################
    # Expected Sarsa TD Control for estimating Q = q* #
    ###################################################

    # Algorithm parameters: step size a -> (0, 1], small "a > 0
    alpha = 0.5

    # Initialize Q(s, a), for all s of S+, a of A(s), arbitrarily except that Q(terminal, ·)=0
    Q, policy = create()

    #Loop for each episode:
    for _ in range(episodes):

        # Initialize S
        S = start

        #Loop for each step of episode: until S is terminal
        while S != goal:
            
            # Choose A from S using policy derived from Q (e-greedy)
            A = e_greedy(Q, policy, S, goal, epsilon)

            # Take action A, observe R, S'
            S_prime, R = move(S, A, goal, start)

            # Q(S, A) <-- Q(S, A) + [R + gamma*pi(a|St+1)*Q(St+1, a)  Q(St, At)] includes bootstrapping 
            Q[S][policy[S].index(A)] = Q[S][policy[S].index(A)] + alpha*(R + ((1-epsilon)*max(Q[S_prime]) + epsilon/len(policy[S_prime])*sum(Q[S_prime])) - Q[S][policy[S].index(A)])
            
            # S <-- S'
            S = S_prime
            
    # Output Q estimate of q*
    return Q

In [97]:
print("Q-Learning")
print(Q_learning(episodes, start, goal, epsilon))
print()
print("Expected-Sarsa")
print(expected_sarsa(episodes, start, goal, epsilon))

Q-Learning
{(0, 0): [-9.921661376953125, -10.423448341756739, -9.923095703125, -10.118081634747796], (0, 1): [-9.9388427734375, -10.074306839224846, -10.091433328267158, -9.762237860122696], (0, 2): [-9.063621520996094, -9.127956374548376, -9.651172772690188, -9.036303657921962], (0, 3): [-8.28125, -8.799790023360401, -9.040111972950399, -8.371845092624426], (0, 4): [-7.8125, -8.003346993820742, -8.242163620889187, -7.744209345430136], (0, 5): [-7.0, -6.958580732345581, -8.080220684409142, -6.9318785555660725], (0, 6): [-6.5, -6.305429560656194, -7.114945277571678, -6.214316293597221], (0, 7): [-5.5, -5.508627311606691, -5.44888973236084, -5.491674304008484], (0, 8): [-4.71875, -4.727063780534081, -4.9317779541015625, -4.696332883089781], (0, 9): [-4.0, -3.8745597056112855, -4.359535217285156, -3.8137969970703125], (0, 10): [-3.0, -2.939502716064453, -3.4501953125, -2.90887451171875], (0, 11): [-2.0, -1.988433837890625, -2.59375, -2.25], (1, 0): [-10.518830832530057, -10.52937337298042