# Windy Gridworld
- Gridworld, with start and goal states.
- Crosswind running upward through the middle of the grid.
- Actions: North, South, East, West, NE, NW, SE, SW.
- In the middle region actions to the next state are shifted upward by a “wind”. 
- Strength of wind varies from column to column.

In [1]:
import matplotlib.pyplot as plt
import random

In [2]:
# Episodes 
episodes = 170

# Start
start = (3, 0)

# Goal
goal = (3, 7)


In [3]:
def create():
    """
    Create and return Q and arbitrary policy
    """
    
    # Width of Grid
    width = 10

    # Create length to iterate grid
    length = width * width

    # Set of equiprobable actions
    actions = [(-1, 0), (1, 0), (0, 1), (0, -1), (0, 0),\
                            (-1, 1), (1, 1), (-1, -1), (1, -1)]
    # Policy
    policy = {}; Q = {}

    # Create states for policy
    for i in range(length):

        # Create row index
        row = i // width

        # Create column index
        column = i % width

        # Create policy
        policy[(row, column)] = actions
        
        # Create Q(s,a)
        Q[(row, column)] = [0 for action in actions]

    # Q and policy 
    return Q, policy

In [4]:
def boundary(state):
    """
    Returns if agent is in bounds
    """
    # row boundary
    row_boundary = (state[0] < 0 or state[0] >= 10) 
        
    # column boundary
    column_boundary = (state[1] < 0 or state[1] >= 10) 
    
    # if not in bounds
    return row_boundary or column_boundary

In [5]:
def e_greedy(Q, policy, state, goal):
    """
    Return an action based on e-greedy policy
    """
    
    # state is goal or out of bounds
    if state == goal or boundary(state):
        
        # Do not move
        return (0,0)

    # Exploitation Arg max a
    argmax = Q[state].index(max(Q[state]))
    
    # e-greedy action
    action = policy[state][argmax]
    
    # Exploration
    if random.random() < 0.1:
        
        # Explorative action
        action = random.choice(policy[state])
    
    # e-greedy action
    return action

In [6]:
def wind(state, action):
    """
    Returns next state from stochastic wind
    """
    
    # Wind tunnel
    wind = {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 2, 7: 2, 8: 1, 9: 0}
    
    # Wind gust 
    gust = random.choice([wind[state[1]], wind[state[1]] +1, -wind[state[1]] ])
    
    # Next state
    s_prime = (state[0] + action[0], state[1] + action[1])
    
    # check if state is in wind tunnel
    if state[0] > 0:
        
        # Add gust values to action and state
        possible = (state[0] + action[0] + gust, state[1] + action[1])
    
        # Check still out of bounds
        if not boundary(possible):
            
            # Next state is same state
            s_prime = possible
    
    return s_prime

In [7]:
def move(state, action, goal):
    """
    Return environment obervation
    """
    
    # S' after stochastic wind
    s_prime = wind(state, action)
    
    # Check goal reached
    if s_prime == goal:
        
        # S', R = 0 on termination
        return s_prime, 0
    
    # check if out of bounds
    if boundary(s_prime):
        
        # Remain in state
        s_prime = state
    
    # S', R = -1 on all transitions
    return s_prime, -1

In [8]:
def sarsa_on_policy(episodes, start, goal):
    """
    Return q* from SARSA On-Policy (e-greedy)
    """
    
    ########################################################
    # Sarsa (on-policy TD control) for estimating Q = q_pi #
    ########################################################

    # Algorithm parameters: step size a -> (0, 1], small "a > 0
    alpha = 0.5

    # Initialize Q(s, a), for all s of S+, a of A(s), arbitrarily except that Q(terminal, ·)=0
    Q, policy = create()

    #Loop for each episode:
    for _ in range(episodes):

        # Initialize S
        S = start

        # Choose A from S using policy derived from Q (e-greedy)
        A = e_greedy(Q, policy, S, goal)

        #Loop for each step of episode: until S is terminal
        while S != goal:

            # Take action A, observe R, S'
            S_prime, R = move(S, A, goal)
            
            # Choose A' from S' using policy derived from Q (e-greedy)
            A_prime = e_greedy(Q, policy, S_prime, goal)

            # Q(S, A) <-- Q(S, A) + [R + gamma*Q(S', A') - Q(S, A)]
            Q[S][policy[S].index(A)] = Q[S][policy[S].index(A)] + alpha*(R + Q[S_prime][policy[S_prime].index(A_prime)] - Q[S][policy[S].index(A)])
            
            # S <-- S'; A <-- A';
            S = S_prime; A = A_prime
            
    # Output Q estimate of q*
    return Q

In [9]:
print(sarsa_on_policy(episodes, start, goal))

{(0, 0): [-9.57138217317273, -9.99588324022009, -9.868394540141715, -10.038537712732538, -9.832247816111874, -9.857052393344938, -10.169705443298465, -10.287631711595601, -10.309564080641625], (0, 1): [-9.622259146532372, -9.931853442858262, -9.774214995982058, -9.500228542235236, -9.495468982088282, -10.162875788292322, -9.888128586403651, -10.119183485585266, -9.799646029866523], (0, 2): [-9.889126191693853, -9.708621279929346, -9.106591924363965, -9.422399656557436, -10.074337868199748, -10.074251348205038, -9.508653914480956, -10.078725708193996, -9.703328619901804], (0, 3): [-9.126171217921272, -8.992542822123905, -6.968883304734781, -8.859257544406809, -8.846682340120191, -8.692802064098071, -9.034434966240784, -9.227965660875176, -9.218516716896739], (0, 4): [-8.538081306533712, -8.6401980268653, -5.306436165417003, -8.228996048597125, -8.091019324785025, -8.101533853019314, -8.076331573383918, -8.452360268792287, -8.678307601416911], (0, 5): [-7.5558098311741535, -7.58961999578