In [1]:
import gym
import random
import numpy as np

In [3]:
env = gym.make('CliffWalking-v0')

obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {}".format(obs_space))
print("The action space: {}".format(action_space))

The observation space: Discrete(48)
The action space: Discrete(4)


In [4]:
def epsilon_greedy_policy(state,Q,epsilon,action_space):
    if random.uniform(0,1) < epsilon:
        return random.choice(action_space)
    else:
        q_values = []
        for action in action_space:
            q_values.append(Q[state,action])
        return action_space[np.argmax(q_values)]
    

In [5]:
def get_greedy_policy(Q,action_space):
    policy = {}
    for state in range(48):
        q_values = []
        for action in action_space:
            q_values.append(Q[state, action])
        policy[state] = action_space[np.argmax(q_values)]

    return policy

In [6]:
def print_cliffwalk_policy(policy):
    for y in range(4):
        for x in range(12):
            print(policy[12*y + x],end='')
        print()
    print()

In [8]:
# SARSA

action_space = [0,1,2,3]
alpha = 0.5
epsilon = 0.1
gamma = 0.9
Q = {(s,a):0 for s in range(48) for a in range(4)}

for i in range(50000):
    S = env.reset()[0]
    A = epsilon_greedy_policy(S,Q,epsilon,action_space)
    
    while True:
        # Take the action and get the new observation
        S2, R, terminated, truncated, info = env.step(A)
        A2 = epsilon_greedy_policy(S2,Q,epsilon,action_space)
        
        Q[S,A] = Q[S,A] + alpha * (R + gamma * Q[S2,A2] - Q[S,A])
        
        S = S2
        A = A2
        
        if terminated:
            break

SARSA_policy = get_greedy_policy(Q,action_space)
SARSA_Q = Q
print_cliffwalk_policy(SARSA_policy)

111111111122
003300310112
003000000302
000000000000



In [9]:
# Q Learning (SARSA MAX)

action_space = [0,1,2,3]
alpha = 0.5
epsilon = 0.1
gamma = 0.9
Q = {(s,a):0 for s in range(48) for a in range(4)}

for i in range(50000):
    S = env.reset()[0]
    
    while True:
        # get action from epsilon greedy
        A = epsilon_greedy_policy(S,Q,epsilon,action_space)
        # Take the action and get the new observation
        S2, R, terminated, truncated, info = env.step(A)
        
        q_values = []
        for action in action_space:
            q_values.append(Q.get((S2,action),0))
        max_Q = np.max(q_values)
        current_Q = Q.get((S,A),0)
        Q[S,A] = (1 - alpha) * current_Q + alpha * (R + gamma * max_Q)
        
        S = S2
        
        if terminated:
            break

QLearning_policy = get_greedy_policy(Q,action_space)
QLearning_Q = Q
print_cliffwalk_policy(QLearning_policy)

111111111112
111111111112
111111111112
000000000000



In [11]:
# Expected SARSA

def epsilon_greedy_policy(state,Q,epsilon,action_space):
    if random.uniform(0,1) < epsilon:
        return random.choice(action_space)
    else:
        return np.argmax(Q[state])

def get_greedy_policy(Q,action_space):
    policy = {}
    for state in range(48):
        policy[state] = np.argmax(Q[state])
    return policy

action_space = [0,1,2,3]
alpha = 0.5
epsilon = 0.1
gamma = 0.9
Q = {s:[0 for a in range(4)] for s in range(48)}

for i in range(50000):
    S = env.reset()[0]
    
    while True:
        # get action from epsilon greedy
        A = epsilon_greedy_policy(S,Q,epsilon,action_space)
        # Take the action and get the new observation
        S2, R, terminated, truncated, info = env.step(A)
        
        Q_max = np.max(Q[S2])
        expected_Q = 0
        
        # count number of greedy actions possible if there are equally greedy actions
        greedy_actions = 0
        for a in action_space:
            if Q[S2][a] == Q_max:
                greedy_actions += 1
        non_greedy_action_probability = epsilon / len(action_space)
        greedy_action_probability = ((1 - epsilon) / greedy_actions) + non_greedy_action_probability
        
        for A2 in action_space:
            expected_Q += Q[S2][A2] * (greedy_action_probability if Q[S2][A2] == Q_max else non_greedy_action_probability)
        
        td_target = R + gamma * expected_Q
        td_error = td_target - Q[S][A]
        Q[S][A] += alpha * (td_error)
        
        S = S2
        if terminated:
            break

E_SARSA_policy = get_greedy_policy(Q,action_space)
E_SARSA_Q = Q
print_cliffwalk_policy(E_SARSA_policy)

111111111112
111111111112
000000000012
000000000000

