In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import gymnasium as gym
import numpy as np

env = gym.make("FrozenLake-v1", is_slippery=False)
n_states = env.observation_space.n
n_actions = env.action_space.n

Q_sarsa = np.zeros((n_states, n_actions))
Q_qlearn = np.zeros((n_states, n_actions))

alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 500

# Choosing greedy action with prob 1 - epsilon, random with prob epsilon. Higher epsilon -> more exploration, slower convergence
def epsilon_greedy(Q, state):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    return np.argmax(Q[state])


for ep in range(episodes):
    state_sarsa, _ = env.reset()
    state_q, _ = env.reset()

    # SARSA
    action_sarsa = epsilon_greedy(Q_sarsa, state_sarsa)
    done = False
    while not done:
        next_state, reward, terminated, truncated, _ = env.step(action_sarsa)
        done = terminated or truncated
        next_action = epsilon_greedy(Q_sarsa, next_state)
        # Updates Q matrix with term gamma * Q[next_state, next_action)] -> on-policy because it updates based on the next action which is the real trajectory
        Q_sarsa[state_sarsa, action_sarsa] += alpha * (reward + gamma * Q_sarsa[next_state, next_action] - Q_sarsa[state_sarsa, action_sarsa])
        state_sarsa, action_sarsa = next_state, next_action

    # Q-learning
    state_q, _ = env.reset()
    done = False
    while not done:
        action = epsilon_greedy(Q_qlearn, state_q)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        # Updates Q matrix with term gamma * np.max(Q_[next_state]) -> off-policy because it chooses greedily after next_state
        Q_qlearn[state_q, action] += alpha * (reward + gamma * np.max(Q_qlearn[next_state]) - Q_qlearn[state_q, action])
        state_q = next_state