In [1]:
import gymnasium as gym
import numpy as np

In [2]:
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode=None)
state, info = env.reset()

state_size = env.observation_space.n
action_size = env.action_space.n

print("States:", state_size)
print("Actions:", action_size)

States: 16
Actions: 4


## Parameters:

- Learning rate (alpha) = 0.80 
- Discount factor (gamma) = 0.95  

In [3]:
alpha = 0.80
gamma = 0.95

eps_start = 1.0    # High initial eps value -> for full exploration initially
eps_end = 0.05    # Low final eps value -> for almost full exploitation
eps_decay = 0.9995    # Controlling at what pace does exploration reduces

episodes = 5000
max_steps = 50

## Epsilon-Greedy Strategy:

Epsilon is decayed exponentially using the formula:

{ epsilon = max(eps_end, eps_start Ã— ((eps_decay)^episode)) }

In the epsilon-greedy strategy, the agent explores the environment by taking random actions with probability eps. As training progresses, eps decays     exponentially, allowing the agent to exploit the learned Q-values.

In [4]:
def selectAction(Q, state, epsilon):
    if np.random.random() < epsilon:
        return env.action_space.sample()    # Choosing random action (as exploration)
    else:
        return np.argmax(Q[state, :])    # Choosing action with highest Q-value (as exploitation)

## Training Using Q-Learning:

The Q-value update rule is:

{ Q[state, action] = Q[state, action] + alpha*(reward + gamma*max(Q[next_state]) - Q[state, action]) }


In [5]:
def epsGreedyQLearning(alpha, gamma):
    q_table = np.zeros((state_size, action_size))
    
    epsilon = eps_start
    
    for episode in range(episodes):
        state, info = env.reset()

        for step in range(max_steps):
            action = selectAction(q_table, state, epsilon)

            new_state, reward, terminated, truncated, info = env.step(action)
            
            max_future_q = np.max(q_table[new_state, :])
            old_q_value = q_table[state, action]

            q_table[state, action] = old_q_value + alpha * (
                reward + gamma * max_future_q - old_q_value
            )

            state = new_state

            if terminated or truncated:
                break
                
        epsilon = max(eps_end, eps_start * (eps_decay ** episode))

    return q_table

In [6]:
q_table = epsGreedyQLearning(alpha, gamma)

In [7]:
np.save("frozenlake_Assignment2_Q1_qtable.npy", q_table)

In [8]:
q_table = np.load("frozenlake_Assignment2_Q1_qtable.npy")

In [9]:
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human")
state, info = env.reset()

done = False
success = False

while not done:
    action = np.argmax(q_table[state, :])
    state, reward, terminated, truncated, info = env.step(action)

    if reward == 1:
        success = True
        
    done = terminated or truncated

if success:
    print("--> Test successful!! <--")
else:
    print("Test failed!!")


--> Test successful!! <--
