 we studied the Q-Learning algorithm, letâ€™s implement it from scratch and train our Q-Learning agent in frozen lake environment

In [1]:
import numpy as np
import gymnasium as gym
import random
import imageio
import os
import tqdm
import time

import pickle
from tqdm.notebook import tqdm

In [2]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode = "human")

print("Observation Space", env.observation_space)
print("Sample observation", env.observation_space.sample()) 

Observation Space Discrete(16)
Sample observation 5


In [3]:
print("Action Space Shape", env.action_space.n) # le3ft, down, right, up
print("Action Space Sample", env.action_space.sample())  # Take a random action

Action Space Shape 4
Action Space Sample 2


reward : reach goal +1, reach hole 0, reach frozen 0

Now we will initialize the q table

In [4]:
state_space = env.observation_space.n
action_space = env.action_space.n

In [5]:
def initialize_q_table(state_space, action_space):
    q_table = np.zeros((state_space, action_space))
    return q_table

In [6]:
q_table_frozenlake = initialize_q_table(state_space, action_space)
print("Q Table Shape", q_table_frozenlake.shape)

Q Table Shape (16, 4)


Define the greedy policy and epsilon greedy policy, but we'll use epsilon greedy 

In [7]:
def greedy_policy(state, q_table):
    # return action index with highest Q-value for the given state
    return int(np.argmax(q_table[state, :]))

def epsilon_greedy_policy(state, q_table, epsilon):
    # correct arg order: (state, q_table, epsilon)
    if random.random() > epsilon:
        return greedy_policy(state, q_table)
    else:
        return env.action_space.sample()

In [8]:
n_training_episodes = 1000  # Total training episodes
learning_rate = 0.7  # Learning rate

n_eval_episodes = 100  # Total number of test episodes

env_id = "FrozenLake-v1"  # Name of the environment
max_steps = 99  # Max steps per episode
gamma = 0.95  # Discounting rate
eval_seed = None  # The evaluation seed of the environment

max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.0005  # Exponential decay rate for exploration prob

In [9]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    rewards_all_episodes = []

    for episode in range(n_training_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        state, info = env.reset()
        state = int(state)
        total_rewards = 0

        for step in range(max_steps):
            action = epsilon_greedy_policy(state, Qtable, epsilon)
            new_state, reward, terminated, truncated, info = env.step(action)
            new_state = int(new_state)

            Qtable[state, action] += learning_rate * (
                reward + gamma * np.max(Qtable[new_state, :]) - Qtable[state, action]
            )

            state = new_state
            total_rewards += reward
            if terminated or truncated:
                break

        rewards_all_episodes.append(total_rewards)

    return Qtable, rewards_all_episodes


In [10]:
q_table_frozenlake, rewards_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, q_table_frozenlake)


In [11]:
q_table_frozenlake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77377986],
       [0.77377873, 0.857375  , 0.77377814, 0.81450613],
       [0.81450561, 0.        , 0.77359421, 0.77277433],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.814231  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.90249999, 0.95      , 0.85735345],
       [0.90249998, 0.94999995, 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [12]:
rewards_frozenlake


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [13]:
import numpy as np
print("avg reward:", np.mean(rewards_frozenlake))
print("max reward:", np.max(rewards_frozenlake))
print("min reward:", np.min(rewards_frozenlake))
print("std reward:", np.std(rewards_frozenlake))

avg reward: 0.062
max reward: 1
min reward: 0
std reward: 0.24115555146004827


In [14]:
# evaluate the learned agent
def evaluate_agent(env, max_steps, n_eval_episodes, q_table, eval_seed=42):
    rewards = []

    for ep in range(n_eval_episodes):
        state, _ = env.reset(seed=eval_seed)
        total_reward = 0
        done = False

        for step in range(max_steps):
            action = greedy_policy(state, q_table)  # CORRECT ORDER
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            state = next_state
            done = terminated or truncated
            if done:
                break

        rewards.append(total_reward)

    mean_reward = np.mean(rewards)
    std_reward = np.std(rewards)
    return mean_reward, std_reward



In [15]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, q_table_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Mean_reward=1.00 +/- 0.00


In [None]:
import pickle

model = {
    "qtable": q_table_frozenlake,
    "env_id": "FrozenLake-v1",
    "max_steps": max_steps,
    "n_eval_episodes": n_eval_episodes,
    "eval_seed": eval_seed,
    "gamma": gamma,
    "learning_rate": learning_rate,
}


: 