 we studied the Q-Learning algorithm, let’s implement it from scratch and train our Q-Learning agent in frozen lake environment

In [1]:
import numpy as np
import gymnasium as gym
import random
import imageio
import os
import tqdm
import time

import pickle
from tqdm.notebook import tqdm

In [2]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode = "human")

print("Observation Space", env.observation_space)
print("Sample observation", env.observation_space.sample()) 

Observation Space Discrete(16)
Sample observation 14


In [3]:
print("Action Space Shape", env.action_space.n) # le3ft, down, right, up
print("Action Space Sample", env.action_space.sample())  # Take a random action

Action Space Shape 4
Action Space Sample 1


reward : reach goal +1, reach hole 0, reach frozen 0

Now we will initialize the q table

In [4]:
state_space = env.observation_space.n
action_space = env.action_space.n

In [5]:
def initialize_q_table(state_space, action_space):
    q_table = np.zeros((state_space, action_space))
    return q_table

In [6]:
q_table_frozenlake = initialize_q_table(state_space, action_space)
print("Q Table Shape", q_table_frozenlake.shape)

Q Table Shape (16, 4)


Define the greedy policy and epsilon greedy policy, but we'll use epsilon greedy 

In [7]:
def greedy_policy(state, q_table):
    # return action index with highest Q-value for the given state
    return int(np.argmax(q_table[state, :]))

def epsilon_greedy_policy(state, q_table, epsilon):
    # correct arg order: (state, q_table, epsilon)
    if random.random() > epsilon:
        return greedy_policy(state, q_table)
    else:
        return env.action_space.sample()

In [55]:
n_training_episodes = 1000  # Total training episodes
learning_rate = 0.7  # Learning rate

n_eval_episodes = 100  # Total number of test episodes

env_id = "FrozenLake-v1"  # Name of the environment
max_steps = 99  # Max steps per episode
gamma = 0.95  # Discounting rate
eval_seed = None  # The evaluation seed of the environment

max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.0005  # Exponential decay rate for exploration prob

In [45]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    rewards_all_episodes = []

    for episode in range(n_training_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        state, info = env.reset()
        state = int(state)
        total_rewards = 0

        for step in range(max_steps):
            action = epsilon_greedy_policy(state, Qtable, epsilon)
            new_state, reward, terminated, truncated, info = env.step(action)
            new_state = int(new_state)

            Qtable[state, action] += learning_rate * (
                reward + gamma * np.max(Qtable[new_state, :]) - Qtable[state, action]
            )

            state = new_state
            total_rewards += reward
            if terminated or truncated:
                break

        rewards_all_episodes.append(total_rewards)

    return Qtable, rewards_all_episodes


In [46]:
q_table_frozenlake, rewards_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, q_table_frozenlake)


KeyboardInterrupt: 

In [15]:
q_table_frozenlake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [28]:
rewards_frozenlake


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [31]:
import numpy as np
print("avg reward:", np.mean(rewards_frozenlake))
print("max reward:", np.max(rewards_frozenlake))
print("min reward:", np.min(rewards_frozenlake))
print("std reward:", np.std(rewards_frozenlake))

avg reward: 0.112
max reward: 1
min reward: 0
std reward: 0.3153664535108324


In [None]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
    """
    Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
    :param env: The evaluation environment
    :param n_eval_episodes: Number of episode to evaluate the agent
    :param Q: The Q-table
    :param seed: The evaluation seed array (for taxi-v3)
    """
    episode_rewards = []
    for episode in tqdm(range(n_eval_episodes)):
        if seed:
            state, info = env.reset(seed=seed[episode])
        else:
            state, info = env.reset()
        step = 0
        truncated = False
        terminated = False
        total_rewards_ep = 0

        for step in range(max_steps):
            # Take the action (index) that have the maximum expected future reward given that state
            action = greedy_policy(Q, state)
            new_state, reward, terminated, truncated, info = env.step(action)
            total_rewards_ep += reward

            if terminated or truncated:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

n_eval_episodes: 100
rewards_frozenlake sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
rewards (first 20): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
mean/min/max: 1.0 1 1


In [1]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, q_table_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

NameError: name 'evaluate_agent' is not defined

In [34]:
import pickle

model = {
    "qtable": q_table_frozenlake,
    "env_id": "FrozenLake-v1",
    "max_steps": max_steps,
    "n_eval_episodes": n_eval_episodes,
    "eval_seed": eval_seed,
    "gamma": gamma,
    "learning_rate": learning_rate,
}
