In [1]:
import gymnasium as gym
import pandas as pd
import numpy as np
import random
import time
from IPython.display import clear_output

In [2]:
# Create FrozenLake with human rendering (non-slippery)
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="ansi", map_name="4x4")
#env = gym.make("FrozenLake-v1", is_slippery=False, render_mode="human", map_name="4x4")

In [3]:
# Hyperparameters
THETA = 1e-8            # Convergence threshold
N_STATES = 16   # the length of the 4 dimensional world
ACTIONS = ['up', 'down', 'left', 'right']     # available actions
EPSILON = 0.9   # greedy policy
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 50   # maximum episodes
FRESH_TIME = 0.3    # refresh/pause time for one move used in time.sleep
num_episodes = 10000
max_steps_per_episode = 100
learning_rate = 0.1
discount_rate = 0.99
epsilon = 0.2

Using Q Learning

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
for episode in range(num_episodes):
    # Fix reset() usage
    state, info = env.reset()
    done = False

    for step in range(max_steps_per_episode):
        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > epsilon:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()

        # Fix step() usage - handle all returned values
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated  # done is now combination of terminated and truncated

        # Update Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
                                learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state

        if done:
            break

# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)
env.close()



********Q-table********

[[0.56158702 0.54039618 0.54875347 0.5320196 ]
 [0.42597189 0.33800784 0.30264525 0.51533354]
 [0.45234214 0.43637341 0.43324064 0.46686914]
 [0.32572237 0.30345231 0.19633274 0.44792198]
 [0.58695087 0.39937495 0.40472511 0.44515438]
 [0.         0.         0.         0.        ]
 [0.36495413 0.16749159 0.2899012  0.03966152]
 [0.         0.         0.         0.        ]
 [0.21154067 0.40289163 0.25186766 0.61547727]
 [0.48661555 0.6898726  0.45925472 0.38634512]
 [0.61291206 0.47149277 0.3548813  0.29258604]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.4883374  0.55881689 0.76684467 0.47399651]
 [0.72340175 0.8608417  0.81567354 0.80430538]
 [0.         0.         0.         0.        ]]


In [6]:
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="human", map_name="4x4")

In [7]:
#Deploy Code
actionLog = []
for episode in range(1):
    state, info = env.reset()  # Fix: Unpack both state and info
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)

        action = np.argmax(q_table[state,:])
        actionLog.append(action)
        # Fix: Handle all five return values from step()
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated  # Combine both terminal conditions

        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
            clear_output(wait=True)
            break

        state = new_state

env.close()

****You reached the goal!****


In [8]:
len(actionLog)

25