In [1]:
# imports
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import random

# Setting up the environment for training (no rendering to speed up)
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode=None)
action_size = env.action_space.n
state_size = env.observation_space.n

# the Q-table
qtable = np.zeros((state_size, action_size))
print("Initial Q-table:")
print(qtable)

# Hyperparameters
total_episodes = 10000        # Increased episodes for more exploration
max_steps = 100               # Max steps per episode
learning_rate = 0.5           # Learning rate for faster convergence
gamma = 0.99                  # Discounting rate
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability
decay_rate = 0.001            # Slower decay to ensure sufficient exploration

# Training
rewards = []
success_count = 0  # Track goal reaches

# Training loop
for episode in range(total_episodes):
    state, info = env.reset()
    done = False
    total_rewards = 0
    print(f"Episode {episode + 1}")

    for step in range(max_steps):
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(qtable[state, :])  # Exploit

        # Take action
        new_state, reward, done, truncated, info = env.step(action)

        # Optional reward shaping (uncomment to use)
        # if not (done or truncated):  # Small negative reward for non-terminal steps
        #     reward = -0.01
        # elif new_state == 15:  # Ensure goal reward is 1.0
        #     reward = 1.0

        # Update Q-table
        qtable[state, action] = qtable[state, action] + learning_rate * (
            reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]
        )

        total_rewards += reward
        state = new_state

        # Debug print every 100 episodes
        if episode % 100 == 0:
            print(f"Step {step}: State {state}, Action {action}, New State {new_state}, Reward {reward}")

        if done or truncated:
            if reward == 1.0:
                success_count += 1
                print(f"Goal reached in episode {episode + 1}!")
            print(f"Episode {episode + 1} terminated. State: {state}, Reward: {reward}, Done: {done}, Truncated: {truncated}")
            break

    # Update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)

# Print training results
print("\nTraining completed.")
print(f"Success rate: {success_count / total_episodes:.3f} ({success_count} successful episodes)")
print(f"Score over time (average reward): {sum(rewards) / total_episodes:.3f}")
print("Final Q-table:")
print(qtable)

# Save Q-table for later use
np.save("qtable.npy", qtable)

# Evaluation with rendering
env.close()  # Close training environment
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="human")
eval_max_steps = 100

for episode in range(20):
    state, info = env.reset()
    done = False
    print("\n****************************************************")
    print(f"EVALUATION EPISODE {episode + 1}")

    for step in range(eval_max_steps):
        action = np.argmax(qtable[state, :])
        new_state, reward, done, truncated, info = env.step(action)
        print(f"Step {step}: State {state}, Action {action}, New State {new_state}, Reward {reward}")

        if done or truncated:
            env.render()
            print(f"Number of steps: {step + 1}")
            print(f"Reward: {reward}")
            break
        state = new_state
    else:
        print(f"Episode did not terminate within {eval_max_steps} steps.")
        print(f"Final state: {state}")

env.close()

Initial Q-table:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 1
Step 0: State 4, Action 1, New State 4, Reward 0.0
Step 1: State 4, Action 0, New State 4, Reward 0.0
Step 2: State 4, Action 0, New State 4, Reward 0.0
Step 3: State 4, Action 0, New State 4, Reward 0.0
Step 4: State 8, Action 1, New State 8, Reward 0.0
Step 5: State 9, Action 2, New State 9, Reward 0.0
Step 6: State 13, Action 1, New State 13, Reward 0.0
Step 7: State 13, Action 1, New State 13, Reward 0.0
Step 8: State 12, Action 0, New State 12, Reward 0.0
Episode 1 terminated. State: 12, Reward: 0.0, Done: True, Truncated: False
Episode 2
Episode 2 terminated. State: 5, Reward: 0.0, Done: True, Truncated: False
Episode 3
Episode 3 terminated. State: 5, Reward: 0.0, Done: True, Truncated: False
Episode 4
Episode 4 term