In [34]:
import numpy as np
import gymnasium as gym

# Create the FrozenLake environment (default: 4x4 grid, slippery surface)
env = gym.make("FrozenLake-v1", is_slippery=True, map_name="8x8")


In [35]:
# Hyperparameters
gamma = 0.99            # Discount factor
theta = 1e-8            # Convergence threshold
max_iterations = 1000   # Optional iteration limit


In [36]:
def value_iteration(env, gamma=0.99, theta=1e-8):
    """
    Perform value iteration to compute the optimal state-value function.
    """
    value_table = np.zeros(env.observation_space.n)

    for i in range(max_iterations):
        delta = 0

        for state in range(env.observation_space.n):
            old_value = value_table[state]

            action_values = []
            for action in range(env.action_space.n):
                value = 0
                for prob, next_state, reward, done in env.unwrapped.P[state][action]:
                    value += prob * (reward + gamma * value_table[next_state] * (not done))
                action_values.append(value)

            value_table[state] = max(action_values)
            delta = max(delta, abs(old_value - value_table[state]))

        if delta < theta:
            print(f"Converged in {i+1} iterations.")
            break

    return value_table

In [37]:
def extract_policy(env, value_table, gamma=0.99):
    """
    Derive the optimal policy from the value function.
    """
    policy = np.zeros(env.observation_space.n, dtype=int)

    for state in range(env.observation_space.n):
        action_values = []

        for action in range(env.action_space.n):
            value = 0
            for prob, next_state, reward, done in env.unwrapped.P[state][action]:
                value += prob * (reward + gamma * value_table[next_state] * (not done))
            action_values.append(value)

        policy[state] = np.argmax(action_values)

    return policy

In [38]:
# Run value iteration and extract optimal policy
optimal_value_table = value_iteration(env, gamma, theta)

Converged in 347 iterations.


In [39]:
optimal_policy = extract_policy(env, optimal_value_table, gamma)

In [40]:
# Display results
print("\nOptimal Value Function:")
print(optimal_value_table.reshape((8,8)))



Optimal Value Function:
[[0.41464029 0.42720516 0.44614817 0.46832032 0.49244367 0.51656979
  0.53526148 0.54097518]
 [0.41168636 0.42120777 0.43749567 0.45838851 0.48324009 0.51353174
  0.54576783 0.55736838]
 [0.39675202 0.39384048 0.37549622 0.         0.42167796 0.49381917
  0.56121205 0.58585888]
 [0.36927222 0.35298248 0.30653119 0.20040369 0.30075272 0.
  0.56901586 0.62825901]
 [0.33266384 0.2913753  0.19730914 0.         0.28929024 0.36195179
  0.53481943 0.6896973 ]
 [0.30613619 0.         0.         0.08627638 0.21393258 0.27271393
  0.         0.77203551]
 [0.28888542 0.         0.05769637 0.04751101 0.         0.25052147
  0.         0.87776873]
 [0.28038877 0.20081497 0.12732648 0.         0.23959086 0.48644205
  0.7371033  0.        ]]


In [41]:
print("\nOptimal Policy (0=Left, 1=Down, 2=Right, 3=Up):")
print(optimal_policy.reshape((8,8)))


Optimal Policy (0=Left, 1=Down, 2=Right, 3=Up):
[[3 2 2 2 2 2 2 2]
 [3 3 3 3 3 2 2 1]
 [3 3 0 0 2 3 2 1]
 [3 3 3 1 0 0 2 2]
 [0 3 0 0 2 1 3 2]
 [0 0 0 1 3 0 0 2]
 [0 0 1 0 0 0 0 2]
 [0 1 0 0 1 2 1 0]]


In [42]:
# # For Colab

# def run_policy(env, policy, episodes=3):
#     """
#     Simulates the given policy in the FrozenLake environment.
#     Uses text rendering (ANSI) for each step.
#     """
#     # Re-create the environment with rendering enabled
#     env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="ansi", map_name="8x8")

#     for ep in range(episodes):
#         state, info = env.reset()
#         done = False
#         total_reward = 0
#         steps = 0

#         print(f"\n--- Episode {ep + 1} ---")

#         while not done:
#             action = policy[state]
#             next_state, reward, terminated, truncated, _ = env.step(action)
#             done = terminated or truncated
#             total_reward += reward
#             steps += 1

#             # Render text output of the environment after each action
#             print(env.render())

#             state = next_state

#         print(f"Episode finished in {steps} steps with reward: {total_reward}")


# # Simulate the optimal policy
# run_policy(env, optimal_policy, episodes=3)

In [43]:
import time

def run_policy(env, policy, episodes=3, delay=0.5):
    """
    Simulates the given policy in the FrozenLake environment.
    Uses graphical rendering (render_mode='human').
    """
    # Create a new environment with graphical rendering
    env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=True, render_mode='human')

    for ep in range(episodes):
        state, info = env.reset()
        done = False
        total_reward = 0
        steps = 0

        print(f"\n--- Episode {ep + 1} ---")

        while not done:
            action = policy[state]
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            steps += 1

            # Add delay so you can visually follow each step
            time.sleep(delay)

        print(f"Episode finished in {steps} steps with reward: {total_reward}")

    env.close()


run_policy(env, optimal_policy, episodes=3, delay=0.75)


--- Episode 1 ---
Episode finished in 100 steps with reward: 0.0

--- Episode 2 ---
Episode finished in 100 steps with reward: 0.0

--- Episode 3 ---
Episode finished in 43 steps with reward: 1.0
