In [1]:
import numpy as np
import gymnasium as gym
import time

In [2]:
env = gym.make("FrozenLake-v1", is_slippery=True, render_mode="human")
env = env.unwrapped

num_states = env.observation_space.n
num_actions = env.action_space.n
gamma = 0.9

In [3]:
def value_iteration(env, gamma=0.9, theta=1e-6):
    V = np.zeros(num_states)
    policy = np.zeros(num_states, dtype=int)
    while True:
        delta = 0
        for state in range(num_states):
            action_values = np.zeros(num_actions)
            for action in range(num_actions):
                for prob, next_state, reward, done in env.P[state][action]:
                    action_values[action] += prob * (reward + gamma * V[next_state] * (not done))
            best_action_value = np.max(action_values)
            delta = max(delta, np.abs(best_action_value - V[state]))
            V[state] = best_action_value
            policy[state] = np.argmax(action_values)
        if delta < theta:
            break
    return policy, V

In [4]:
def run_agent(env, policy, sleep_time=0.5):
    state, _ = env.reset()
    done = False
    while not done:
        env.render()
        time.sleep(sleep_time)
        action = policy[state]
        next_state, reward, done, _, _ = env.step(action)
        print(f"State: {state} -> Action: {action} -> Next State: {next_state} -> Reward: {reward}")
        state = next_state
    env.close()

In [5]:
policy_vi, values_vi = value_iteration(env)

print("\nOptimal Policy (Value Iteration):")
print(policy_vi.reshape((4, 4)))

print("\nOptimal State Values (Value Iteration):")
print(values_vi.reshape((4, 4)))

print("\nRunning agent using Value Iteration policy...\n")
run_agent(env, policy_vi)


Optimal Policy (Value Iteration):
[[0 3 0 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]

Optimal State Values (Value Iteration):
[[0.06888624 0.06141117 0.07440763 0.05580502]
 [0.09185097 0.         0.11220727 0.        ]
 [0.14543392 0.24749561 0.29961676 0.        ]
 [0.         0.37993504 0.63901974 0.        ]]

Running agent using Value Iteration policy...

State: 0 -> Action: 0 -> Next State: 0 -> Reward: 0.0
State: 0 -> Action: 0 -> Next State: 0 -> Reward: 0.0
State: 0 -> Action: 0 -> Next State: 0 -> Reward: 0.0
State: 0 -> Action: 0 -> Next State: 4 -> Reward: 0.0
State: 4 -> Action: 0 -> Next State: 8 -> Reward: 0.0
State: 8 -> Action: 3 -> Next State: 4 -> Reward: 0.0
State: 4 -> Action: 0 -> Next State: 0 -> Reward: 0.0
State: 0 -> Action: 0 -> Next State: 4 -> Reward: 0.0
State: 4 -> Action: 0 -> Next State: 4 -> Reward: 0.0
State: 4 -> Action: 0 -> Next State: 8 -> Reward: 0.0
State: 8 -> Action: 3 -> Next State: 9 -> Reward: 0.0
State: 9 -> Action: 1 -> Next State: 13 -> Rewar