In [1]:
import numpy as np

def value_iteration_from_env(env, gamma=0.99, theta=1e-8, max_iters=10_000):
    """
    Reine Value Iteration.
    Erwartet ein Gymnasium-Env mit env.unwrapped.P:
      P[s][a] = Liste von (prob, s_next, reward, done)
    Gibt nur V (Value-Funktion) zurück.
    """
    P = env.unwrapped.P
    nS = env.observation_space.n
    nA = env.action_space.n

    V = np.zeros(nS, dtype=np.float64)   # Initialize V(s)

    for _ in range(max_iters):
        delta = 0.0                      # Δ ← 0
        for s in range(nS):              # for each s ∈ S
            v_old = V[s]                 # v ← V(s)

            # V(s) ← max_a Σ_{s'} p(s'|s,a) [ r + γ V(s') ]
            best = -np.inf
            for a in range(nA):
                q = 0.0
                for prob, s_next, r, done in P[s][a]:
                    q += prob * (r + (0.0 if done else gamma * V[s_next]))
                if q > best:
                    best = q
            V[s] = best

            delta = max(delta, abs(v_old - V[s]))  # Δ ← max(Δ, |v − V(s)|)

        if delta < theta:                 # until Δ < θ
            break

    return V


In [2]:
import gymnasium as gym
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)
V = value_iteration_from_env(env)
print(V)


[0.95099005 0.96059601 0.970299   0.96059601 0.96059601 0.
 0.9801     0.         0.970299   0.9801     0.99       0.
 0.         0.99       1.         0.        ]


In [4]:
import numpy as np

# Gymnasium-Aktionsreihenfolge bei FrozenLake:
# 0 = Left, 1 = Down, 2 = Right, 3 = Up

def extract_policy_from_env(env, V, gamma=0.99):
    P = env.unwrapped.P
    nS = env.observation_space.n
    nA = env.action_space.n
    pi = np.zeros(nS, dtype=int)

    for s in range(nS):
        q = np.zeros(nA, dtype=float)
        for a in range(nA):
            for prob, s_next, r, done in P[s][a]:
                q[a] += prob * (r + (0.0 if done else gamma * V[s_next]))
        pi[s] = int(np.argmax(q))
    return pi


In [5]:
pi = extract_policy_from_env(env, V)

# optional: als Pfeile ausgeben
arrow = {0:"←", 1:"↓", 2:"→", 3:"↑"}
n = int(np.sqrt(env.observation_space.n))
grid = env.unwrapped.desc.astype(str).copy()
for s,a in enumerate(pi):
    r,c = divmod(s, n)
    if grid[r,c] not in ("H","G","S"):
        grid[r,c] = arrow[a]
print("\nPolicy als Pfeile:")
for r in range(n):
    print(" ".join(grid[r]))



Policy als Pfeile:
S → ↓ ←
↓ H ↓ H
→ ↓ ↓ H
H → → G
