In [1]:
import gym
import numpy as np

In [2]:
def value_iteration(env, gamma=0.99, theta=1e-6):
    V = np.zeros(env.nS)
    while True:
        delta = 0
        for s in range(env.nS):
            v = V[s]
            q_values = [
                sum([prob * (r + gamma * V[s_]) for prob, s_, r, _ in env.P[s][a]])
                for a in range(env.nA)
            ]
            V[s] = max(q_values)
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        q_values = [
            sum([prob * (r + gamma * V[s_]) for prob, s_, r, _ in env.P[s][a]])
            for a in range(env.nA)
        ]
        best_action = np.argmax(q_values)
        policy[s, best_action] = 1
    return policy, V

In [4]:
def policy_iteration(env, gamma=0.99, theta=1e-6):
    policy = np.ones([env.nS, env.nA]) / env.nA
    V = np.zeros(env.nS)
    while True:
        while True:
            delta = 0
            for s in range(env.nS):
                v = V[s]
                V[s] = sum([
                    policy[s, a] * sum([prob * (r + gamma * V[s_]) for prob, s_, r, _ in env.P[s][a]])
                    for a in range(env.nA)
                ])
                delta = max(delta, abs(v - V[s]))
            if delta < theta:
                break

        stable = True
        for s in range(env.nS):
            old_action = np.argmax(policy[s])
            q_values = [
                sum([prob * (r + gamma * V[s_]) for prob, s_, r, _ in env.P[s][a]])
                for a in range(env.nA)
            ]
            best_action = np.argmax(q_values)
            policy[s] = np.eye(env.nA)[best_action]
            if old_action != best_action:
                stable = False
        if stable:
            break
    return policy, V

In [5]:
env = gym.make("CliffWalking-v0")

policy_vi, V_vi = value_iteration(env)
policy_pi, V_pi = policy_iteration(env)

  deprecation(
  deprecation(


In [6]:
print("Optimal Policy from Value Iteration:")
print(policy_vi.reshape((4, 12, 4)))  # 4 rows x 12 columns x 4 actions

print("\nOptimal Policy from Policy Iteration:")
print(policy_pi.reshape((4, 12, 4)))

Optimal Policy from Value Iteration:
[[[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]]

Optimal Policy from Policy Iteration:
[[[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1