In [1]:
import numpy as np

states = [0, 1, 2, 3]
actions = ["a", "b"]

transition_prob = {
    0: {"a": [(1.0, 1, 0)], "b": [(1.0, 2, 0)]},
    1: {"a": [(1.0, 3, 1)], "b": [(1.0, 0, 0)]},
    2: {"a": [(1.0, 0, 0)], "b": [(1.0, 3, 1)]},
    3: {"a": [(1.0, 3, 0)], "b": [(1.0, 3, 0)]},
}

In [2]:
disc_fact = 0.9

def value_iteration(states, actions, transition_prob, disc_fact, theta=1e-6):
    v = np.zeros(len(states))
    while True:
        delta = 0
        for s in range(len(states)):
            old_v = v[s]
            max_value = float("-inf")
            for a in actions:
                value = sum(
                    prob * (reward + disc_fact * v[next_state])
                    for prob, next_state, reward in transition_prob[s][a]
                )
                max_value = max(max_value, value)
            v[s] = max_value
            delta = max(delta, abs(old_v - v[s]))
        if delta < theta:
            break
    policy = {s: None for s in range(len(states))}
    for s in range(len(states)):
        action_values = {}
        for a in actions:
            action_values[a] = sum(
                prob * (reward + disc_fact * v[next_state])
                for prob, next_state, reward in transition_prob[s][a]
            )
        policy[s] = max(action_values, key=action_values.get)
    return v, policy

In [3]:
def policy_iteration(states, actions, transition_prob, disc_fact, theta=1e-6):
    policy = {s: np.random.choice(actions) for s in range(len(states))}
    v = np.zeros(len(states))
    while True:
        while True:
            delta = 0
            for s in range(len(states)):
                old_v = v[s]
                a = policy[s]
                v[s] = sum(
                    prob * (reward + disc_fact * v[next_state])
                    for prob, next_state, reward in transition_prob[s][a]
                )
                delta = max(delta, abs(old_v - v[s]))
            if delta < theta:
                break
        policy_stable = True
        for s in range(len(states)):
            old_action = policy[s]
            action_values = {}
            for a in actions:
                action_values[a] = sum(
                    prob * (reward + disc_fact * v[next_state])
                    for prob, next_state, reward in transition_prob[s][a]
                )
            new_action = max(action_values, key=action_values.get)
            policy[s] = new_action
            if old_action != new_action:
                policy_stable = False
        if policy_stable:
            break
    return v, policy

In [4]:
V_vi, policy_vi = value_iteration(states, actions, transition_prob, disc_fact)
print("Value Iteration Results:")
print("Optimal Values:", V_vi)
print("Optimal Policy:", policy_vi)

Value Iteration Results:
Optimal Values: [0.9 1.  1.  0. ]
Optimal Policy: {0: 'a', 1: 'a', 2: 'b', 3: 'a'}


In [5]:
V_pi, policy_pi = policy_iteration(states, actions, transition_prob, disc_fact)
print("Policy Iteration Results:")
print("Optimal Values:", V_pi)
print("Optimal Policy:", policy_pi)

Policy Iteration Results:
Optimal Values: [0.9 1.  1.  0. ]
Optimal Policy: {0: 'a', 1: 'a', 2: 'b', 3: 'a'}
