Value Iteration 2 - Hung

Importing libraries

In [1]:
import numpy as np

# Define the MDP parameters
states = ['S0', 'S1', 'S2']  # State space
actions = ['a0', 'a1', 'a2']  # Action space
gamma = 0.9  # Discount factor

Transition probabilities and rewards (based on the diagram)

In [2]:
# Transition probabilities and rewards (based on the diagram)
# Format: {state: {action: {next_state: (probability, reward)}}}
transitions = {
    'S0': {
        'a0': {'S0': (0.7, 10), 'S1': (0.3, 0)},  # Reward +10 for S0->S0, 0 for S0->S1
        'a1': {'S1': (1.0, 40)},  # Reward +40 for S0->S1
        'a2': {'S1': (0.8, 0), 'S2': (0.2, 0)}  # No explicit reward shown, assuming 0
    },
    'S1': {
        'a1': {'S1': (0.1, 0), 'S2': (0.8, -50), 'S0': (0.1, 0)},  # Reward -50 for S1->S2
        'a2': {'S2': (1.0, 0)}  # No explicit reward shown, assuming 0
    },
    'S2': {}  # Terminal state, no actions (value = 0)
}

Iteration

In [3]:
# Initialize value function V(s) to zeros
V = {state: 0.0 for state in states}

# Number of iterations for convergence
max_iterations = 1000
theta = 0.0001  # Convergence threshold

# Value Iteration
for i in range(max_iterations):
    new_V = V.copy()
    for state in states:
        if state == 'S2':  # Terminal state, value remains 0
            continue
        
        # Calculate the value for each action and take the maximum
        action_values = []
        for action in actions:
            if action in transitions[state]:
                value = 0
                for next_state, (prob, reward) in transitions[state][action].items():
                    value += prob * (reward + gamma * V[next_state])
                action_values.append(value)
        
        # If no valid actions, keep current value (e.g., for terminal state)
        if action_values:
            new_V[state] = max(action_values)
    
    # Check for convergence
    if max(abs(new_V[state] - V[state]) for state in states) < theta:
        print(f"Converged after {i + 1} iterations")
        break
    
    V = new_V

# Print the optimal value function
print("Optimal Value Function:")
for state, value in V.items():
    print(f"V({state}) = {value:.2f}")

Converged after 2 iterations
Optimal Value Function:
V(S0) = 40.00
V(S1) = 0.00
V(S2) = 0.00


Derive the optimal policy

In [4]:
# Derive the optimal policy
policy = {}
for state in states:
    if state == 'S2':
        policy[state] = None  # Terminal state, no action
        continue
    
    best_action = None
    best_value = float('-inf')
    for action in actions:
        if action in transitions[state]:
            value = 0
            for next_state, (prob, reward) in transitions[state][action].items():
                value += prob * (reward + gamma * V[next_state])
            if value > best_value:
                best_value = value
                best_action = action
    policy[state] = best_action

# Print the optimal policy
print("\nOptimal Policy:")
for state, action in policy.items():
    print(f"π({state}) = {action}")


Optimal Policy:
π(S0) = a1
π(S1) = a2
π(S2) = None
