<a href="https://colab.research.google.com/github/mostafadentist/python-ipynb/blob/main/Markov_Decision_Processes_(MDP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import numpy as np

# States: {0=Good, 1=Bad}
# Actions: {0=Do nothing, 1=Repair}
# Transition matrices for each action
P = {
    0: np.array([[0.7, 0.3],
                 [0.4, 0.6]]),  # if "do nothing"
    1: np.array([[0.9, 0.1],
                 [0.6, 0.4]])   # if "repair"
}

# Rewards for (state, action)
R = {
    (0,0): 5, (1,0): 1,
    (0,1): 3, (1,1): 2
}

In [55]:
def value_iteration(P, R, states, actions, gamma=0.9, theta=1e-6):
    V = np.zeros(len(states))
    policy = np.zeros(len(states), dtype=int)

    while True:
        delta = 0
        for s in states:
            action_values = []
            for a in actions:
                value = R[(s,a)] + gamma * np.dot(P[a][s], V)
                action_values.append(value)
            best_value = max(action_values)
            delta = max(delta, abs(best_value - V[s]))
            V[s] = best_value
            policy[s] = np.argmax(action_values)
        if delta < theta:
            break
    return V, policy

states = [0,1]
actions = [0,1]
V, policy = value_iteration(P, R, states, actions)
print("Optimal Values:", V)
print("Optimal Policy:", policy)

Optimal Values: [41.09889406 37.80219136]
Optimal Policy: [0 1]


In [56]:
def policy_iteration(P, R, states, actions, gamma=0.9, max_iter=100):
    policy = np.zeros(len(states), dtype=int)
    V = np.zeros(len(states))

    for _ in range(max_iter):
        # Policy Evaluation
        A = np.eye(len(states))
        b = np.zeros(len(states))
        for s in states:
            a = policy[s]
            A[s] -= gamma * P[a][s]
            b[s] = R[(s,a)]
        V = np.linalg.solve(A, b)

        # Policy Improvement
        stable = True
        for s in states:
            action_values = [R[(s,a)] + gamma*np.dot(P[a][s], V) for a in actions]
            best_action = np.argmax(action_values)
            if best_action != policy[s]:
                policy[s] = best_action
                stable = False
        if stable:
            break
    return V, policy

V, policy = policy_iteration(P, R, states, actions)
print("Optimal Values (Policy Iteration):", V)
print("Optimal Policy:", policy)

Optimal Values (Policy Iteration): [41.0989011 37.8021978]
Optimal Policy: [0 1]


In [57]:
T = 3  # horizon
gamma = 0.9
V = np.zeros((T+1, len(states)))
policy = np.zeros((T, len(states)), dtype=int)

for t in reversed(range(T)):
    for s in states:
        action_values = []
        for a in actions:
            value = R[(s,a)] + gamma*np.dot(P[a][s], V[t+1])
            action_values.append(value)
        V[t,s] = max(action_values)
        policy[t,s] = np.argmax(action_values)

print("Finite Horizon Values:\n", V)
print("Finite Horizon Policy:\n", policy)

Finite Horizon Values:
 [[11.9381  8.6438]
 [ 8.69    5.42  ]
 [ 5.      2.    ]
 [ 0.      0.    ]]
Finite Horizon Policy:
 [[0 1]
 [0 1]
 [0 1]]
