<a href="https://colab.research.google.com/github/nosadchiy/public/blob/main/PolicyIteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# ----------------------------
# PARAMETERS
# ----------------------------
K = 20.0         # fixed ordering cost
c = 2.0          # per-unit ordering cost
h = 1.0          # holding cost per unit left over
p = 20.0          # penalty cost per unit shortage
beta = 0.95      # discount factor

# State space: inventory levels 0, 1, ..., X_max
X_max = 50
states = np.arange(X_max + 1)

# Action space: possible order quantities 0, 1, ..., Q_max
Q_max = 50
actions = np.arange(Q_max + 1)

# Demand distribution: d in {0, 1, ..., d_max}
d_max = 10
d_vals = np.arange(d_max + 1)
P_d = np.array([0.005, 0.005, 0.01, 0.03, 0.05, 0.1, 0.15, 0.2, 0.2, 0.15, 0.1])

# (P_d should sum to 1; here it does.)

# ----------------------------
# POLICY ITERATION PARAMETERS
# ----------------------------
tol_policy_eval = 1e-4  # tolerance for policy evaluation convergence
max_policy_eval_iter = 1000

# ----------------------------
# INITIALIZATION
# ----------------------------
# Initialize an arbitrary policy. Here we start with "order nothing" for all states.
policy = np.zeros(len(states), dtype=int)
# Initialize the value function arbitrarily (zeros)
V = np.zeros(len(states))

# ----------------------------
# POLICY ITERATION ALGORITHM
# ----------------------------
policy_stable = False
iteration = 0

while not policy_stable:
    iteration += 1
    # ----------------------------
    # Policy Evaluation: Given current policy, compute V(s)
    # ----------------------------
    eval_iter = 0
    while True:
        eval_iter += 1
        delta = 0.0
        V_new = np.copy(V)
        # Loop over all states
        for x in states:
            q = policy[x]  # action prescribed by the current policy at state x
            cost = 0.0
            # Compute expected cost for taking action q in state x
            for d, prob in zip(d_vals, P_d):
                inv_after = x + q - d
                # Calculate inventory/shortage cost and next state
                if inv_after >= 0:
                    cost_inv = h * inv_after
                    next_state = min(int(inv_after), X_max)
                else:
                    cost_inv = p * (-inv_after)
                    next_state = 0  # lost sales: next period starts with 0 inventory
                # Ordering cost: fixed cost (if ordering) plus per-unit cost
                cost_order = (K if q > 0 else 0) + c * q
                cost += prob * (cost_order + cost_inv + beta * V[next_state])
            # Update the value for state x
            delta = max(delta, abs(cost - V[x]))
            V_new[x] = cost
        V = V_new.copy()
        if delta < tol_policy_eval:
            break
        if eval_iter >= max_policy_eval_iter:
            print("Policy evaluation did not converge within the max iterations.")
            break

    # ----------------------------
    # Policy Improvement: Update policy using the computed value function
    # ----------------------------
    policy_stable = True
    for x in states:
        old_action = policy[x]
        action_costs = []
        # For each possible action at state x, compute the expected cost
        for q in actions:
            cost = 0.0
            for d, prob in zip(d_vals, P_d):
                inv_after = x + q - d
                if inv_after >= 0:
                    cost_inv = h * inv_after
                    next_state = min(int(inv_after), X_max)
                else:
                    cost_inv = p * (-inv_after)
                    next_state = 0
                cost_order = (K if q > 0 else 0) + c * q
                cost += prob * (cost_order + cost_inv + beta * V[next_state])
            action_costs.append(cost)
        # Choose the action that minimizes the expected cost
        best_action = actions[np.argmin(action_costs)]
        policy[x] = best_action
        if best_action != old_action:
            policy_stable = False

    print(f"Policy improvement iteration {iteration} completed.")

print(f"\nPolicy iteration converged after {iteration} iterations.")

# ----------------------------
# OUTPUT THE OPTIMAL POLICY
# ----------------------------
print("\nOptimal Policy (order quantity) for each inventory level:")
for x, q in enumerate(policy):
    print(f"Inventory level {x:2d} -> Order {q:2d} units")


Policy improvement iteration 1 completed.
Policy improvement iteration 2 completed.
Policy improvement iteration 3 completed.
Policy improvement iteration 4 completed.
Policy improvement iteration 5 completed.

Policy iteration converged after 5 iterations.

Optimal Policy (order quantity) for each inventory level:
Inventory level  0 -> Order 18 units
Inventory level  1 -> Order 17 units
Inventory level  2 -> Order 16 units
Inventory level  3 -> Order 15 units
Inventory level  4 -> Order 14 units
Inventory level  5 -> Order 13 units
Inventory level  6 -> Order 12 units
Inventory level  7 -> Order  0 units
Inventory level  8 -> Order  0 units
Inventory level  9 -> Order  0 units
Inventory level 10 -> Order  0 units
Inventory level 11 -> Order  0 units
Inventory level 12 -> Order  0 units
Inventory level 13 -> Order  0 units
Inventory level 14 -> Order  0 units
Inventory level 15 -> Order  0 units
Inventory level 16 -> Order  0 units
Inventory level 17 -> Order  0 units
Inventory level 1