In [1]:
import numpy as np
import matplotlib.pyplot as plt

## Markov Decision Process (MDP)

An MDP is defined by the tuple $\langle \mathcal{S}, \mathcal{A}, \mathcal{T}, \mathcal{R}, \gamma  \rangle$ where,

$\mathcal{S}$ is the state space

$\mathcal{A}$ is the action space

$\mathcal{T}$ is the transition model

$\mathcal{R}$ is the reward model

$\gamma$ is the discount factor


## State Space

In [2]:
TIME_HORIZON = 10
num_damage_states = 5

num_states = TIME_HORIZON * num_damage_states

state_space = {}
key = 1

for time in range(1, TIME_HORIZON+1):
    for state in range(1, num_damage_states+1):
        state_space[key] = (time, state)
        key += 1

print(state_space)

{1: (1, 1), 2: (1, 2), 3: (1, 3), 4: (1, 4), 5: (1, 5), 6: (2, 1), 7: (2, 2), 8: (2, 3), 9: (2, 4), 10: (2, 5), 11: (3, 1), 12: (3, 2), 13: (3, 3), 14: (3, 4), 15: (3, 5), 16: (4, 1), 17: (4, 2), 18: (4, 3), 19: (4, 4), 20: (4, 5), 21: (5, 1), 22: (5, 2), 23: (5, 3), 24: (5, 4), 25: (5, 5), 26: (6, 1), 27: (6, 2), 28: (6, 3), 29: (6, 4), 30: (6, 5), 31: (7, 1), 32: (7, 2), 33: (7, 3), 34: (7, 4), 35: (7, 5), 36: (8, 1), 37: (8, 2), 38: (8, 3), 39: (8, 4), 40: (8, 5), 41: (9, 1), 42: (9, 2), 43: (9, 3), 44: (9, 4), 45: (9, 5), 46: (10, 1), 47: (10, 2), 48: (10, 3), 49: (10, 4), 50: (10, 5)}


## Action Space

The action space has 3 actions and we carry out these actions at the beginning of the time-step

0: Do nothing - component undergoes deterioration due to the environment

1: Repair - component moves back by 1 damage state (+undergoes deterioration due to the environment)

2: Replace - component is replaced (+undergoes deterioration due to the environment)

In [3]:
DO_NOTHING = 0
MINOR_REPAIR = 1
REPLACE = 2

action_space = [DO_NOTHING, MINOR_REPAIR, REPLACE]

num_actions = len(action_space)

## Transition Model

In [4]:
TRANSITION_MODEL = np.array([[0.7, 0.3, 0.0, 0.0, 0.0],
                             [0.0, 0.6, 0.4, 0.0, 0.0],
                             [0.0, 0.0, 0.5, 0.5, 0.0],
                             [0.0, 0.0, 0.0, 0.1, 0.9],
                             [0.0, 0.0, 0.0, 0.0, 1.0]])

## Reward model

In [5]:
REPAIR_COST = -25
REPLACE_COST = -50

REWARDS = [0, REPAIR_COST, REPLACE_COST]

PENALTY = -500

## Discount Factor

In [6]:
DISCOUNT_FACTOR = 0.9

In [7]:
def MDP_model(current_state, action):

    """_summary_

    Returns
    -------
    current_state: 
        _description_

    action:
        
    """    

    current_time, current_damage_state = current_state
    next_time = current_time + 1
    
    output = []

    # action = 'do-nothing'
    # damage state does not change

    # action = 'minor-repair'
    if action == 1:
        # move back by one state
        # but not lower than 1
        # but no minor repair for failure
        if current_damage_state != 5:
            current_damage_state = max(1, current_damage_state-1)

    # action = 'replace'
    elif action == 2:
        # replacing leads to initial undamaged state
        current_damage_state = 1

    for next_damage_state in range(1, num_damage_states+1):

        next_state = (next_time, next_damage_state)
        prob = TRANSITION_MODEL[current_damage_state-1, next_damage_state-1]
        reward = REWARDS[action]

        if next_damage_state == 5:
            reward += PENALTY

        output.append((prob, next_state, reward))

    return output

In [8]:
MDP_model(state_space[5], 2)

[(0.7, (2, 1), -50),
 (0.3, (2, 2), -50),
 (0.0, (2, 3), -50),
 (0.0, (2, 4), -50),
 (0.0, (2, 5), -550)]

## Policy iteration

In [9]:
def evaluate_policy(policy):

    # Initialise values of all states with 0
    value_function_pi = np.zeros(num_states)

    delta_threshold = 1e-5
    delta = 10

    while delta > delta_threshold:
        delta = 0

        # ignore terminal states, since value is 0
        for idx_state in range(num_states-num_damage_states):

            state = state_space[idx_state+1]
            old_value = value_function_pi[idx_state]

            # get action from policy 
            action = policy[idx_state]

            new_value = 0
            for tuple in MDP_model(state, action):
                prob, next_state, reward = tuple
                idx_next_state = (next_state[0]-1) * 5 + next_state[1]-1
                new_value += prob * (reward + DISCOUNT_FACTOR * value_function_pi[idx_next_state])

            value_function_pi[idx_state] = new_value

            delta = max([delta, np.abs(old_value - new_value)])
    
    return value_function_pi

In [10]:
# Policy improvement
q_values = np.zeros((num_states, len(action_space)))
policy = np.zeros(num_states, dtype=int)

policy_stable = False

while policy_stable == False:

    value_function_pi = evaluate_policy(policy)
    policy_stable = True

    # ignore terminal states, since value is 0
    for idx_state in range(num_states-num_damage_states):

        # get old action from policy 
        old_action = policy[idx_state]
        state = state_space[idx_state+1]

        # store Q values of all actions
        for idx_action, action in enumerate(action_space):
            q_val = 0
            for tuple in MDP_model(state, action):
                prob, next_state, reward = tuple
                idx_next_state = (next_state[0]-1) * 5 + next_state[1]-1
                q_val += prob * (reward + DISCOUNT_FACTOR * value_function_pi[idx_next_state])
            q_values[idx_state, idx_action] = q_val

        # break ties evenly
        best_actions = np.flatnonzero(q_values[idx_state, :] == max(q_values[idx_state, :]))
        policy[idx_state] = np.random.choice(best_actions)

        if old_action != policy[idx_state]:
            policy_stable = False

In [11]:
policy.reshape(TIME_HORIZON, num_damage_states)

array([[0, 0, 0, 2, 2],
       [0, 0, 0, 2, 2],
       [0, 0, 0, 2, 2],
       [0, 0, 0, 2, 2],
       [0, 0, 0, 2, 2],
       [0, 0, 0, 2, 2],
       [0, 0, 0, 1, 2],
       [0, 0, 0, 1, 2],
       [0, 0, 0, 1, 2],
       [0, 0, 0, 0, 0]])

In [13]:
print(f"Value of states: [time_horizon, num_states] \n {np.around(value_function_pi.reshape(TIME_HORIZON, num_damage_states),3)}")

Value of states: [time_horizon, num_states] 
 [[-16.602 -33.016 -49.573 -66.602 -66.602]
 [-13.485 -30.024 -46.677 -63.485 -63.485]
 [-10.054 -26.484 -43.672 -60.054 -60.054]
 [ -6.519 -22.024 -40.531 -56.519 -56.519]
 [ -3.357 -16.312 -36.711 -53.357 -53.357]
 [ -1.094  -9.882 -30.488 -51.094 -51.094]
 [  0.     -4.05  -21.375 -46.375 -50.   ]
 [  0.      0.    -11.25  -36.25  -50.   ]
 [  0.      0.      0.    -25.    -50.   ]
 [  0.      0.      0.      0.      0.   ]]
