In [2]:
import numpy as np
import matplotlib.pyplot as plt

## Markov Decision Process (MDP)

An MDP is defined by the 5-tuple $\langle \mathcal{S}, \mathcal{A}, \mathcal{T}, \mathcal{R}, \gamma  \rangle$ where,

$\mathcal{S}$ is the state space

$\mathcal{A}$ is the action space

$\mathcal{T}$ is the transition model

$\mathcal{R}$ is the reward model

$\gamma$ is the discount factor


## State Space

In [3]:
TIME_HORIZON = 3
num_damage_states = 5

num_states = TIME_HORIZON * num_damage_states

state_space = {}
key = 1

print('state \t time \t damage state')
for time in range(1, TIME_HORIZON+1):
    for damage_state in range(1, num_damage_states+1):
        state_space[key] = (time, damage_state)
        
        print(f'  {key}:  \t  {time} \t     {damage_state}')

        key += 1

state 	 time 	 damage state
  1:  	  1 	     1
  2:  	  1 	     2
  3:  	  1 	     3
  4:  	  1 	     4
  5:  	  1 	     5
  6:  	  2 	     1
  7:  	  2 	     2
  8:  	  2 	     3
  9:  	  2 	     4
  10:  	  2 	     5
  11:  	  3 	     1
  12:  	  3 	     2
  13:  	  3 	     3
  14:  	  3 	     4
  15:  	  3 	     5


## Action Space

The action space has 3 actions and we carry out these actions at the beginning of the time-step

0: Do nothing - component undergoes deterioration due to the environment

1: Repair - component moves back by 1 damage state but remains failed if it has failed (+undergoes deterioration due to the environment)

2: Replace - component is replaced (+undergoes deterioration due to the environment)

In [4]:
DO_NOTHING = 0
MINOR_REPAIR = 1
REPLACE = 2

action_space = [DO_NOTHING, MINOR_REPAIR, REPLACE]

num_actions = len(action_space)

## Transition Model

The transition model specifies the probability of transitioning from state ($s$) to state ($s'$) for a given action ($a$). 

For action $a =$ Do-Nothing, the transition model summarises the deterioration process.

Mathematically, the probability of transitioning from state ($s$) to state ($s'$) for given action ($a$)
is written as: $\mathbb{P}(s' | s, a)$

The transition model for discrete state spaces can be written as a matrix and the elements of the matrix are the probabilities of transitioning from state ($s$) to state ($s'$) for given action ($a$).

More formally, $T^a_{i, j} = \mathbb{P}(s_j | s_i, a)$

Example: Consider the action $a=$ Do-Nothing. The probability of transitioning from damage state $s$ to


**Quiz 2: Can you explain why the matrix element (4,4) is 1 in for action 'minor-repair' in TRANSITION_MODEL[1]?** 

(Hint: Look at the definition of the action 'minor-repair')

In [5]:
TRANSITION_MODEL = np.zeros((num_actions, num_damage_states, num_damage_states))

# action[0] = do-nothing
TRANSITION_MODEL[0] = np.array([[0.7, 0.3, 0.0, 0.0, 0.0],
                                [0.0, 0.6, 0.4, 0.0, 0.0],
                                [0.0, 0.0, 0.5, 0.5, 0.0],
                                [0.0, 0.0, 0.0, 0.2, 0.8],
                                [0.0, 0.0, 0.0, 0.0, 1.0]])

# action[1] = minor-repair
TRANSITION_MODEL[1] = np.array([[1, 0, 0, 0, 0],
                                [1, 0, 0, 0, 0],
                                [0, 1, 0, 0, 0],
                                [0, 0, 1, 0, 0],
                                [0, 0, 0, 0, 1]])

# action[2] = replace
TRANSITION_MODEL[2] = np.array([[1, 0, 0, 0, 0],
                                [1, 0, 0, 0, 0],
                                [1, 0, 0, 0, 0],
                                [1, 0, 0, 0, 0],
                                [1, 0, 0, 0, 0]])

## Reward model

In [6]:
REPAIR_COST = -25
REPLACE_COST = -50
REWARDS = [0, REPAIR_COST, REPLACE_COST]

PENALTY = -500

## Discount Factor

In [7]:
DISCOUNT_FACTOR = 0.9

In [8]:
def MDP_model(current_state, action):

    """

    Inputs
    ------
    current_state : tuple
        (current_time, current_damage_state)

    action : int
        (1, 2, or 3)


    Returns
    -------
    output: list of tuples
            Each tuple is of the form: (p(s'|s, a), s', r)
            s' is the next state s
            p(s'|s, a) is the probability of trasitioning to state s' given the current state s and the action a
            r is the reward 
            
    """

    current_time, current_damage_state = current_state
    next_time = current_time + 1
    
    output = []

    # action = 'do-nothing'
    # damage state does not change

    # action = 'minor-repair'
    if action == 1:
        # move back by one state
        # but not lower than 1
        # but no minor repair for failure
        if current_damage_state != 5:
            current_damage_state = max(1, current_damage_state-1)

    # action = 'replace'
    elif action == 2:
        # replacing leads to initial undamaged state
        current_damage_state = 1

    for next_damage_state in range(1, num_damage_states+1):

        next_state = (next_time, next_damage_state)
        prob = TRANSITION_MODEL[0][current_damage_state-1, next_damage_state-1]
        reward = REWARDS[action]

        if next_damage_state == 5:
            reward += PENALTY

        output.append((prob, next_state, reward))

    return output

### Example

Below is an example of how the above function can be used.

Assume that we are in the state 5 (time: 1, damage_state:5) and take action (2: replace).

We know that, actions are taken at the beginning of the time step, i.e. the component is replaced 
and the new component undergoes deterioration at the end of the time step. 

Therefore, the next time step is 2 and the possible damage states are 0 or 1 with probability 0.7 and 0.3 respectively (why?).

In [9]:
current_state = state_space[5]
# action = 0 # do nothing
# action = 1 # minor-repair
action = 2 # replace

print(f"Current state: {current_state}")
print(f"Action: {action} \n")

# returns a list p(s'|s,a), s' and r
output = MDP_model(current_state, action)

for prob, next_state, reward in output:
    print(f"Next state: {next_state}, probability: {prob}, reward: {reward}")

Current state: (1, 5)
Action: 2 

Next state: (2, 1), probability: 0.7, reward: -50
Next state: (2, 2), probability: 0.3, reward: -50
Next state: (2, 3), probability: 0.0, reward: -50
Next state: (2, 4), probability: 0.0, reward: -50
Next state: (2, 5), probability: 0.0, reward: -550


In [11]:
state_space

{1: (1, 1),
 2: (1, 2),
 3: (1, 3),
 4: (1, 4),
 5: (1, 5),
 6: (2, 1),
 7: (2, 2),
 8: (2, 3),
 9: (2, 4),
 10: (2, 5),
 11: (3, 1),
 12: (3, 2),
 13: (3, 3),
 14: (3, 4),
 15: (3, 5)}

In [15]:
current_state = state_space[15]
# action = 0 # do nothing
# action = 1 # minor-repair
action = 2 # replace

print(f"Current state: {current_state}")
print(f"Action: {action} \n")

# returns a list p(s'|s,a), s' and r
output = MDP_model(current_state, action)

for prob, next_state, reward in output:
    print(f"Next state: {next_state}, probability: {prob}, reward: {reward}")

Current state: (3, 5)
Action: 2 

Next state: (4, 1), probability: 0.7, reward: -50
Next state: (4, 2), probability: 0.3, reward: -50
Next state: (4, 3), probability: 0.0, reward: -50
Next state: (4, 4), probability: 0.0, reward: -50
Next state: (4, 5), probability: 0.0, reward: -550
