# MDP

In [None]:
import random
import numpy as np
from numpy.random import choice

In [None]:
# Problem Definition

states = ('PU', 'PF', 'RU', 'RF')
actions = ('save_money', 'advertise')

def reward(state):
    if state.startswith('P'):
        return 0
    elif state.startswith('R'):
        return 10

def transition_function(state, action, resulted_state):
    transition_dictionary = {
        ('PU', 'save_money', 'PU'): 1.0,
        ('PU', 'save_money', 'PF'): 0,
        ('PU', 'save_money', 'RU'): 0,
        ('PU', 'save_money', 'RF'): 0,
        ('PU', 'advertise', 'PU'): 0.5,
        ('PU', 'advertise', 'PF'): 0.5,
        ('PU', 'advertise', 'RU'): 0,
        ('PU', 'advertise', 'RF'): 0,

        ('PF', 'save_money', 'PU'): 0.5,
        ('PF', 'save_money', 'PF'): 0,
        ('PF', 'save_money', 'RU'): 0,
        ('PF', 'save_money', 'RF'): 0.5,
        ('PF', 'advertise', 'PU'): 0,
        ('PF', 'advertise', 'PF'): 1.0,
        ('PF', 'advertise', 'RU'): 0,
        ('PF', 'advertise', 'RF'): 0,

        ('RU', 'save_money', 'PU'): 0.5,
        ('RU', 'save_money', 'PF'): 0,
        ('RU', 'save_money', 'RU'): 0.5,
        ('RU', 'save_money', 'RF'): 0,
        ('RU', 'advertise', 'PU'): 0.5,
        ('RU', 'advertise', 'PF'): 0.5,
        ('RU', 'advertise', 'RU'): 0,
        ('RU', 'advertise', 'RF'): 0,

        ('RF', 'save_money', 'PU'): 0,
        ('RF', 'save_money', 'PF'): 0,
        ('RF', 'save_money', 'RU'): 0.5,
        ('RF', 'save_money', 'RF'): 0.5,
        ('RF', 'advertise', 'PU'): 0,
        ('RF', 'advertise', 'PF'): 1.0,
        ('RF', 'advertise', 'RU'): 0,
        ('RF', 'advertise', 'RF'): 0
    }
    return transition_dictionary[state, action, resulted_state]


def get_next_state(state, action):
    probabilities = [transition_function(state, action, s) for s in states]
    resulted_state = choice(states, p=probabilities)
    return resulted_state

In [None]:
# MDP class implementation

import numpy as np
from copy import deepcopy
from random import choice

class MDP:
    '''
    Assuming P and R are known.
    '''

    def __init__(self, states=states, actions=actions, P=transition_function, R=reward, gamma=0.9):
        self.gamma = gamma  # discount constant
        self.states = states
        self.actions = actions
        self.P = P # transition function
        self.R = R # reward function
        self.V = {} # current est. V*
        self.optimal_policy = {}

    def choose_action(self, state):
      if len(self.optimal_policy) == 0:
        return choice(self.actions, 1, p=probs)[0]
      else:
        return self.optimal_policy[state]

    def policy_evaluation(self, policy):
      n = len(self.states)
      A = np.zeros((n, n))
      b = np.array([-self.R(s) for s in self.states])
      for i, s in enumerate(self.states):
        A[i][i] = -1
        for j, s2 in enumerate(self.states):
          A[i][j] += self.gamma * self.P(s, policy[s], s2)
      values = np.linalg.solve(A, b)
      return {s: v for (s, v) in zip(self.states, values)}

    def policy_iteration(self, T=100):
      # init random policy
      old_policy = {s: choice(self.actions) for s in self.states}
      for i in range(T):
        new_policy = {}
        self.V = self.policy_evaluation(old_policy)
        for s in self.states:
          # argmax_a { sum ( P(s' | s , a) * V(s') ) }
          action_index = np.argmax([sum([self.P(s, a, s2) * self.V[s2] for s2 in self.states]) for a in self.actions])
          new_policy[s] = self.actions[action_index]
        old_policy = new_policy
      self.optimal_policy = deepcopy(new_policy)

    def value_iteration(self, T=100):
      policy = {}
      old_V = {s: 0 for s in self.states}
      for i in range(T):
        new_V = deepcopy(old_V)
        for s in self.states:
          max_a_exp = float("-inf") # sum ( P(s' | s , a) * V(s') )
          max_a = None # a which maximizing max_a_exp
          for a in self.actions:
            exp = sum([self.P(s, a, s2) * old_V[s2] for s2 in self.states])
            if exp > max_a_exp:
              max_a_exp = exp
              max_a = a
          policy[s] = max_a
          new_V[s] = self.R(s) + self.gamma * max_a_exp
          old_V = deepcopy(new_V)
      self.V = deepcopy(new_V)
      self.optimal_policy = deepcopy(policy)

    def print_policy(self):
      if len(self.optimal_policy) == 0:
        print('{}')
      else:
        for s in states:
            a = self.optimal_policy[s]
            print(f's: {s}, a: {a}, v*(s): {self.V[s]}')
        print()

In [None]:
# Usage (VI, PI, policy eval)

mdp = MDP()
mdp.print_policy()

print('value iteration result:')
mdp.value_iteration(100)
mdp.print_policy()

print('policy iteration result:')
mdp.policy_iteration(100)
mdp.print_policy()

phi = {
    'PU': 'advertise',
    'PF': 'save_money',
    'RU': 'save_money',
    'RF': 'save_money'
}
print('policy evaluation on policy phi:')
print(mdp.policy_evaluation(phi))

{}
value iteration result:
s: PU, a: advertise, v*(s): 31.58508953413495
s: PF, a: save_money, v*(s): 38.60400287377479
s: RU, a: save_money, v*(s): 44.02416232966445
s: RF, a: save_money, v*(s): 54.20158563176306

policy iteration result:
s: PU, a: advertise, v*(s): 31.58510430883212
s: PF, a: save_money, v*(s): 38.60401637746148
s: RU, a: save_money, v*(s): 44.024176252680824
s: RF, a: save_money, v*(s): 54.20159875219339

policy evaluation on policy phi:
{'PU': 31.58510430883212, 'PF': 38.60401637746148, 'RU': 44.024176252680824, 'RF': 54.20159875219339}
