In [1]:
import sys
sys.path.append("../../RL-book")
from dataclasses import dataclass
from typing import Tuple, Dict
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.markov_decision_process import FinitePolicy, StateActionMapping
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical, Constant
from scipy.stats import poisson


@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int

    def inventory_position(self) -> int:
        return self.on_hand + self.on_order


InvOrderMapping = StateActionMapping[InventoryState, int]


class SimpleInventoryMDPCap(FiniteMarkovDecisionProcess[InventoryState, int]):

    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost

        self.poisson_distr = poisson(poisson_lambda)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState,
                                                            float]]]] = {}

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state.inventory_position()
                base_reward: float = - self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                         self.poisson_distr.pmf(i) for i in range(ip)}

                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost *\
                        (probability * (self.poisson_lambda - ip) +
                         ip * self.poisson_distr.pmf(ip))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)

                d[state] = d1
        return d

In [3]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
    SimpleInventoryMDPCap(
        capacity=user_capacity,
        poisson_lambda=user_poisson_lambda,
        holding_cost=user_holding_cost,
        stockout_cost=user_stockout_cost
    )


In [4]:
si_mdp.get_action_transition_reward_map()
#{State:{Action 1:{(Next_State,Reward):Probability},{Action 2:{(Next_State,Reward):Probability}}}}

{InventoryState(on_hand=0, on_order=0): {0: {(InventoryState(on_hand=0, on_order=0), -10.0): 1.0},
  1: {(InventoryState(on_hand=0, on_order=1), -10.0): 1.0},
  2: {(InventoryState(on_hand=0, on_order=2), -10.0): 1.0}},
 InventoryState(on_hand=0, on_order=1): {0: {(InventoryState(on_hand=1, on_order=0), -0.0): 0.3678794411714424, (InventoryState(on_hand=0, on_order=0), -3.6787944117144233): 0.6321205588285577},
  1: {(InventoryState(on_hand=1, on_order=1), -0.0): 0.3678794411714424, (InventoryState(on_hand=0, on_order=1), -3.6787944117144233): 0.6321205588285577}},
 InventoryState(on_hand=0, on_order=2): {0: {(InventoryState(on_hand=2, on_order=0), -0.0): 0.36787944117144233, (InventoryState(on_hand=1, on_order=0), -0.0): 0.36787944117144233, (InventoryState(on_hand=0, on_order=0), -1.0363832351432696): 0.26424111765711533}},
 InventoryState(on_hand=1, on_order=0): {0: {(InventoryState(on_hand=1, on_order=0), -1.0): 0.3678794411714424, (InventoryState(on_hand=0, on_order=0), -4.6787944

In [2]:
if __name__ == '__main__':
    from pprint import pprint

    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0

    user_gamma = 0.9

    si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity,
            poisson_lambda=user_poisson_lambda,
            holding_cost=user_holding_cost,
            stockout_cost=user_stockout_cost
        )

    print("MDP Transition Map")
    print("------------------")
    print(si_mdp)

MDP Transition Map
------------------
From State InventoryState(on_hand=0, on_order=0):
  With Action 0:
    To [State InventoryState(on_hand=0, on_order=0) and Reward -10.000] with Probability 1.000
  With Action 1:
    To [State InventoryState(on_hand=0, on_order=1) and Reward -10.000] with Probability 1.000
  With Action 2:
    To [State InventoryState(on_hand=0, on_order=2) and Reward -10.000] with Probability 1.000
From State InventoryState(on_hand=0, on_order=1):
  With Action 0:
    To [State InventoryState(on_hand=1, on_order=0) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=0) and Reward -3.679] with Probability 0.632
  With Action 1:
    To [State InventoryState(on_hand=1, on_order=1) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=1) and Reward -3.679] with Probability 0.632
From State InventoryState(on_hand=0, on_order=2):
  With Action 0:
    To [State InventoryState(on_hand=2, on_order=

In [6]:
fdp: FinitePolicy[InventoryState, int] = FinitePolicy(
    {InventoryState(alpha, beta):
     Constant(user_capacity - (alpha + beta)) for alpha in
     range(user_capacity + 1) for beta in range(user_capacity + 1 - alpha)}
)

print("Policy Map")
print("----------")
print(fdp)

implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
    si_mdp.apply_finite_policy(fdp)
print("Implied MP Transition Map")
print("--------------")
print(FiniteMarkovProcess(implied_mrp.transition_map))

print("Implied MRP Transition Reward Map")
print("---------------------")
print(implied_mrp)

print("Implied MP Stationary Distribution")
print("-----------------------")
implied_mrp.display_stationary_distribution()
print()

print("Implied MRP Reward Function")
print("---------------")
implied_mrp.display_reward_function()
print()

print("Implied MRP Value Function")
print("--------------")
implied_mrp.display_value_function(gamma=user_gamma)
print()

from rl.dynamic_programming import evaluate_mrp_result
from rl.dynamic_programming import policy_iteration_result
from rl.dynamic_programming import value_iteration_result

print("Implied MRP Policy Evaluation Value Function")
print("--------------")
pprint(evaluate_mrp_result(implied_mrp, gamma=user_gamma))
print()

print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
print("--------------")
opt_vf_pi, opt_policy_pi = policy_iteration_result(
    si_mdp,
    gamma=user_gamma
)
pprint(opt_vf_pi)
print(opt_policy_pi)
print()

print("MDP Value Iteration Optimal Value Function and Optimal Policy")
print("--------------")
opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
pprint(opt_vf_vi)
print(opt_policy_vi)
print()


Policy Map
----------
For State InventoryState(on_hand=0, on_order=0):
  Do Action 2 with Probability 1.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Action 0 with Probability 1.000

Implied MP Transition Map
--------------
From State InventoryState(on_hand=0, on_order=0):
  To State InventoryState(on_hand=0, on_order=2) with Probability 1.000
From State InventoryState(on_hand=0, on_order=1):
  To State InventoryState(on_hand=1, on_order=1) with Probability 0.368
  To State InventoryState(on_hand=0, on_order=1) with Probability 0.632
From State InventoryState(on_hand=0, on_order=2):
  To State InventoryState(on_hand=2, on_order=0) with 

In [7]:
fdp.states()

dict_keys([InventoryState(on_hand=0, on_order=0), InventoryState(on_hand=0, on_order=1), InventoryState(on_hand=0, on_order=2), InventoryState(on_hand=1, on_order=0), InventoryState(on_hand=1, on_order=1), InventoryState(on_hand=2, on_order=0)])