In [1]:
from dataclasses import dataclass
from typing import Tuple, Dict, Mapping
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical
from scipy.stats import poisson


@dataclass(frozen=True)
class InventoryState:
    on_hand: int
    on_order: int

    def inventory_position(self) -> int:
        return self.on_hand + self.on_order


InvOrderMapping = Mapping[
    InventoryState,
    Mapping[int, Categorical[Tuple[InventoryState, float]]]
]


class SimpleInventoryMDPCap(FiniteMarkovDecisionProcess[InventoryState, int]):

    def __init__(
        self,
        capacity: int,
        poisson_lambda: float,
        holding_cost: float,
        stockout_cost: float
    ):
        self.capacity: int = capacity
        self.poisson_lambda: float = poisson_lambda
        self.holding_cost: float = holding_cost
        self.stockout_cost: float = stockout_cost

        self.poisson_distr = poisson(poisson_lambda)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[int, Categorical[Tuple[InventoryState,
                                                            float]]]] = {}

        for alpha in range(self.capacity + 1):
            for beta in range(self.capacity + 1 - alpha):
                state: InventoryState = InventoryState(alpha, beta)
                ip: int = state.inventory_position()
                base_reward: float = - self.holding_cost * alpha
                d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

                for order in range(self.capacity - ip + 1):
                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                        {(InventoryState(ip - i, order), base_reward):
                         self.poisson_distr.pmf(i) for i in range(ip)}

                    probability: float = 1 - self.poisson_distr.cdf(ip - 1)
                    reward: float = base_reward - self.stockout_cost *\
                        (probability * (self.poisson_lambda - ip) +
                         ip * self.poisson_distr.pmf(ip))
                    sr_probs_dict[(InventoryState(0, order), reward)] = \
                        probability
                    d1[order] = Categorical(sr_probs_dict)

                d[state] = d1
        return d



@dataclass(frozen=True)
class TwoStoreInventoryState:
    on_hand_1: int
    on_order_1: int
    on_hand_2: int
    on_order_2: int

    def inventory_position(self) -> Tuple[int,int]:
        return self.on_hand_1 + self.on_order_1, self.on_hand_2 + self.on_order_2


InvOrderMapping = Mapping[
    TwoStoreInventoryState,
    Mapping[int, Categorical[Tuple[TwoStoreInventoryState, float]]]
]

class TwoStoreInventory(FiniteMarkovDecisionProcess[TwoStoreInventoryState, int]):

    def __init__(
        self,
        capacity_1: int,
        capacity_2: int,
        poisson_lambda_1: float,
        poisson_lambda_2: float,
        holding_cost_1: float,
        holding_cost_2: float,
        stockout_cost_1: float,
        stockout_cost_2: float,
        supplier_cost: float,
        between_stores_cost: float
    ):
        self.capacity_1: int = capacity_1
        self.capacity_2: int = capacity_2
        self.poisson_lambda_1: float = poisson_lambda_1
        self.poisson_lambda_2: float = poisson_lambda_2
        self.holding_cost_1: float = holding_cost_1
        self.holding_cost_2: float = holding_cost_2
        self.stockout_cost_1: float = stockout_cost_1
        self.stockout_cost_2: float = stockout_cost_2
        self.supplier_cost: float = supplier_cost
        self.between_stores_cost: float = between_stores_cost

        self.poisson_distr_1 = poisson(poisson_lambda_1)
        self.poisson_distr_2 = poisson(poisson_lambda_2)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[TwoStoreInventoryState, Dict[int, Categorical[Tuple[TwoStoreInventoryState,
                                                            float]]]] = {}


        for alpha1 in range(self.capacity_1+1):
            for beta1 in range(self.capacity_1+1-alpha1):
                for alpha2 in range(self.capacity_2+1):
                    for beta2 in range(self.capacity_2+1-alpha2):
                        state: TwoStoreInventory = TwoStoreInventory(alpha1,beta1,alpha2,beta2)
                        ip1,ip2 = state.inventory_position()
                        for gamma in range(max(-alpha2,alpha1-self.capacity_1),min(alpha1,self.capacity_2-alpha2)+1):
                            base_reward: float = -self.holding_cost_1 * alpha1 -self.holding_cost_2 * alpha2
                            ip1 -= gamma
                            ip2 += gamma
                            d1: Dict[Tuple[int,int,int], Categorical[Tuple[TwoStoreInventoryState, float]]] = {}
                            for order1 in range(self.capacity_1-ip1+1):
                                for order2 in range(self.capacity_2-ip2+1):
                                    pr1: float = 1 - self.poisson_distr_1.cdf(ip1 - 1)   
                                    pr2: float = 1 - self.poisson_distr_2.cdf(ip2 - 1)                                 
                                    sr_probs_dict: Dict[Tuple[TwoStoreInventoryState, float], float] = {}
                                    for index1 in range(ip1):
                                        for index2 in range(ip2):
                                            reward = -self.holding_cost_1*alpha1-self.holding_cost_2*alpha2
                                            sr_probs_dict[(TwoStoreInventoryState(ip1-index1,order1,ip2-index2,order2),reward)] = self.poisson_distr_1.pmf(index1)*self.poisson_distr_2.pmf(index2)
                                    for index1 in range(ip1):
                                        reward = -self.holding_cost_1*alpha1-self.holding_cost_2*alpha2-self.stockout_cost_2*(pr2*(self.poisson_lambda_2-ip2)+ip2*self.poisson_distr_2.pmf(ip2))
                                        sr_probs_dict[(TwoStoreInventoryState(ip1-index1,order1,0,order2),reward)] = self.poisson_distr_1.pmf(index1)*pr2
                                    for index2 in range(ip2):
                                        reward = -self.holding_cost_1*alpha1-self.holding_cost_2*alpha2-self.stockout_cost_1*(pr1*(self.poisson_lambda_1-ip1)+ip1*self.poisson_distr_1.pmf(ip1))
                                        sr_probs_dict[(TwoStoreInventoryState(0,order1,ip2-index2,order2),reward)] = pr1*self.poisson_distr_2.pmf(index2)
                                    reward = -self.holding_cost_1*alpha1-self.holding_cost_2*alpha2-\
                                        self.stockout_cost_1*(pr1*(self.poisson_lambda_1-ip1)+ip1*self.poisson_distr_1.pmf(ip1))-self.stockout_cost_2*(pr2*(self.poisson_lambda_2-ip2)+ip2*self.poisson_distr_2.pmf(ip2))
                                    sr_probs_dict[(TwoStoreInventoryState(0,order1,0,order2),reward)]
                            d1[(gamma,order1,order2)] = Categorical(sr_probs_dict)
                        d[state] = d1


        # for alpha in range(self.capacity + 1):
        #     for beta in range(self.capacity + 1 - alpha):
        #         state: InventoryState = InventoryState(alpha, beta)
        #         ip: int = state.inventory_position()
        #         base_reward: float = - self.holding_cost * alpha
        #         d1: Dict[int, Categorical[Tuple[InventoryState, float]]] = {}

        #         for order in range(self.capacity - ip + 1):
        #             sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
        #                 {(InventoryState(ip - i, order), base_reward):
        #                  self.poisson_distr.pmf(i) for i in range(ip)}

        #             probability: float = 1 - self.poisson_distr.cdf(ip - 1)
        #             reward: float = base_reward - self.stockout_cost *\
        #                 (probability * (self.poisson_lambda - ip) +
        #                  ip * self.poisson_distr.pmf(ip))
        #             sr_probs_dict[(InventoryState(0, order), reward)] = \
        #                 probability
        #             d1[order] = Categorical(sr_probs_dict)

        #         d[state] = d1
        return d


if __name__ == '__main__':
    from pprint import pprint

    user_capacity = 2
    user_poisson_lambda = 1.0
    user_holding_cost = 1.0
    user_stockout_cost = 10.0

    user_gamma = 0.9

    si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
        SimpleInventoryMDPCap(
            capacity=user_capacity,
            poisson_lambda=user_poisson_lambda,
            holding_cost=user_holding_cost,
            stockout_cost=user_stockout_cost
        )

    print("MDP Transition Map")
    print("------------------")
    print(si_mdp)

    fdp: FiniteDeterministicPolicy[InventoryState, int] = \
        FiniteDeterministicPolicy(
            {InventoryState(alpha, beta): user_capacity - (alpha + beta)
             for alpha in range(user_capacity + 1)
             for beta in range(user_capacity + 1 - alpha)}
    )

    print("Deterministic Policy Map")
    print("------------------------")
    print(fdp)

    implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
        si_mdp.apply_finite_policy(fdp)
    print("Implied MP Transition Map")
    print("--------------")
    print(FiniteMarkovProcess(
        {s.state: Categorical({s1.state: p for s1, p in v.table().items()})
         for s, v in implied_mrp.transition_map.items()}
    ))

    print("Implied MRP Transition Reward Map")
    print("---------------------")
    print(implied_mrp)

    print("Implied MP Stationary Distribution")
    print("-----------------------")
    implied_mrp.display_stationary_distribution()
    print()

    print("Implied MRP Reward Function")
    print("---------------")
    implied_mrp.display_reward_function()
    print()

    print("Implied MRP Value Function")
    print("--------------")
    implied_mrp.display_value_function(gamma=user_gamma)
    print()

    from rl.dynamic_programming import evaluate_mrp_result
    from rl.dynamic_programming import policy_iteration_result
    from rl.dynamic_programming import value_iteration_result

    print("Implied MRP Policy Evaluation Value Function")
    print("--------------")
    pprint(evaluate_mrp_result(implied_mrp, gamma=user_gamma))
    print()

    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(
        si_mdp,
        gamma=user_gamma
    )
    pprint(opt_vf_pi)
    print(opt_policy_pi)
    print()

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
    pprint(opt_vf_vi)
    print(opt_policy_vi)
    print()


MDP Transition Map
------------------
From State InventoryState(on_hand=0, on_order=0):
  With Action 0:
    To [State InventoryState(on_hand=0, on_order=0) and Reward -10.000] with Probability 1.000
  With Action 1:
    To [State InventoryState(on_hand=0, on_order=1) and Reward -10.000] with Probability 1.000
  With Action 2:
    To [State InventoryState(on_hand=0, on_order=2) and Reward -10.000] with Probability 1.000
From State InventoryState(on_hand=0, on_order=1):
  With Action 0:
    To [State InventoryState(on_hand=1, on_order=0) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=0) and Reward -3.679] with Probability 0.632
  With Action 1:
    To [State InventoryState(on_hand=1, on_order=1) and Reward -0.000] with Probability 0.368
    To [State InventoryState(on_hand=0, on_order=1) and Reward -3.679] with Probability 0.632
From State InventoryState(on_hand=0, on_order=2):
  With Action 0:
    To [State InventoryState(on_hand=2, on_order=