In [1]:
from dataclasses import dataclass
from typing import Tuple, Dict, Mapping
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical
from scipy.stats import poisson

In [2]:
@dataclass(frozen=True)
class InventoryState:
    A_on_hand: int
    A_on_order: int
    
    B_on_hand: int
    B_on_order: int

    def A_inventory_position(self) -> int:
        return self.A_on_hand + self.A_on_order
    
    def B_inventory_position(self) -> int:
        return self.B_on_hand + self.B_on_order


InvOrderMapping = Mapping[
    InventoryState,
    Mapping[int, Categorical[Tuple[InventoryState, float]]]
]

In [3]:
class TwoStoreMDP(FiniteMarkovDecisionProcess[InventoryState, int]):

    def __init__(
        self,
        A_capacity: int,
        A_poisson_lambda: float,
        A_holding_cost: float,
        A_stockout_cost: float,
        
        B_capacity: int,
        B_poisson_lambda: float,
        B_holding_cost: float,
        B_stockout_cost: float,
        
        supplier_cost: float,
        transfer_cost: float
        
    ):
        self.A_capacity: int = A_capacity
        self.A_poisson_lambda: float = A_poisson_lambda
        self.A_holding_cost: float = A_holding_cost
        self.A_stockout_cost: float = A_stockout_cost
        self.A_poisson_distr = poisson(A_poisson_lambda)
        
        self.B_capacity: int = B_capacity
        self.B_poisson_lambda: float = B_poisson_lambda
        self.B_holding_cost: float = B_holding_cost
        self.B_stockout_cost: float = B_stockout_cost
        self.B_poisson_distr = poisson(B_poisson_lambda)
        
        self.supplier_cost: float = supplier_cost
        self.transfer_cost: float = transfer_cost
        
        
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        
        # Instead of State X order -->
        
        # State X Tuple(A_order_supplier, B_order_supplier, A_to_B_transfer)
        
        d: Dict[InventoryState, Dict[Tuple[int, int, int], Categorical[Tuple[InventoryState,
                                                            float]]]] = {}
        # A_alpha = Inventory in Store A
        for A_alpha in range(self.A_capacity + 1):
            # A_beta = inventory on truck from supplier (order from prev day)
            for A_beta in range(self.A_capacity + 1 - A_alpha):
                # B_alpha = Inventory in Store B
                for B_alpha in range(self.B_capacity + 1):
                    #B_beta = inventory on truck from supplier (order from prev day)
                    for B_beta in range(self.B_capacity + 1 - B_alpha):
                        state: InventoryState = InventoryState(A_alpha, A_beta, B_alpha, B_beta)
                        
                        A_ip: int = state.A_inventory_position()
                        B_ip: int = state.B_inventory_position()
                        
                        A_base_reward: float = -self.A_holding_cost * A_alpha + \
                                                -self.supplier_cost * A_beta
                        B_base_reward: float = -self.B_holding_cost * B_alpha + \
                                                -self.supplier_cost * B_beta
                    
                        
                        # We can transfer between stores, and this happens 
                        # "instantaneously" between periods.
                        # For C-(A+B), we can split this up between an order to
                        # the supplier, and order to the other store. 
                        # Each combination incurs a different price. 
                        
                        max_from_A: int = A_ip
                        max_to_A: int = B_ip
                        
                        max_from_B: int = B_ip
                        max_to_B: int = A_ip
                        
                        for A_to_B in range(-min(max_from_B, max_to_A), min(max_from_A,max_to_B)):
                            transfer_reward = -self.transfer_cost * A_to_B
                            
                            A_ip_new: int = A_ip - A_to_B
                            B_ip_new: int = B_ip + A_to_B
                            
                            total_base_reward = A_base_reward + B_base_reward + transfer_reward
                            
                           
                            d1: Dict[Tuple[int,int,int], Categorical[Tuple[InventoryState, float]]] = {}

                            for A_order in range(self.A_capacity - A_ip_new + 1):
                                for B_order in range(self.B_capacity - B_ip_new + 1):
                                    
                                    group_order = (A_order, B_order, A_to_B)
                                    
                                    # Probabilities of next states with NO stockout
                                    sr_probs_dict: Dict[Tuple[InventoryState, float], float] =\
                                        {(InventoryState(A_ip_new - i, A_order, B_ip_new - j, B_order), total_reward):
                                         self.A_poisson_distr.pmf(i) * self.B_poisson_distr.pmf(j) 
                                         for i in range(A_ip_new) for j in range(B_ip_new)}
                                    
                                    # Probabilities of stockout
                                    A_probability: float = 1 - self.A_poisson_distr.cdf(A_ip_new - 1) 
                                    B_probability: float = 1 - self.B_poisson_distr.cdf(B_ip_new - 1) 
                                    
                                    total_reward: float = total_base_reward - self.A_stockout_cost *\
                                        (A_probability * (self.A_poisson_lambda - A_ip_new) +
                                         A_ip_new * self.A_poisson_distr.pmf(A_ip_new)) +\
                                        - self.B_stockout_cost *\
                                        (B_probability * (self.B_poisson_lambda - B_ip_new) +
                                         B_ip_new * self.B_poisson_distr.pmf(B_ip_new))
                                    
                                    
                                    # Add in stockout for A
                                    sr_probs_dict.update({(InventoryState(0, A_order, j, B_order), total_reward):
                                        A_probability * self.B_poisson_distr.pmf(j) for j in range(B_ip_new)})
                                                
                                    # Add stockout for B
                                    sr_probs_dict.update({(InventoryState(i, A_order, 0, B_order), total_reward):
                                        B_probability * self.A_poisson_distr.pmf(i) for i in range(A_ip_new)})
                                     
                                    # Combined stockout
                                    sr_probs_dict.update({(InventoryState(0, A_order, 0, B_order), total_reward):
                                        B_probability * A_probability})
                                    
                                                      
                                                      
                                                      
                                    d1[group_order] = Categorical(sr_probs_dict)

                            d[state] = d1
        return d

In [5]:
if __name__ == '__main__':
    from pprint import pprint

    user_A_capacity = 2
    user_A_poisson_lambda = 1.0
    user_A_holding_cost = 1.0
    user_A_stockout_cost = 10.0
    
    user_B_capacity = 3
    user_B_poisson_lambda = 0.8
    user_B_holding_cost = 0.8
    user_B_stockout_cost = 12.0
    
    user_supplier_cost = 0.4
    user_transfer_cost = 0.1

    user_gamma = 0.9

    si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
        TwoStoreMDP(
            A_capacity=user_A_capacity,
            A_poisson_lambda=user_A_poisson_lambda,
            A_holding_cost=user_A_holding_cost,
            A_stockout_cost=user_A_stockout_cost,
        
            B_capacity=user_B_capacity,
            B_poisson_lambda=user_B_poisson_lambda,
            B_holding_cost=user_B_holding_cost,
            B_stockout_cost=user_B_stockout_cost,
        
            supplier_cost=user_supplier_cost,
            transfer_cost=user_transfer_cost
        )

    print("MDP Transition Map")
    print("------------------")
    print(si_mdp)

    # Boy this gets complicated...
    # Across all combinations of demand for both stores
    # Then across all possible existing orders for each store
    # Then finally across all possible betwee-store transfers
    fdp: FiniteDeterministicPolicy[InventoryState, Tuple[int,int,int]] =\
        FiniteDeterministicPolicy(
            {InventoryState(A_alpha, A_beta, B_alpha, B_beta): 
             [user_A_capacity - (A_alpha + A_beta), 
              user_B_capacity - (B_alpha + B_beta),
              transfer]
            for A_alpha in range(user_A_capacity + 1)
            for B_alpha in range(user_B_capacity + 1)
            for A_beta in range(user_A_capacity + 1 - A_alpha)
            for B_beta in range(user_B_capacity + 1 - B_alpha)
            for transfer in range(-min(B_alpha, user_A_capacity+1-A_alpha-A_beta),
                                   min(A_alpha, user_B_capacity+1-B_alpha-B_beta))
            }
    )
    
    
    # print("Keys:")
    # print(si_mdp.mapping.keys())
    
    print("Deterministic Policy Map")
    print("------------------------")
    print(fdp)

    implied_mrp: FiniteMarkovRewardProcess[InventoryState] =\
        si_mdp.apply_finite_policy(fdp)
    print("Implied MP Transition Map")
    print("--------------")
    print(FiniteMarkovProcess(
        {s.state: Categorical({s1.state: p for s1, p in v.table().items()})
         for s, v in implied_mrp.transition_map.items()}
    ))

    print("Implied MRP Transition Reward Map")
    print("---------------------")
    print(implied_mrp)

    print("Implied MP Stationary Distribution")
    print("-----------------------")
    implied_mrp.display_stationary_distribution()
    print()

    print("Implied MRP Reward Function")
    print("---------------")
    implied_mrp.display_reward_function()
    print()

    print("Implied MRP Value Function")
    print("--------------")
    implied_mrp.display_value_function(gamma=user_gamma)
    print()

    from rl.dynamic_programming import evaluate_mrp_result
    from rl.dynamic_programming import policy_iteration_result
    from rl.dynamic_programming import value_iteration_result

    print("Implied MRP Policy Evaluation Value Function")
    print("--------------")
    pprint(evaluate_mrp_result(implied_mrp, gamma=user_gamma))
    print()

    print("MDP Policy Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_pi, opt_policy_pi = policy_iteration_result(
        si_mdp,
        gamma=user_gamma
    )
    pprint(opt_vf_pi)
    print(opt_policy_pi)
    print()

    print("MDP Value Iteration Optimal Value Function and Optimal Policy")
    print("--------------")
    opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
    pprint(opt_vf_vi)
    print(opt_policy_vi)
    print()

MDP Transition Map
------------------
From State InventoryState(A_on_hand=0, A_on_order=0, B_on_hand=0, B_on_order=1):
  With Action (0, 0, -1):
    To [Terminal State InventoryState(A_on_hand=0, A_on_order=0, B_on_hand=0, B_on_order=0) and Reward -13.579] with Probability 1.000
  With Action (0, 1, -1):
    To [State InventoryState(A_on_hand=0, A_on_order=0, B_on_hand=0, B_on_order=1) and Reward -13.579] with Probability 1.000
  With Action (0, 2, -1):
    To [State InventoryState(A_on_hand=0, A_on_order=0, B_on_hand=0, B_on_order=2) and Reward -13.579] with Probability 1.000
  With Action (0, 3, -1):
    To [State InventoryState(A_on_hand=0, A_on_order=0, B_on_hand=0, B_on_order=3) and Reward -13.579] with Probability 1.000
  With Action (1, 0, -1):
    To [State InventoryState(A_on_hand=0, A_on_order=1, B_on_hand=0, B_on_order=0) and Reward -13.579] with Probability 1.000
  With Action (1, 1, -1):
    To [State InventoryState(A_on_hand=0, A_on_order=1, B_on_hand=0, B_on_order=1) and

KeyError: InventoryState(A_on_hand=0, A_on_order=0, B_on_hand=0, B_on_order=1)