In [1]:
import gymnasium as gym
import numpy as np
from scipy.stats import poisson

class JacksCarRentalEnv(gym.Env):
    """
    Jack's Car Rental environment following the Gym interface.
    """

    def __init__(self, max_cars=20, max_move_cars=5, rental_credit=10, move_car_cost=2,
                 request_lambda=[3, 4], return_lambda=[3, 2]):
        """
        Initialize the environment.
        """
        self.max_cars = max_cars
        self.max_move_cars = max_move_cars
        self.rental_credit = rental_credit
        self.move_car_cost = move_car_cost

        self.request_lambda = request_lambda  # Poisson distribution lambda for requests
        self.return_lambda = return_lambda   # Poisson distribution lambda for returns

        # Define the action and observation space
        self.action_space = gym.spaces.Discrete(2 * max_move_cars + 1)
        self.observation_space = gym.spaces.Tuple(
            (gym.spaces.Discrete(max_cars + 1), gym.spaces.Discrete(max_cars + 1))
        )

        self.state = None

    def step(self, action):
        """
        Execute one time step within the environment.
        """
        # Decode the action (assuming action 0 means moving 5 cars from A to B, and so on)
        num_cars_moved = action - self.max_move_cars
        cars_at_A, cars_at_B = self.state

        # Move cars overnight (capped by the number of cars available and max cars allowed)
        cars_moved = min(cars_at_A if num_cars_moved > 0 else cars_at_B, abs(num_cars_moved), self.max_move_cars)
        cars_at_A -= cars_moved * np.sign(num_cars_moved)
        cars_at_B += cars_moved * np.sign(num_cars_moved)

        # Simulate rental requests and returns for both locations
        requests_A = poisson.rvs(self.request_lambda[0])
        requests_B = poisson.rvs(self.request_lambda[1])
        returns_A = poisson.rvs(self.return_lambda[0])
        returns_B = poisson.rvs(self.return_lambda[1])

        # Calculate rentals
        rentals_A = min(cars_at_A, requests_A)
        rentals_B = min(cars_at_B, requests_B)

        # Update the number of cars after rentals and returns
        cars_at_A = min(cars_at_A - rentals_A + returns_A, self.max_cars)
        cars_at_B = min(cars_at_B - rentals_B + returns_B, self.max_cars)

        # Calculate reward
        reward = (rentals_A + rentals_B) * self.rental_credit - abs(num_cars_moved) * self.move_car_cost

        # Update state
        self.state = (cars_at_A, cars_at_B)

        return self.state, reward, False, False, {}

    def reset(self, option="random"):
        """
        Reset the state of the environment to an initial state.
        """
     
        if option == "random":
            self.state = (np.random.randint(0, self.max_cars + 1),
                        np.random.randint(0, self.max_cars + 1))
            
        if option == "equal":    
            initial_cars = self.max_cars // 2
            self.state = (initial_cars, initial_cars)

        return self.state

    def render(self, mode='human'):
        """
        Render the environment.
        """
        if mode == 'human':
            print(f"Location A: {self.state[0]} cars, Location B: {self.state[1]} cars")
        elif mode == 'ansi':
            return f"Location A: {self.state[0]} cars, Location B: {self.state[1]} cars"
        else:
            raise NotImplementedError("Render mode not supported: " + mode)


# Example usage
env = JacksCarRentalEnv()

env.state