In [None]:
import numpy as np

In [None]:
class CarRental():    
    def reset(self):
        self.nb_location = 2
        self.car_request = (3, 4) 
        self.car_return = (3, 2)
        self.state = [0, 0]
        self.rent_reward = 10
        self.transfer_cost = 2
        self.reward = 0
        self.location_capacity = 20
    
    def __init__(self):
        self.reset()

    def rent_and_return(self, log = False):
        for i in range(self.nb_location):
            nb_rent_request = np.random.poisson(self.car_request[i], 1)[0]
            nb_return = np.random.poisson(self.car_return[i], 1)[0]
            if log: print("Rent, Return:", nb_rent_request, nb_return)
            # first return cars
            self.state[i] = min(self.location_capacity, self.state[i] + nb_return)
            if log: print("Return", nb_return, "on location", i, "nb cars:", self.state[i])
            # rent out cars
            nb_rent = nb_rent_request
            if (nb_rent_request > self.state[i]):
                if log: print("!!! Refusing", nb_rent_request - self.state[i], "rentals on location", i)
                nb_rent = self.state[i]
            if log: print("Renting", nb_rent, "from location ", i, "nb cars:", self.state[i])
            self.state[i] -= nb_rent
            self.reward += self.rent_reward * nb_rent
            if log: print("State:", self.state)
            if log: print("Reward:", self.reward)

    def perform_action(self, action):
        state, cost = self.get_action_output(action)
        self.state = state
        self.reward -= cost
        
    def get_action_output(self, action):
        state = [0, 0]
        if (action > 0):
            effective_action = min(action, self.state[0])
        else:
            effective_action = -min(-action, self.state[1])
        state[0] = min(self.state[0] - effective_action, self.location_capacity)
        state[1] = min(self.state[1] + effective_action, self.location_capacity)
        cost = self.transfer_cost * abs(action)
        if (action >= 1):
            cost -= 2
        return state, cost
        
car_rental = CarRental()

In [None]:
total_reward = 0
for i in range(100):
    car_rental.reset()
    for j in range(30):
        car_rental.rent_and_return()
        # car_rental.perform_action(1)
        # car_rental.perform_action((car_rental.state[0] - car_rental.state[1] ) // 2)
    #print(car_rental.reward)
    total_reward += car_rental.reward
print("Total:", total_reward)

In [None]:
import math # compute Poisson's probabilities
V_s = {}
gamma = 0.9

# policy consisting in reaching the state s' maximising the expected reward V_s
def policy(s1, s2):
    max_Vs =  -80
    best_action = 0
    for a in range(-s2, s1):
        state_n, cost = car_rental.get_action_output(a)
        sn1 ,sn2 = state_n[0], state_n[1]
        value = 0
        if (sn1, sn2) in V_s:
            value = V_s[(sn1, sn2)]
        if value > max_Vs:
            max_Vs = value
            best_action = a
    return best_action   

In [None]:
for j in range(100):
    # Policy evaluation
    for s1, s2 in np.ndindex((car_rental.location_capacity + 1,car_rental.location_capacity + 1)):
        action = policy(s1, s2)
        car_rental.state = [s1, s2]
        state_n, cost = car_rental.get_action_output(action)
        sn1, sn2 = state_n[0], state_n[1]
        if (sn1 < 0 or sn2 < 0):
            print(action, car_rental.state, state_n)
            raise IndexError
        if (sn1, sn2) not in V_s:
            V_s[(sn1, sn2)] = 0.0
        # Compute expected reward given strategy consisting in doing nothing
        # expected reward on location 1
        V_s[(s1, s2)] = 0.0
        for i in range(1 , sn1):
            r = car_rental.rent_reward * i - cost
            l_ambda = car_rental.car_request[0]
            p_r = math.pow(l_ambda, i) * math.exp(-l_ambda) / math.factorial(i)
            V_s[(s1, s2)] += p_r * (r + gamma * V_s[(sn1, sn2)] )

        # expected reward on location 2
        for i in range(1 , sn2):
            r = car_rental.rent_reward * i - cost
            l_ambda = car_rental.car_request[1]
            p_r = math.pow(l_ambda, i) * math.exp(-l_ambda) / math.factorial(i)
            V_s[(s1, s2)] += p_r * (r + gamma * V_s[(sn1, sn2)] )

In [None]:
print(V_s)

In [None]:
total_reward = 0
for i in range(100):
    car_rental.reset()
    for j in range(30):
        car_rental.rent_and_return()
        action = policy(car_rental.state[0], car_rental.state[1])
        car_rental.perform_action(action)
        # print("From state", car_rental.state, "Action", action)
    # print(car_rental.reward)
    total_reward += car_rental.reward
print("Total:", total_reward)