In [4]:
import numpy as np

In [5]:
class CarRental():    
    def reset(self):
        self.nb_location = 2
        self.car_request = (3, 4) 
        self.car_return = (3, 2)
        self.state = [0, 0]
        self.rent_reward = 10
        self.transfer_cost = 2
        self.reward = 0
        self.location_capacity = 20
    
    def __init__(self):
        self.reset()

    def rent_and_return(self, log = False):
        for i in range(self.nb_location):
            nb_rent_request = np.random.poisson(self.car_request[i], 1)[0]
            nb_return = np.random.poisson(self.car_return[i], 1)[0]
            if log: print("Rent, Return:", nb_rent_request, nb_return)
            # first return cars
            self.state[i] = min(self.location_capacity, self.state[i] + nb_return)
            if log: print("Return", nb_return, "on location", i, "nb cars:", self.state[i])
            # rent out cars
            nb_rent = nb_rent_request
            if (nb_rent_request > self.state[i]):
                if log: print("!!! Refusing", nb_rent_request - self.state[i], "rentals on location", i)
                nb_rent = self.state[i]
            if log: print("Renting", nb_rent, "from location ", i, "nb cars:", self.state[i])
            self.state[i] -= nb_rent
            self.reward += self.rent_reward * nb_rent
            if log: print("State:", self.state)
            if log: print("Reward:", self.reward)

    def perform_action(self, action):
        state, cost = self.get_action_output(self.state, action)
        self.state = state
        self.reward -= cost
        
    def get_action_output(self, state, action):
        out_state = [0, 0]
        if (action > 0):
            effective_action = min(action, state[0])
        else:
            effective_action = -min(-action, state[1])
        out_state[0] = min(state[0] - effective_action, self.location_capacity)
        out_state[1] = min(state[1] + effective_action, self.location_capacity)
        cost = self.transfer_cost * abs(action)
        if (action >= 1):
            cost -= 2
        return out_state, cost
        
car_rental = CarRental()

In [6]:
total_reward = 0
for i in range(10):
    car_rental.reset()
    for j in range(200):
        car_rental.rent_and_return()
        # car_rental.perform_action(1)
        # car_rental.perform_action((car_rental.state[0] - car_rental.state[1] ) // 2)
    #print(car_rental.reward)
    total_reward += car_rental.reward
print("Total:", total_reward)

Total: 97160


In [7]:
import math # compute Poisson's probabilities
from random import randrange

def do_nothing_policy(*_):
    return 0

def random_policy(_, s1, s2):
    if s1 == s2:
        return 0
    return randrange(-s2, s1)

# policy consisting in reaching the state s' maximising the expected reward V_s
def greedy_policy(V_s, s1, s2):
    if s1 == s2:
        return 0
    # return randrange(-s2, s1)
    max_Vs =  -80
    best_action = 0
    for a in range(-s2, s1):
        state_n, _ = car_rental.get_action_output([s1, s2], a)
        sn1, sn2 = state_n[0], state_n[1]
        value = 0
        if (sn1, sn2) in V_s:
            value = V_s[(sn1, sn2)]
        if value > max_Vs:
            max_Vs = value
            best_action = a
    return best_action   

def q(V_s, action, s1, s2):
    state_n, cost = car_rental.get_action_output([s1, s2], action)
    sn1, sn2 = state_n[0], state_n[1]
    if (sn1 < 0 or sn2 < 0):
        print(action, car_rental.state, state_n)
        raise IndexError
    v_next = 0.0
    if (sn1, sn2) in V_s:
        v_next = V_s[(sn1, sn2)]
    # Compute expected reward given strategy consisting in doing nothing
    # expected reward on location 1
    v = 0.0
    for i in range(1 , sn1):
        r = car_rental.rent_reward * i - cost
        l_ambda = car_rental.car_request[0]
        p_r = math.pow(l_ambda, i) * math.exp(-l_ambda) / math.factorial(i)
        if (p_r > 1.0):
            raise RangeError
#             if (s1 is 0):
#                 print("Shop 1 Prob", p_r, "of reward", r, v_next, i)
        v += p_r * (r + gamma * v_next)

    # expected reward on location 2
    for j in range(1 , sn2):
        r = car_rental.rent_reward * j - cost
        l_ambda = car_rental.car_request[1]
        p_r = math.pow(l_ambda, j) * math.exp(-l_ambda) / math.factorial(j)
        if (p_r > 1.0):
            raise RangeError
#             if (s1 is 0):
#                 print("Shop 2 Prob", p_r, "of reward", r, v_next, j)
        v += p_r * (r + gamma * v_next)
    return v

def value_of_state(V_s, s1, s2, policy):
    continue_loop = True
    delta = 0.0
    v = 0.0
    v_current = 0.0
    if (s1, s2) in V_s:
        v_current = V_s[(s1, s2)]
    # print(s1, s2)
    while(continue_loop):
        action = policy(s1, s2)
        v = q(V_s, action, s1, s2)
        delta = abs(v - v_current)
        v_current = v
        continue_loop = (delta >= theta)
    return v
    

# Policy improvement
def optimal_policy(V_s, s1, s2):
    if (s1 == 0 and s2 == 0):
        return 0
    qs = [ q(V_s, a, s1, s2) for a in range(-s2, s1)]
    return np.asarray(qs).argmax() -s2


In [None]:
V_s = {}
gamma = 0.9
car_rental.reset()
theta = 2.0

for k in range(100):
    running_policy = lambda s1, s2 : optimal_policy(V_s, s1, s2)
    # Policy evaluation
    fixed_V_s = {}
    for s1, s2 in np.ndindex((car_rental.location_capacity + 1,car_rental.location_capacity + 1)):
        vs = value_of_state(fixed_V_s, s1, s2, running_policy)
        V_s[(s1, s2)] = vs

In [None]:
m = 0 
index_m = 0
for s1,s2 in V_s:
    if V_s[s1,s2] > m:
        m = V_s[s1,s2]
        index_m = (s1,s2)
print(index_m)
print(V_s)

In [None]:
total_reward = 0
for i in range(10):
    car_rental.reset()
    for j in range(200):
        car_rental.rent_and_return()
        action = optimal_policy(V_s, car_rental.state[0], car_rental.state[1])
        car_rental.perform_action(action)
        # print("From state", car_rental.state, "Action", action)
    # print(car_rental.reward)
    total_reward += car_rental.reward
print("Total:", total_reward)

In [None]:
print(optimal_policy(V_s, 2, 14))