-
Notifications
You must be signed in to change notification settings - Fork 0
/
car_rental.py
169 lines (141 loc) · 7.22 KB
/
car_rental.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#######################################################################
# Copyright (C) #
# 2016 Shangtong Zhang(zhangshangtong.cpp@gmail.com) #
# 2016 Kenta Shimada(hyperkentakun@gmail.com) #
# 2017 Aja Rangaswamy (aja004@gmail.com) #
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from math import exp, factorial
import seaborn as sns
# maximum # of cars in each location
MAX_CARS = 20
# maximum # of cars to move during night
MAX_MOVE_OF_CARS = 5
# expectation for rental requests in first location
RENTAL_REQUEST_FIRST_LOC = 3
# expectation for rental requests in second location
RENTAL_REQUEST_SECOND_LOC = 4
# expectation for # of cars returned in first location
RETURNS_FIRST_LOC = 3
# expectation for # of cars returned in second location
RETURNS_SECOND_LOC = 2
DISCOUNT = 0.9
# credit earned by a car
RENTAL_CREDIT = 10
# cost of moving a car
MOVE_CAR_COST = 2
# all possible actions
actions = np.arange(-MAX_MOVE_OF_CARS, MAX_MOVE_OF_CARS + 1)
# An up bound for poisson distribution
# If n is greater than this value, then the probability of getting n is truncated to 0
POISSON_UPPER_BOUND = 11
# Probability for poisson distribution
# @lam: lambda should be less than 10 for this function
poisson_cache = dict()
def poisson(n, lam):
global poisson_cache
key = n * 10 + lam
if key not in poisson_cache.keys():
poisson_cache[key] = exp(-lam) * pow(lam, n) / factorial(n)
return poisson_cache[key]
# @state: [# of cars in first location, # of cars in second location]
# @action: positive if moving cars from first location to second location,
# negative if moving cars from second location to first location
# @stateValue: state value matrix
# @constant_returned_cars: if set True, model is simplified such that
# the # of cars returned in daytime becomes constant
# rather than a random value from poisson distribution, which will reduce calculation time
# and leave the optimal policy/value state matrix almost the same
def expected_return(state, action, state_value, constant_returned_cars):
# initailize total return
returns = 0.0
# cost for moving cars
returns -= MOVE_CAR_COST * abs(action)
# go through all possible rental requests
for rental_request_first_loc in range(0, POISSON_UPPER_BOUND):
for rental_request_second_loc in range(0, POISSON_UPPER_BOUND):
# moving cars
num_of_cars_first_loc = int(min(state[0] - action, MAX_CARS))
num_of_cars_second_loc = int(min(state[1] + action, MAX_CARS))
# valid rental requests should be less than actual # of cars
real_rental_first_loc = min(num_of_cars_first_loc, rental_request_first_loc)
real_rental_second_loc = min(num_of_cars_second_loc, rental_request_second_loc)
# get credits for renting
reward = (real_rental_first_loc + real_rental_second_loc) * RENTAL_CREDIT
num_of_cars_first_loc -= real_rental_first_loc
num_of_cars_second_loc -= real_rental_second_loc
# probability for current combination of rental requests
prob = poisson(rental_request_first_loc, RENTAL_REQUEST_FIRST_LOC) * \
poisson(rental_request_second_loc, RENTAL_REQUEST_SECOND_LOC)
if constant_returned_cars:
# get returned cars, those cars can be used for renting tomorrow
returned_cars_first_loc = RETURNS_FIRST_LOC
returned_cars_second_loc = RETURNS_SECOND_LOC
num_of_cars_first_loc = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
num_of_cars_second_loc = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
returns += prob * (reward + DISCOUNT * state_value[num_of_cars_first_loc, num_of_cars_second_loc])
else:
for returned_cars_first_loc in range(0, POISSON_UPPER_BOUND):
for returned_cars_second_loc in range(0, POISSON_UPPER_BOUND):
num_of_cars_first_loc_ = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS)
num_of_cars_second_loc_ = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS)
prob_ = poisson(returned_cars_first_loc, RETURNS_FIRST_LOC) * \
poisson(returned_cars_second_loc, RETURNS_SECOND_LOC) * prob
returns += prob_ * (reward + DISCOUNT * state_value[num_of_cars_first_loc_, num_of_cars_second_loc_])
return returns
def figure_4_2(constant_returned_cars=True):
value = np.zeros((MAX_CARS + 1, MAX_CARS + 1))
policy = np.zeros(value.shape, dtype=np.int)
iterations = 0
_, axes = plt.subplots(2, 3, figsize=(40, 20))
plt.subplots_adjust(wspace=0.1, hspace=0.2)
axes = axes.flatten()
while True:
fig = sns.heatmap(np.flipud(policy), cmap="YlGnBu", ax=axes[iterations])
fig.set_ylabel('# cars at first location', fontsize=30)
fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
fig.set_xlabel('# cars at second location', fontsize=30)
fig.set_title('policy %d' % (iterations), fontsize=30)
# policy evaluation (in-place)
while True:
new_value = np.copy(value)
for i in range(MAX_CARS + 1):
for j in range(MAX_CARS + 1):
new_value[i, j] = expected_return([i, j], policy[i, j], new_value,
constant_returned_cars)
value_change = np.abs((new_value - value)).sum()
print('value change %f' % (value_change))
value = new_value
if value_change < 1e-4:
break
# policy improvement
new_policy = np.copy(policy)
for i in range(MAX_CARS + 1):
for j in range(MAX_CARS + 1):
action_returns = []
for action in actions:
if (action >= 0 and i >= action) or (action < 0 and j >= abs(action)):
action_returns.append(expected_return([i, j], action, value, constant_returned_cars))
else:
action_returns.append(-float('inf'))
new_policy[i, j] = actions[np.argmax(action_returns)]
policy_change = (new_policy != policy).sum()
print('policy changed in %d states' % (policy_change))
policy = new_policy
if policy_change == 0:
fig = sns.heatmap(np.flipud(value), cmap="YlGnBu", ax=axes[-1])
fig.set_ylabel('# cars at first location', fontsize=30)
fig.set_yticks(list(reversed(range(MAX_CARS + 1))))
fig.set_xlabel('# cars at second location', fontsize=30)
fig.set_title('optimal value', fontsize=30)
break
iterations += 1
plt.savefig('../images/figure_4_2.png')
plt.close()
if __name__ == '__main__':
figure_4_2()