In [7]:
import pyquery
import requests
import urllib
Q = pyquery.PyQuery
import collections
import numpy as np

%matplotlib
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter, FuncFormatter

Using matplotlib backend: TkAgg


In [None]:
cache = {}

In [80]:
def get_cars(car_list, results):
    # I suck at using JQuery, so I sloppily make my way down the 
    # HTML chain. The hertz HTML isn't super friendly, but each item seems
    # to have a unique CSS class to seek to. 
    # In a couple places, I hardcode child indices because it was easier
    # than making a new Q object to go down one level. 
    for listing in car_list.getchildren():
        ql = Q(listing)
        desc = ql(".gv-description")[0]
        odo = desc[0][1].text
        mi, unit = odo.split()
        mi = int(mi.replace(",",""))
        year = ql(".inventory-title.single-line-text")[0][0].text.split()[0]
        year = int(year)

        price = ql("span.askingPrice.finalPrice")[0]
        value = price[1].text.replace("\"", "").strip().replace("$", "").replace(",", "")
        value = int(value)
        results[year].append((mi, value))
            
def get_url(url, params):
    url = requests.Request("GET", url, params=qp2).prepare().url
    if url in cache:
        content = cache[url]
    else:
        content = requests.get(url).content
        cache[url] = content
    return content
            
def get_listing(url, params):
    content = get_url(url, params=params)
    return Q(content)

def get_car_list(q):
    return q(".gv-inventory-list")[0]

def get_num_results(q):
    return int(q(".total-count")[0][0].text)

def c2l(cc):
    return list(zip(*cc))

def ann(ax, xx, yy):
    for x, y in zip(xx,yy):
        ax.annotate(y, (x,y))
        
def mmy(params):
    return params['make'],params['model'],params['year']

In [81]:
qp2 = {
     'geoRadius': '500',
     'geoZip': '93012',
     'compositeType': 'certified, Rent2Buy',
     'model': 'Corolla',
     'make': 'Toyota',
     'normalBodyStyle': 'Sedan',
     'normalExteriorColor': 'Black,Gray',
     'odometer': '10-200000',
     'sortBy': 'odometer asc',
     'trim': 'LE',
     'year': '2015-2018',
     'start': '0'
}

In [82]:
def do_scan(qp2):
    qp2['start'] = '0'
    url = "https://www.hertzcarsales.com/all-inventory/index.htm"
    pq = get_listing(url, qp2)
    nresults = get_num_results(pq)
    cars_listing = get_car_list(pq)
    got = 0
    results = collections.defaultdict(list)
    while True:
        got += len(cars_listing)
        print("parsing cars 1-%d of %d"%(got, nresults))
        get_cars(cars_listing, results)
        if got >= nresults:
            break
        qp2['start'] = str(got)
        pq = get_listing(url, qp2)
        cars_listing = get_car_list(pq)
    return results


In [92]:
qp2['geoRadius'] = '500'
big_data = do_scan(qp2)
qp2['geoRadius'] = '30'
local_data = do_scan(qp2)

def sort_results(res):
    return sorted(res.items(), key=lambda item: item[0], reverse=True)

big_data = sort_results(big_data)
local_data = sort_results(local_data)
fitfns = []
for year, cars in big_data:
    xx,yy = c2l(cars)
    xr = np.array(xx)
    yr = np.array(yy)
    fit = np.polyfit(xr, yr, 1)
    fitfn = np.poly1d(fit)
        
    # second pass, remove outliers
    # rather than statistical analysis, hand pick the ones that are ~$300 above or below the mean
    # This is based on manual review of the data per 1/1/2019
    diff = fitfn(xr) - yr
    idx = np.where(abs(diff) < 300)
    xr2 = xr[idx]
    yr2 = yr[idx]
    fit2 = np.polyfit(xr2, yr2, 1)
    fitfn2 = np.poly1d(fit2)
    fitfns.append((year, fitfn2))
    
    
results = big_data

parsing cars 1-35 of 193
parsing cars 1-70 of 193
parsing cars 1-105 of 193
parsing cars 1-140 of 193
parsing cars 1-175 of 193
parsing cars 1-194 of 193
parsing cars 1-4 of 4


  app.launch_new_instance()


In [96]:
results = big_data

In [100]:
# NameError card so it runs correctly on first execution
try:
    fig.clear()
    plt.close(fig)
except NameError: 
    pass

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
allx,ally = [],[]
for year, cars in results: # sort by year
    x, y = c2l(cars)
    ax.scatter(x,y, label=str(year))
    allx.extend(x)
    ally.extend(y)
    
for year, fn in fitfns:
    if year == 2016: continue  # only 1 data point, don't plot
    xr = np.arange(10000, 45000)
    yr = fn(xr)
    ax.plot(xr, yr, label="Fit %d"%year)
# m,b = fit
# ax.plot(xr, yr, label="Best Fit")
# ax.annotate("y = %.4f*m + %.2f"%(m,b), (30000, 14000)

# Add "," separator and $ sign
xf = FuncFormatter(lambda x, p: format(int(x), ','))
yf = FuncFormatter(lambda y, p: "$%s"%format(int(y), ","))
ax.xaxis.set_major_formatter(xf)
ax.yaxis.set_major_formatter(yf)

ax.xaxis.set_label_text("Miles Driven")
ax.yaxis.set_label_text("Price (No haggle)")

ax.set_title("%s %s %s"%(mmy(qp2)))

# Yaxis label doesn't quite fit, so fudge it over a bit
b = ax.get_position()
shift = 0.02
ax.set_position([b.x0+shift, b.y0, b.width*(1-shift), b.height])
ax.set_xlim(10000, 45000)

ax.grid()

# assign just so it doesn't get stuck in global Jupyter memory
_=ax.legend()

In [103]:
def miles2cost(year, mi):
    for y, fn in fitfns:
        if y == year:
            break
    else:
        raise ValueError("Failed to find year %d" % year)
    return fn(mi)

def deal(year, mi, cost):
    res = miles2cost(year, mi)
    return res - cost

In [104]:
miles2cost(2018, 25000) - miles2cost(2017, 25000)

1031.0119670115364

In [102]:
deal(2017, 40418, 13088)

TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'

In [75]:
all2y = np.array(ally)
all2x = np.array(allx)
ddeal = all2y - np.poly1d(fit)(all2x)
for mi, cost in sorted(zip(all2x, ddeal), key=lambda t: t[1]):
    print(mi, cost)

28672 -874.0963775258606
30572 -716.1227139263992
22386 -696.7397719396577
25270 -656.9523794023698
29067 -641.2544843038668
31679 -624.0822688713451
26925 -619.3490040038923
27220 -594.8215141292385
35594 -498.57337782298055
29621 -495.1926897596022
28552 -484.07366154266856
32361 -467.37803804248506
32716 -437.86190615942905
30369 -433.00095272150065
34109 -422.0422675309819
34116 -421.4602592966676
32984 -415.57930518855574
34512 -388.53522204120054
33321 -387.5597659080213
34616 -379.8882425599659
35917 -371.71785501107297
33536 -369.68379871123943
33697 -356.29760932202225
19304 -352.989683104679
37574 -333.9481915456472
35226 -329.170382141192
36531 -320.6674184584044
37763 -318.23396921917447
35435 -311.7932791452513
36666 -309.44297393949455
37987 -299.60970572113365
35598 -298.2408016890877
36850 -294.1444717803897
34496 -289.86552657677385
36977 -283.5851795292674
36992 -282.3380190271673
34756 -268.2480778736899
39622 -263.6692109921223
36033 -262.07314712815787
34938 -253.1