In [1]:
import pyquery
import requests
import urllib
Q = pyquery.PyQuery
import collections
import numpy as np

%matplotlib
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter, FuncFormatter

Using matplotlib backend: TkAgg


In [2]:
cache = {}

In [3]:
def get_cars(car_list, results):
    # I suck at using JQuery, so I sloppily make my way down the 
    # HTML chain. The hertz HTML isn't super friendly, but each item seems
    # to have a unique CSS class to seek to. 
    # In a couple places, I hardcode child indices because it was easier
    # than making a new Q object to go down one level. 
    for listing in car_list.getchildren():
        ql = Q(listing)
        desc = ql(".gv-description")[0]
        odo = desc[0][1].text
        mi, unit = odo.split()
        mi = int(mi.replace(",",""))
        year = ql(".inventory-title.single-line-text")[0][0].text.split()[0]
        year = int(year)

        price = ql("span.askingPrice.finalPrice")[0]
        value = price[1].text.replace("\"", "").strip().replace("$", "").replace(",", "")
        value = int(value)
        results[year].append((mi, value))
            
def get_url(url, params):
    url = requests.Request("GET", url, params=params).prepare().url
    if url in cache:
        content = cache[url]
    else:
        content = requests.get(url).content
        cache[url] = content
    return content
            
def get_listing(url, params):
    content = get_url(url, params=params)
    return Q(content)

def get_car_list(q):
    return q(".gv-inventory-list")[0]

def get_num_results(q):
    return int(q(".total-count")[0][0].text)

def c2l(cc):
    return list(zip(*cc))

def ann(ax, xx, yy):
    for x, y in zip(xx,yy):
        ax.annotate(y, (x,y))
        
def mmy(params):
    return params['make'],params['model'],params['year']

In [4]:
qp2 = {
     'geoRadius': '500',
     'geoZip': '91360',
     'compositeType': 'certified, Rent2Buy',
     'model': 'Corolla',
     'make': 'Toyota',
     'normalBodyStyle': 'Sedan',
     'normalExteriorColor': 'Black,Gray',
     'odometer': '10-200000',
     'sortBy': 'odometer asc',
     'trim': 'LE',
     'year': '2015-2018',
     'start': '0'
}

In [5]:
def hertz_scan(params):
    params = params.copy()
    params['start'] = '0'
    url = "https://www.hertzcarsales.com/all-inventory/index.htm"
    pq = get_listing(url, params)
    nresults = get_num_results(pq)
    cars_listing = get_car_list(pq)
    got = 0
    results = collections.defaultdict(list)
    while True:
        got += len(cars_listing)
        print("parsing cars 1-%d of %d"%(got, nresults))
        get_cars(cars_listing, results)
        if got >= nresults:
            break
        params['start'] = str(got)
        pq = get_listing(url, params)
        cars_listing = get_car_list(pq)
    return results

def make_fits(data, outlier=300):
    fitfns = []
    for year, cars in data:
        if len(cars) < 2:
            print("skipping year %s: insufficient data"%year)
            continue
        xx,yy = c2l(cars)
        xr = np.array(xx)
        yr = np.array(yy)
        fit = np.polyfit(xr, yr, 1)
        fitfn = np.poly1d(fit)

        # second pass, remove outliers
        # rather than statistical analysis, hand pick the ones that are ~$300 above or below the mean
        # This is based on manual review of the data per 1/1/2019
        diff = fitfn(xr) - yr
        idx = np.where(abs(diff) < outlier)
        xr2 = xr[idx]
        yr2 = yr[idx]
        fit2 = np.polyfit(xr2, yr2, 1)
        fitfn2 = np.poly1d(fit2)
        fitfns.append((year, fitfn2, fit2))
    return fitfns

def sort_results(res):
    return sorted(res.items(), key=lambda item: item[0], reverse=True)

In [13]:
qp2['geoRadius'] = '500'
big_data = hertz_scan(qp2)
qp2['geoRadius'] = '30'
local_data = hertz_scan(qp2)
big_data = sort_results(big_data)
local_data = sort_results(local_data)
fitfns = make_fits(big_data, 300)

# NameError trap so it runs correctly on first execution
try:
    fig.clear()
    plt.close(fig)
except NameError: 
    pass

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
allx,ally = [],[]
for year, cars in local_data: # sort by year
    x, y = c2l(cars)
    ax.scatter(x,y, label=str(year))
    allx.extend(x)
    ally.extend(y)
    
for year, fn in fitfns:
    xr = np.arange(10000, 45000)
    yr = fn(xr)
    ax.plot(xr, yr, label="Fit %d"%year)
# m,b = fit
# ax.plot(xr, yr, label="Best Fit")
# ax.annotate("y = %.4f*m + %.2f"%(m,b), (30000, 14000)

# Add "," separator and $ sign
xf = FuncFormatter(lambda x, p: format(int(x), ','))
yf = FuncFormatter(lambda y, p: "$%s"%format(int(y), ","))
ax.xaxis.set_major_formatter(xf)
ax.yaxis.set_major_formatter(yf)

ax.xaxis.set_label_text("Miles Driven")
ax.yaxis.set_label_text("Price (No haggle)")

ax.set_title("%s %s %s"%(mmy(qp2)))

# Yaxis label doesn't quite fit, so fudge it over a bit
b = ax.get_position()
shift = 0.02
ax.set_position([b.x0+shift, b.y0, b.width*(1-shift), b.height])
ax.set_xlim(10000, 45000)

ax.grid()

# assign just so it doesn't get stuck in global Jupyter memory
_=ax.legend()

NameError: name 'do_scan' is not defined

In [6]:
def miles2cost(year, mi):
    for y, fn in fitfns:
        if y == year:
            break
    else:
        raise ValueError("Failed to find year %d" % year)
    return fn(mi)

def deal(year, mi, cost):
    res = miles2cost(year, mi)
    return res - cost

In [13]:
qp2 = {
     'geoRadius': '500',
     'geoZip': '93012',
     'compositeType': 'certified, Rent2Buy',
     'model': 'Corolla',
     'make': 'Toyota',
     'normalBodyStyle': 'Sedan',
     'normalExteriorColor': 'Black,Gray',
     'odometer': '10-200000',
     'sortBy': 'odometer asc',
     'trim': 'LE',
     'year': '2017-2018',
     'start': '0'
}

maxx = 45000

ref_radius = 500
local_radius = 30
geozip = '93012'

qp2['geoZip'] = geozip
qp2['geoRadius'] = str(ref_radius)
big_data = hertz_scan(qp2)
qp2['geoRadius'] = str(local_radius)
qp2['normalExteriorColor'] = 'Gray'
local_data = hertz_scan(qp2)



big_data = sort_results(big_data)
local_data = sort_results(local_data)
fitfns = make_fits(big_data, 300)

try:
    fig.clear()
    plt.close(fig)
except NameError: 
    pass

def doplot(ax, data, axtitle=""):
    for year, cars in data: # sort by year
        x, y = c2l(cars)
        ax.scatter(x,y, label=str(year), picker=5)

        # Add "," separator and $ sign
        xf = FuncFormatter(lambda x, p: format(int(x), ','))
        yf = FuncFormatter(lambda y, p: "$%s"%format(int(y), ","))
        ax.xaxis.set_major_formatter(xf)
        ax.yaxis.set_major_formatter(yf)

        ax.xaxis.set_label_text("Miles Driven")
        ax.yaxis.set_label_text("Price (No haggle)")

        ax.set_title(axtitle)

        # Yaxis label doesn't quite fit, so fudge it over a bit
        b = ax.get_position()
        xshift = 0.02
        ax.set_position([b.x0+xshift, b.y0, b.width*(1-xshift), b.height])
        ax.set_xlim(10000, maxx)
    
    for year, fn, fit in fitfns:
        if year == 2016: 
            continue  # only 1 data point, don't plot
        xr = np.arange(10000, maxx)
        yr = fn(xr)
        ax.plot(xr, yr, label="Fit %d"%year)
        
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
doplot(ax, local_data, "Within 30 miles of zip=%s"%qp2['geoZip'])
doplot(ax2, big_data, "Within 500 miles of zip=%s"%qp2['geoZip'])

b = ax.get_position()
b2 = ax2.get_position()
yshift = 0.01
ax.set_position([b.x0, b.y0+yshift, b.width, b.height*(1-yshift)])
ax2.set_position([b2.x0, b2.y0-yshift, b2.width, b2.height*(1-yshift)])

fig.suptitle("%s %s %s"%(mmy(qp2)))

for i, (year, fn, fit) in enumerate(fitfns):
    m,b = fit
    fig.text(0.7, 0.95-0.02*i, "%s: y = %.4f*miles + %.0f"%(year, m,b))

def onpick(e):
    a = e.artist
    data = a.get_offsets()
    x,y = data[e.ind[0]]
    print(x,y)
    
def mouseover(e):
    if not e.inaxes:
        txt = ""
    else:
        y1 = e.ydata
        txt = ["mouse: {:,.0f}miles, ${:,.0f}".format(e.xdata, y1)]
        for year, fn, _ in fitfns:
            y2 = fn(e.xdata)
            txt.append("{} @ {:,.0f} miles: ${:,.0f} ({:,.0f})".format(year, e.xdata, y2, (y1-y2)))
        txt = "\n".join(txt)
            
    mtext.set_text(txt)
    fig.canvas.draw()
    
mtext = fig.text(0.02, 0.90, "")

fig.canvas.mpl_connect('pick_event', onpick)
fig.canvas.mpl_connect('motion_notify_event', mouseover)
    
    
ax.grid()
ax2.grid()
_=ax.legend()

parsing cars 1-35 of 192
parsing cars 1-70 of 192
parsing cars 1-105 of 192
parsing cars 1-140 of 192
parsing cars 1-175 of 192
parsing cars 1-192 of 192
parsing cars 1-4 of 4
20347.0 14100.0
21045.0 14100.0
21489.0 14100.0
20347.0 14100.0
17186.0 14300.0


from types import MethodType
def pd(o):
    for a in dir(o):
        if a.startswith("__"):
            continue
        v = getattr(o, a)
        if isinstance(v, MethodType):
            v = "<method>"
        print("%s: %s" % (a,v))