In [2]:
# aggregate data on hours
# merge the data

In [3]:
import pandas as pd
import numpy as np

In [35]:
# settings
regions_info = {
     'oakland': dict(
         x_min=-13618976.4221,
         x_max=-13605638.1607,
         y_min=4549035.0828,
         y_max=4564284.2700,
         timezone='America/Los_Angeles'),
     'madrid': dict(
         x_min=-416448.0394,
         x_max=-406912.5201,
         y_min=4921025.4356,
         y_max=4931545.0816,
         timezone='Europe/Madrid')
}
REGION = 'oakland'
selected_region = regions_info['oakland']
HEX_SIZE = 10000
Q = -634
R = -304

In [5]:
def convert_datetime_columns(df, columns):
    for col in columns:
        df[col] = df[col].dt.tz_localize('UTC').dt.tz_convert(regions_info[REGION]['timezone'])

In [7]:
def convert_to_mercator(lngs, lats):
    # converts incoming iterable degrees to mercator
    from pyproj import Proj  # put here for clarity
    
    projection = Proj(init='epsg:3857')
    xs = []
    ys = []
    for lng, lat in zip(lngs, lats):
        x, y = projection(lng, lat)
        xs.append(x)
        ys.append(y)
    return xs, ys

In [55]:
VEHICLE_DATAFILE = 'vehicle_availability_data_darwin_20190715_20190802_with_dow_hour_mask.csv'
VEHICLE_DT_COLS = ['available_at', 'unavailable_at']

supply_df = pd.read_csv(
        VEHICLE_DATAFILE,
        parse_dates=VEHICLE_DT_COLS,
        infer_datetime_format=True
    ).dropna()

supply_df = supply_df.assign(Date=supply_df.available_at.dt.round('H'))

supply_df['start_merc_lng'], supply_df['start_merc_lat'] = convert_to_mercator(
    supply_df['lng'], supply_df['lat'])

from bokeh.util.hex import cartesian_to_axial
supply_df['q'], supply_df['r'] = cartesian_to_axial(
    supply_df['start_merc_lng'],
    supply_df['start_merc_lat'],
    size=HEX_SIZE,
    orientation='pointytop'
)

supply_df = supply_df[(supply_df['q'] == Q) & (supply_df['r'] == R)]

summed_df = supply_df.groupby(['Date']).size()
summed_df = summed_df.to_frame().reset_index()
summed_df['hour'] = summed_df['Date'].dt.hour
summed_df.columns = ['Date', 'count', 'hour']
summed_df.to_csv('supply_data_darwin_20190715_20190802.csv')
summed_df.head()

Unnamed: 0,Date,count,hour
0,2019-07-15 00:00:00,32,0
1,2019-07-15 01:00:00,51,1
2,2019-07-15 02:00:00,45,2
3,2019-07-15 03:00:00,38,3
4,2019-07-15 04:00:00,32,4


In [56]:
# import rental count data

rental_datafile = 'rental_data_darwin_20190715_20190802_datalake.csv'
rental_df = pd.read_csv(
    rental_datafile,
    parse_dates=['reserved_at', 'booked_at', 'ended_at'],
    infer_datetime_format=True
)

rental_df = rental_df.assign(Date=rental_df.reserved_at.dt.round('H'))

rental_df['start_merc_lng'], rental_df['start_merc_lat'] = convert_to_mercator(
    rental_df['start_location_lng'], rental_df['start_location_lat'])

from bokeh.util.hex import cartesian_to_axial
rental_df['q'], rental_df['r'] = cartesian_to_axial(
    rental_df['start_merc_lng'],
    rental_df['start_merc_lat'],
    size=HEX_SIZE,
    orientation='pointytop'
)

rental_df = rental_df[(rental_df['q'] == Q) & (rental_df['r'] == R)]

In [57]:
# calculate center-ish of hex
# get min/max lat and get min/max lng
# divide sum of values by 2
lat_centerish = (rental_df['start_location_lat'].max() + rental_df['start_location_lat'].min())/2
lng_centerish = (rental_df['start_location_lng'].max() + rental_df['start_location_lng'].min())/2

print(f'lat_centerish: {lat_centerish}')
print(f'lng_centerish: {lng_centerish}')

lat_centerish: 37.8601415
lng_centerish: -122.27936


In [58]:
rental_df = rental_df.groupby(['Date']).size()
rental_df = rental_df.to_frame().reset_index()
rental_df['hour'] = rental_df['Date'].dt.hour
rental_df.columns = ['Date', 'count', 'hour']
rental_df.head()

Unnamed: 0,Date,count,hour
0,2019-01-05 19:00:00,1,19
1,2019-02-17 01:00:00,1,1
2,2019-02-17 16:00:00,1,16
3,2019-03-09 16:00:00,1,16
4,2019-03-10 22:00:00,1,22


In [59]:
merged_df = rental_df.merge(summed_df, on='Date', suffixes=('_demand', '_supply'))
merged_df = merged_df.drop(['hour_demand'], axis=1)
merged_df.columns = ['Date', 'Demand_Count', 'Supply_Count', 'hour']
merged_df.to_csv(f'supply_demand_counts_20190715_20190802_hex_({Q},{R}).csv')