### Data Filtering and Calculation Libraries

In [1]:
import pandas as pd
import calendar
from settings import region
from bokeh.util.hex import cartesian_to_axial

pd.options.mode.chained_assignment = None

### Input Files
The provided files are the intermediates from the "hexbin_data_analysis.ipynb".

In [2]:
VEHICLE_DATAFILE = 'vehicle_update_data_darwin_20190501_20190606_with_dow_hour_mask.csv'
RENTAL_DATAFILE = 'rental_data_prod_db_darwin_20190501_20190606_with_dow_hour_mask.csv'
APP_EVENTS_DATAFILE = 'nearest_vehicle_app_data_darwin_20190501_20190606_with_threshold_mask.csv'

In [3]:
# constants
VEHICLE_DT_COLS = ['available_at', 'unavailable_at']
RENTAL_DT_COLS = ['reserved_at', 'booked_at', 'ended_at']
APP_EVENTS_DT_COLS = ['event_datetime']
APP_EVENTS_MASKS = ['event_dow', 'event_hour']
PERIOD_MINUTES = 180

### Analysis Variables
Change these as appropriate.

- `PERIODS`: number of weeks of data
- `HEX_SIZE`: the size of hex for binning
- `selected_region`: change the region. only `oakland` and `madrid` available right now.

In [4]:
# variables for change, based on input data
PERIODS = 5
HEX_SIZE = 500 # meters (some distortion due to mercator proj)
selected_region = region['oakland']

In [5]:
def convert_to_mercator(lngs, lats):
    # converts incoming iterable degrees to mercator
    from pyproj import Proj  # put here for clarity

    projection = Proj(init='epsg:3857')
    xs = []
    ys = []
    for lng, lat in zip(lngs, lats):
        x, y = projection(lng, lat)
        xs.append(x)
        ys.append(y)
    return xs, ys

### function to collect data for idle vehicle minutes

In [6]:
# input the supply df, get back the merged idle minutes df
idle_minutes_df = pd.DataFrame()

def get_idle_supply_minutes(df, dow, hour_start, hour_end):
    # supply_df is multi-indexed columns. has to be accessed differently
    # fix loopback - if hour_end > 23, then we need to loop to the beginning of the next day
    
    df['analysis_hours'] = df[
        ["('{0}', {1})".format(dow, i) for i in range(hour_start, hour_end)]].sum(axis=1)
#     supply_figure_df = df[df['analysis_hours'] != 0]  # to be used for the figure
    supply_figure_df = df.loc[df['analysis_hours'] != 0]

    # ===== BIN THE SUPPLY =====
    supply_figure_df['q'], supply_figure_df['r'] = cartesian_to_axial(
        supply_figure_df['merc_lng'],
        supply_figure_df['merc_lat'],
        size=HEX_SIZE,
        orientation='pointytop'
    )

    # ======================
    # SUPPLY DATA PROCESSING
    # ======================
    # ===== GROUP THE SUPPLY DATA FOR AGGREGATION =====
    supply_figure_df = supply_figure_df.groupby(['q', 'r'])

    # ===== AGGREGATE THE TOTAL IDLE MINUTES =====
    total_idle_minutes_df = supply_figure_df['idle_duration_minutes'].agg(['mean', 'median', 'sum', 'min', 'max']).reset_index()
    total_idle_minutes_df.drop(total_idle_minutes_df.columns[[0, 1]], axis=1, inplace=True)
    total_idle_minutes_df.columns = ['total_idle_mins_mean', 'total_idle_mins_median',
                               'total_idle_mins_sum', 'total_idle_mins_min', 'total_idle_mins_max']

    # ===== AGGREGATE THE BLOCK IDLE MINUTES =====
    # binned_df[('Monday', 10),]  # FYI groupby changes the syntax. access like this
    block_idle_minutes_df = supply_figure_df['analysis_hours'].agg(['count', 'mean', 'median', 'sum', 'min', 'max']).reset_index()
    block_idle_minutes_df['count'] = block_idle_minutes_df['count']
    block_idle_minutes_df.columns = ['q', 'r', 'idle_events_count', 'idle_minutes_per_block_mean',
                              'idle_minutes_per_block_median', 'idle_minutes_per_block_sum',
                              'idle_minutes_per_block_min', 'idle_minutes_per_block_max']
    global idle_minutes_df
    idle_minutes_df = block_idle_minutes_df.merge(total_idle_minutes_df, left_index=True, right_index=True)

### collect rental stats

In [7]:
# input rental df and get back aggregate data
rental_analysis_df_counts = pd.DataFrame()
revenue_aggregates_df = pd.DataFrame()

def get_rental_stats(df_groupby, dow, hour_start, hour_end):
    # filter for the hour range
    rental_figure_df = pd.concat([df_groupby.get_group((dow, i)) for i in range(hour_start, hour_end)])
    
    # ===== BIN THE RENTALS =====
    rental_figure_df['q'], rental_figure_df['r'] = cartesian_to_axial(
        rental_figure_df['start_merc_lng'],
        rental_figure_df['start_merc_lat'],
        size=HEX_SIZE,
        orientation='pointytop'
    )
    
    # ===== GROUP THE RENTALS AND COUNT THE BINNED RENTALS =====
    global rental_analysis_df_counts
    rental_analysis_df_counts = rental_figure_df.groupby(['q', 'r'])['reserved_at'].agg(['count']).reset_index()
    rental_analysis_df_counts.rename(columns={'count': 'rental_count'}, inplace=True)
    rental_analysis_df_counts['rental_count'] = rental_analysis_df_counts['rental_count']
    
    # ===== GROUP THE RENTALS AND GET AGGREGATE FIGURES =====
    global revenue_aggregates_df
    revenue_aggregates_df = rental_figure_df.groupby(['q', 'r'])['total_to_charge'].agg(
        ['mean', 'median', 'sum', 'min', 'max']).reset_index()
    revenue_aggregates_df.columns = ['q', 'r', 'revenue_per_block_per_rental_mean',
                                     'revenue_per_block_median', 'revenue_per_block_sum',
                                     'revenue_per_block_min', 'revenue_per_block_max']

### collect app event stats

In [8]:
in_range_app_events = pd.DataFrame()
out_of_range_app_events = pd.DataFrame()

def get_app_events_stats(df_groupby, dow, hour_start, hour_end):
    app_events_df = pd.concat([df_groupby.get_group((dow, i)) for i in range(hour_start, hour_end)])

    # ===== BIN THE APP EVENTS =====
    app_events_df['q'], app_events_df['r'] = cartesian_to_axial(
        app_events_df['merc_lng'],
        app_events_df['merc_lat'],
        size=HEX_SIZE,
        orientation='pointytop'
    )

    global in_range_app_events
    global out_of_range_app_events
    
    # ==========================
    # APP EVENTS DATA PROCESSING
    # ==========================
    # ===== SEPARATE THE APP EVENTS BASED ON DISTANCE TO NEAREST VEHICLE =====
    in_range_app_events = app_events_df[app_events_df['vehicle_nearby'] == True]
    out_of_range_app_events = app_events_df[app_events_df['vehicle_nearby'] == False]

    # ===== GROUP THE APP EVENTS AND COUNT THEM =====
    in_range_app_events = in_range_app_events.groupby(['q', 'r'])['event_datetime'].agg(['count']).reset_index()
    in_range_app_events.rename(columns={'count': 'in_range_app_events_count'}, inplace=True)
    in_range_app_events['in_range_app_events_count'] = in_range_app_events['in_range_app_events_count']

    out_of_range_app_events = out_of_range_app_events.groupby(['q', 'r'])['event_datetime'].agg(['count']).reset_index()
    out_of_range_app_events.rename(columns={'count': 'out_of_range_app_events_count'}, inplace=True)
    out_of_range_app_events['out_of_range_app_events_count'] = out_of_range_app_events['out_of_range_app_events_count']

## Get Data Loaded and Ready

In [9]:
# ready supply data
supply_df = pd.read_csv(
        VEHICLE_DATAFILE,
        parse_dates=VEHICLE_DT_COLS,
        infer_datetime_format=True
    ).dropna()

supply_df['merc_lng'], supply_df['merc_lat'] = convert_to_mercator(
    supply_df['lng'], supply_df['lat'])

# spatial filtering (only look at the datapoints in the bounding box)
supply_df = supply_df.loc[(supply_df['merc_lng'] >= selected_region['x_min']) &
                      (supply_df['merc_lng'] <= selected_region['x_max']) &
                      (supply_df['merc_lat'] >= selected_region['y_min']) &
                      (supply_df['merc_lat'] <= selected_region['y_max'])]

In [10]:
# ready rental data
rental_df = pd.read_csv(
    RENTAL_DATAFILE,
    parse_dates=RENTAL_DT_COLS,
    infer_datetime_format=True
).dropna()

# convert rental start position to mercator projection
rental_df['start_merc_lng'], rental_df['start_merc_lat'] = convert_to_mercator(
    rental_df['start_location_lng'], rental_df['start_location_lat'])

# spatial filtering (only look at the datapoints in the bounding box)
rental_df = rental_df.loc[(rental_df['start_merc_lng'] >= selected_region['x_min']) &
                      (rental_df['start_merc_lng'] <= selected_region['x_max']) &
                      (rental_df['start_merc_lat'] >= selected_region['y_min']) &
                      (rental_df['start_merc_lat'] <= selected_region['y_max'])]

# filter for the hour range
rental_df_grouped = rental_df.groupby(['reserved_at_dow', 'reserved_at_hour'])  # group the data for filtering

In [11]:
# ready app events data
app_events_df = pd.read_csv(
    APP_EVENTS_DATAFILE,
    parse_dates=APP_EVENTS_DT_COLS,
    infer_datetime_format=True
).dropna()

# convert app events position to mercator projection
app_events_df['merc_lng'], app_events_df['merc_lat'] = convert_to_mercator(
    app_events_df['data-user_location-lng'], app_events_df['data-user_location-lat'])

# spatial filtering (only look at the datapoints in the bounding box)
app_events_df = app_events_df.loc[(app_events_df['merc_lng'] >= selected_region['x_min']) &
                              (app_events_df['merc_lng'] <= selected_region['x_max']) &
                              (app_events_df['merc_lat'] >= selected_region['y_min']) &
                              (app_events_df['merc_lat'] <= selected_region['y_max'])]

grouped_app_events = app_events_df.groupby(APP_EVENTS_MASKS)

## Merge Collected Stats

In [12]:
# merge the data and do some calculations
_hexbin_source = pd.DataFrame()
def merge_data():
    global _hexbin_source
    global idle_minutes_df
    global rental_analysis_df_counts
    global in_range_app_events
    global out_of_range_app_events
    global revenue_aggregates_df
    
    _hexbin_source = pd.DataFrame()
    
    _hexbin_source = idle_minutes_df.merge(
        rental_analysis_df_counts, on=['q', 'r'], how='outer').merge(
        in_range_app_events, on=['q', 'r'], how='outer').merge(
        out_of_range_app_events, on=['q', 'r'], how='outer').merge(
        revenue_aggregates_df, on=['q', 'r'], how='outer') # outer merge/join to capture all the data

    _hexbin_source['rental_vehicle_ratio'] = _hexbin_source['rental_count']/_hexbin_source['idle_events_count']
    _hexbin_source['rentals_per_available_minutes'] = _hexbin_source['rental_count']/_hexbin_source['idle_minutes_per_block_sum']
    _hexbin_source['propensity_to_rent'] = _hexbin_source['rental_count']/_hexbin_source['in_range_app_events_count']
    _hexbin_source['out_in_app_events_ratio'] = _hexbin_source['out_of_range_app_events_count']/_hexbin_source['in_range_app_events_count']
    _hexbin_source['projected_demand_increase'] = _hexbin_source['out_of_range_app_events_count'] * _hexbin_source['propensity_to_rent']
    _hexbin_source['potential_lost_revenue'] = _hexbin_source['projected_demand_increase'] * _hexbin_source['revenue_per_block_per_rental_mean']
    _hexbin_source.fillna(0, inplace=True)  # fill in datapoints without data

### Data Visualization Libraries

In [13]:
from ipywidgets import interactive
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
# from bokeh.tile_providers import get_provider, Vendors  # future
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.transform import linear_cmap, log_cmap
# import colorcet as cc  # other colorset


## Update/Redraw Map

In [14]:

def update(dow='Tuesday', hour_start=15, analysis_column='potential_lost_revenue'):
    # refilter and regroup
    get_idle_supply_minutes(supply_df, dow, hour_start, hour_start + 3)
    get_rental_stats(rental_df_grouped, dow, hour_start, hour_start + 3)
    get_app_events_stats(grouped_app_events, dow, hour_start, hour_start + 3)
    
    # remerge 
    merge_data()
    map_figure = figure(
        x_range=(selected_region['x_min'], selected_region['x_max']),  # bounding box for starting view
        y_range=(selected_region['y_min'], selected_region['y_max']),
        x_axis_type='mercator',
        y_axis_type='mercator',
        plot_width=750,
        plot_height=750,
        title=f'HexBin Representation',
        tooltips=[('(q, r)', '(@q, @r)'),
                  ('Projected Rental Count Increase', '@projected_demand_increase'),
                  ('Projected Revenue Increase', '@potential_lost_revenue'),
                  ('Propensity to Rent', '@propensity_to_rent'),
                  ('vehicles', '@idle_events_count'),
                  ('rentals', '@rental_count'),
                  ('out_of_range_app_events', '@out_of_range_app_events_count'),
                  ('in_range_app_events', '@in_range_app_events_count'),
                  ('rental_vehicle_ratio', '@rental_vehicle_ratio'),
                  ('mean idle minutes/block', '@idle_minutes_per_block_mean'),
                  ('median idle minutes/block', '@idle_minutes_per_block_median'),
                  ('sum idle minutes/block', '@idle_minutes_per_block_sum'),
                  ('max idle minutes/block', '@idle_minutes_per_block_max'),
                  ('mean total idle minutes', '@total_idle_mins_mean'),
                  ('median total idle minutes', '@total_idle_mins_median'),
                  ('rev/block/rental_mean', '@revenue_per_block_per_rental_mean'),
                  ('rev/block/rental_median', '@revenue_per_block_median')]
    )

    # add background of streets for context
    # map_figure.add_tile(get_provider(Vendors.CARTODBPOSITRON)) # future
    map_figure.add_tile(CARTODBPOSITRON)    

    map_figure.hex_tile(q='q', r='r', size=HEX_SIZE, source=_hexbin_source,
                        hover_color='pink', hover_alpha=0.8, fill_alpha=0.3, 
                        fill_color=log_cmap(analysis_column, 'Viridis256',
                                               0, max(_hexbin_source[analysis_column])))
    push_notebook()
    show(map_figure)

In [15]:
output_notebook()
interactive_plot = interactive(update,
         dow=list(calendar.day_name),
         hour_start=(0, 23),
         analysis_column=[
             'potential_lost_revenue',
             'out_in_app_events_ratio',
             'revenue_per_block_per_rental_mean',
             'idle_minutes_per_block_mean',
             'propensity_to_rent'
         ])
output = interactive_plot.children[-1]
output.layout.height = '800px'
interactive_plot

interactive(children=(Dropdown(description='dow', index=1, options=('Monday', 'Tuesday', 'Wednesday', 'Thursdaâ€¦