In [169]:
import h3
import pandas as pd
import geopandas as gpd
from shapely import geometry

import seaborn as sns

from scipy import stats

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import PoissonRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GroupKFold, GroupShuffleSplit, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler, OneHotEncoder

# 0. Data Read & Cleanup
Here we'll read in the data, drop a bunch of unnecessary columns, and also rename some columns to have cleaner names.

## Read data

In [204]:
gcs_path = 'gs://smart4'

count_data = pd.read_csv(f'{gcs_path}/count_file/Final_data_march18.csv')

unwanted_columns = [
    'Unnamed: 0',
    'X', 'ID',
    'ATT.in.Million',
    'bgarea_t', 'bgcliparea_t', 'pctofbgarea_t',
    'bgarea_q', 'bgcliparea_q', 'pctofbgarea_q',
    'bgarea_h', 'bgcliparea_h', 'pctofbgarea_h',
    'rt_i_shd_tot_width', 'lt_i_shd_tot_width',
    'near_strava_id', 
    'ATT.in.Thousands',
    'rowIndex', 'primary', 'secondary', 'tertiary', 'residential', 'trunk', 'secondary_link', 'unclassified',
    'speed_0_25', 'speed_21_35', 'speed_greater_than_35',
    'path', 'bike_lane',
    'bike_route', 'cycle_track', 'trail', 'Interstate', 'Freeway', 'Principal_Arterial',
    'Minor_Arterial', 'Major_Collector', 'Minor_Collector', 'Local'
]
count_data = count_data.drop(columns=unwanted_columns)

count_data['centroid'] = count_data.apply(lambda x: geometry.Point(x['Long'], x['Lat']), axis=1)
count_data = gpd.GeoDataFrame(count_data, geometry='centroid', crs=4326)

  arr = construct_1d_object_array_from_listlike(values)


## Data cleanup

In [205]:
count_data = count_data.rename(columns={
    'Stv_commute_adb': 'strava_commute_adb',
    'Stv_leisure_adb': 'strava_leisure_adb',
    'Stv_Ave_speed': 'strava_average_speed'})

count_data['total_lanes'] = count_data['rt_lanes_amt'] + count_data['lt_lanes_amt']
count_data['strava_leisure_pct'] = \
    count_data['strava_leisure_adb']/(count_data[['strava_leisure_adb', 'strava_commute_adb']].sum(axis=1))

count_data['bike_facs'] = count_data['bike_facs'].replace(
    {'Not Collected': 'Unknown', 
     'Class V': 'Unknown',
     'Class VI': 'Unknown'})

In [206]:
count_data['bike_facs'] = count_data['bike_facs'].fillna('Unknown')
count_data['fclass'] = count_data['fclass'].fillna('unknown')

## Make matrices

In [207]:
count_data.columns

Index(['location', 'Lat', 'Long', 'year', 'ATT', 'no_of_months_data_collected',
       'type', 'AADB', 'matched_seg_id', 'segment_id', 'street_name', 'county',
       'tdg_id', 'lrs_cal_id', 'bikes_proh', 'bike_facs', 'int_tdg_id',
       'loc_id', 'seg_counter', 'fclass', 'tasas_ids', 'ataip_ids', 'fc_draft',
       'speed', 'slope', 'empnum_density_t', 'pctwhite_t', 'totwhitepersqmi_t',
       'pctbiketowork_t', 'totbiketoworkpersqmi_t', 'pctatleastbachelors_t',
       'totatleastbachelorspersqmi_t', 'pctnoveh_t', 'totnovehpersqmi_t',
       'popdensitysqmi_t', 'hshlddensitysqmi_t', 'pctwhite_q',
       'totwhitepersqmi_q', 'pctbiketowork_q', 'totbiketoworkpersqmi_q',
       'pctatleastbachelors_q', 'totatleastbachelorspersqmi_q', 'pctnoveh_q',
       'totnovehpersqmi_q', 'popdensitysqmi_q', 'hshlddensitysqmi_q',
       'pctwhite_h', 'totwhitepersqmi_h', 'pctbiketowork_h',
       'totbiketoworkpersqmi_h', 'pctatleastbachelors_h',
       'totatleastbachelorspersqmi_h', 'pctnoveh_h', '

In [208]:
demographic_col_templates = [
    'popdensitysqmi_{}', 'hshlddensitysqmi_{}',
    'empnum_density_{}', 'pctwhite_{}', 'pctbiketowork_{}', 'pctatleastbachelors_{}', 
    'pctnoveh_{}']

demographic_cols = []
for distance in ('t', 'q', 'h'):
    demographic_cols += [col.format(distance) for col in demographic_col_templates]
    
strava_cols = ['strava_commute_adb', 'strava_leisure_adb', 'strava_average_speed', 'strava_leisure_pct']


y_col = 'AADB'
x_cols = ['bike_facs', 'fclass', 'fc_draft', 'speed', 'slope', 
          # 'adt_amt', 'total_lanes', ## - these were included, but are almost entirely Null
          'near_univ_miles', 'near_large_univ_miles'] + strava_cols + demographic_cols 


X = count_data[x_cols]
y = count_data[y_col]


ADT and number of lanes are almost entirely missing data.

Everything else likely makes sense to fill with 0s.

In [209]:
X = X.fillna(0)

# 1. Set up train-test splits
Here we're going to use a grouped train/test split, grouped based on the `h3` index at resolution 7. This is to try to prevent information leakage from locations that are spatially adjacent. It's not perfect, but it should help (other than in edge cases of counts right on either side of a grid cell line).

In [210]:
TEST_SIZE = 0.2
GROUPER_COL = 'h3_7'


In [211]:
count_data['h3_7'] = count_data.apply(lambda x: h3.geo_to_h3(x['Lat'], x['Long'], 7), axis=1)
count_data['h3_8'] = count_data.apply(lambda x: h3.geo_to_h3(x['Lat'], x['Long'], 8), axis=1)

grouper = count_data[GROUPER_COL]

In [212]:
train_indx, test_indx = next(GroupShuffleSplit(random_state=42,test_size=TEST_SIZE).split(X, y, grouper))

X_train, X_test, y_train, y_test = X.loc[train_indx], X.loc[test_indx], y[train_indx], y[test_indx]

grouper_train = count_data[GROUPER_COL][train_indx]

In [213]:
print(f"Test set is {len(test_indx)/(len(test_indx) + len(train_indx)) * 100 :.2f}% of sample")

Test set is 20.26% of sample


# 2. Model Fitting
Now we'll start estimating models. First let's set up a container for storing results.


In [214]:
model_results = {}

def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    
    results = {
        'in_sample_rmse': mean_squared_error(y_train, model.predict(X_train), squared=False),
        'out_sample_rmse': mean_squared_error(y_test, model.predict(X_test), squared=False),
        'model': model
    }
    print(results)
    return results


Make a set of folds for doing cross-validation for hyperparam tuning

In [216]:
gkf = GroupKFold()

## 2a. Dummy Model
As usual, the first thing we'll do is set a baseline for model accuracy by estimating a no-skill model.

In [217]:
dummy_model = DummyRegressor()
dummy_model.fit(X_train, y_train)

model_results['dummy'] = evaluate_model(dummy_model)

{'in_sample_rmse': 276.74668661055586, 'out_sample_rmse': 272.6618123376978, 'model': DummyRegressor()}


## 2b. Vanilla Poisson Regression
Here we'll one-hot encode categoricals, standard scale everything else, and dump into a Poisson regression. We'll also tune the regularization on the Poisson.

In [219]:
categorical_cols = ['bike_facs', 'fclass', 'fc_draft']
categorical_drop_vals = ['Unknown', 'unknown', 1]

numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

base_column_transformers = [
    ('categoricals', OneHotEncoder(drop=categorical_drop_vals), categorical_cols),
    ('standard_scale', StandardScaler(), numeric_cols)
]

In [220]:
vanilla_poisson_model = Pipeline(
    [
        ('transform', ColumnTransformer(base_column_transformers,)),
        ('poisson', PoissonRegressor(max_iter=1000))
    ])

In [224]:
vanilla_poisson_cv = RandomizedSearchCV(
    estimator=vanilla_poisson_model,
    param_distributions={
        'poisson__alpha': stats.uniform(0, 3),
    },
    cv=gkf.split(X_train, y_train, grouper_train),
    n_iter=25)

In [225]:
vanilla_poisson_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f7171a1af50>,
                   estimator=Pipeline(steps=[('transform',
                                              ColumnTransformer(transformers=[('categoricals',
                                                                               OneHotEncoder(drop=['Unknown',
                                                                                                   'unknown',
                                                                                                   1]),
                                                                               ['bike_facs',
                                                                                'fclass',
                                                                                'fc_draft']),
                                                                              ('standard_scale',
                                                                       

In [229]:
model_results['vanilla_poisson'] = evaluate_model(vanilla_poisson_cv.best_estimator_)

{'in_sample_rmse': 186.65047464211733, 'out_sample_rmse': 320.42087239595185, 'model': Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('categoricals',
                                                  OneHotEncoder(drop=['Unknown',
                                                                      'unknown',
                                                                      1]),
                                                  ['bike_facs', 'fclass',
                                                   'fc_draft']),
                                                 ('standard_scale',
                                                  StandardScaler(),
                                                  ['speed', 'slope',
                                                   'near_univ_miles',
                                                   'near_large_univ_miles',
                                                   'strava_commute_adb',
                 

Yikes, the out-of-sample RMSE is horrible! Let's try a tree-based model, then we'll try some interactions.

## 2c. Random Forest Regression