In [1]:
import pandas as pd
import geopandas as gpd

import seaborn as sns

import gcsfs

from scipy import stats

import arviz as az
import pymc as pm
import patsy

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GroupKFold, GroupShuffleSplit, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from count_modeling.constants import GCS_PATH, MODEL_DATA_PATH, GROUPER_COL, CURRENT_MODEL_DATA_VERSION, MODEL_PATH
from count_modeling.utils import get_model_data, get_timestamp

import pickle

## Read data

In [2]:
X_train, X_test, y_train, y_test, context_train, context_test = get_model_data(CURRENT_MODEL_DATA_VERSION)
grouper_train = context_train[GROUPER_COL]

# 2. Model Fitting
Now we'll start estimating models. First let's set up a container for storing results.


In [3]:
model_results = {}

def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    
    results = {
        'in_sample_rmse': mean_squared_error(y_train, model.predict(X_train), squared=False),
        'out_sample_rmse': mean_squared_error(y_test, model.predict(X_test), squared=False),
        'model': model
    }
    print(results)
    return results


def model_cv(model, cv_params, n_iter=25):
    """Convenience function for doing cross-validation and evaluating the best model"""
    
    cv = RandomizedSearchCV(
        estimator=model,
        param_distributions=cv_params,
        cv=gkf.split(X_train, y_train, grouper_train),
        n_iter=n_iter,
        verbose=1
    )
    
    cv.fit(X_train, y_train)
    
    best_model = cv.best_estimator_
    
    return evaluate_model(best_model)

Make a set of folds for doing cross-validation for hyperparam tuning

In [4]:
gkf = GroupKFold()

## 2a. Dummy Model
As usual, the first thing we'll do is set a baseline for model accuracy by estimating a no-skill model.

In [5]:
dummy_model = DummyRegressor()
dummy_model.fit(X_train, y_train)

model_results['dummy'] = evaluate_model(dummy_model)

{'in_sample_rmse': 287.411110343937, 'out_sample_rmse': 172.04738719266348, 'model': DummyRegressor()}


## 2b. Vanilla Poisson Regression
Here we'll one-hot encode categoricals, standard scale everything else, and dump into a Poisson regression. We'll also tune the regularization on the Poisson.

In [6]:
categorical_cols = ['bike_facs', 'fc_draft']
categorical_drop_vals = ['Unknown', 7]

numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

base_column_transformers = [
    ('categoricals', OneHotEncoder(drop=categorical_drop_vals), categorical_cols),
    ('standard_scale', StandardScaler(), numeric_cols)
]

In [7]:
vanilla_poisson_model = Pipeline(
    [
        ('transform', ColumnTransformer(base_column_transformers,)),
        ('poisson', PoissonRegressor(max_iter=1000))
    ])

vanilla_poisson_params = {'poisson__alpha': stats.uniform(0, 3),}

In [8]:
model_results['vanilla_poisson'] = model_cv(vanilla_poisson_model, vanilla_poisson_params)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
{'in_sample_rmse': 152.61787599425284, 'out_sample_rmse': 160.5048157364552, 'model': Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('categoricals',
                                                  OneHotEncoder(drop=['Unknown',
                                                                      7]),
                                                  ['bike_facs', 'fc_draft']),
                                                 ('standard_scale',
                                                  StandardScaler(),
                                                  ['speed', 'slope',
                                                   'dist_water',
                                                   'near_univ_miles',
                                                   'near_large_univ_miles',
                                                   'strava_commute_adb',
                                 

In [9]:
p = model_results['vanilla_poisson']['model'].named_steps['poisson']

In [10]:
pd.Series(
    data=p.coef_, 
    index=model_results['vanilla_poisson']['model'][:-1].get_feature_names_out())

categoricals__bike_facs_Class I      0.428874
categoricals__bike_facs_Class II     0.039765
categoricals__bike_facs_Class III    0.047300
categoricals__bike_facs_Class IV    -0.246080
categoricals__bike_facs_Class V      0.147913
                                       ...   
standard_scale__d5cei               -0.188428
standard_scale__d5dr                -0.005240
standard_scale__d5dri               -0.005236
standard_scale__d5de                -0.005240
standard_scale__d5dei               -0.005239
Length: 104, dtype: float64

Yikes, the out-of-sample RMSE is horrible! Let's try a tree-based model, then we'll try some interactions.

## 2c. Random Forest Regression

In [11]:
rfr_model = Pipeline(
    [
        ('transform', ColumnTransformer(base_column_transformers,)),
        ('rfr', RandomForestRegressor(n_jobs=-1))
    ])

rfr_params = {
        'rfr__n_estimators': stats.distributions.randint(2, 500),
        'rfr__min_samples_split': stats.distributions.randint(2, 25),
        'rfr__min_samples_leaf': stats.distributions.randint(1, 25),
        'rfr__max_features': ['sqrt',] + [0.1 * i for i in range(1, 10)],
        'rfr__max_depth': stats.distributions.randint(2, 1000),
    }

model_results['rfr'] = model_cv(rfr_model, rfr_params, 100)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'in_sample_rmse': 113.75992035931222, 'out_sample_rmse': 126.17392659866637, 'model': Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('categoricals',
                                                  OneHotEncoder(drop=['Unknown',
                                                                      7]),
                                                  ['bike_facs', 'fc_draft']),
                                                 ('standard_scale',
                                                  StandardScaler(),
                                                  ['speed', 'slope',
                                                   'dist_water',
                                                   'near_univ_miles',
                                                   'near_large_univ_miles',
                                                   'strava_commute_adb',
                               

## 2d. Gradient Boosting Regressor

In [12]:
gbr_model = Pipeline(
    [
        ('transform', ColumnTransformer(base_column_transformers,)),
        ('gbr', GradientBoostingRegressor(random_state=42))
    ])

gbr_params = {
    'gbr__learning_rate': stats.expon(1, 10),
    'gbr__n_estimators': [10, 100, 250, 500, 1000],
    'gbr__min_samples_split': stats.uniform(),
    'gbr__max_depth': stats.randint(1, 10),
#    'gbr__min_impurity_decrease': stats.expon(1, 10),
}

model_results['gbr'] = model_cv(gbr_model, gbr_params, 10)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sample_weight * ((y - raw_predictions.ravel()) ** 2))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  * np.sum(sam

{'in_sample_rmse': 59.57845723600397, 'out_sample_rmse': 165.37066034380135, 'model': Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('categoricals',
                                                  OneHotEncoder(drop=['Unknown',
                                                                      7]),
                                                  ['bike_facs', 'fc_draft']),
                                                 ('standard_scale',
                                                  StandardScaler(),
                                                  ['speed', 'slope',
                                                   'dist_water',
                                                   'near_univ_miles',
                                                   'near_large_univ_miles',
                                                   'strava_commute_adb',
                                                   'strava_leisure_adb',
                      

## 2. Comparison of models

In [13]:
display(pd.DataFrame(model_results).T)

Unnamed: 0,in_sample_rmse,out_sample_rmse,model
dummy,287.41111,172.047387,DummyRegressor()
vanilla_poisson,152.617876,160.504816,(ColumnTransformer(transformers=[('categorical...
rfr,113.75992,126.173927,(ColumnTransformer(transformers=[('categorical...
gbr,59.578457,165.37066,(ColumnTransformer(transformers=[('categorical...


In [21]:
fs = gcsfs.GCSFileSystem()

In [23]:
MODEL_VERSION = get_timestamp()
print(f"Writing out to version {MODEL_VERSION}")

Writing out to version 2023-05-24T08:35:37.020376-07:00


In [29]:
export_path = MODEL_PATH.format(version=MODEL_VERSION)
with fs.open(export_path, 'wb') as fi:
    pickle.dump(model_results, fi)
    print(f"Wrote model comparison dict to {export_path}")

Wrote model comparison dict to gs://smart4/models/2023-05-24T08:35:37.020376-07:00.pkl
