# CaBi ML fitting - Random Forest

Trying out Random Forest here since it seems so effective and quick to compute.



## 0. Data load, shaping, and split
* Read in data from AWS
* Encode time variable (day_of_year) as cyclical
* Split into Xtrain, Xtest, ytrain, ytest based on date
  * Specify feature and target columns

In [1]:
# Read in data from AWS

from util_functions import *
import numpy as np
import pandas as pd
import time
start_time = time.perf_counter()

set_env_path()
conn, cur = aws_connect()

# fullquery contains pretty much everything

fullquery = """
SELECT 
EXTRACT(DOY FROM date) as day_of_year,
date,
year,
quarter,
month,
day_of_week,
daylight_hours,
apparenttemperaturehigh,
apparenttemperaturehightime,
apparenttemperaturelow,
apparenttemperaturelowtime,
precipintensitymaxtime,
sunrisetime,
sunsettime,
cloudcover,
dewpoint,
humidity,
precipaccumulation,
precipintensitymax,
precipprobability,
rain,
snow,
visibility,
windspeed,
us_holiday,
nats_single,
nats_double,
nats_attendance,
dc_bike_event,
dc_pop,
cabi_bikes_avail,
cabi_stations_alx,
cabi_stations_arl,
cabi_stations_ffx,
cabi_stations_mcn,
cabi_stations_mcs,
cabi_stations_wdc,
cabi_docks_alx,
cabi_docks_arl,
cabi_docks_ffx,
cabi_docks_mcn,
cabi_docks_mcs,
cabi_docks_wdc,
cabi_stations_tot,
cabi_docks_tot,
cabi_dur_empty_wdc,
cabi_dur_full_wdc,
cabi_dur_empty_arl,
cabi_dur_full_arl,
cabi_dur_full_alx,
cabi_dur_empty_alx,
cabi_dur_empty_mcs,
cabi_dur_full_mcs,
cabi_dur_full_mcn,
cabi_dur_empty_mcn,
cabi_dur_full_ffx,
cabi_dur_empty_ffx,
cabi_dur_empty_tot,
cabi_dur_full_tot,
cabi_active_members_day_key,
cabi_active_members_monthly,
cabi_active_members_annual,
cabi_trips_wdc_to_wdc,
cabi_trips_wdc_to_wdc_casual
from final_db"""

query = """
SELECT 
EXTRACT(DOY FROM date) as day_of_year,
date,
year,
quarter,
month,
day_of_week,
daylight_hours,
apparenttemperaturehigh,
apparenttemperaturehightime,
apparenttemperaturelow,
apparenttemperaturelowtime,
precipintensitymaxtime,
sunrisetime,
sunsettime,
cloudcover,
dewpoint,
humidity,
precipaccumulation,
precipintensitymax,
precipprobability,
rain,
snow,
visibility,
windspeed,
us_holiday,
nats_single,
nats_double,
nats_attendance,
dc_bike_event,
dc_pop,
cabi_trips_wdc_to_wdc,
cabi_trips_wdc_to_wdc_casual
from final_db"""

pd.options.display.max_rows = None
pd.options.display.max_columns = None

df = pd.read_sql(fullquery, con=conn)

# Setting date to index for easier splitting
df.set_index(df.date, drop=True, inplace=True)
df.index = pd.to_datetime(df.index)

print("We have {} instances and {} features".format(*df.shape))

We have 2780 instances and 64 features


In [2]:
# Summary statistics

df.describe(percentiles=[.5]).round(3).transpose()

Unnamed: 0,count,mean,std,min,50%,max
day_of_year,2780.0,182.697,107.702,1.0,182.0,366.0
year,2780.0,2014.024,2.228,2010.0,2014.0,2018.0
quarter,2780.0,2.505,1.14,1.0,3.0,4.0
month,2780.0,6.507,3.522,1.0,7.0,12.0
day_of_week,2780.0,2.999,2.0,0.0,3.0,6.0
daylight_hours,2780.0,12.077,2.021,9.0,12.0,15.0
apparenttemperaturehigh,2780.0,64.306,20.594,2.24,65.89,113.67
apparenttemperaturehightime,2780.0,15.46,2.352,7.0,16.0,19.0
apparenttemperaturelow,2780.0,48.574,19.685,-10.84,50.885,90.06
apparenttemperaturelowtime,2780.0,6.783,4.354,0.0,6.0,23.0


In [3]:
def print_highly_correlated(df, features, threshold=0.75):
    """ 
    Prints highly correlated feature pairs in df.
    """
    corr_df = df[features].corr()
    # Select pairs above threshold
    correlated_features = np.where(np.abs(corr_df) > threshold)
    # Avoid duplication
    correlated_features = [(corr_df.iloc[x,y], x, y) for x, y in zip(*correlated_features) if x != y and x < y]
    # Sort by abs(correlation)
    s_corr_list = sorted(correlated_features, key=lambda x: -abs(x[0]))
    print("There are {} feature pairs with pairwise correlation above {}".format(len(s_corr_list), threshold))
    for v, i, j in s_corr_list:
        cols = df[features].columns
        print("{} and {} = {:0.3f}".format(corr_df.index[i], corr_df.columns[j], v))
        
print_highly_correlated(df, df.columns)

There are 163 feature pairs with pairwise correlation above 0.75
cabi_stations_mcs and cabi_docks_mcs = 1.000
cabi_stations_tot and cabi_docks_tot = 1.000
cabi_stations_ffx and cabi_docks_ffx = 0.999
cabi_stations_wdc and cabi_docks_wdc = 0.999
cabi_stations_arl and cabi_docks_arl = 0.998
cabi_stations_mcn and cabi_docks_mcn = 0.997
day_of_year and month = 0.997
cabi_stations_wdc and cabi_docks_tot = 0.994
cabi_stations_wdc and cabi_stations_tot = 0.994
cabi_docks_wdc and cabi_docks_tot = 0.993
cabi_dur_empty_wdc and cabi_dur_empty_tot = 0.992
cabi_bikes_avail and cabi_stations_tot = 0.992
cabi_dur_full_wdc and cabi_dur_full_tot = 0.992
cabi_docks_wdc and cabi_stations_tot = 0.991
cabi_bikes_avail and cabi_docks_tot = 0.991
cabi_stations_alx and cabi_docks_alx = 0.990
dc_pop and cabi_docks_arl = 0.989
cabi_stations_mcn and cabi_stations_mcs = 0.988
dc_pop and cabi_docks_tot = 0.988
cabi_bikes_avail and cabi_stations_wdc = 0.988
dc_pop and cabi_stations_wdc = 0.988
dc_pop and cabi_docks

In [4]:
# Encode day_of_year as cyclical
df['sin_day_of_year'] = np.sin(2*np.pi*df.day_of_year/365)
df['cos_day_of_year'] = np.cos(2*np.pi*df.day_of_year/365)

In [5]:
df.sample(100).plot.scatter('sin_day_of_year','cos_day_of_year').set_aspect('equal')

* Split into Xtrain, Xtest, ytrain, ytest based on date
  * Training dates = 2013-01-01 to 2016-12-31
  * Test dates = 2017-01-01 to 2017-09-08
  * New data (coincides with beginning of dockless pilot) = 2017-09-09 to present

In [6]:
# Train test split
# This can be tweaked, but we use 5-fold cross-validation to pick the model so that shouldn't change

train = df.loc['2013-01-01':'2016-12-31']
test = df.loc['2017-01-01':'2017-09-08']
print(train.shape, test.shape)

tr = train.shape[0]
te = test.shape[0]
trpct = tr/(tr+te)
tepct = te/(tr+te)

print("{:0.3f} percent of the data is in the training set and {:0.3f} percent is in the test set".format(trpct, tepct))

(1461, 66) (251, 66)
0.853 percent of the data is in the training set and 0.147 percent is in the test set


In [7]:
# Specify columns to keep and drop for X and y
drop_cols = ['date']
y_cols = ['cabi_trips_wdc_to_wdc', 'cabi_trips_wdc_to_wdc_casual']

feature_cols = [col for col in df.columns if (col not in y_cols) & (col not in drop_cols)]

# X y split
Xtrain_raw = train[feature_cols]

# Our target variable here is all DC to DC trips
ytrain = train[y_cols[0]]
Xtest_raw = test[feature_cols]
ytest = test[y_cols[0]]
print(Xtrain_raw.shape, ytrain.shape, Xtest_raw.shape, ytest.shape)

(1461, 63) (1461,) (251, 63) (251,)


### 1. Preprocessing

Trying it without any preprocessing first.

In [8]:
from sklearn.preprocessing import PolynomialFeatures

# Use PolynomialFeatures to create quadratic and interaction terms
# Should ultimately be part of a Pipeline, but I had issues because 
# PF returns an array and Columns requires a df

pf = PolynomialFeatures(1, include_bias=False)

Xtrain_pf_array = pf.fit_transform(Xtrain_raw)
Xtest_pf_array = pf.transform(Xtest_raw)

# Get feature names 
Xtrain_cols = pf.get_feature_names(Xtrain_raw.columns)

# Convert arrays to dfs with the new pf column names
Xtrain = pd.DataFrame(Xtrain_pf_array, columns=Xtrain_cols)
Xtest = pd.DataFrame(Xtest_pf_array, columns=Xtrain_cols)

print(Xtrain.shape, Xtest.shape)

(1461, 63) (251, 63)


In [9]:
Xtrain.describe(percentiles=[.5]).round(3).transpose()

Unnamed: 0,count,mean,std,min,50%,max
day_of_year,1461.0,183.125,105.475,1.0,183.0,366.0
year,1461.0,2014.501,1.119,2013.0,2015.0,2016.0
quarter,1461.0,2.509,1.117,1.0,3.0,4.0
month,1461.0,6.523,3.45,1.0,7.0,12.0
day_of_week,1461.0,3.003,2.0,0.0,3.0,6.0
daylight_hours,1461.0,12.183,2.035,9.0,12.0,15.0
apparenttemperaturehigh,1461.0,64.597,20.97,2.24,66.66,107.32
apparenttemperaturehightime,1461.0,15.425,2.279,7.0,16.0,19.0
apparenttemperaturelow,1461.0,48.905,20.114,-10.84,51.9,86.07
apparenttemperaturelowtime,1461.0,6.75,4.435,0.0,6.0,23.0


In [10]:
# Appending train and test to get full dataset for cross-validation

Xfull = Xtrain.append(Xtest)
yfull = ytrain.append(ytest)
print(Xfull.shape, yfull.shape)
print("Final vars=", Xfull.shape[1])

(1712, 63) (1712,)
Final vars= 63


### 2. Model Fitting

In [11]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import median_absolute_error as medae
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [12]:
def score_model(model):
    """ 
    Fits a model using the training set, predicts using the test set, and then calculates 
    and reports goodness of fit metrics.
    """
    model.fit(Xtrain, ytrain)
    yhat = model.predict(Xtest)
    r2 = r2_score(ytest, yhat)
    me = mse(ytest, yhat)
    ae = mae(ytest, yhat)
    mede = medae(ytest, yhat)
    ev = evs(ytest, yhat)
    print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \
          \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f}".format(model, r2, me, ae, mede, ev))

def cv_score(model, cv=5):
    """
    Evaluates a model by 5-fold cross-validation and prints mean and 2*stdev of scores.
    Shuffles before cross-validation but sets random_state=7 for reproducibility.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=7)
    scores = cross_val_score(model, Xfull, yfull, cv=kf)
    print(scores)
    print("Accuracy: {:0.3f} (+/- {:0.3f})".format(scores.mean(), scores.std() * 2))
    

In [13]:
from sklearn.ensemble import RandomForestRegressor

t = time.perf_counter()
rf = RandomForestRegressor(n_estimators=500) 
score_model(rf)
elapsed_time = (time.perf_counter() - t)/60
print("This cell took {:0.2f} minutes to run".format(elapsed_time))

Results from RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False): 
r2=0.824 
MSE=1680139.301           
MAE=1056.468 
MEDAE=917.658 
EVS=0.845
This cell took 0.21 minutes to run


In [14]:
t = time.perf_counter()
cv_score(rf)
elapsed_time = (time.perf_counter() - t)/60
print("This cell took {:0.2f} minutes to run".format(elapsed_time))

[0.91857589 0.92305609 0.88748569 0.93329556 0.93926031]
Accuracy: 0.920 (+/- 0.036)
This cell took 1.02 minutes to run


In [15]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = Xtrain.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances.head(20)

Unnamed: 0,importance
cabi_dur_empty_tot,0.649995
apparenttemperaturehigh,0.098798
cabi_dur_empty_arl,0.08127
cabi_dur_empty_wdc,0.020685
precipprobability,0.015374
day_of_year,0.013816
cabi_active_members_monthly,0.013805
precipintensitymax,0.010982
cabi_dur_empty_mcs,0.007215
visibility,0.007192


In [16]:
end_time = (time.perf_counter() - start_time)/60
print("This notebook took {:0.2f} minutes to run".format(end_time))

This notebook took 1.27 minutes to run


To do:
* No polynomials, 3 polynomials
* How to interpret the coefficients?
* Modify train/test split size