# CaBi ML fitting sandbox

5/27: Sandbox created from copying the Champion nb.
* At this point, I've found that dc_pop is more predictive than the dock/station variables and cabi_active_members_day_key and daylight_hours is more predictive than cabi_active_members_monthly
* Now we can try tweaking other things
* After changing the cross-validation to include shuffling, everything performs better, including Ridge
  * This is probably a good thing? It shows that the model is more generalizable, and that any issues we had in CV earlier were because the non-shuffled folds weren't each representative of the full sample

## 0. Data load, shaping, and split
* Read in data from AWS
  * Check for high pairwise correlation
* Encode time variable (day_of_year) as cyclical
* Split into Xtrain, Xtest, ytrain, ytest based on date
  * Specify feature and target columns

In [1]:
# Read in data from AWS

from util_functions import *
import numpy as np
import pandas as pd
import time
start_time = time.perf_counter()

set_env_path()
conn, cur = aws_connect()

# fullquery contains all of the variables within consideration

fullquery = """
SELECT 
EXTRACT(DOY FROM date) as day_of_year,
date,
daylight_hours,
apparenttemperaturehigh,
apparenttemperaturelow,
cloudcover,
dewpoint,
humidity,
precipaccumulation,
precipintensitymax,
precipprobability,
rain,
snow,
visibility,
windspeed,
us_holiday,
nats_single,
nats_double,
dc_bike_event,
dc_pop,
cabi_bikes_avail,
cabi_stations_alx,
cabi_stations_arl,
cabi_stations_ffx,
cabi_stations_mcn,
cabi_stations_mcs,
cabi_stations_wdc,
cabi_docks_alx,
cabi_docks_arl,
cabi_docks_ffx,
cabi_docks_mcn,
cabi_docks_mcs,
cabi_docks_wdc,
cabi_stations_tot,
cabi_docks_tot,
cabi_dur_empty_wdc,
cabi_dur_full_wdc,
cabi_dur_empty_arl,
cabi_dur_full_arl,
cabi_dur_full_alx,
cabi_dur_empty_alx,
cabi_dur_empty_mcs,
cabi_dur_full_mcs,
cabi_dur_full_mcn,
cabi_dur_empty_mcn,
cabi_dur_full_ffx,
cabi_dur_empty_ffx,
cabi_dur_empty_tot,
cabi_dur_full_tot,
cabi_active_members_day_key,
cabi_active_members_monthly,
cabi_active_members_annual,
cabi_trips_wdc_to_wdc,
cabi_trips_wdc_to_wdc_casual
from final_db"""

query = """
SELECT 
EXTRACT(DOY FROM date) as day_of_year,
date,
daylight_hours,
apparenttemperaturehigh,
cloudcover,
humidity,
precipaccumulation,
precipintensitymax,
precipprobability,
rain,
snow,
visibility,
windspeed,
us_holiday,
nats_single,
nats_double,
dc_bike_event,
dc_pop,
cabi_dur_empty_arl,
cabi_dur_full_arl,
cabi_dur_full_alx,
cabi_dur_empty_alx,
cabi_dur_empty_mcs,
cabi_dur_full_mcs,
cabi_dur_full_mcn,
cabi_dur_empty_mcn,
cabi_trips_wdc_to_wdc,
cabi_trips_wdc_to_wdc_casual
from final_db"""

pd.options.display.max_rows = None
pd.options.display.max_columns = None

df = pd.read_sql(query, con=conn)

# Setting date to index for easier splitting
df.set_index(df.date, drop=True, inplace=True)
df.index = pd.to_datetime(df.index)

print("We have {} instances and {} features".format(*df.shape))

We have 2780 instances and 54 features


In [2]:
# Summary statistics

df.describe(percentiles=[.5]).round(3).transpose()

Unnamed: 0,count,mean,std,min,50%,max
day_of_year,2780.0,182.697,107.702,1.0,182.0,366.0
daylight_hours,2780.0,12.077,2.021,9.0,12.0,15.0
apparenttemperaturehigh,2780.0,64.306,20.594,2.24,65.89,113.67
apparenttemperaturelow,2780.0,48.574,19.685,-10.84,50.885,90.06
cloudcover,2780.0,0.34,0.244,0.0,0.27,1.0
dewpoint,2780.0,45.143,18.577,-9.73,46.435,75.84
humidity,2780.0,0.669,0.137,0.21,0.67,0.97
precipaccumulation,2780.0,0.045,0.527,0.0,0.0,21.427
precipintensitymax,2780.0,0.029,0.072,0.0,0.001,1.185
precipprobability,2780.0,0.282,0.373,0.0,0.0,1.0


In [3]:
def print_highly_correlated(df, features, threshold=0.75):
    """ 
    Prints highly correlated feature pairs in df.
    """
    corr_df = df[features].corr()
    # Select pairs above threshold
    correlated_features = np.where(np.abs(corr_df) > threshold)
    # Avoid duplication
    correlated_features = [(corr_df.iloc[x,y], x, y) for x, y in zip(*correlated_features) if x != y and x < y]
    # Sort by abs(correlation)
    s_corr_list = sorted(correlated_features, key=lambda x: -abs(x[0]))
    print("There are {} feature pairs with pairwise correlation above {}".format(len(s_corr_list), threshold))
    for v, i, j in s_corr_list:
        cols = df[features].columns
        print("{} and {} = {:0.3f}".format(corr_df.index[i], corr_df.columns[j], v))
        
print_highly_correlated(df, df.columns)

There are 140 feature pairs with pairwise correlation above 0.75
cabi_stations_mcs and cabi_docks_mcs = 1.000
cabi_stations_tot and cabi_docks_tot = 1.000
cabi_stations_ffx and cabi_docks_ffx = 0.999
cabi_stations_wdc and cabi_docks_wdc = 0.999
cabi_stations_arl and cabi_docks_arl = 0.998
cabi_stations_mcn and cabi_docks_mcn = 0.997
cabi_stations_wdc and cabi_docks_tot = 0.994
cabi_stations_wdc and cabi_stations_tot = 0.994
cabi_docks_wdc and cabi_docks_tot = 0.993
cabi_dur_empty_wdc and cabi_dur_empty_tot = 0.992
cabi_bikes_avail and cabi_stations_tot = 0.992
cabi_dur_full_wdc and cabi_dur_full_tot = 0.992
cabi_docks_wdc and cabi_stations_tot = 0.991
cabi_bikes_avail and cabi_docks_tot = 0.991
cabi_stations_alx and cabi_docks_alx = 0.990
dc_pop and cabi_docks_arl = 0.989
cabi_stations_mcn and cabi_stations_mcs = 0.988
dc_pop and cabi_docks_tot = 0.988
cabi_bikes_avail and cabi_stations_wdc = 0.988
dc_pop and cabi_stations_wdc = 0.988
dc_pop and cabi_docks_wdc = 0.987
dc_pop and cabi_s

In [4]:
# Encode day_of_year as cyclical
df['sin_day_of_year'] = np.sin(2*np.pi*df.day_of_year/365)
df['cos_day_of_year'] = np.cos(2*np.pi*df.day_of_year/365)

In [5]:
df.sample(100).plot.scatter('sin_day_of_year','cos_day_of_year').set_aspect('equal')

* Split into Xtrain, Xtest, ytrain, ytest based on date
  * Training dates = 2013-01-01 to 2016-12-31
  * Test dates = 2017-01-01 to 2017-09-08
  * New data (coincides with beginning of dockless pilot) = 2017-09-09 to present

In [6]:
# Train test split
# This can be tweaked, but we use 5-fold cross-validation to pick the model so that shouldn't change

train = df.loc['2013-01-01':'2016-12-31']
test = df.loc['2017-01-01':'2017-09-08']
print(train.shape, test.shape)

tr = train.shape[0]
te = test.shape[0]
trpct = tr/(tr+te)
tepct = te/(tr+te)

print("{:0.3f} percent of the data is in the training set and {:0.3f} percent is in the test set".format(trpct, tepct))

(1461, 56) (251, 56)
0.853 percent of the data is in the training set and 0.147 percent is in the test set


In [7]:
# Specify columns to keep and drop for X and y
drop_cols = ['date', 'day_of_year']
y_cols = ['cabi_trips_wdc_to_wdc', 'cabi_trips_wdc_to_wdc_casual']

feature_cols = [col for col in df.columns if (col not in y_cols) & (col not in drop_cols)]

# X y split
Xtrain_raw = train[feature_cols]

# Our target variable here is all DC to DC trips
ytrain = train[y_cols[0]]
Xtest_raw = test[feature_cols]
ytest = test[y_cols[0]]
print(Xtrain_raw.shape, ytrain.shape, Xtest_raw.shape, ytest.shape)

(1461, 52) (1461,) (251, 52) (251,)


### 1. Preprocessing

We want to use PolynomialFeatures and StandardScaler in a Pipeline, but we only want to scale continuous features.

Here, I do the polynomial transformation first and then feed it through a pipeline because I wasn't able to get it all working in one pipeline.

* Use PolynomialFeatures to create quadratic and interaction terms
  * Convert back to DataFrame
  * Drop redundant variables
* Use Pipeline and FeatureUnion to selectively scale/ignore certain variables
* Fit and transform using pipeline to get final Xtrain and Xtest

In [8]:
# Imports and custom classes
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin

class Columns(BaseEstimator, TransformerMixin):
    """ 
    This is a custom transformer for splitting the data into subsets for FeatureUnion.
    """
    def __init__(self, names=None):
        self.names = names

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        return X[self.names]
    

In [9]:
# Use PolynomialFeatures to create quadratic and interaction terms
# Should ultimately be part of a Pipeline, but I had issues because 
# PF returns an array and Columns requires a df

pf = PolynomialFeatures(1, include_bias=False)

Xtrain_pf_array = pf.fit_transform(Xtrain_raw)
Xtest_pf_array = pf.transform(Xtest_raw)

# Get feature names 
Xtrain_cols = pf.get_feature_names(Xtrain_raw.columns)

# Convert arrays to dfs with the new pf column names
Xtrain_pf = pd.DataFrame(Xtrain_pf_array, columns=Xtrain_cols)
Xtest_pf = pd.DataFrame(Xtest_pf_array, columns=Xtrain_cols)

print(Xtrain_pf.shape, Xtest_pf.shape)

(1461, 52) (251, 52)


In [10]:
# A lot of these variables are redundant, especially squared dummy variables
# All of these variables listed next are 'binary' but only some are meaningful

bin_vars = [col for col in Xtrain_pf.columns if Xtrain_pf[col].nunique() == 2]
bin_vars

['rain',
 'snow',
 'us_holiday',
 'nats_single',
 'nats_double',
 'dc_bike_event',
 'cabi_dur_empty_ffx']

In [11]:
# Dropping squared dummies and nonsensical interaction terms
# This part can be expanded. There's a lot of noise after PF

to_drop = [
    'rain^2', 'snow^2', 'us_holiday^2', 'nats_single^2', 'nats_double^2', 
    'dc_bike_event^2', 'sin_day_of_year^2', 'cos_day_of_year^2',
    'sin_day_of_year cos_day_of_year'
]
'''
Xtrain_pf2 = Xtrain_pf.drop(labels=to_drop, axis=1)
Xtest_pf2 = Xtest_pf.drop(labels=to_drop, axis=1)
'''
Xtrain_pf2 = Xtrain_pf.copy()
Xtest_pf2 = Xtest_pf.copy()

print(Xtrain_pf2.shape, Xtest_pf2.shape)

(1461, 52) (251, 52)


In [12]:
Xtrain_pf2.head()

Unnamed: 0,daylight_hours,apparenttemperaturehigh,apparenttemperaturelow,cloudcover,dewpoint,humidity,precipaccumulation,precipintensitymax,precipprobability,rain,snow,visibility,windspeed,us_holiday,nats_single,nats_double,dc_bike_event,dc_pop,cabi_bikes_avail,cabi_stations_alx,cabi_stations_arl,cabi_stations_ffx,cabi_stations_mcn,cabi_stations_mcs,cabi_stations_wdc,cabi_docks_alx,cabi_docks_arl,cabi_docks_ffx,cabi_docks_mcn,cabi_docks_mcs,cabi_docks_wdc,cabi_stations_tot,cabi_docks_tot,cabi_dur_empty_wdc,cabi_dur_full_wdc,cabi_dur_empty_arl,cabi_dur_full_arl,cabi_dur_full_alx,cabi_dur_empty_alx,cabi_dur_empty_mcs,cabi_dur_full_mcs,cabi_dur_full_mcn,cabi_dur_empty_mcn,cabi_dur_full_ffx,cabi_dur_empty_ffx,cabi_dur_empty_tot,cabi_dur_full_tot,cabi_active_members_day_key,cabi_active_members_monthly,cabi_active_members_annual,sin_day_of_year,cos_day_of_year
0,9.0,41.12,27.08,0.82,28.14,0.61,0.0,0.0,0.0,0.0,0.0,10.0,4.91,1.0,0.0,0.0,0.0,646400.0,1740.0,8.0,45.0,0.0,0.0,0.0,141.0,251.0,806.0,0.0,0.0,0.0,3142.0,194.0,4199.0,94125.0,98078.0,0.0,8635.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94125.0,106713.0,0.0,72.0,20777.0,0.017213,0.999852
1,9.0,30.18,23.83,0.49,20.08,0.54,0.0,0.0,0.0,0.0,0.0,10.0,6.75,0.0,0.0,0.0,0.0,646400.0,1740.0,8.0,45.0,0.0,0.0,0.0,141.0,251.0,806.0,0.0,0.0,0.0,3142.0,194.0,4199.0,265014.0,231465.0,39580.0,14297.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,304594.0,245762.0,0.0,72.0,20023.0,0.034422,0.999407
2,9.0,32.65,25.05,0.3,18.69,0.59,0.0,0.0,0.0,0.0,0.0,10.0,1.29,0.0,0.0,0.0,0.0,646400.0,1740.0,8.0,45.0,0.0,0.0,0.0,141.0,251.0,806.0,0.0,0.0,0.0,3142.0,194.0,4199.0,343719.0,309701.0,9426.0,12830.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,353145.0,322531.0,0.0,72.0,20023.0,0.05162,0.998667
3,10.0,37.6,25.99,0.16,20.54,0.55,0.0,0.0,0.0,0.0,0.0,9.99,6.83,0.0,0.0,0.0,0.0,646400.0,1740.0,8.0,45.0,0.0,0.0,0.0,141.0,251.0,806.0,0.0,0.0,0.0,3142.0,194.0,4199.0,306638.0,237864.0,27949.0,40820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,334587.0,278684.0,0.0,72.0,20023.0,0.068802,0.99763
4,10.0,43.83,32.01,0.21,22.79,0.55,0.0,0.0,0.0,0.0,0.0,9.98,2.92,0.0,0.0,0.0,0.0,646400.0,1740.0,8.0,45.0,0.0,0.0,0.0,141.0,251.0,806.0,0.0,0.0,0.0,3142.0,194.0,4199.0,141722.0,151893.0,13432.0,22196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,155154.0,174089.0,0.0,72.0,20023.0,0.085965,0.996298


In [13]:
# Defining binary and continuous variables
# We have normal 0,1 binary variables, binary variables outside 0,1 that were created by PF, and continuous variables
# We want to ignore the 0,1s, MinMaxScale the non 0,1 binary variables, and StandardScale the continuous variables

binary = [col for col in Xtrain_pf2.columns if Xtrain_pf2[col].nunique() == 2]
cont = [col for col in Xtrain_pf2.columns if (col not in binary)]

# FeatureUnion in our pipeline shifts the ordering of the variables so we need to save the ordering here
cols = binary + cont

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('binarypf', Pipeline([
            ('binpfcols', Columns(names=binary)),
            ('minmax', MinMaxScaler())
        ])),
        ('continuous', Pipeline([
            ('contcols', Columns(names=cont)),
            ('scaler', StandardScaler())
        ]))
    ]))   
])

In [14]:
# Fit and transform to create our final Xtrain and Xtest

pipeline.fit(Xtrain_pf2)
Xtrain_scaled = pipeline.transform(Xtrain_pf2)
Xtest_scaled = pipeline.transform(Xtest_pf2)

# Put everything back into dfs
Xtrain = pd.DataFrame(Xtrain_scaled, columns=cols)
Xtest = pd.DataFrame(Xtest_scaled, columns=cols)
print(Xtrain.shape, Xtest.shape)

(1461, 52) (251, 52)


In [15]:
Xtrain.describe(percentiles=[.5]).round(3).transpose()

Unnamed: 0,count,mean,std,min,50%,max
rain,1461.0,0.449,0.498,0.0,0.0,1.0
snow,1461.0,0.047,0.211,0.0,0.0,1.0
us_holiday,1461.0,0.029,0.167,0.0,0.0,1.0
nats_single,1461.0,0.211,0.408,0.0,0.0,1.0
nats_double,1461.0,0.005,0.074,0.0,0.0,1.0
dc_bike_event,1461.0,0.012,0.11,0.0,0.0,1.0
cabi_dur_empty_ffx,1461.0,0.001,0.026,0.0,0.0,1.0
daylight_hours,1461.0,0.0,1.0,-1.565,-0.09,1.385
apparenttemperaturehigh,1461.0,-0.0,1.0,-2.975,0.098,2.038
apparenttemperaturelow,1461.0,0.0,1.0,-2.971,0.149,1.848


In [16]:
# Appending train and test to get full dataset for cross-validation

Xfull = Xtrain.append(Xtest)
yfull = ytrain.append(ytest)
print(Xfull.shape, yfull.shape)

(1712, 52) (1712,)


### 2. Model Fitting

In [17]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import median_absolute_error as medae
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [18]:
from sklearn.model_selection import KFold

def score_model(model, alpha=False):
    """ 
    Fits a model using the training set, predicts using the test set, and then calculates 
    and reports goodness of fit metrics and alpha if specified and available.
    """
    model.fit(Xtrain, ytrain)
    yhat = model.predict(Xtest)
    r2 = r2_score(ytest, yhat)
    me = mse(ytest, yhat)
    ae = mae(ytest, yhat)
    mede = medae(ytest, yhat)
    ev = evs(ytest, yhat)
    
    if alpha == True:
        print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \
              \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f} \nalpha={:0.3f}".format(model, r2, me, 
                                                                                  ae, mede, ev, model.alpha_))
    else:
        print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \
              \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f}".format(model, r2, me, ae, mede, ev))

def cv_score(model, cv=5):
    """
    Evaluates a model by 5-fold cross-validation and prints mean and 2*stdev of scores.
    Shuffles before cross-validation but sets random_state=7 for reproducibility.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=7)
    scores = cross_val_score(model, Xfull, yfull, cv=kf)
    print(scores)
    print("Accuracy: {:0.3f} (+/- {:0.3f})".format(scores.mean(), scores.std() * 2))
    

In [19]:
'''Elastic Net'''
from sklearn.linear_model import ElasticNetCV

t = time.perf_counter()

# Alphas to search over
# Our alpha is usually in the low double digits
# This sets our search space to 250 steps between 10^0=1 and 10^2=100
alphas = np.logspace(-10, 0, 250)

# Suggested l1_ratio from docs
l1_ratio = [.1, .5, .7, .9, .95, .99, 1]

en = ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, fit_intercept=True, normalize=False)

score_model(en, alpha=True)
print("L1 ratio=",en.l1_ratio_)

elapsed_time = (time.perf_counter() - t)/60
print("This cell took {:0.2f} minutes to run".format(elapsed_time))



Results from ElasticNetCV(alphas=array([1.00000e-10, 1.09688e-10, ..., 9.11674e-01, 1.00000e+00]),
       copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000,
       n_alphas=100, n_jobs=1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0): 
r2=0.353 
MSE=6166234.084               
MAE=1686.747 
MEDAE=1156.234 
EVS=0.363 
alpha=0.013
L1 ratio= 0.1
This cell took 0.52 minutes to run




In [20]:
'''Lasso'''
from sklearn.linear_model import LassoCV

t = time.perf_counter()

lasso = LassoCV(alphas=alphas, n_alphas=250, fit_intercept=True, normalize=False)
score_model(lasso, alpha=True)

elapsed_time = (time.perf_counter() - t)/60
print("This cell took {:0.2f} minutes to run".format(elapsed_time))



Results from LassoCV(alphas=array([1.00000e-10, 1.09688e-10, ..., 9.11674e-01, 1.00000e+00]),
    copy_X=True, cv=None, eps=0.001, fit_intercept=True, max_iter=1000,
    n_alphas=250, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False): 
r2=-7.175 
MSE=77854717.047               
MAE=3647.534 
MEDAE=1273.611 
EVS=-6.795 
alpha=0.912
This cell took 0.08 minutes to run




In [21]:
# Which variables were selected?

# Put coefficients and variable names in df
lassodf = pd.DataFrame(lasso.coef_, index=Xtrain.columns)

# Select nonzeros
results = lassodf[(lassodf.T != 0).any()]

# Sort by magnitude
results['sorted'] = results[0].abs()
results.sort_values(by='sorted', inplace=True, ascending=False)

print("Lasso chooses {} variables".format(len(results)))
print(results)

Lasso chooses 45 variables
                                       0       sorted
cabi_bikes_avail            -1040.418401  1040.418401
cabi_docks_ffx               1027.425243  1027.425243
cabi_stations_ffx            -790.212232   790.212232
us_holiday                   -709.296428   709.296428
apparenttemperaturehigh       699.753656   699.753656
cabi_dur_empty_wdc            599.791878   599.791878
cabi_active_members_annual    570.125585   570.125585
nats_double                   502.469816   502.469816
snow                         -484.546438   484.546438
cabi_dur_empty_arl            482.035829   482.035829
humidity                     -392.643268   392.643268
cabi_dur_empty_ffx           -390.447298   390.447298
cabi_stations_mcs             357.503378   357.503378
dewpoint                      331.317696   331.317696
nats_single                   328.849475   328.849475
precipprobability            -302.107584   302.107584
cos_day_of_year              -293.210414   293.210414
c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [22]:
'''Ridge'''
from sklearn.linear_model import RidgeCV

t = time.perf_counter()

rr = RidgeCV(alphas=alphas, fit_intercept=True, normalize=False)

score_model(rr, alpha=True)

cv_score(rr)

elapsed_time = (time.perf_counter() - t)/60
print("This cell took {:0.2f} minutes to run".format(elapsed_time))

Results from RidgeCV(alphas=array([1.00000e-10, 1.09688e-10, ..., 9.11674e-01, 1.00000e+00]),
    cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring=None, store_cv_values=False): 
r2=-46.132 
MSE=448844751.709               
MAE=7502.699 
MEDAE=1353.448 
EVS=-42.577 
alpha=0.912
[0.85617746 0.85064933 0.83448946 0.85937594 0.86316262]
Accuracy: 0.853 (+/- 0.020)
This cell took 0.02 minutes to run


In [23]:
'''RF'''
from sklearn.ensemble import RandomForestRegressor

t = time.perf_counter()

rf = RandomForestRegressor() 
score_model(rf)

cv_score(rf)

elapsed_time = (time.perf_counter() - t)/60
print("This cell took {:0.2f} minutes to run".format(elapsed_time))

Results from RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False): 
r2=0.822 
MSE=1694052.266               
MAE=1008.756 
MEDAE=832.300 
EVS=0.837
[0.90048116 0.90698067 0.87783071 0.91605326 0.91430531]
Accuracy: 0.903 (+/- 0.028)
This cell took 0.02 minutes to run


In [24]:
t = time.perf_counter()

cv_score(lasso)

elapsed_time = (time.perf_counter() - t)/60
print("This cell took {:0.2f} minutes to run".format(elapsed_time))



[0.85457201 0.84915847 0.83331284 0.86004123 0.86342485]
Accuracy: 0.852 (+/- 0.021)
This cell took 0.40 minutes to run




In [25]:
end_time = (time.perf_counter() - start_time)/60
print("This notebook took {:0.2f} minutes to run".format(end_time))

This notebook took 1.09 minutes to run


To do:
* No polynomials, 3 polynomials
* How to interpret the coefficients?
* Modify train/test split size