In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Types of models that will be used to test the methods
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
# Scoring metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
training_data = pd.read_csv('./datasets/Train_Data.csv', index_col=0)
testing_data = pd.read_csv('./datasets/Test_Data.csv', index_col=0)
training_data.head()

Unnamed: 0,avg_trip_distance_hour,avg_fare_amount_hour,avg_tolls_amount_hour,avg_tip_amount_hour,avg_Brooklyn_PU_hour,avg_Manhattan_PU_hour,avg_Queens_PU_hour,avg_Staten_Island_PU_hour,avg_Bronx_DO_hour,avg_Brooklyn_DO_hour,...,avg_Queens_DO_hour,avg_Staten_Island_DO_hour,avg_Mins_In_Ride_hour,Year,Month,Day,Hour,Minute,Second,avg_total_amount_hour
902,1.716836,9.837774,0.000649,1.71345,0.006721,0.976207,0.016222,0.0,0.001777,0.015682,...,0.017381,0.0,12.468015,2020.0,2.0,7.0,15.0,0.0,0.0,14.941477
1662,2.177231,9.241164,0.00057,1.524831,0.019652,0.93607,0.041542,0.0,0.003731,0.015423,...,0.030597,0.0,8.979179,2020.0,3.0,10.0,7.0,0.0,0.0,14.548219
4061,2.376608,9.451111,0.0,1.135965,0.02924,0.935673,0.035088,0.0,0.005848,0.035088,...,0.035088,0.0,7.618304,2020.0,6.0,18.0,6.0,0.0,0.0,13.509466
4276,4.196867,15.776386,0.0,1.254699,0.072289,0.759036,0.120482,0.0,0.084337,0.060241,...,0.156627,0.0,12.454337,2020.0,6.0,27.0,5.0,0.0,0.0,15.934359
1772,2.174408,9.309273,0.000672,1.625397,0.008639,0.959683,0.029998,0.0,0.007439,0.032157,...,0.035277,0.0,9.193669,2020.0,3.0,14.0,21.0,0.0,0.0,14.904066


In [3]:
scaled_training_data = pd.read_csv('./datasets/Scaled_Train_Data.csv', index_col=0)
scaled_testing_data = pd.read_csv('./datasets/Scaled_Test_Data.csv', index_col=0)
scaled_training_data.head()

Unnamed: 0,avg_total_amount_hour,avg_trip_distance_hour,avg_fare_amount_hour,avg_tolls_amount_hour,avg_tip_amount_hour,avg_Brooklyn_PU_hour,avg_Manhattan_PU_hour,avg_Queens_PU_hour,avg_Staten_Island_PU_hour,avg_Bronx_DO_hour,...,avg_Manhattan_DO_hour,avg_Queens_DO_hour,avg_Staten_Island_DO_hour,avg_Mins_In_Ride_hour,Year,Month,Day,Hour,Minute,Second
902,-1.049266,-0.393334,-0.179202,0.678041,-0.542879,0.717105,-0.743743,-0.090486,-0.764672,-0.851409,...,-0.864707,-0.085501,1.173908,0.0,-0.87539,-0.992506,0.509073,0.0,0.0,-0.216984
1662,-0.498839,-0.669347,-0.191281,0.184676,-0.225849,0.217055,0.134667,-0.090486,-0.71607,-0.857655,...,-0.543522,-0.085501,-0.72525,0.0,-0.292115,-0.651219,-0.652838,0.0,0.0,-0.431011
4061,-0.260473,-0.572218,-0.278109,-0.832468,0.009221,0.212107,-0.089259,-0.090486,-0.66344,-0.383127,...,-0.434385,-0.085501,-1.466046,0.0,1.457711,0.258879,-0.798077,0.0,0.0,-0.996343
4276,1.915742,2.354081,-0.278109,-0.521899,1.064664,-1.988511,2.873285,-0.090486,1.28823,0.223841,...,2.519381,-0.085501,1.166463,0.0,1.457711,1.282739,-0.943316,0.0,0.0,0.323382
1772,-0.502214,-0.637837,-0.175689,0.447724,-0.495842,0.511244,-0.265848,-0.090486,-0.623868,-0.453837,...,-0.42978,-0.085501,-0.608492,0.0,-0.292115,-0.19617,1.380506,0.0,0.0,-0.237345


In [4]:
training_data = training_data.fillna(0)
testing_data = testing_data.fillna(0)
scaled_training_data = scaled_training_data.fillna(0)
scaled_testing_data = scaled_testing_data.fillna(0)

In [5]:
print(training_data.shape)
testing_data.shape

(3056, 21)


(1310, 21)

In [6]:
print(scaled_training_data.shape)
scaled_testing_data.shape

(3056, 21)


(1310, 21)

In [7]:
# splitting the data into training and testing variables
X_train = training_data.drop(['avg_total_amount_hour'], axis=1)
X_test = testing_data.drop(['avg_total_amount_hour'], axis=1)
y_train = training_data[['avg_total_amount_hour']]
y_test = testing_data[['avg_total_amount_hour']]

In [8]:
# splitting the scaled data into training and testing variables
scaled_X_train = scaled_training_data.drop(['avg_total_amount_hour'], axis=1)
scaled_X_test = scaled_testing_data.drop(['avg_total_amount_hour'], axis=1)
scaled_y_train = scaled_training_data['avg_total_amount_hour']
scaled_y_test = scaled_testing_data['avg_total_amount_hour']

In [85]:
def get_metrics(model, X_train_param, X_test_param, y_train_param, y_test_param):
    model.fit(X_train_param, y_train_param)
    model_train_pred = model.predict(X_train_param)
    model_test_pred = model.predict(X_test_param)
    
    model_train_RMSE = mean_squared_error(y_train_param, model_train_pred)
    model_train_R2 = r2_score(y_train_param, model_train_pred)
    model_test_RMSE = mean_squared_error(y_test_param, model_test_pred)
    model_test_R2 = r2_score(y_test_param, model_test_pred)
    return model, model_train_RMSE, model_test_RMSE, model_train_R2, model_test_R2


In [68]:
linreg = LinearRegression()
lin_metrics = get_metrics(linreg, X_train, X_test, y_train, y_test)
lin_metrics[4]

0.594062114572092

In [69]:
linreg_parameters = {'RMSE_train': lin_metrics[1], 'RMSE_test': lin_metrics[2], 
                    'r2_train': lin_metrics[3], 'r2_test': lin_metrics[4]}

In [70]:
linreg_coef = {}
for i in range(len(X_train.columns)):
    key = '{}_coef'.format(X_train.columns[i])
    linreg_coef[key] = lin_metrics[0].coef_[0][i]
linreg_coef['Intercept'] = lin_metrics[0].intercept_[0]
linreg_coef

{'avg_trip_distance_hour_coef': 0.02384654212514587,
 'avg_fare_amount_hour_coef': 0.5021296434974725,
 'avg_tolls_amount_hour_coef': -13.201861677706658,
 'avg_tip_amount_hour_coef': 0.8110932593188867,
 'avg_Brooklyn_PU_hour_coef': 6.943171260955081,
 'avg_Manhattan_PU_hour_coef': 9.817486123465908,
 'avg_Queens_PU_hour_coef': 1.8222610628156666,
 'avg_Staten_Island_PU_hour_coef': -223.23728079827234,
 'avg_Bronx_DO_hour_coef': -15.922328132451629,
 'avg_Brooklyn_DO_hour_coef': -10.322684509656238,
 'avg_Manhattan_DO_hour_coef': -20.486324124754695,
 'avg_Queens_DO_hour_coef': -12.139197154780382,
 'avg_Staten_Island_DO_hour_coef': 209.09171271860154,
 'avg_Mins_In_Ride_hour_coef': 0.20491038391526423,
 'Year_coef': -0.00047627714550678484,
 'Month_coef': 0.02329995256293893,
 'Day_coef': 0.003791881407247746,
 'Hour_coef': 0.032330518738870884,
 'Minute_coef': 0.0,
 'Second_coef': 0.0,
 'Intercept': 17.17781414775334}

In [71]:
linreg_scaled = LinearRegression()
scaled_lin_metrics = get_metrics(linreg_scaled, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_lin_metrics[4]

0.9712573105995855

In [72]:
scaled_linreg_parameters = {'RMSE_train': scaled_lin_metrics[1], 'RMSE_test': scaled_lin_metrics[2], 
                    'r2_train': scaled_lin_metrics[3], 'r2_test': scaled_lin_metrics[4]}

In [73]:
scaled_linreg_coef = {}
for i in range(len(scaled_X_train.columns)):
    key = '{}_coef'.format(scaled_X_train.columns[i])
    scaled_linreg_coef[key] = scaled_lin_metrics[0].coef_[i]
scaled_linreg_coef['Intercept'] = scaled_lin_metrics[0].intercept_
scaled_linreg_coef

{'avg_trip_distance_hour_coef': 0.9368319108337128,
 'avg_fare_amount_hour_coef': 0.027811524573117235,
 'avg_tolls_amount_hour_coef': -0.012950816054801294,
 'avg_tip_amount_hour_coef': -0.10898108824293816,
 'avg_Brooklyn_PU_hour_coef': -0.118419902434686,
 'avg_Manhattan_PU_hour_coef': -0.020726934647550557,
 'avg_Queens_PU_hour_coef': 0.016974839603512308,
 'avg_Staten_Island_PU_hour_coef': -0.451189306345183,
 'avg_Bronx_DO_hour_coef': -0.410699656255767,
 'avg_Brooklyn_DO_hour_coef': -1.3578555549561142,
 'avg_Manhattan_DO_hour_coef': -0.5034333577304753,
 'avg_Queens_DO_hour_coef': -0.031162639108591312,
 'avg_Staten_Island_DO_hour_coef': -0.27108070404535906,
 'avg_Mins_In_Ride_hour_coef': -7.173211677424796e-14,
 'Year_coef': 0.010315415228993314,
 'Month_coef': -0.01605441910696027,
 'Day_coef': 0.017178707772479148,
 'Hour_coef': 0.0,
 'Minute_coef': 0.0,
 'Second_coef': 0.014458656282033014,
 'Intercept': 7.697814552103426e-16}

# Ridge Regression

In [74]:
ridge = Ridge()
params = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
ridge_cv = GridSearchCV(ridge, param_grid=params, cv=5)
ridge_cv.fit(X_train, y_train)
ridge_cv.best_params_

{'alpha': 0.0001}

In [75]:
ridgereg = Ridge(0.0001)
ridge_metrics = get_metrics(ridgereg, X_train, X_test, y_train, y_test)
ridge_metrics[4]

0.594020396090598

In [76]:
ridge_parameters = {'RMSE_train': ridge_metrics[1], 'RMSE_test': ridge_metrics[2], 
                    'r2_train': ridge_metrics[3], 'r2_test': ridge_metrics[4], 'alpha': 0.1}

In [77]:
ridgereg_coef = {}
for i in range(len(X_train.columns)):
    key = '{}_coef'.format(X_train.columns[i])
    ridgereg_coef[key] = ridge_metrics[0].coef_[0][i]
ridgereg_coef['Intercept'] = ridge_metrics[0].intercept_[0]
ridgereg_coef

{'avg_trip_distance_hour_coef': 0.019664663101736986,
 'avg_fare_amount_hour_coef': 0.5013097535432647,
 'avg_tolls_amount_hour_coef': -13.318141741407267,
 'avg_tip_amount_hour_coef': 0.8118761298032134,
 'avg_Brooklyn_PU_hour_coef': 7.074598563238224,
 'avg_Manhattan_PU_hour_coef': 9.969755841730134,
 'avg_Queens_PU_hour_coef': 1.9865204675116548,
 'avg_Staten_Island_PU_hour_coef': -103.09636147262144,
 'avg_Bronx_DO_hour_coef': -15.968981831891671,
 'avg_Brooklyn_DO_hour_coef': -10.426029596733583,
 'avg_Manhattan_DO_hour_coef': -20.63584651134562,
 'avg_Queens_DO_hour_coef': -12.239798495082068,
 'avg_Staten_Island_DO_hour_coef': 85.90488449116557,
 'avg_Mins_In_Ride_hour_coef': 0.205584575052608,
 'Year_coef': -0.00047676704535492165,
 'Month_coef': 0.023768205669930143,
 'Day_coef': 0.003826189586317743,
 'Hour_coef': 0.0322504277551859,
 'Minute_coef': 0.0,
 'Second_coef': 0.0,
 'Intercept': 17.18003029535437}

In [78]:
scaled_ridgereg = Ridge(0.0001)
scaled_ridge_metrics = get_metrics(scaled_ridgereg, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_ridge_metrics[4]

0.9712585330009592

In [79]:
scaled_ridge_parameters = {'RMSE_train': scaled_ridge_metrics[1], 'RMSE_test': scaled_ridge_metrics[2], 
                    'r2_train': scaled_ridge_metrics[3], 'r2_test': scaled_ridge_metrics[4], 'alpha': 0.1}

In [80]:
scaled_ridgereg_coef = {}
for i in range(len(scaled_X_train.columns)):
    key = '{}_coef'.format(scaled_X_train.columns[i])
    scaled_ridgereg_coef[key] = scaled_ridge_metrics[0].coef_[i]
scaled_ridgereg_coef['Intercept'] = scaled_ridge_metrics[0].intercept_
scaled_ridgereg_coef

{'avg_trip_distance_hour_coef': 0.9368283026992433,
 'avg_fare_amount_hour_coef': 0.027811068439086227,
 'avg_tolls_amount_hour_coef': -0.01295452780162604,
 'avg_tip_amount_hour_coef': -0.10898234319360971,
 'avg_Brooklyn_PU_hour_coef': -0.11842544739665839,
 'avg_Manhattan_PU_hour_coef': -0.02072954035370628,
 'avg_Queens_PU_hour_coef': 0.016974111925321406,
 'avg_Staten_Island_PU_hour_coef': -0.3556863286835019,
 'avg_Bronx_DO_hour_coef': -0.3122867799225532,
 'avg_Brooklyn_DO_hour_coef': -1.1074805633756415,
 'avg_Manhattan_DO_hour_coef': -0.4057166006529058,
 'avg_Queens_DO_hour_coef': -0.029018662146743788,
 'avg_Staten_Island_DO_hour_coef': -0.2710798832443245,
 'avg_Mins_In_Ride_hour_coef': 0.0,
 'Year_coef': 0.01031468588741437,
 'Month_coef': -0.01605281431629801,
 'Day_coef': 0.01717934035819822,
 'Hour_coef': 0.0,
 'Minute_coef': 0.0,
 'Second_coef': 0.014456461247907677,
 'Intercept': 6.289765582292031e-16}

# Lasso Regression

In [86]:
LassoReg = Lasso(alpha=0.001)
LassoReg.fit(X_train, y_train)
ls_y_pred = LassoReg.predict(X_test)
print(mean_squared_error(y_test, ls_y_pred))
r2_score(y_test, ls_y_pred)

1.125729883132359


0.5941233057975904

In [87]:
lasso = Lasso()
params = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
lasso_cv = GridSearchCV(lasso, param_grid=params, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_cv.best_params_

  positive)
  positive)


{'alpha': 0.001}

In [88]:
lassoreg = Lasso(alpha=0.001)
lassoreg_metrics = get_metrics(lassoreg, X_train, X_test, y_train, y_test)
lassoreg_metrics

(Lasso(alpha=0.001),
 1.2723834506261653,
 1.125729883132359,
 0.631406605923204,
 0.5941233057975904)

In [89]:
lasso_parameters = {'RMSE_train': lassoreg_metrics[1], 'RMSE_test': lassoreg_metrics[2], 
                    'r2_train': lassoreg_metrics[3], 'r2_test': lassoreg_metrics[4], 'alpha': 0.001}

In [90]:
lassoreg_coef = {}
for i in range(len(X_train.columns)):
    key = '{}_coef'.format(X_train.columns[i])
    lassoreg_coef[key] = lassoreg_metrics[0].coef_[i]
lassoreg_coef['Intercept'] = lassoreg_metrics[0].intercept_[0]
lassoreg_coef

{'avg_trip_distance_hour_coef': -0.07917711812953268,
 'avg_fare_amount_hour_coef': 0.46613376752928226,
 'avg_tolls_amount_hour_coef': -0.0,
 'avg_tip_amount_hour_coef': 0.9944909738721761,
 'avg_Brooklyn_PU_hour_coef': -0.0,
 'avg_Manhattan_PU_hour_coef': 3.236460432398636,
 'avg_Queens_PU_hour_coef': -2.3099802092195074,
 'avg_Staten_Island_PU_hour_coef': -0.0,
 'avg_Bronx_DO_hour_coef': -3.8126303747568366,
 'avg_Brooklyn_DO_hour_coef': 0.0,
 'avg_Manhattan_DO_hour_coef': -9.05097787598441,
 'avg_Queens_DO_hour_coef': -0.0,
 'avg_Staten_Island_DO_hour_coef': -0.0,
 'avg_Mins_In_Ride_hour_coef': 0.20482519164723043,
 'Year_coef': -0.0004106936855137426,
 'Month_coef': 0.018504445220802038,
 'Day_coef': 0.003958213367615454,
 'Hour_coef': 0.02949980834715494,
 'Minute_coef': 0.0,
 'Second_coef': 0.0,
 'Intercept': 12.417772586370225}

In [91]:
scaled_lassoreg = Lasso(0.001)
scaled_lassoreg_metrics = get_metrics(scaled_lassoreg, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_lassoreg_metrics[4]

0.9713568925262316

In [92]:
scaled_lasso_parameters = {'RMSE_train': scaled_lassoreg_metrics[1], 'RMSE_test': scaled_lassoreg_metrics[2], 
                    'r2_train': scaled_lassoreg_metrics[3], 'r2_test': scaled_lassoreg_metrics[4], 'alpha': 0.001}

In [93]:
scaled_lassoreg_coef = {}
for i in range(len(scaled_X_train.columns)):
    key = '{}_coef'.format(scaled_X_train.columns[i])
    scaled_lassoreg_coef[key] = scaled_lassoreg_metrics[0].coef_[i]
scaled_lassoreg_coef['Intercept'] = scaled_lassoreg_metrics[0].intercept_
scaled_lassoreg_coef

{'avg_trip_distance_hour_coef': 0.9256104556267888,
 'avg_fare_amount_hour_coef': 0.02721629893142827,
 'avg_tolls_amount_hour_coef': -0.01227456349316627,
 'avg_tip_amount_hour_coef': -0.06549482374968447,
 'avg_Brooklyn_PU_hour_coef': -0.06461865638872859,
 'avg_Manhattan_PU_hour_coef': 0.0,
 'avg_Queens_PU_hour_coef': 0.0,
 'avg_Staten_Island_PU_hour_coef': 0.051561225717579026,
 'avg_Bronx_DO_hour_coef': 0.09366839986010177,
 'avg_Brooklyn_DO_hour_coef': -0.06737748101726299,
 'avg_Manhattan_DO_hour_coef': 0.0,
 'avg_Queens_DO_hour_coef': -0.0021848443165270646,
 'avg_Staten_Island_DO_hour_coef': -0.2642899494456789,
 'avg_Mins_In_Ride_hour_coef': 0.0,
 'Year_coef': 0.01033008838638731,
 'Month_coef': -0.014866623271837399,
 'Day_coef': 0.015088698104426184,
 'Hour_coef': 0.0,
 'Minute_coef': 0.0,
 'Second_coef': 0.011356311264934934,
 'Intercept': 1.754161723581888e-17}

# KNearest Neighbors

In [94]:
knn = KNeighborsRegressor()
params = {'n_neighbors': np.arange(1, 10)}
knn_cv = GridSearchCV(knn, param_grid=params, cv=5)
knn_cv.fit(X_train, y_train)
knn_cv.best_params_

{'n_neighbors': 6}

In [95]:
k_nn = KNeighborsRegressor(n_neighbors=6)
k_nn_metrics = get_metrics(k_nn, X_train, X_test, y_train, y_test)
k_nn_metrics[4]

0.6928458213484202

In [96]:
k_nn_parameters = {'RMSE_train': k_nn_metrics[1], 'RMSE_test': k_nn_metrics[2], 
                    'r2_train': k_nn_metrics[3], 'r2_test': k_nn_metrics[4], 'n_neighbors': 8}

In [97]:
scaled_knn_metrics = get_metrics(k_nn, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_knn_metrics[4]

0.9576602053640365

In [98]:
scaled_k_nn_parameters = {'RMSE_train': scaled_knn_metrics[1], 'RMSE_test': scaled_knn_metrics[2], 
                    'r2_train': scaled_knn_metrics[3], 'r2_test': scaled_knn_metrics[4], 'n_neighbors': 8}

# Decision Tree

In [99]:
decisiontree = DecisionTreeRegressor()
params = {'max_depth': np.arange(1, 10)}
decisiontree = GridSearchCV(decisiontree, param_grid=params, cv=5)
decisiontree.fit(X_train, y_train)
decisiontree.best_params_

{'max_depth': 8}

In [100]:
tree = DecisionTreeRegressor(max_depth=8)
tree_metrics = get_metrics(tree, X_train, X_test, y_train, y_test)
tree_metrics[4]

0.7621444237823523

In [101]:
tree_parameters = {'RMSE_train': tree_metrics[1], 'RMSE_test': tree_metrics[2], 
                    'r2_train': tree_metrics[3], 'r2_test': tree_metrics[4], 'max_depth':8}

In [102]:
scaled_tree_metrics = get_metrics(tree, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_tree_metrics[4]

0.9725197295610699

In [103]:
scaled_tree_parameters = {'RMSE_train': scaled_tree_metrics[1], 'RMSE_test': scaled_tree_metrics[2], 
                    'r2_train': scaled_tree_metrics[3], 'r2_test': scaled_tree_metrics[4], 'max_depth':8}

# Random Forest Regressor

In [104]:
rfr = RandomForestRegressor()
params = {'n_estimators': np.arange(1, 10),
         'max_depth': np.arange(1, 10)}
rfr_cv = GridSearchCV(rfr, param_grid=params, cv=5)
rfr_cv.fit(X_train, y_train)
rfr_cv.best_params_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


{'max_depth': 9, 'n_estimators': 9}

In [105]:
rfrreg = RandomForestRegressor(max_depth=9, n_estimators=8)
rfrreg_metrics = get_metrics(rfrreg, X_train, X_test, y_train, y_test)
rfrreg_metrics[4]

  


0.8198388617742726

In [106]:
rfrtrees_parameters = {'RMSE_train': rfrreg_metrics[1], 'RMSE_test': rfrreg_metrics[2], 
                    'r2_train': rfrreg_metrics[3], 'r2_test': rfrreg_metrics[4], 'n_estimators': 5, 'max_depth': 9}

In [107]:
scaled_rfrreg_metrics = get_metrics(rfrreg, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_rfrreg_metrics[4]

0.9842312646104325

In [108]:
scaled_rfrtrees_parameters = {'RMSE_train': scaled_rfrreg_metrics[1], 'RMSE_test': scaled_rfrreg_metrics[2], 
                              'r2_train': scaled_rfrreg_metrics[3], 'r2_test': scaled_rfrreg_metrics[4],
                              'n_estimators': 5, 'max_depth': 9}

# Gradient Boosting

In [None]:
gradboost = GradientBoostingRegressor()
params = {'n_estimators': np.arange(1, 10),
         'max_depth': np.arange(1, 10)}
gradboost_cv = GridSearchCV(gradboost, param_grid=params, cv=5)
gradboost_cv.fit(X_train, y_train)
gradboost_cv.best_params_

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [None]:
gradientboost = GradientBoostingRegressor(max_depth=9, n_estimators=9)
gradient_metrics = get_metrics(gradientboost, X_train, X_test, y_train, y_test)
gradient_metrics[4]

In [None]:
gradientboosting_parameters = {'RMSE_train': gradient_metrics[1], 'RMSE_test': gradient_metrics[2], 
                               'r2_train': gradient_metrics[3], 'r2_test': gradient_metrics[4],
                              'n_estimators': 9, 'max_depth': 9}

In [None]:
scaled_gradient_metrics = get_metrics(gradientboost, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_gradient_metrics[4]

In [None]:
scaled_gradientboosting_parameters = {'RMSE_train': scaled_gradient_metrics[1], 'RMSE_test': scaled_gradient_metrics[2], 
                               'r2_train': scaled_gradient_metrics[3], 'r2_test': scaled_gradient_metrics[4],
                              'n_estimators': 9, 'max_depth': 9}

# Ada Boost

In [None]:
adaBoost = AdaBoostRegressor()
params = {'n_estimators': np.arange(1, 10)}
adaBoost_cv = GridSearchCV(adaBoost, param_grid=params, cv=5)
adaBoost_cv.fit(X_train, y_train)
adaBoost_cv.best_params_

In [None]:
adaboosting = AdaBoostRegressor(n_estimators=8)
adaboosting_metrics = get_metrics(adaboosting, X_train, X_test, y_train, y_test)
adaboosting_metrics[4]

In [None]:
adaboosting_parameters = {'RMSE_train': adaboosting_metrics[1], 'RMSE_test': adaboosting_metrics[2], 
                          'r2_train': adaboosting_metrics[3], 'r2_test': adaboosting_metrics[4], 'n_estimators': 6}

In [None]:
scaled_adaboosting_metrics = get_metrics(adaboosting, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_adaboosting_metrics[4]

In [None]:
scaled_adaboosting_parameters = {'RMSE_train': scaled_adaboosting_metrics[1], 'RMSE_test': scaled_adaboosting_metrics[2], 
                          'r2_train': scaled_adaboosting_metrics[3], 'r2_test': scaled_adaboosting_metrics[4],
                          'n_estimators': 6}

In [None]:
df = pd.DataFrame([linreg_parameters, ridge_parameters, lasso_parameters, k_nn_parameters, tree_parameters,
                  rfrtrees_parameters, gradientboosting_parameters, adaboosting_parameters],
                 index=['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'KNearestNeighbors',
                       'Single Decison Tree', 'Random Forest', 'Gradient Boosting', 'AdaBoosting'])

df_scaled = pd.DataFrame([scaled_linreg_parameters, scaled_ridge_parameters, scaled_lasso_parameters,
                          scaled_k_nn_parameters, scaled_tree_parameters, scaled_rfrtrees_parameters,
                          scaled_gradientboosting_parameters, scaled_adaboosting_parameters],
                 index=['Scaled Linear Regression', 'Scaled Ridge Regression', 'Scaled Lasso Regression',
                        'Scaled KNearestNeighbors', 'Scaled Single Decison Tree', 'Scaled Random Forest',
                        'Scaled Gradient Boosting', 'Scaled AdaBoosting'])

The best performing model is the RandomForest Regressor with max_depth=9 and n_estimators=5.

In [None]:
df_coef = pd.DataFrame([linreg_coef, ridgereg_coef, lassoreg_coef], index=['Linear Regression Coefficients',
                                                                          'Ridge Regression Coefficients',
                                                                          'Lasso Regression Coefficients'])

df_coef_scaled = pd.DataFrame([scaled_linreg_coef, scaled_ridgereg_coef, scaled_lassoreg_coef],
                              index=['Scaled Linear Regression Coefficients', 'Scaled Ridge Regression Coefficients',
                                    'Scaled Lasso Regression Coefficients'])

In [None]:
df

In [None]:
df_scaled

In [None]:
df_coef

In [None]:
df_coef_scaled

In [None]:
df.to_csv('./datasets/Model_Metrics.csv')
df_scaled.to_csv('./datasets/Scaled_Model_Metrics')
df_coef.to_csv('./datasets/Coef_Metrics')
df_coef_scaled.to_csv('./datasets/Scaled_Coef_Metrics')