In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Types of models that will be used to test the methods
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
# Scoring metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
training_data = pd.read_csv('./datasets/Train_Data.csv', index_col=0)
testing_data = pd.read_csv('./datasets/Test_Data.csv', index_col=0)
training_data.head()

  mask |= (ar1 == a)


Unnamed: 0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,congestion_surcharge,VendorID_1,...,store_and_fwd_flag_NK,store_and_fwd_flag_Y,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,total_amount
6665106,6.0,2.04,10.5,0.0,0.5,1.0,0.0,0.3,2.5,0,...,0,0,1,0,0,0,0,0,0,14.8
3115270,3.0,1.19,7.5,1.0,0.5,1.5,0.0,0.3,2.5,0,...,0,0,0,1,0,0,0,0,0,13.3
16480546,2.0,7.99,27.0,0.5,0.5,0.0,0.0,0.3,2.5,0,...,0,0,0,0,0,1,0,0,0,30.8
13418577,1.0,1.84,11.0,1.0,0.5,3.06,0.0,0.3,2.5,0,...,0,0,0,0,1,0,0,0,0,18.36
4318540,2.0,1.5,8.0,3.5,0.5,2.46,0.0,0.3,2.5,1,...,0,0,0,0,0,0,1,0,0,14.76


In [3]:
scaled_training_data = pd.read_csv('./datasets/Scaled_Train_Data.csv', index_col=0)
scaled_testing_data = pd.read_csv('./datasets/Scaled_Test_Data.csv', index_col=0)
scaled_training_data.head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,...,store_and_fwd_flag_N,store_and_fwd_flag_NK,store_and_fwd_flag_Y,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
6665106,3.965663,-0.006628,-0.008853,-0.877712,0.098219,-0.435445,-0.193567,0.060195,0.314756,-0.701261,...,-0.130153,-0.098888,2.634261,-0.403813,-0.427468,-0.439033,-0.440066,-0.40169,-0.363246,-0.015509
3115270,1.341061,-0.012098,-0.021701,-0.081851,0.098219,-0.243078,-0.193567,0.060195,0.314756,-0.701261,...,-0.130153,-0.098888,-0.379613,2.476392,-0.427468,-0.439033,-0.440066,-0.40169,-0.363246,-0.02193
16480546,0.466193,0.031663,0.061814,-0.479782,0.098219,-0.820178,-0.193567,0.060195,0.314756,-0.701261,...,-0.130153,-0.098888,-0.379613,-0.403813,-0.427468,2.277734,-0.440066,-0.40169,-0.363246,0.052976
13418577,-0.408674,-0.007915,-0.006711,-0.081851,0.098219,0.357106,-0.193567,0.060195,0.314756,-0.701261,...,-0.130153,-0.098888,-0.379613,-0.403813,2.339355,-0.439033,-0.440066,-0.40169,-0.363246,-0.000271
4318540,0.466193,-0.010103,-0.01956,1.907801,0.098219,0.126266,-0.193567,0.060195,0.314756,1.426003,...,-0.130153,-0.098888,-0.379613,-0.403813,-0.427468,-0.439033,2.272386,-0.40169,-0.363246,-0.015681


In [4]:
training_data = training_data.dropna()
testing_data = testing_data.dropna()
scaled_training_data = scaled_training_data.dropna()
scaled_testing_data = scaled_testing_data.dropna()

In [5]:
print(training_data.shape)
testing_data.shape

(11671252, 71)


(5001966, 71)

In [6]:
print(scaled_training_data.shape)
scaled_testing_data.shape

(11671252, 71)


(5001966, 71)

In [7]:
# splitting the data into training and testing variables
X_train = training_data.drop(['total_amount'], axis=1)
X_test = testing_data.drop(['total_amount'], axis=1)
y_train = training_data[['total_amount']]
y_test = testing_data[['total_amount']]

In [8]:
# splitting the scaled data into training and testing variables
scaled_X_train = scaled_training_data.drop(['total_amount'], axis=1)
scaled_X_test = scaled_testing_data.drop(['total_amount'], axis=1)
scaled_y_train = scaled_training_data['total_amount']
scaled_y_test = scaled_testing_data['total_amount']

In [9]:
def get_metrics(model, X_train_param, X_test_param, y_train_param, y_test_param):
    model.fit(X_train_param, y_train_param)
    model_train_pred = model.predict(X_train_param)
    model_test_pred = model.predict(X_test_param)
    
    model_train_RMSE = mean_squared_error(y_train_param, model_train_pred, squared=False)
    model_train_R2 = r2_score(y_train_param, model_train_pred)
    model_test_RMSE = mean_squared_error(y_test_param, model_test_pred, squared=False)
    model_test_R2 = r2_score(y_test_param, model_test_pred)
    return model, model_train_RMSE, model_test_RMSE, model_train_R2, model_test_R2

In [10]:
linreg = LinearRegression()
lin_metrics = get_metrics(linreg, X_train, X_test, y_train, y_test)
lin_metrics[4]

0.8879021543847992

In [11]:
linreg_parameters = {'RMSE_train': lin_metrics[1], 'RMSE_test': lin_metrics[2], 
                    'r2_train': lin_metrics[3], 'r2_test': lin_metrics[4]}

In [12]:
linreg_coef = {}
for i in range(len(X_train.columns)):
    key = '{}_coef'.format(X_train.columns[i])
    linreg_coef[key] = lin_metrics[0].coef_[0][i]
linreg_coef['Intercept'] = lin_metrics[0].intercept_[0]
linreg_coef

{'passenger_count_coef': 0.0010140987583700898,
 'trip_distance_coef': -2.755065277554314e-06,
 'fare_amount_coef': 1.0000143246154896,
 'extra_coef': 0.6869719846780284,
 'mta_tax_coef': 0.6431153641701794,
 'tip_amount_coef': 0.99847231550015,
 'tolls_amount_coef': 1.0022625237294935,
 'improvement_surcharge_coef': 3.7649383131935177,
 'congestion_surcharge_coef': 0.7678819360907848,
 'VendorID_1_coef': -0.8073319175067712,
 'VendorID_2_coef': 0.7921318061864712,
 'RateCode_1_coef': -0.13233165390187895,
 'RateCode_2_coef': 0.0499497614681601,
 'RateCode_3_coef': -0.3224291379658477,
 'RateCode_4_coef': 0.025056968066723195,
 'RateCode_5_coef': -0.12601175741838272,
 'RateCode_6_coef': 0.372174582729602,
 'RateCode_99_coef': 0.11839112570082488,
 'payment_type_1_coef': -0.37854995029314276,
 'payment_type_2_coef': -0.40365375327865927,
 'payment_type_3_coef': -0.17567500434111238,
 'payment_type_4_coef': -0.26303143931464734,
 'payment_type_5_coef': 1.205710035908169,
 'Bronx_PU_coef

In [13]:
linreg_scaled = LinearRegression()
scaled_lin_metrics = get_metrics(linreg_scaled, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_lin_metrics[4]

-105176.60888984652

In [14]:
scaled_linreg_parameters = {'RMSE_train': scaled_lin_metrics[1], 'RMSE_test': scaled_lin_metrics[2], 
                    'r2_train': scaled_lin_metrics[3], 'r2_test': scaled_lin_metrics[4]}

In [15]:
scaled_linreg_coef = {}
for i in range(len(scaled_X_train.columns)):
    key = '{}_coef'.format(scaled_X_train.columns[i])
    scaled_linreg_coef[key] = scaled_lin_metrics[0].coef_[i]
scaled_linreg_coef['Intercept'] = scaled_lin_metrics[0].intercept_
scaled_linreg_coef

{'passenger_count_coef': 0.00020378634322531223,
 'trip_distance_coef': 0.0007117353807072763,
 'fare_amount_coef': -257.5202631108437,
 'extra_coef': -0.8118620294768008,
 'mta_tax_coef': -0.007657645184876938,
 'tip_amount_coef': -2.82837147238977,
 'tolls_amount_coef': -1.8001088370328453,
 'improvement_surcharge_coef': -0.07404733912002398,
 'congestion_surcharge_coef': 8623338795.783451,
 'VendorID_1_coef': 8728018946.786913,
 'VendorID_2_coef': 78182417435.04703,
 'RateCode_1_coef': 55699227643.58955,
 'RateCode_2_coef': 16376387581.421568,
 'RateCode_3_coef': 3900635210.3651886,
 'RateCode_4_coef': 22131607922.14804,
 'RateCode_5_coef': 775974109.7791394,
 'RateCode_6_coef': 2370599439.5047107,
 'RateCode_99_coef': 173139876477.19528,
 'payment_type_1_coef': 167810373100.50708,
 'payment_type_2_coef': 28622580504.26202,
 'payment_type_3_coef': 21327002860.13546,
 'payment_type_4_coef': 113738850.5897848,
 'payment_type_5_coef': -0.026105560408711703,
 'Bronx_PU_coef': -0.0577795

# Ridge Regression

In [18]:
ridge = Ridge()
params = {'alpha': [1, 0.1, 0.01, 0.001]}
ridge_cv = GridSearchCV(ridge, param_grid=params, cv=5)
ridge_cv.fit(X_train, y_train)
ridge_cv.best_params_

{'alpha': 1}

In [20]:
ridgereg = Ridge(alpha=ridge_cv.best_params_['alpha'])
ridge_metrics = get_metrics(ridgereg, X_train, X_test, y_train, y_test)
ridge_metrics[4]

0.8880707725776429

In [21]:
ridge_parameters = {'RMSE_train': ridge_metrics[1], 'RMSE_test': ridge_metrics[2], 
                    'r2_train': ridge_metrics[3], 'r2_test': ridge_metrics[4], 'alpha': 0.1}

In [22]:
ridgereg_coef = {}
for i in range(len(X_train.columns)):
    key = '{}_coef'.format(X_train.columns[i])
    ridgereg_coef[key] = ridge_metrics[0].coef_[0][i]
ridgereg_coef['Intercept'] = ridge_metrics[0].intercept_[0]
ridgereg_coef

{'passenger_count_coef': 0.0010142223469673899,
 'trip_distance_coef': -2.7554298464738226e-06,
 'fare_amount_coef': 1.0000143252238143,
 'extra_coef': 0.6869707964731465,
 'mta_tax_coef': 0.6436205873107322,
 'tip_amount_coef': 0.9984719949953605,
 'tolls_amount_coef': 1.0022622091174014,
 'improvement_surcharge_coef': 3.763698174686881,
 'congestion_surcharge_coef': 0.7678882111834207,
 'VendorID_1_coef': -0.8179436857074434,
 'VendorID_2_coef': 0.7815142962059208,
 'RateCode_1_coef': -0.1343051120342748,
 'RateCode_2_coef': 0.04797360754784556,
 'RateCode_3_coef': -0.32409210424543294,
 'RateCode_4_coef': 0.023076613599855536,
 'RateCode_5_coef': -0.12780748391913485,
 'RateCode_6_coef': 0.3626643002251231,
 'RateCode_99_coef': 0.11605948042204121,
 'payment_type_1_coef': -0.24925243257635085,
 'payment_type_2_coef': -0.27435705817707856,
 'payment_type_3_coef': -0.0464202173919956,
 'payment_type_4_coef': -0.13384142214870626,
 'payment_type_5_coef': 0.6674411853870007,
 'Bronx_PU_

In [24]:
scaled_ridgereg = Ridge(alpha=ridge_cv.best_params_['alpha'])
scaled_ridge_metrics = get_metrics(scaled_ridgereg, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_ridge_metrics[4]

-108886.95302625443

In [25]:
scaled_ridge_parameters = {'RMSE_train': scaled_ridge_metrics[1], 'RMSE_test': scaled_ridge_metrics[2], 
                    'r2_train': scaled_ridge_metrics[3], 'r2_test': scaled_ridge_metrics[4], 'alpha': 0.1}

In [26]:
scaled_ridgereg_coef = {}
for i in range(len(scaled_X_train.columns)):
    key = '{}_coef'.format(scaled_X_train.columns[i])
    scaled_ridgereg_coef[key] = scaled_ridge_metrics[0].coef_[i]
scaled_ridgereg_coef['Intercept'] = scaled_ridge_metrics[0].intercept_
scaled_ridgereg_coef

{'passenger_count_coef': 0.00036959347076250643,
 'trip_distance_coef': 0.0008693604921295292,
 'fare_amount_coef': -247.277286605363,
 'extra_coef': -0.7641097336080122,
 'mta_tax_coef': -0.0023280036103667757,
 'tip_amount_coef': -2.712070563652221,
 'tolls_amount_coef': -1.7249341244748437,
 'improvement_surcharge_coef': -0.06302780636845079,
 'congestion_surcharge_coef': 0.3400781401790996,
 'VendorID_1_coef': -0.32968673171190493,
 'VendorID_2_coef': 0.016657768005778555,
 'RateCode_1_coef': 0.0020136266825534843,
 'RateCode_2_coef': -0.009497829192261614,
 'RateCode_3_coef': -0.001853051797742237,
 'RateCode_4_coef': -0.004412801808336983,
 'RateCode_5_coef': -0.001955915943966158,
 'RateCode_6_coef': -0.00042980716809392527,
 'RateCode_99_coef': 0.002610950055111721,
 'payment_type_1_coef': 0.008229087891629495,
 'payment_type_2_coef': -0.015444375384061527,
 'payment_type_3_coef': -0.01054716510199002,
 'payment_type_4_coef': -0.00019693011141439155,
 'payment_type_5_coef': -0.

# Lasso Regression

In [27]:
lasso = Lasso()
params = {'alpha': [1, 0.1, 0.01, 0.001]}
lasso_cv = GridSearchCV(lasso, param_grid=params, cv=5)
lasso_cv.fit(X_train, y_train)
lasso_cv.best_params_

{'alpha': 0.001}

In [28]:
lassoreg = Lasso(alpha=lasso_cv.best_params_['alpha'])
lassoreg_metrics = get_metrics(lassoreg, X_train, X_test, y_train, y_test)
lassoreg_metrics

(Lasso(alpha=0.001),
 0.288453983994175,
 29.333617782659687,
 0.9999984755688047,
 0.9956997801324088)

In [29]:
lasso_parameters = {'RMSE_train': lassoreg_metrics[1], 'RMSE_test': lassoreg_metrics[2], 
                    'r2_train': lassoreg_metrics[3], 'r2_test': lassoreg_metrics[4], 'alpha': 0.001}

In [30]:
lassoreg_coef = {}
for i in range(len(X_train.columns)):
    key = '{}_coef'.format(X_train.columns[i])
    lassoreg_coef[key] = lassoreg_metrics[0].coef_[i]
lassoreg_coef['Intercept'] = lassoreg_metrics[0].intercept_[0]
lassoreg_coef

{'passenger_count_coef': 0.0008215327970705915,
 'trip_distance_coef': -2.9112501005916087e-06,
 'fare_amount_coef': 1.0000161001981938,
 'extra_coef': 0.6771094335135825,
 'mta_tax_coef': 1.1916867168594378,
 'tip_amount_coef': 0.9998425174014166,
 'tolls_amount_coef': 1.0016874362307344,
 'improvement_surcharge_coef': 1.694447098206617,
 'congestion_surcharge_coef': 0.7873860833933782,
 'VendorID_1_coef': -1.3965806661268128,
 'VendorID_2_coef': 0.16818407953427328,
 'RateCode_1_coef': -0.11070911958621127,
 'RateCode_2_coef': 0.0,
 'RateCode_3_coef': -0.0,
 'RateCode_4_coef': 0.0,
 'RateCode_5_coef': 0.0,
 'RateCode_6_coef': 0.0,
 'RateCode_99_coef': -0.0,
 'payment_type_1_coef': 0.016763605221925732,
 'payment_type_2_coef': -0.0,
 'payment_type_3_coef': 0.0,
 'payment_type_4_coef': -0.0,
 'payment_type_5_coef': 0.0,
 'Bronx_PU_coef': 0.0,
 'Brooklyn_PU_coef': 0.017875412441330406,
 'Manhattan_PU_coef': -0.0,
 'Queens_PU_coef': -0.0,
 'Staten_Island_PU_coef': -0.0,
 'Bronx_DO_coef':

In [31]:
scaled_lassoreg = Lasso(alpha=lasso_cv.best_params_['alpha'])
scaled_lassoreg_metrics = get_metrics(scaled_lassoreg, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_lassoreg_metrics[4]

-207139.56174785877

In [32]:
scaled_lasso_parameters = {'RMSE_train': scaled_lassoreg_metrics[1], 'RMSE_test': scaled_lassoreg_metrics[2], 
                    'r2_train': scaled_lassoreg_metrics[3], 'r2_test': scaled_lassoreg_metrics[4], 'alpha': 0.001}

In [33]:
scaled_lassoreg_coef = {}
for i in range(len(scaled_X_train.columns)):
    key = '{}_coef'.format(scaled_X_train.columns[i])
    scaled_lassoreg_coef[key] = scaled_lassoreg_metrics[0].coef_[i]
scaled_lassoreg_coef['Intercept'] = scaled_lassoreg_metrics[0].intercept_
scaled_lassoreg_coef

{'passenger_count_coef': 0.003522797446363571,
 'trip_distance_coef': 5.18992732493148e-05,
 'fare_amount_coef': -0.0013181043627843534,
 'extra_coef': 0.3822313100918155,
 'mta_tax_coef': 0.12296333501886231,
 'tip_amount_coef': 0.092847862559871,
 'tolls_amount_coef': 0.08690554401418935,
 'improvement_surcharge_coef': 0.19124105790654164,
 'congestion_surcharge_coef': -0.0,
 'VendorID_1_coef': 0.3355633314275071,
 'VendorID_2_coef': 0.0,
 'RateCode_1_coef': 0.04885343902947463,
 'RateCode_2_coef': -0.09069672365349764,
 'RateCode_3_coef': -0.003186924174351682,
 'RateCode_4_coef': -0.02243555063788705,
 'RateCode_5_coef': -0.001711041177154665,
 'RateCode_6_coef': 0.0007504133583742504,
 'RateCode_99_coef': 0.015644682494990885,
 'payment_type_1_coef': -0.0,
 'payment_type_2_coef': 0.001184239713315311,
 'payment_type_3_coef': -0.00829959556870274,
 'payment_type_4_coef': 0.0,
 'payment_type_5_coef': 0.0,
 'Bronx_PU_coef': -0.0,
 'Brooklyn_PU_coef': 0.38456959244029565,
 'Manhattan_

# Random Forest Regressor

In [35]:
rfrreg = RandomForestRegressor(max_depth=5,
                               n_estimators=5)
rfrreg_metrics = get_metrics(rfrreg, X_train, X_test, y_train, y_test)
rfrreg_metrics[4]

  


0.0008373738699121169

In [36]:
rfrtrees_parameters = {'RMSE_train': rfrreg_metrics[1], 'RMSE_test': rfrreg_metrics[2], 
                    'r2_train': rfrreg_metrics[3], 'r2_test': rfrreg_metrics[4], 'n_estimators': 5, 'max_depth': 9}

In [37]:
scaled_rfrreg_metrics = get_metrics(rfrreg, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_rfrreg_metrics[4]

0.6915049418013793

In [38]:
scaled_rfrtrees_parameters = {'RMSE_train': scaled_rfrreg_metrics[1], 'RMSE_test': scaled_rfrreg_metrics[2], 
                              'r2_train': scaled_rfrreg_metrics[3], 'r2_test': scaled_rfrreg_metrics[4],
                              'n_estimators': 5, 'max_depth': 9}

# Gradient Boosting

In [39]:
gradientboost = GradientBoostingRegressor(max_depth=5, n_estimators=5)
gradient_metrics = get_metrics(gradientboost, X_train, X_test, y_train, y_test)
gradient_metrics[4]

  return f(*args, **kwargs)


0.0005472288027202898

In [40]:
gradientboosting_parameters = {'RMSE_train': gradient_metrics[1], 'RMSE_test': gradient_metrics[2], 
                               'r2_train': gradient_metrics[3], 'r2_test': gradient_metrics[4],
                              'n_estimators': 9, 'max_depth': 9}

In [41]:
scaled_gradient_metrics = get_metrics(gradientboost, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)
scaled_gradient_metrics[4]

0.45926421152749974

In [42]:
scaled_gradientboosting_parameters = {'RMSE_train': scaled_gradient_metrics[1], 'RMSE_test': scaled_gradient_metrics[2], 
                               'r2_train': scaled_gradient_metrics[3], 'r2_test': scaled_gradient_metrics[4],
                              'n_estimators': 9, 'max_depth': 9}

In [43]:
n_nearest_params = {'RMSE_train': 'DNF', 'RMSE_test': 'DNF',
                    'r2_train': 'DNF', 'r2_test': 'DNF', 'n_neighbors': '1 - 10'}

scaled_n_nearest_params = {'RMSE_train': 'DNF', 'RMSE_test': 'DNF',
                    'r2_train': 'DNF', 'r2_test': 'DNF', 'n_neighbors': '1 - 10'}

adaboosting_params = {'RMSE_train': 'DNF', 'RMSE_test': 'DNF',
                    'r2_train': 'DNF', 'r2_test': 'DNF', 'n_estimators': '1 - 10'}

scaled_adaboosting_params = {'RMSE_train': 'DNF', 'RMSE_test': 'DNF',
                    'r2_train': 'DNF', 'r2_test': 'DNF', 'n_estimators': '1 - 10'}

In [44]:
df = pd.DataFrame([linreg_parameters, ridge_parameters, lasso_parameters, n_nearest_params,
                   rfrtrees_parameters, gradientboosting_parameters, adaboosting_params],
                 index=['Linear Regression', 'Ridge Regression', 'Lasso Regression',
                        'N_nearest_neighbors', 'Random Forest', 'Gradient Boosting', 'Ada Boosting'])

df_scaled = pd.DataFrame([scaled_linreg_parameters, scaled_ridge_parameters, scaled_lasso_parameters,
                          scaled_n_nearest_params, scaled_rfrtrees_parameters, 
                          scaled_gradientboosting_parameters, scaled_adaboosting_params],
                 index=['Scaled Linear Regression', 'Scaled Ridge Regression', 'Scaled Lasso Regression',
                        'Scaled N Nearest Neighbors','Scaled Random Forest',
                        'Scaled Gradient Boosting', 'Ada Boosting'])

In [45]:
df_coef = pd.DataFrame([linreg_coef, ridgereg_coef, lassoreg_coef], index=['Linear Regression Coefficients',
                                                                          'Ridge Regression Coefficients',
                                                                          'Lasso Regression Coefficients'])

df_coef_scaled = pd.DataFrame([scaled_linreg_coef, scaled_ridgereg_coef, scaled_lassoreg_coef],
                              index=['Scaled Linear Regression Coefficients', 'Scaled Ridge Regression Coefficients',
                                    'Scaled Lasso Regression Coefficients'])

In [46]:
df

Unnamed: 0,RMSE_train,RMSE_test,r2_train,r2_test,alpha,n_neighbors,n_estimators,max_depth
Linear Regression,0.285299,149.768,0.999999,0.887902,,,,
Ridge Regression,0.285299,149.655,0.999999,0.888071,0.1,,,
Lasso Regression,0.288454,29.3336,0.999998,0.9957,0.001,,,
N_nearest_neighbors,DNF,DNF,DNF,DNF,,1 - 10,,
Random Forest,58.7649,447.135,0.936731,0.000837374,,,5,9.0
Gradient Boosting,137.981,447.2,0.651187,0.000547229,,,9,9.0
Ada Boosting,DNF,DNF,DNF,DNF,,,1 - 10,


In [47]:
df_scaled

Unnamed: 0,RMSE_train,RMSE_test,r2_train,r2_test,alpha,n_neighbors,n_estimators,max_depth
Scaled Linear Regression,0.393475,324.557,0.845178,-105177,,,,
Scaled Ridge Regression,0.394026,330.232,0.844743,-108887,0.1,,,
Scaled Lasso Regression,0.655668,455.473,0.5701,-207140,0.001,,,
Scaled N Nearest Neighbors,DNF,DNF,DNF,DNF,,1 - 10,,
Scaled Random Forest,0.555123,0.555845,0.691839,0.691505,,,5,9.0
Scaled Gradient Boosting,0.735088,0.735906,0.459646,0.459264,,,9,9.0
Ada Boosting,DNF,DNF,DNF,DNF,,,1 - 10,


In [48]:
df_coef.T

Unnamed: 0,Linear Regression Coefficients,Ridge Regression Coefficients,Lasso Regression Coefficients
passenger_count_coef,0.001014,0.001014,0.000822
trip_distance_coef,-0.000003,-0.000003,-0.000003
fare_amount_coef,1.000014,1.000014,1.000016
extra_coef,0.686972,0.686971,0.677109
mta_tax_coef,0.643115,0.643621,1.191687
...,...,...,...
Thursday_coef,0.020451,0.020451,0.000218
Friday_coef,0.022978,0.022977,0.003021
Saturday_coef,-0.044973,-0.044973,-0.053320
Sunday_coef,-0.039753,-0.039754,-0.045891


In [49]:
df_coef_scaled.T

Unnamed: 0,Scaled Linear Regression Coefficients,Scaled Ridge Regression Coefficients,Scaled Lasso Regression Coefficients
passenger_count_coef,2.037863e-04,3.695935e-04,3.522797e-03
trip_distance_coef,7.117354e-04,8.693605e-04,5.189927e-05
fare_amount_coef,-2.575203e+02,-2.472773e+02,-1.318104e-03
extra_coef,-8.118620e-01,-7.641097e-01,3.822313e-01
mta_tax_coef,-7.657645e-03,-2.328004e-03,1.229633e-01
...,...,...,...
Thursday_coef,-3.757826e+10,-1.221381e-02,-4.586968e-03
Friday_coef,-3.525531e+10,2.366155e-02,2.025204e-02
Saturday_coef,-3.270946e+10,1.934718e-02,1.665431e-02
Sunday_coef,2.576682e+02,2.474191e+02,0.000000e+00


In [50]:
df.to_csv('./datasets/Model_Metrics.csv')
df_scaled.to_csv('./datasets/Scaled_Model_Metrics.csv')
df_coef.to_csv('./datasets/Coef_Metrics.csv')
df_coef_scaled.to_csv('./datasets/Scaled_Coef_Metrics.csv')