# Hyperparameter Tuning of Models Using Gridsearch

In [21]:
# utilities
from warnings import filterwarnings

filterwarnings('ignore')
import os
from tqdm.autonotebook import tqdm

# setup
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, r2_score, \
                            mean_absolute_error, mean_squared_error

# classifiers
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# regressors
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor


# data processing
import pandas as pd
import numpy as np

## Load Data

In [3]:
df = pd.read_csv('./taxi_tnp_weather.csv.gz', low_memory=False,
                 parse_dates=['trip_end_timestamp', 'trip_start_timestamp'])

## Feature Engineering

In [4]:
df['start_hour'] = df.trip_start_timestamp.dt.hour
df['start_day'] = df.trip_start_timestamp.dt.weekday
df['start_month'] = df.trip_start_timestamp.dt.month

df['end_hour'] = df.trip_end_timestamp.dt.hour
df['end_day'] = df.trip_end_timestamp.dt.weekday
df['end_month'] = df.trip_end_timestamp.dt.month

df.loc[:, 'fare_per_sec'] = df.fare / df.trip_seconds
df.loc[:, 'fare_per_mile'] = df.fare / df.trip_miles
df.loc[:, 'tip_per_sec'] = df.tip / df.trip_seconds
df.loc[:, 'tip_per_mile'] = df.tip / df.trip_miles

In [5]:
allowed_cols = ['dropoff_community_area', 'fare', 'payment_type',
                'pickup_community_area', 'trip_miles', 'trip_seconds',
                'additional_charges', 'start_hour', 'start_day', 
                'start_month', 'end_hour', 'end_day', 'end_month', 
                'fare_per_sec', 'fare_per_mile']

to_convert_cols = ['dropoff_community_area', 'pickup_community_area', 
                   'start_hour', 'start_day', 'start_month', 'end_hour', 
                   'end_day', 'end_month']

## Data Setup for Classifier Models

In [6]:
has_tip_df = df.copy()
has_tip_df['has_tip'] = has_tip_df['tip'].map(lambda x: 1 if x > 0 else 0)

In [7]:
clf_features = has_tip_df[has_tip_df.TransportType=='taxi'][allowed_cols]
clf_target = has_tip_df[has_tip_df.TransportType=='taxi']['has_tip']

In [8]:
clf_features[to_convert_cols] = clf_features[to_convert_cols].astype(str)
clf_features.dtypes

dropoff_community_area     object
fare                      float64
payment_type               object
pickup_community_area      object
trip_miles                float64
trip_seconds              float64
additional_charges        float64
start_hour                 object
start_day                  object
start_month                object
end_hour                   object
end_day                    object
end_month                  object
fare_per_sec              float64
fare_per_mile             float64
dtype: object

In [9]:
clf_features = pd.get_dummies(clf_features)
clf_features.shape, clf_target.shape

((240304, 251), (240304,))

## Hyperparameter Tuning for Classifiers

There are 8 classifier models to be trained using gridsearch with 10-fold cross validation. The models are:
- GaussianNB
- LogisticRegression
- LGBMClassifier
- LinearSVC
- KNeighborsClassifier
- RandomForestClassifier
- DecisionTreeClassifier
- ExtraTreesClassifier

In [12]:
X_train_clf, X_test_clf,\
y_train_clf, y_test_clf = train_test_split(clf_features, clf_target, 
                                           test_size=0.3, random_state=0, 
                                           stratify=clf_target)

clf_results = []
models = [
    {'model': GaussianNB(),
     'params': {}
    },

    {'model': LogisticRegression(tol=1e-6),
     'params': {
         'C': [10, 100],
         'C': [1e-4, 1e-2, 1, 10, 100],
         'penalty': ['l1', 'l2', 'elasticnet'],
         'solver': ['saga', 'liblinear']
     }
    },
    {'model': LGBMClassifier(),
     'params': {    
        'learning_rate': [0.07, 0.1, 0.12],
        'n_estimators': [100, 300],
        'max_depth': [5, 10, 15]
     }
    },
    {'model': LinearSVC(tol=1e-6),
     'params': {
         'C': [1e-4, 1e-2, 1, 10, 100],
         'penalty': ['l1', 'l2']
     }
    },
    {'model': KNeighborsClassifier(n_jobs=-1),
     'params': {
         'n_neighbors': [5, 10, 15],
     }
    },
    {'model': RandomForestClassifier(),
     'params': {
         'n_estimators': [300, 500],
         'max_depth': [5, 10, 15],
     }
    },
    {'model': DecisionTreeClassifier(),
     'params': {
         'max_depth': [5, 10],
     }
    },
    {'model': ExtraTreesClassifier(n_jobs=-1),
     'params': {
         'n_estimators': [300, 500],
         'max_depth': [5, 10],  
     }
    }
]

print("starting Gridsearch")
for i in tqdm(models):

    modelname = type(i['model']).__name__
    gs = GridSearchCV(i['model'], i['params'], verbose=2, n_jobs=-1, cv=10)
    gs = gs.fit(X_train_clf, y_train_clf)
    print(modelname, ': ', gs.best_score_)
    
    train_predicted = gs.predict(X_train_clf)
    print(f'train score: {gs.score(X_train_clf, y_train_clf):.4f}')
    print('train accuracy_score: '
          f'{accuracy_score(y_train_clf, train_predicted):.4f}')
    print(f'train f1_score: {f1_score(y_train_clf, train_predicted):.4f}')

    test_predicted = gs.predict(X_test_clf)
    print(f'test unseen data score: {gs.score(X_test_clf, y_test_clf):.4f}')
    print('test unseen data accuracy_score: ', 
          f'{accuracy_score(y_test_clf, test_predicted):.4f}')
    print('test unseen data f1_score: ', 
          f'{f1_score(y_test_clf, test_predicted):.4f}')

    tmpgs = pd.DataFrame({
        'model': modelname,
        **gs.cv_results_
    })
#     print(f'saving to gridsearch-results-{modelname}.csv.gz')
#     tmpgs.to_csv('gridsearch-results-' + modelname + '.csv.gz', index=False, compression='gzip')
    clf_results.append(tmpgs)
    print('==============================\n')
print("finished Gridsearch")

clf_results = pd.concat(clf_results).set_index('model').reset_index()
clf_results = clf_results.drop(clf_results.columns[(clf_results.columns.str\
                    .contains(r'param_|split|std|rank_test_score'))], axis=1)

clf_results.sort_values(by=['mean_test_score', 'mean_score_time', 
                            'mean_fit_time'], 
                        ascending=False).reset_index(drop=True)

## Pre-run Results of Top 3 Ranked Accuracies for each Regressor Models

Here is a processed output of a previously executed gridsearch since the cell above is expected to complete after a day.

|Model|Avg. Train Time (seconds)|Avg. Prediction Time (seconds)|Avg. Prediction Accuracy|Parameters|
|-|:-:|:-:|:-:|-|
|GaussianNB|11.28|1.04|96.10|{}|
|LGBMClassifier|20.36|1.15|97.62|{'learning_rate': 0.07, 'max_depth': 10, 'n_estimators': 300}|
|LGBMClassifier|18.18|1.01|97.62|{'learning_rate': 0.07, 'max_depth': 15, 'n_estimators': 300}|
|LGBMClassifier|9.14|0.52|97.62|{'learning_rate': 0.12, 'max_depth': 5, 'n_estimators': 100}|
|LogisticRegression|15.03|0.12|97.62|{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}|
|LogisticRegression|47.87|0.06|97.62|{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}|
|LogisticRegression|390.76|0.09|97.61|{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}|
|LinearSVC|238.75|0.07|97.61|{'C': 0.0001, 'penalty': 'l2'}|
|LinearSVC|215.47|0.09|97.51|{'C': 0.01, 'penalty': 'l2'}|
|LinearSVC|239.71|0.09|97.39|{'C': 10, 'penalty': 'l2'}|
|RandomForestClassifier|183.72|2.44|97.62|{'max_depth': 10, 'n_estimators': 300}|
|RandomForestClassifier|302.48|3.96|97.62|{'max_depth': 10, 'n_estimators': 500}|
|RandomForestClassifier|232.04|2.70|97.62|{'max_depth': 15, 'n_estimators': 300}|
|DecisionTreeClassifier|6.64|0.09|97.60|{'max_depth': 5}|
|DecisionTreeClassifier|12.19|0.09|97.55|{'max_depth': 10}|
|ExtraTreesClassifier|66.40|1.14|97.62|{'max_depth': 5, 'n_estimators': 300}|
|ExtraTreesClassifier|110.45|1.80|97.62|{'max_depth': 5, 'n_estimators': 500}|
|ExtraTreesClassifier|136.74|1.89|97.62|{'max_depth': 10, 'n_estimators': 300}|
|KNeighborsClassifier|31.90|43.53|86.36|{'n_neighbors': 15}|
|KNeighborsClassifier|31.05|44.67|85.17|{'n_neighbors': 10}|
|KNeighborsClassifier|30.88|42.54|83.92|{'n_neighbors': 5}|
|XGBClassifier|1,188.64|0.87|97.61|{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300}|
|XGBClassifier|538.61|0.55|97.61|{'learning_rate': 0.07, 'max_depth': 5, 'n_estimators': 300}|
|XGBClassifier|705.49|0.61|97.61|{'learning_rate': 0.07, 'max_depth': 5, 'n_estimators': 500}|

## Data Setup for Regressor Models

In [14]:
reg_features = df[df['tip']>0][df.TransportType=='taxi'][allowed_cols]
reg_target = df[df['tip']>0][df.TransportType=='taxi']['tip']
reg_features[to_convert_cols] = reg_features[to_convert_cols].astype(str)
print(reg_features.dtypes)
reg_features = pd.get_dummies(reg_features)
reg_features.shape

dropoff_community_area     object
fare                      float64
payment_type               object
pickup_community_area      object
trip_miles                float64
trip_seconds              float64
additional_charges        float64
start_hour                 object
start_day                  object
start_month                object
end_hour                   object
end_day                    object
end_month                  object
fare_per_sec              float64
fare_per_mile             float64
dtype: object


(117735, 244)

## Hyperparameter Tuning for Regressors

There are 8 regressor models to be trained using gridsearch with 10-fold cross validation. The models are:
- LGBMRegressor
- LinearRegression
- Lasso
- Ridge
- RandomForestRegressor
- DecisionTreeRegressor
- KNeighborsRegressor
- ExtraTreesRegressor

In [25]:
X_train_reg, X_test_reg,\
y_train_reg, y_test_reg = train_test_split(reg_features, reg_target, 
                                           test_size=0.3, random_state=0)

reg_results = []
models = [
    {'model': LinearRegression(),
     'params': {}
    },
    {'model': Lasso(),
     'params': {
         'alpha': [1e-2, 0.1, 1],
         'max_iter': [300, 500]
     }
    },
    {'model': Ridge(),
     'params': {
         'alpha': [1e-2, 0.1, 1],
         'max_iter': [300, 500],
         'solver': ['saga', 'cholesky']
     }
    },
    {'model': LGBMRegressor(),
     'params': {    
        'learning_rate': [0.07, 0.1, 0.12],
        'n_estimators': [100, 300],
        'max_depth': [5, 10, 15]
     }
    },
    {'model': RandomForestRegressor(n_jobs=-1),
     'params': {
         'n_estimators': [300, 500],
         'max_depth': [5, 10, 15],
     }
    },
    {'model': DecisionTreeRegressor(),
     'params': {
         'max_depth': [5, 10],
     }
    },
    {'model': KNeighborsRegressor(n_jobs=-1),
     'params': {
         'n_neighbors': [5, 10],
     }
    },
    {'model': ExtraTreesRegressor(n_jobs=-1),
     'params': {
         'n_estimators': [300, 500],
         'max_depth': [5, 10],  
     }
    },
]

print("starting Gridsearch")
for i in tqdm(models):

    modelname = type(i['model']).__name__
    gs = GridSearchCV(i['model'], i['params'], verbose=2, cv=10, n_jobs=-1, 
                      scoring=['neg_mean_squared_error', 'r2'], refit='r2')
    gs = gs.fit(X_train_reg, y_train_reg)
    print(modelname, ': ', gs.best_score_)

    print(f'train score: {gs.score(X_train_reg, y_train_reg):.4f}')
    print('train r2 score: '
          f'{r2_score(y_train_reg, gs.predict(X_train_reg)):.4f}')
    print('train root mean_squared_error: '
  f'{np.sqrt(mean_squared_error(y_train_reg, gs.predict(X_train_reg))):.4f}')
    print('train mean_absolute_error: '
          f'{mean_absolute_error(y_train_reg, gs.predict(X_train_reg)):.4f}')

    print(f'test unseen data score: {gs.score(X_test_reg, y_test_reg):.4f}')
    print('test unseen data r2 score: '
          f'{r2_score(y_test_reg, gs.predict(X_test_reg)):.4f}')
    print('test unseen data root mean_squared_error: '
  f'{np.sqrt(mean_squared_error(y_test_reg, gs.predict(X_test_reg))):.4f}')
    print('test unseen data mean_absolute_error: '
          f'{mean_absolute_error(y_test_reg, gs.predict(X_test_reg)):.4f}')
    
    tmpgs = pd.DataFrame({
        'model': modelname,
        **gs.cv_results_
    })
#     print(f'saving to gridsearch-results-{modelname}.csv.gz')
#     tmpgs.to_csv(modelname + '.csv.gz', index=False, compression='gzip')
    reg_results.append(tmpgs)
    print('==============================\n')

print("finished Gridsearch")

reg_results = pd.concat(reg_results).set_index('model').reset_index()
reg_results = reg_results.drop(reg_results.columns[(reg_results.columns\
                                                    .str.contains\
                        (r'param_|split|std|rank_test_'))], axis=1)

reg_results.sort_values(by=['mean_test_r2', 'mean_score_time', 
                            'mean_fit_time'], 
                        ascending=False).reset_index(drop=True)

## Pre-run Results of Top 3 Ranked Accuracies for each Regressor Models

Here is a processed output of a previously executed gridsearch since the cell above is expected to complete after a day.

|Model|Avg. Train Time (seconds)|Avg. Prediction Time (seconds)|Avg. Prediction Accuracy|Avg. RMSE|Parameters|
|-|:-:|:-:|:-:|:-:|-|
|LinearRegression|1.86|0.02|77.02|1.51|{}|
|LGBMRegressor|1.56|0.13|77.13|1.51|{'learning_rate': 0.07, 'max_depth': 5, 'n_estimators': 100}|
|LGBMRegressor|12.89|1.22|77.09|1.51|{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}|
|LGBMRegressor|16.89|1.10|77.08|1.51|{'learning_rate': 0.07, 'max_depth': 15, 'n_estimators': 100}|
|Lasso|2.49|0.02|76.97|1.51|{'alpha': 0.01, 'max_iter': 300}|
|Lasso|2.67|0.02|76.97|1.51|{'alpha': 0.01, 'max_iter': 500}|
|Lasso|0.93|0.02|76.80|1.52|{'alpha': 0.1, 'max_iter': 300}|
|Ridge|1.16|0.02|77.07|1.51|{'alpha': 1, 'max_iter': 300, 'solver': 'cholesky'}|
|Ridge|0.99|0.03|77.07|1.51|{'alpha': 1, 'max_iter': 500, 'solver': 'cholesky'}|
|Ridge|5.32|0.10|77.03|1.51|{'alpha': 0.1, 'max_iter': 300, 'solver': 'cholesky'}|
|RandomForestRegressor|327.11|0.34|76.45|1.53|{'max_depth': 5, 'n_estimators': 300}|
|RandomForestRegressor|518.83|0.52|76.40|1.53|{'max_depth': 5, 'n_estimators': 500}|
|RandomForestRegressor|984.36|1.17|76.29|1.54|{'max_depth': 10, 'n_estimators': 500}|
|DecisionTreeRegressor|1.94|0.03|72.20|1.62|{'max_depth': 5}|
|DecisionTreeRegressor|3.40|0.03|71.92|1.70|{'max_depth': 10}|
|ExtraTreesRegressor|619.96|0.55|76.78|1.52|{'max_depth': 10, 'n_estimators': 300}|
|ExtraTreesRegressor|890.29|0.75|76.75|1.52|{'max_depth': 10, 'n_estimators': 500}|
|ExtraTreesRegressor|276.04|0.33|75.54|1.56|{'max_depth': 5, 'n_estimators': 300}|
|KNeighborsRegressor|11.07|10.35|74.14|1.60|{'n_neighbors': 10}|
|KNeighborsRegressor|11.20|9.06|72.25|1.66|{'n_neighbors': 5}|