In [30]:
import numpy as np
import pandas as pd
import pickle as pkl

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [31]:
df = pd.read_csv('Sample 14days_food_preparation_Cleaned.csv')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 38 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   Merchant_lat                                                10000 non-null  float64
 1   Merchant_lon                                                10000 non-null  float64
 2   NationFoodCategory_International                            10000 non-null  int64  
 3   NationFoodCategory_Isram                                    10000 non-null  int64  
 4   NationFoodCategory_Japanese                                 10000 non-null  int64  
 5   NationFoodCategory_Korean                                   10000 non-null  int64  
 6   NationFoodCategory_Myanmar                                  10000 non-null  int64  
 7   NationFoodCategory_Thai                                     10000 non-null  int64  
 8

In [33]:
df.head()

Unnamed: 0,Merchant_lat,Merchant_lon,NationFoodCategory_International,NationFoodCategory_Isram,NationFoodCategory_Japanese,NationFoodCategory_Korean,NationFoodCategory_Myanmar,NationFoodCategory_Thai,NationFoodCategory_Vietnam,FoodCategories_FastFood,...,FoodCategories_เครื่องดื่ม,FoodCategories_ไก่ทอด,riderInitial_to_Merchant_EucDistance,riderInitial_to_Merchant_ShortestDistance,day_of_week_sin,day_of_week_cos,calledMerchantTime_to_arrivedAtMerchantTime_prediction (s),calledMerchantTime_to_arrivedAtMerchantTime_prediction (m),duration (s),duration (m)
0,13.825703,100.559309,0,0,0,0,0,1,0,0,...,0,0,793.928285,3393.191,-0.974928,-0.222521,401.262823,6.687714,507.0,8.45
1,13.831056,100.570731,0,0,0,0,0,1,0,0,...,0,0,1621.210851,2350.873,0.433884,-0.900969,633.470806,10.557847,1758.0,29.3
2,13.827205,100.536048,0,0,0,0,0,1,0,1,...,0,0,1969.07484,3193.337,0.433884,-0.900969,636.793739,10.613229,860.0,14.333333
3,13.84018,100.542326,0,0,1,0,0,0,0,0,...,0,0,2764.603123,3610.225,-0.974928,-0.222521,868.961712,14.482695,1542.0,25.7
4,13.827034,100.564534,0,0,0,0,0,1,0,0,...,0,0,317.267557,249.565,0.781831,0.62349,401.262823,6.687714,389.0,6.483333


# 3. select features and split data (test/train)

In [34]:
pred_s = df['calledMerchantTime_to_arrivedAtMerchantTime_prediction (s)'].copy()
pred_m = df['calledMerchantTime_to_arrivedAtMerchantTime_prediction (m)'].copy()
X = df.drop(['calledMerchantTime_to_arrivedAtMerchantTime_prediction (s)',
             'calledMerchantTime_to_arrivedAtMerchantTime_prediction (m)',
             'duration (s)',
             'duration (m)'], axis=1)
X.loc[X['riderInitial_to_Merchant_ShortestDistance'] <= X['riderInitial_to_Merchant_EucDistance'], 'riderInitial_to_Merchant_ShortestDistance'] \
  = X.loc[X['riderInitial_to_Merchant_ShortestDistance'] <= X['riderInitial_to_Merchant_EucDistance'], 'riderInitial_to_Merchant_EucDistance']
y_s = df['duration (s)']
y_m = df['duration (m)']

In [35]:
X.head()

Unnamed: 0,Merchant_lat,Merchant_lon,NationFoodCategory_International,NationFoodCategory_Isram,NationFoodCategory_Japanese,NationFoodCategory_Korean,NationFoodCategory_Myanmar,NationFoodCategory_Thai,NationFoodCategory_Vietnam,FoodCategories_FastFood,...,FoodCategories_อาหารอีสาน,FoodCategories_อาหารฮาลาล,FoodCategories_อาหารเหนือ,FoodCategories_อาหารใต้,FoodCategories_เครื่องดื่ม,FoodCategories_ไก่ทอด,riderInitial_to_Merchant_EucDistance,riderInitial_to_Merchant_ShortestDistance,day_of_week_sin,day_of_week_cos
0,13.825703,100.559309,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,793.928285,3393.191,-0.974928,-0.222521
1,13.831056,100.570731,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1621.210851,2350.873,0.433884,-0.900969
2,13.827205,100.536048,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1969.07484,3193.337,0.433884,-0.900969
3,13.84018,100.542326,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,2764.603123,3610.225,-0.974928,-0.222521
4,13.827034,100.564534,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,317.267557,317.267557,0.781831,0.62349


In [36]:
# Default
Xs = pd.concat([X, pred_s], axis=1)
Xm = pd.concat([X, pred_m], axis=1)
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, y_s, test_size=0.20, random_state=0)
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, y_m, test_size=0.20, random_state=0)

# 4. train models

## 4.1 lr

In [37]:
lr_s = LinearRegression()
lr_s.fit(Xs_train, ys_train)
lr_predictions_s = lr_s.predict(Xs_test)

lr_m = LinearRegression()
lr_m.fit(Xm_train, ym_train)
lr_predictions_m = lr_m.predict(Xm_test)

## 4.2 rf

In [38]:
rf_s = RandomForestRegressor()
rf_s.fit(Xs_train, ys_train)
rf_predictions_s = rf_s.predict(Xs_test)

rf_m = RandomForestRegressor()
rf_m.fit(Xm_train, ym_train)
rf_predictions_m = rf_m.predict(Xm_test)

## 4.3 gbdt

In [39]:
gbdt_s = GradientBoostingRegressor()
gbdt_s.fit(Xs_train, ys_train)
gbdt_predictions_s = gbdt_s.predict(Xs_test)

gbdt_m = GradientBoostingRegressor()
gbdt_m.fit(Xm_train, ym_train)
gbdt_predictions_m = gbdt_m.predict(Xm_test)

# 5. Evaluate

## 5.1 create evaluate function

In [40]:
def evaluate(y_test, y_pred):
    MAE  = metrics.mean_absolute_error(y_test, y_pred)
    MSE  = metrics.mean_squared_error(y_test, y_pred)
    R2   = metrics.r2_score(y_test, y_pred)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    return MAE, MSE, R2, RMSE

## 5.2 evaluate all model

In [41]:
lr_metrics_s = evaluate(ys_test, lr_predictions_s)
rf_metrics_s = evaluate(ys_test, rf_predictions_s)
gbdt_metrics_s = evaluate(ys_test, gbdt_predictions_s)

lr_metrics_m = evaluate(ym_test, lr_predictions_m)
rf_metrics_m = evaluate(ym_test, rf_predictions_m)
gbdt_metrics_m = evaluate(ym_test, gbdt_predictions_m)

## 5.3 benchmark on dataframe

In [42]:
data_dict_s = {
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'MAE': [lr_metrics_s[0], rf_metrics_s[0], gbdt_metrics_s[0]],
    'MSE': [lr_metrics_s[1], rf_metrics_s[1], gbdt_metrics_s[1]],
    'R2': [lr_metrics_s[2], rf_metrics_s[2], gbdt_metrics_s[2]],
    'RMSE': [lr_metrics_s[3], rf_metrics_s[3], gbdt_metrics_s[3]]
}

data_dict_m = {
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'MAE': [lr_metrics_m[0], rf_metrics_m[0], gbdt_metrics_m[0]],
    'MSE': [lr_metrics_m[1], rf_metrics_m[1], gbdt_metrics_m[1]],
    'R2': [lr_metrics_m[2], rf_metrics_m[2], gbdt_metrics_m[2]],
    'RMSE': [lr_metrics_m[3], rf_metrics_m[3], gbdt_metrics_m[3]]
}

In [43]:
benchmark_s = pd.DataFrame(data_dict_s)
benchmark_s

Unnamed: 0,Model,MAE,MSE,R2,RMSE
0,Linear Regression,259.340303,95877.468801,0.300911,309.640871
1,Random Forest,269.830165,106113.072779,0.226279,325.750016
2,Gradient Boosting,258.568869,95514.928175,0.303555,309.054895


In [44]:
benchmark_m = pd.DataFrame(data_dict_m)
benchmark_m

Unnamed: 0,Model,MAE,MSE,R2,RMSE
0,Linear Regression,4.322338,26.63263,0.300911,5.160681
1,Random Forest,4.497211,29.504501,0.225527,5.431805
2,Gradient Boosting,4.309479,26.531936,0.303554,5.150916


## 5.4 save all data

In [68]:
result_s = pd.DataFrame({
    'ys_test': ys_test,
    'lr_predictions_s': lr_predictions_s,
    'rf_predictions_s': rf_predictions_s,
    'gbdt_predictions_s': gbdt_predictions_s
})
result_m = pd.DataFrame({
    'ym_test': ym_test,
    'lr_predictions_m': lr_predictions_m,
    'rf_predictions_m': rf_predictions_m,
    'gbdt_predictions_m': gbdt_predictions_m
})

In [69]:
benchmark_s.to_csv('Food_preparation_model_metrics_s.csv', index=False)
benchmark_m.to_csv('Food_preparation_model_metrics_m.csv', index=False)
result_s.to_csv('Food_preparation_model_prediction_result_s.csv', index=False)
result_m.to_csv('Food_preparation_model_prediction_result_m.csv', index=False)

# 6. Hyperparameter tuning

## 6.1 get best model params and feats importance

In [47]:
def get_feature_importance(model, X):
    return pd.DataFrame([[round(float(x),6) for x in model.feature_importances_]], columns=X.columns.to_list())

In [48]:
params = pd.DataFrame(gbdt_s.get_params(), index=[0])
params.loc[len(params)] = gbdt_m.get_params()
params.index = ['gbdt_s', 'gbdt_m']
print('model params')
display(params)

feature_importances = get_feature_importance(gbdt_s, Xs_train)
feature_importances.loc[len(feature_importances)] = get_feature_importance(gbdt_m, Xm_train).loc[0].values
feature_importances.index = ['gbdt_s', 'gbdt_m']
print('model feature importances')
display(feature_importances)

model params


Unnamed: 0,alpha,ccp_alpha,criterion,init,learning_rate,loss,max_depth,max_features,max_leaf_nodes,min_impurity_decrease,...,min_samples_split,min_weight_fraction_leaf,n_estimators,n_iter_no_change,random_state,subsample,tol,validation_fraction,verbose,warm_start
gbdt_s,0.9,0.0,friedman_mse,,0.1,squared_error,3,,,0.0,...,2,0.0,100,,,1.0,0.0001,0.1,0,False
gbdt_m,0.9,0.0,friedman_mse,,0.1,squared_error,3,,,0.0,...,2,0.0,100,,,1.0,0.0001,0.1,0,False


model feature importances


Unnamed: 0,Merchant_lat,Merchant_lon,NationFoodCategory_International,NationFoodCategory_Isram,NationFoodCategory_Japanese,NationFoodCategory_Korean,NationFoodCategory_Myanmar,NationFoodCategory_Thai,NationFoodCategory_Vietnam,FoodCategories_FastFood,...,FoodCategories_อาหารฮาลาล,FoodCategories_อาหารเหนือ,FoodCategories_อาหารใต้,FoodCategories_เครื่องดื่ม,FoodCategories_ไก่ทอด,riderInitial_to_Merchant_EucDistance,riderInitial_to_Merchant_ShortestDistance,day_of_week_sin,day_of_week_cos,calledMerchantTime_to_arrivedAtMerchantTime_prediction (s)
gbdt_s,0.005971,0.00746,0.000628,0.0,0.000695,0.00028,0.000169,0.000373,0.000466,0.0,...,0.0,0.000482,0.000425,0.00132,4.2e-05,0.106273,0.02753,0.004529,0.002269,0.830189
gbdt_m,0.006016,0.007441,0.000628,0.0,0.000913,0.00028,0.000169,0.000373,0.000466,0.0,...,0.0,0.000482,0.000425,0.00132,4.2e-05,0.36535,0.027658,0.004529,0.002246,0.570979


## 6.2 tune params using gridsearchCV

In [49]:
gbdt = GradientBoostingRegressor(random_state=0)

param_grid = { 
 "n_estimators": [30, 35, 40, 45, 50, 55, 60, 65, 70],
 "max_features": [1.0, "sqrt", "log2"], # "auto" is deprecated, use 1.0 instead
 "max_depth": [2, 3, 4, 5],
}
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_root_mean_squared_error']
refit = "neg_mean_squared_error"

CV_gbdt_s = GridSearchCV(estimator=gbdt, param_grid=param_grid, scoring=scoring, refit=refit)
CV_gbdt_s.fit(Xs_train, ys_train)

In [50]:
CV_gbdt_s.best_params_

{'max_depth': 2, 'max_features': 1.0, 'n_estimators': 45}

In [51]:
gbdt = GradientBoostingRegressor(random_state=0)

param_grid = { 
 "n_estimators": [30, 35, 40, 45, 50, 55, 60, 65, 70],
 "max_features": [1.0, "sqrt", "log2"], # "auto" is deprecated, use 1.0 instead
 "max_depth": [2, 3, 4, 5],
}
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_root_mean_squared_error']
refit = "neg_mean_absolute_error"

CV_gbdt_m = GridSearchCV(estimator=gbdt, param_grid=param_grid, scoring=scoring, refit=refit)
CV_gbdt_m.fit(Xm_train, ym_train)

In [52]:
CV_gbdt_m.best_params_

{'max_depth': 2, 'max_features': 1.0, 'n_estimators': 45}

## 6.3 get result dataframe

In [54]:
CV_gbdt_s_results = pd.DataFrame(CV_gbdt_s.cv_results_)
CV_gbdt_s_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 40 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   mean_fit_time                            108 non-null    float64
 1   std_fit_time                             108 non-null    float64
 2   mean_score_time                          108 non-null    float64
 3   std_score_time                           108 non-null    float64
 4   param_max_depth                          108 non-null    object 
 5   param_max_features                       108 non-null    object 
 6   param_n_estimators                       108 non-null    object 
 7   params                                   108 non-null    object 
 8   split0_test_neg_mean_absolute_error      108 non-null    float64
 9   split1_test_neg_mean_absolute_error      108 non-null    float64
 10  split2_test_neg_mean_absolute_error      108 non-n

In [55]:
CV_gbdt_s_results[
    ['param_max_depth',
     'param_max_features',
     'param_n_estimators', 
     
     'mean_test_neg_mean_absolute_error',
     'mean_test_neg_mean_squared_error',
     'mean_test_r2',
     'mean_test_neg_root_mean_squared_error',
     
     'rank_test_neg_mean_absolute_error',
     'rank_test_neg_mean_squared_error',
     'rank_test_r2',
     'rank_test_neg_root_mean_squared_error'
     
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True)

Unnamed: 0,param_max_depth,param_max_features,param_n_estimators,mean_test_neg_mean_absolute_error,mean_test_neg_mean_squared_error,mean_test_r2,mean_test_neg_root_mean_squared_error,rank_test_neg_mean_absolute_error,rank_test_neg_mean_squared_error,rank_test_r2,rank_test_neg_root_mean_squared_error
3,2,1.0,45,-250.337280,-90555.989561,0.313427,-300.899055,1,1,2,1
1,2,1.0,35,-250.388492,-90556.181583,0.313435,-300.899189,6,2,1,2
2,2,1.0,40,-250.376575,-90559.519305,0.313407,-300.904786,5,3,3,3
0,2,1.0,30,-250.413363,-90568.858118,0.313347,-300.920076,8,4,4,4
4,2,1.0,50,-250.354023,-90579.473242,0.313242,-300.938357,2,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...
87,5,1.0,60,-252.676330,-92479.000985,0.298847,-304.074771,104,104,104,104
88,5,1.0,65,-252.822576,-92578.893177,0.298091,-304.238810,105,105,105,105
89,5,1.0,70,-252.929011,-92656.206872,0.297502,-304.366888,106,106,106,106
18,2,log2,30,-253.449389,-93156.940776,0.293836,-305.181944,107,107,107,107


In [56]:
CV_gbdt_m_results = pd.DataFrame(CV_gbdt_m.cv_results_)
CV_gbdt_m_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 40 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   mean_fit_time                            108 non-null    float64
 1   std_fit_time                             108 non-null    float64
 2   mean_score_time                          108 non-null    float64
 3   std_score_time                           108 non-null    float64
 4   param_max_depth                          108 non-null    object 
 5   param_max_features                       108 non-null    object 
 6   param_n_estimators                       108 non-null    object 
 7   params                                   108 non-null    object 
 8   split0_test_neg_mean_absolute_error      108 non-null    float64
 9   split1_test_neg_mean_absolute_error      108 non-null    float64
 10  split2_test_neg_mean_absolute_error      108 non-n

In [58]:
CV_gbdt_m_results[
    ['param_max_depth',
     'param_max_features',
     'param_n_estimators', 
     
     'mean_test_neg_mean_absolute_error',
     'mean_test_neg_mean_squared_error',
     'mean_test_r2',
     'mean_test_neg_root_mean_squared_error',
     
     'rank_test_neg_mean_absolute_error',
     'rank_test_neg_mean_squared_error',
     'rank_test_r2',
     'rank_test_neg_root_mean_squared_error'
     
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True)

Unnamed: 0,param_max_depth,param_max_features,param_n_estimators,mean_test_neg_mean_absolute_error,mean_test_neg_mean_squared_error,mean_test_r2,mean_test_neg_root_mean_squared_error,rank_test_neg_mean_absolute_error,rank_test_neg_mean_squared_error,rank_test_r2,rank_test_neg_root_mean_squared_error
3,2,1.0,45,-4.172280,-25.154398,0.313428,-5.014980,1,1,2,1
1,2,1.0,35,-4.173133,-25.154449,0.313437,-5.014982,6,2,1,2
2,2,1.0,40,-4.172935,-25.155377,0.313408,-5.015075,5,3,3,3
0,2,1.0,30,-4.173548,-25.157967,0.313348,-5.015330,8,4,4,4
4,2,1.0,50,-4.172559,-25.160922,0.313243,-5.015635,2,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...
87,5,1.0,60,-4.211225,-25.689494,0.298811,-5.068008,104,104,104,104
88,5,1.0,65,-4.213743,-25.718464,0.298020,-5.070865,105,105,105,105
89,5,1.0,70,-4.215465,-25.739614,0.297439,-5.072968,106,106,106,106
9,2,sqrt,30,-4.224156,-25.876928,0.293836,-5.086366,107,107,107,107


## 6.4 get best estimator, its predictions and metrics

In [59]:
gbdt_best_s = CV_gbdt_s.best_estimator_
gbdt_best_predictions_s = gbdt_best_s.predict(Xs_test)
gbdt_best_metrics_s = evaluate(ys_test, gbdt_best_predictions_s)

print(gbdt_best_metrics_s)

gbdt_best_m = CV_gbdt_m.best_estimator_
gbdt_best_predictions_m = gbdt_best_m.predict(Xm_test)
gbdt_best_metrics_m = evaluate(ym_test, gbdt_best_predictions_m)

print(gbdt_best_metrics_m)

(258.50524603215706, 95322.87554630966, 0.3049550873921877, 308.7440291670588)
(4.308420767202618, 26.47857654064157, 0.3049550873921876, 5.14573381945098)


## 6.5 Save all data

In [63]:
result_best_s = pd.DataFrame({
    'ys_test': ys_test,
    'gbdt_s': gbdt_predictions_s,
    'gbdt_s (tuned)': gbdt_best_predictions_s
})
result_best_m = pd.DataFrame({
    'ym_test': ym_test,
    'gbdt_m': gbdt_predictions_m,
    'gbdt_m (tuned)': gbdt_best_predictions_m
})
benchmark_best_s = pd.DataFrame({
    'Model': ['gbdt_s', 'gbdt_s (tuned)'],
    'MAE': [gbdt_metrics_s[0], gbdt_best_metrics_s[0]],
    'MSE': [gbdt_metrics_s[1], gbdt_best_metrics_s[1]],
    'R2': [gbdt_metrics_s[2], gbdt_best_metrics_s[2]],
    'RMSE': [gbdt_metrics_s[3], gbdt_best_metrics_s[3]]
})
benchmark_best_m = pd.DataFrame({
    'Model': ['gbdt_m', 'gbdt_m (tuned)'],
    'MAE': [gbdt_metrics_m[0], gbdt_best_metrics_m[0]],
    'MSE': [gbdt_metrics_m[1], gbdt_best_metrics_m[1]],
    'R2': [gbdt_metrics_m[2], gbdt_best_metrics_m[2]],
    'RMSE': [gbdt_metrics_m[3], gbdt_best_metrics_m[3]]
})


In [64]:
pkl.dump(CV_gbdt_s, open("food_preparation_CV_gbdt_s.pkl", "wb"))

pkl.dump(CV_gbdt_s.best_estimator_, open("food_preparation_best_gbdt_s.pkl", "wb"))
CV_gbdt_s_results[
    ['param_max_depth',
     'param_max_features',
     'param_n_estimators', 
     
     'mean_test_neg_mean_absolute_error',
     'mean_test_neg_mean_squared_error',
     'mean_test_r2',
     'mean_test_neg_root_mean_squared_error',
     
     'rank_test_neg_mean_absolute_error',
     'rank_test_neg_mean_squared_error',
     'rank_test_r2',
     'rank_test_neg_root_mean_squared_error'
     
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True).to_csv('Food_preparation_CV_metrics_s.csv', index=False)
result_best_s.to_csv('Food_preparation_best_model_prediction_result_s.csv', index=False)
benchmark_best_s.to_csv('Food_preparation_best_model_metrics_s.csv', index=False)

In [65]:
pkl.dump(CV_gbdt_m, open("food_preparation_CV_gbdt_m.pkl", "wb"))
pkl.dump(CV_gbdt_m.best_estimator_, open("food_preparation_best_gbdt_m.pkl", "wb"))
CV_gbdt_m_results[
    ['param_max_depth',
     'param_max_features',
     'param_n_estimators', 
     
     'mean_test_neg_mean_absolute_error',
     'mean_test_neg_mean_squared_error',
     'mean_test_r2',
     'mean_test_neg_root_mean_squared_error',
     
     'rank_test_neg_mean_absolute_error',
     'rank_test_neg_mean_squared_error',
     'rank_test_r2',
     'rank_test_neg_root_mean_squared_error'
     
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True).to_csv('Food_preparation_CV_metrics_m.csv', index=False)
result_best_m.to_csv('Food_preparation_best_model_prediction_result_m.csv', index=False)
benchmark_best_m.to_csv('Food_preparation_best_model_metrics_m.csv', index=False)