# 1. Import libraries

In [7]:
import numpy as np
import pandas as pd
import pickle as pkl
from time import time

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from sklearn.model_selection import train_test_split
from sklearn import metrics

np.random.seed(42)

# 2. load dataset

In [8]:
df = pd.read_csv('Sample 14days_food_delivery_Cleaned.csv')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   u_lat            20000 non-null  float64
 1   u_lon            20000 non-null  float64
 2   v_lat            20000 non-null  float64
 3   v_lon            20000 non-null  float64
 4   euc_dist         20000 non-null  float64
 5   shortest_dist    20000 non-null  float64
 6   day_of_week_sin  20000 non-null  float64
 7   day_of_week_cos  20000 non-null  float64
 8   duration (s)     20000 non-null  float64
 9   duration (m)     20000 non-null  float64
dtypes: float64(10)
memory usage: 1.5 MB


# 3. select features and split data (test/train)

In [10]:
# X = df.drop(['duration (s)', 'duration (m)'], axis=1)
X = df.copy()
X.loc[X['shortest_dist'] <= X['euc_dist'], 'shortest_dist'] = X.loc[X['shortest_dist'] <= X['euc_dist'], 'euc_dist']
y_s = df['duration (s)']
y_m = df['duration (m)']

In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   u_lat            20000 non-null  float64
 1   u_lon            20000 non-null  float64
 2   v_lat            20000 non-null  float64
 3   v_lon            20000 non-null  float64
 4   euc_dist         20000 non-null  float64
 5   shortest_dist    20000 non-null  float64
 6   day_of_week_sin  20000 non-null  float64
 7   day_of_week_cos  20000 non-null  float64
 8   duration (s)     20000 non-null  float64
 9   duration (m)     20000 non-null  float64
dtypes: float64(10)
memory usage: 1.5 MB


In [12]:
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X, y_s, test_size=0.20)
Xm_train, Xm_test, ym_train, ym_test = train_test_split(X, y_m, test_size=0.20)

# 4. train models

## 4.1 lr

In [11]:
lr_s = LinearRegression()
lr_s.fit(Xs_train, ys_train)
lr_predictions_s = lr_s.predict(Xs_test)

lr_m = LinearRegression()
lr_m.fit(Xm_train, ym_train)
lr_predictions_m = lr_m.predict(Xm_test)

## 4.2 rf

In [8]:
rf_s = RandomForestRegressor()
rf_s.fit(Xs_train, ys_train)
rf_predictions_s = rf_s.predict(Xs_test)

rf_m = RandomForestRegressor()
rf_m.fit(Xm_train, ym_train)
rf_predictions_m = rf_m.predict(Xm_test)

## 4.3 gbdt

In [9]:
gbdt_s = GradientBoostingRegressor()
gbdt_s.fit(Xs_train, ys_train)
gbdt_predictions_s = gbdt_s.predict(Xs_test)

gbdt_m = GradientBoostingRegressor()
gbdt_m.fit(Xm_train, ym_train)
gbdt_predictions_m = gbdt_m.predict(Xm_test)

# 5. Evaluate

## 5.1 create evaluate function

In [10]:
def evaluate(y_test, y_pred):
    MAE  = metrics.mean_absolute_error(y_test, y_pred)
    MSE  = metrics.mean_squared_error(y_test, y_pred)
    R2   = metrics.r2_score(y_test, y_pred)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    return MAE, MSE, R2, RMSE

## 5.2 evaluate all model

In [11]:
lr_metrics_s = evaluate(ys_test, lr_predictions_s)
rf_metrics_s = evaluate(ys_test, rf_predictions_s)
gbdt_metrics_s = evaluate(ys_test, gbdt_predictions_s)

lr_metrics_m = evaluate(ym_test, lr_predictions_m)
rf_metrics_m = evaluate(ym_test, rf_predictions_m)
gbdt_metrics_m = evaluate(ym_test, gbdt_predictions_m)

## 5.3 benchmark on dataframe

In [12]:
data_dict_s = {
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'MAE': [lr_metrics_s[0], rf_metrics_s[0], gbdt_metrics_s[0]],
    'MSE': [lr_metrics_s[1], rf_metrics_s[1], gbdt_metrics_s[1]],
    'R2': [lr_metrics_s[2], rf_metrics_s[2], gbdt_metrics_s[2]],
    'RMSE': [lr_metrics_s[3], rf_metrics_s[3], gbdt_metrics_s[3]]
}

data_dict_m = {
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'MAE': [lr_metrics_m[0], rf_metrics_m[0], gbdt_metrics_m[0]],
    'MSE': [lr_metrics_m[1], rf_metrics_m[1], gbdt_metrics_m[1]],
    'R2': [lr_metrics_m[2], rf_metrics_m[2], gbdt_metrics_m[2]],
    'RMSE': [lr_metrics_m[3], rf_metrics_m[3], gbdt_metrics_m[3]]
}

In [13]:
benchmark_s = pd.DataFrame(data_dict_s)
benchmark_s

Unnamed: 0,Model,MAE,MSE,R2,RMSE
0,Linear Regression,212.612121,75281.645784,0.891858,274.375009
1,Random Forest,216.584621,80773.37146,0.883969,284.206565
2,Gradient Boosting,206.970008,71898.833882,0.896718,268.139579


In [14]:
benchmark_m = pd.DataFrame(data_dict_m)
benchmark_m

Unnamed: 0,Model,MAE,MSE,R2,RMSE
0,Linear Regression,3.544697,21.015235,0.891485,4.584238
1,Random Forest,3.58962,22.272649,0.884992,4.719391
2,Gradient Boosting,3.446087,19.961956,0.896923,4.467881


## 5.4 save all data

In [15]:
result_s = pd.DataFrame({
    'ys_test': ys_test,
    'lr_predictions_s': lr_predictions_s,
    'rf_predictions_s': rf_predictions_s,
    'gbdt_predictions_s': gbdt_predictions_s
})
result_m = pd.DataFrame({
    'ym_test': ym_test,
    'lr_predictions_m': lr_predictions_m,
    'rf_predictions_m': rf_predictions_m,
    'gbdt_predictions_m': gbdt_predictions_m
})

In [16]:
benchmark_s.to_csv('Food_delivery_model_metrics_s.csv', index=False)
benchmark_m.to_csv('Food_delivery_model_metrics_m.csv', index=False)
result_s.to_csv('Food_delivery_model_prediction_result_s.csv', index=False)
result_m.to_csv('Food_delivery_model_prediction_result_m.csv', index=False)

# 6. Hyperparameter tuning

## 6.1 get best model params and feats importance

In [17]:
def get_feature_importance(model, X):
    return pd.DataFrame([[round(float(x),6) for x in model.feature_importances_]], columns=X.columns.to_list())

In [18]:
params = pd.DataFrame(gbdt_s.get_params(), index=[0])
params.loc[len(params)] = gbdt_m.get_params()
params.index = ['gbdt_s', 'gbdt_m']
print('model params')
display(params)

feature_importances = get_feature_importance(gbdt_s, Xs_train)
feature_importances.loc[len(feature_importances)] = get_feature_importance(gbdt_m, Xm_train).loc[0].values
feature_importances.index = ['gbdt_s', 'gbdt_m']
print('model feature importances')
display(feature_importances)

model params


Unnamed: 0,alpha,ccp_alpha,criterion,init,learning_rate,loss,max_depth,max_features,max_leaf_nodes,min_impurity_decrease,...,min_samples_split,min_weight_fraction_leaf,n_estimators,n_iter_no_change,random_state,subsample,tol,validation_fraction,verbose,warm_start
gbdt_s,0.9,0.0,friedman_mse,,0.1,squared_error,3,,,0.0,...,2,0.0,100,,,1.0,0.0001,0.1,0,False
gbdt_m,0.9,0.0,friedman_mse,,0.1,squared_error,3,,,0.0,...,2,0.0,100,,,1.0,0.0001,0.1,0,False


model feature importances


Unnamed: 0,u_lat,u_lon,v_lat,v_lon,euc_dist,shortest_dist,day_of_week_sin,day_of_week_cos
gbdt_s,0.000374,0.000419,0.001142,0.000792,0.9954,0.001514,0.000294,6.5e-05
gbdt_m,0.000482,0.00043,0.00104,0.000816,0.99546,0.001234,0.000383,0.000155


## 6.2 tune params using gridsearchCV

In [19]:
gbdt = GradientBoostingRegressor()

param_grid = { 
 "n_estimators": [30, 35, 40, 45, 50, 55, 60, 65, 70],
 "max_features": [1.0, "sqrt", "log2"], # "auto" is deprecated, use 1.0 instead
 "max_depth": [2, 3, 4, 5],
}
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_root_mean_squared_error']
refit = "neg_mean_squared_error"

CV_gbdt_s = GridSearchCV(estimator=gbdt, param_grid=param_grid, scoring=scoring, refit=refit)
CV_gbdt_s.fit(Xs_train, ys_train)

In [20]:
gbdt = GradientBoostingRegressor()

param_grid = { 
 "n_estimators": [30, 35, 40, 45, 50, 55, 60, 65, 70],
 "max_features": [1.0, "sqrt", "log2"], # "auto" is deprecated, use 1.0 instead
 "max_depth": [2, 3, 4, 5],
}
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2', 'neg_root_mean_squared_error']
refit = "neg_mean_squared_error"

CV_gbdt_m = GridSearchCV(estimator=gbdt, param_grid=param_grid, scoring=scoring, refit=refit)
CV_gbdt_m.fit(Xm_train, ym_train)

## 6.3 get result dataframe

In [21]:
CV_gbdt_s_results = pd.DataFrame(CV_gbdt_s.cv_results_)
CV_gbdt_s_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 40 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   mean_fit_time                            108 non-null    float64
 1   std_fit_time                             108 non-null    float64
 2   mean_score_time                          108 non-null    float64
 3   std_score_time                           108 non-null    float64
 4   param_max_depth                          108 non-null    object 
 5   param_max_features                       108 non-null    object 
 6   param_n_estimators                       108 non-null    object 
 7   params                                   108 non-null    object 
 8   split0_test_neg_mean_absolute_error      108 non-null    float64
 9   split1_test_neg_mean_absolute_error      108 non-null    float64
 10  split2_test_neg_mean_absolute_error      108 non-n

In [22]:
CV_gbdt_s_results[
    ['param_max_depth', 'param_max_features', 'param_n_estimators', 
     'mean_test_neg_mean_absolute_error', 'mean_test_neg_mean_squared_error', 'mean_test_r2', 'mean_test_neg_root_mean_squared_error',
     'rank_test_neg_mean_absolute_error', 'rank_test_neg_mean_squared_error', 'rank_test_r2', 'rank_test_neg_root_mean_squared_error'
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True)

Unnamed: 0,param_max_depth,param_max_features,param_n_estimators,mean_test_neg_mean_absolute_error,mean_test_neg_mean_squared_error,mean_test_r2,mean_test_neg_root_mean_squared_error,rank_test_neg_mean_absolute_error,rank_test_neg_mean_squared_error,rank_test_r2,rank_test_neg_root_mean_squared_error
5,2,1.0,55,-207.933227,-71051.779248,0.898539,-266.539635,2,1,1,1
6,2,1.0,60,-207.927307,-71062.637705,0.898524,-266.560051,1,2,2,2
4,2,1.0,50,-207.950603,-71065.634892,0.898520,-266.565406,4,3,3,3
8,2,1.0,70,-207.942609,-71098.196348,0.898473,-266.626957,3,4,4,4
7,2,1.0,65,-207.973809,-71113.489316,0.898451,-266.655072,5,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...
12,2,sqrt,45,-224.087546,-85344.200399,0.878145,-292.104701,104,104,104,104
18,2,log2,30,-225.485490,-86203.095937,0.876954,-293.504597,105,105,105,105
11,2,sqrt,40,-228.072570,-89043.056379,0.872872,-298.355078,106,106,106,106
10,2,sqrt,35,-232.489962,-91450.158498,0.869396,-302.348216,107,107,107,107


In [23]:
CV_gbdt_m_results = pd.DataFrame(CV_gbdt_m.cv_results_)
CV_gbdt_m_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 40 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   mean_fit_time                            108 non-null    float64
 1   std_fit_time                             108 non-null    float64
 2   mean_score_time                          108 non-null    float64
 3   std_score_time                           108 non-null    float64
 4   param_max_depth                          108 non-null    object 
 5   param_max_features                       108 non-null    object 
 6   param_n_estimators                       108 non-null    object 
 7   params                                   108 non-null    object 
 8   split0_test_neg_mean_absolute_error      108 non-null    float64
 9   split1_test_neg_mean_absolute_error      108 non-null    float64
 10  split2_test_neg_mean_absolute_error      108 non-n

In [24]:
CV_gbdt_m_results[
    ['param_max_depth', 'param_max_features', 'param_n_estimators', 
     'mean_test_neg_mean_absolute_error', 'mean_test_neg_mean_squared_error', 'mean_test_r2', 'mean_test_neg_root_mean_squared_error',
     'rank_test_neg_mean_absolute_error', 'rank_test_neg_mean_squared_error', 'rank_test_r2', 'rank_test_neg_root_mean_squared_error'
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True)

Unnamed: 0,param_max_depth,param_max_features,param_n_estimators,mean_test_neg_mean_absolute_error,mean_test_neg_mean_squared_error,mean_test_r2,mean_test_neg_root_mean_squared_error,rank_test_neg_mean_absolute_error,rank_test_neg_mean_squared_error,rank_test_r2,rank_test_neg_root_mean_squared_error
4,2,1.0,50,-3.468744,-19.725073,0.898505,-4.440756,1,1,1,1
5,2,1.0,55,-3.469274,-19.736580,0.898447,-4.442028,2,2,2,2
6,2,1.0,60,-3.469551,-19.745459,0.898402,-4.443019,4,3,3,3
3,2,1.0,45,-3.470251,-19.746586,0.898395,-4.443171,8,4,4,4
31,3,1.0,50,-3.469533,-19.754680,0.898355,-4.444071,3,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...
12,2,sqrt,45,-3.718931,-23.457695,0.879273,-4.842721,104,104,104,104
18,2,log2,30,-3.731314,-23.520644,0.878902,-4.848658,105,105,105,105
11,2,sqrt,40,-3.762249,-23.933032,0.876855,-4.891137,106,106,106,106
10,2,sqrt,35,-3.804460,-24.431149,0.874317,-4.940192,107,107,107,107


## 6.4 train model with best hyperparams, get their predictions and metrics

In [25]:
print(CV_gbdt_s.best_params_)
gbdt_best_s = GradientBoostingRegressor()
gbdt_best_s.set_params(**CV_gbdt_s.best_params_)
gbdt_best_s.fit(Xs_train, ys_train)

gbdt_best_predictions_s = gbdt_best_s.predict(Xs_test)
gbdt_best_metrics_s = evaluate(ys_test, gbdt_best_predictions_s)
print(gbdt_best_metrics_s)

{'max_depth': 2, 'max_features': 1.0, 'n_estimators': 55}
(206.90890479540232, 71485.19012468595, 0.8973117263210284, 267.3671448115605)


In [26]:
print(CV_gbdt_m.best_params_)
gbdt_best_m = GradientBoostingRegressor()
gbdt_best_m.set_params(**CV_gbdt_m.best_params_)
gbdt_best_m.fit(Xm_train, ym_train)

gbdt_best_predictions_m = gbdt_best_m.predict(Xm_test)
gbdt_best_metrics_m = evaluate(ym_test, gbdt_best_predictions_m)
print(gbdt_best_metrics_m)

{'max_depth': 2, 'max_features': 1.0, 'n_estimators': 50}
(3.437741311098818, 19.828949207197546, 0.8976102854346911, 4.452970829367462)


## 6.5 Save all data

In [27]:
result_best_s = pd.DataFrame({
    'ys_test': ys_test,
    'gbdt_s': gbdt_predictions_s,
    'gbdt_s (tuned)': gbdt_best_predictions_s
})
result_best_m = pd.DataFrame({
    'ym_test': ym_test,
    'gbdt_m': gbdt_predictions_m,
    'gbdt_m (tuned)': gbdt_best_predictions_m
})
benchmark_best_s = pd.DataFrame({
    'Model': ['gbdt_s', 'gbdt_s (tuned)'],
    'MAE': [gbdt_metrics_s[0], gbdt_best_metrics_s[0]],
    'MSE': [gbdt_metrics_s[1], gbdt_best_metrics_s[1]],
    'R2': [gbdt_metrics_s[2], gbdt_best_metrics_s[2]],
    'RMSE': [gbdt_metrics_s[3], gbdt_best_metrics_s[3]]
})

benchmark_best_m = pd.DataFrame({
    'Model': ['gbdt_m', 'gbdt_m (tuned)'],
    'MAE': [gbdt_metrics_m[0], gbdt_best_metrics_m[0]],
    'MSE': [gbdt_metrics_m[1], gbdt_best_metrics_m[1]],
    'R2': [gbdt_metrics_m[2], gbdt_best_metrics_m[2]],
    'RMSE': [gbdt_metrics_m[3], gbdt_best_metrics_m[3]]
})


In [28]:
pkl.dump(CV_gbdt_s, open("food_delivery_CV_gbdt_s.pkl", "wb"))
pkl.dump(CV_gbdt_s.best_estimator_, open("food_delivery_best_gbdt_s.pkl", "wb"))
CV_gbdt_s_results[
    ['param_max_depth',
     'param_max_features',
     'param_n_estimators', 
     
     'mean_test_neg_mean_absolute_error',
     'mean_test_neg_mean_squared_error',
     'mean_test_r2',
     'mean_test_neg_root_mean_squared_error',
     
     'rank_test_neg_mean_absolute_error',
     'rank_test_neg_mean_squared_error',
     'rank_test_r2',
     'rank_test_neg_root_mean_squared_error'
     
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True).to_csv('Food_delivery_CV_metrics_s.csv', index=False)
result_best_s.to_csv('Food_delivery_best_model_prediction_result_s.csv', index=False)
benchmark_best_s.to_csv('Food_delivery_best_model_metrics_s.csv', index=False)

In [29]:
pkl.dump(CV_gbdt_m, open("food_delivery_CV_gbdt_m.pkl", "wb"))
pkl.dump(CV_gbdt_m.best_estimator_, open("food_delivery_best_gbdt_m.pkl", "wb"))
CV_gbdt_m_results[
    ['param_max_depth',
     'param_max_features',
     'param_n_estimators', 
     
     'mean_test_neg_mean_absolute_error',
     'mean_test_neg_mean_squared_error',
     'mean_test_r2',
     'mean_test_neg_root_mean_squared_error',
     
     'rank_test_neg_mean_absolute_error',
     'rank_test_neg_mean_squared_error',
     'rank_test_r2',
     'rank_test_neg_root_mean_squared_error'
     
     ]].sort_values(by='rank_test_neg_mean_squared_error', ascending=True).to_csv('Food_delivery_CV_metrics_m.csv', index=False)
result_best_m.to_csv('Food_delivery_best_model_prediction_result_m.csv', index=False)
benchmark_best_m.to_csv('Food_delivery_best_model_metrics_m.csv', index=False)

In [30]:
CV_gbdt_m = pkl.load(open("food_delivery_CV_gbdt_m.pkl", "rb"))
gbdt_best_m = GradientBoostingRegressor()
gbdt_best_m.set_params(**CV_gbdt_m.best_params_)
gbdt_best_m.fit(Xm_train, ym_train)
pkl.dump(gbdt_best_m, open("food_delivery_best_gbdt_m.pkl", "wb"))

CV_gbdt_s = pkl.load(open("food_delivery_CV_gbdt_s.pkl", "rb"))
gbdt_best_s= GradientBoostingRegressor()
gbdt_best_s.set_params(**CV_gbdt_s.best_params_)
gbdt_best_s.fit(Xs_train, ys_train)
pkl.dump(gbdt_best_m, open("food_delivery_best_gbdt_s.pkl", "wb"))

In [31]:
X.shape

(20000, 8)

In [32]:
df = pd.read_csv('Sample 14days_food_delivery_Cleaned.csv')
X = df.drop(['duration (s)', 'duration (m)'], axis=1)
X.loc[X['shortest_dist'] <= X['euc_dist'], 'shortest_dist'] = X.loc[X['shortest_dist'] <= X['euc_dist'], 'euc_dist']
y_s = df['duration (s)']
y_m = df['duration (m)']
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X, y_s, test_size=0.20)
Xm_train, Xm_test, ym_train, ym_test = train_test_split(X, y_m, test_size=0.20)
gbdt_best_s = GradientBoostingRegressor()
gbdt_best_s.set_params(max_depth=2, max_features=1.0, n_estimators=55)
gbdt_best_s.fit(Xs_train, ys_train)
gbdt_best_m = GradientBoostingRegressor()
gbdt_best_m.set_params(max_depth=2, max_features=1.0, n_estimators=50)
gbdt_best_m.fit(Xm_train, ym_train)

## 6.6 Test model preformance (prediction time)

In [34]:
from module.distance_calculator import DistanceCalculator
from module.delivery_model import DeliveryModel
from sklearn.ensemble import GradientBoostingRegressor

In [38]:
df = pd.read_csv('Sample 14days_food_delivery_Cleaned.csv')
model = DeliveryModel()

In [93]:
def get_prediction_times(model, df, data_sizes=[1], size=5):
    periods = {}
    for data_size in data_sizes:
        period_data = []
        if data_size < df.shape[0]:
            loc = df[['u_lat', 'u_lon', 'v_lat', 'v_lon']].sample(data_size).values
            loc = [ ((l[0], l[1]), (l[2], l[3])) for l in loc]
        else:
            loc = df[['u_lat', 'u_lon', 'v_lat', 'v_lon']].values
            while len(loc) < data_size:
                loc = np.concatenate((loc, df[['u_lat', 'u_lon', 'v_lat', 'v_lon']].values))
            loc = loc[:data_size]
            loc = [ ((l[0], l[1]), (l[2], l[3])) for l in loc]
        for _ in range(size):    
            st = time()
            model.batch_predict(locations=loc, approx=True)
            en = time()
            period = en - st
            period_data.append(period)
        periods[str(data_size) + ' data'] = period_data
    return pd.DataFrame(periods)

In [94]:
result_time = get_prediction_times(model, df, [1, 10, 100, 1000, 10000, 100000, 1000000], 30)

In [97]:
result_time.to_csv('Food_delivery_best_model_prediction_time.csv', index=False)