In [52]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import math 

from sklearn.linear_model import Ridge

from sklearn.ensemble import RandomForestRegressor
from sklearn import ensemble

from sklearn.externals import joblib

import json

import warnings
warnings.filterwarnings('ignore')

In [2]:
def create_target(df):
    # drop those records that have missing actual delivery time
    df = df[pd.notnull(df['actual_delivery_time'])]
    #df['created_at_datetime'] = df['created_at'].astype("datetime64[s]")
    #df['actual_delivery_time_datetime'] = df['actual_delivery_time'].astype("datetime64[s]")
    df['duration'] = df['actual_delivery_time'].astype("datetime64[s]") - df['created_at'].astype("datetime64[s]")
    df['duration'] = df['duration'] / np.timedelta64(1, 's')
    return df

In [3]:
def create_time_feature(df):
    
    #create created_at_year, created_at_month, created_at_day, created_at_date, created_at_dayOfWeek, 
    #created_at_time, created_at_hour, created_at_minute, created_at_second, created_at_isWeekend,
    #created_at_isHoliday
    
    df['created_at_datetime'] = df['created_at'].astype("datetime64[s]")
    #df['actual_delivery_time_datetime'] = df['actual_delivery_time'].astype("datetime64[s]")

    df['created_at_year'], df['created_at_month'], df['created_at_day'], df['created_at_date'], df['created_at_dayOfWeek'], df['created_at_time'], df['created_at_hour'], df['created_at_minute'], df['created_at_second'] = df['created_at_datetime'].dt.year, df['created_at_datetime'].dt.month, df['created_at_datetime'].dt.day, df['created_at_datetime'].dt.date, df['created_at_datetime'].dt.dayofweek, df['created_at_datetime'].dt.time, df['created_at_datetime'].dt.hour, df['created_at_datetime'].dt.minute, df['created_at_datetime'].dt.second

    df.loc[df['created_at_dayOfWeek'].isin([5, 6]), 'created_at_isWeekend'] = 1
    df.loc[df['created_at_dayOfWeek'].isin([0, 1, 2, 3, 4]), 'created_at_isWeekend'] = 0

    cal = calendar()
    holidays = cal.holidays(start=df['created_at_date'].min(), end=df['created_at_date'].max())
    df['created_at_isHoliday'] = np.where(df.created_at_datetime.dt.normalize().isin(holidays), 1, 0)

    return df

In [4]:
def process_continuous_features(df):
    
    def bin_num(x, a=251, b=446):
        if x == a:
             return 'fast'
        elif x == b:
             return 'slow'
        else:
            return 'other'
    
    #df = df.loc[(df['total_items'] < 20)]
    #df = df.loc[df['subtotal'] < 12000]
    #df = df.loc[df['num_distinct_items'] < 16]
    #df = df.loc[(df['min_item_price'] > 0) & (df['min_item_price'] <= 5000)]
    #df = df.loc[(df['max_item_price'] > 0) & (df['max_item_price'] <= 5000)]
    
    df['total_items'][(df['total_items'] > 20)] = 20
    df['subtotal'][df['subtotal'] > 12000] = 12000
    df['num_distinct_items'][df['num_distinct_items'] > 16] = 16
    df['min_item_price'][(df['min_item_price'] < 0)] = 0
    df['min_item_price'][(df['min_item_price'] > 5000)] = 5000

    df['max_item_price'][(df['max_item_price'] < 0)] = 0
    df['max_item_price'][(df['max_item_price'] > 5000)] = 5000

    
    #df = df.loc[df['total_onshift_dashers'] > 0]
    df['total_onshift_dashers'][df['total_onshift_dashers'] < 0] = 0
    df['total_onshift_dashers'] = df['total_onshift_dashers'].fillna(int(df['total_onshift_dashers'].mean()))
    
    #df = df.loc[df['total_busy_dashers'] > 0]
    df['total_busy_dashers'][df['total_busy_dashers'] < 0] = 0
    df['total_busy_dashers'] = df['total_busy_dashers'].fillna(int(df['total_busy_dashers'].mean()))
    
    #df = df.loc[df['total_outstanding_orders'] > 0]
    df['total_outstanding_orders'][df['total_outstanding_orders'] < 0] = 0
    df['total_outstanding_orders'] = df['total_outstanding_orders'].fillna(int(df['total_outstanding_orders'].mean()))
    
    df['estimated_order_place_duration_rebinned'] =  df['estimated_order_place_duration'].apply(bin_num)
    df['estimated_store_to_consumer_driving_duration'] = df['estimated_store_to_consumer_driving_duration'].fillna(int(df['estimated_store_to_consumer_driving_duration'].mean()))

    return df

In [5]:
# calculate number of orders for each store
def make_store_id_cont(df):
    store_counts_df = pd.DataFrame(df['store_id'].value_counts().reset_index().rename(columns={'index': 'store_id', 0: 'store_id_count'}))
    store_counts_df.columns = ['store_id', 'store_id_count']
    store_counts_df = store_counts_df.sort_values(by='store_id', ascending=True)
    df = pd.merge(df, store_counts_df, on='store_id', how='left')
    df['store_id_rebinned'] = df['store_id']
    df['store_id_rebinned'][(df['store_id_count'] <500) & (df['store_id_count'] >= 400)] = '[400, 500)'
    df['store_id_rebinned'][(df['store_id_count'] <400) & (df['store_id_count'] >= 200)] = '[200, 400)'
    df['store_id_rebinned'][(df['store_id_count'] <200) & (df['store_id_count'] >= 50)] = '[50, 200)'
    df['store_id_rebinned'][df['store_id_count'] <50] = '[0, 50)'

    return df,store_counts_df

In [6]:
def make_store_category_cont(df):
    df['store_primary_category'][df['store_primary_category'].isnull()] = 'Unknown'
    
    store_primary_category_counts_df = pd.DataFrame(df['store_primary_category'].value_counts().reset_index().rename(columns={'index': 'store_primary_category', 0: 'store_primary_category_count'}))
    store_primary_category_counts_df.columns = ['store_primary_category', 'store_primary_category_count']
    df = pd.merge(df, store_primary_category_counts_df, on='store_primary_category', how='left')
    
    #lst_store_primary_category=df['store_primary_category'].tolist()
    #lst_store_primary_category_cnt = df['store_primary_category_count'].tolist()
    #lst_store_primary_category_bin = [lst_store_primary_category[i] if lst_store_primary_category_cnt[i] > 300 else "other2" for i in range(len(lst_store_primary_category)) ]   
    #df['store_primary_category_rebinned'] = lst_store_primary_category_bin
    
    df['store_primary_category_rebinned'] = df['store_primary_category']
    df['store_primary_category_rebinned'][df['store_primary_category_rebinned'].isnull()] = 'Unknown'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <3000) & (df['store_primary_category_count'] >= 2000)] = '[2000, 3000)'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <2000) & (df['store_primary_category_count'] >= 1000)] = '[1000, 2000)'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <1000) & (df['store_primary_category_count'] >= 200)] = '[200, 1000)'
    df['store_primary_category_rebinned'][df['store_primary_category_count'] <200] = '[0, 200)'
    return df, store_primary_category_counts_df

In [7]:
def impute_market_id(df):
    df['market_id'][df['market_id'].isnull()] = 0
    return df

In [8]:
def impute_order_protocol(df):
    df['order_protocol'][df['order_protocol'].isnull()] = 0
    df['order_protocol'].loc[df['order_protocol'] == 6] = 0
    df['order_protocol'].loc[df['order_protocol'] == 7] = 0
    return df

In [9]:
def select_features(df,TrainOrScore):
    if TrainOrScore == 'Train':
        TrainFeatures = df[['duration', 'market_id', 'store_id_rebinned', 'store_primary_category_rebinned',
                            'order_protocol',  'total_items', 'subtotal', 'num_distinct_items', 'min_item_price',
                            'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders',
                            'estimated_store_to_consumer_driving_duration', 'created_at_month', 'created_at_dayOfWeek',
                            'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday',
                            'estimated_order_place_duration_rebinned']]
    else:
        TrainFeatures = df[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned',
                    'order_protocol',  'total_items', 'subtotal', 'num_distinct_items', 'min_item_price',
                    'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders',
                    'estimated_store_to_consumer_driving_duration', 'created_at_month', 'created_at_dayOfWeek',
                    'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday',
                    'estimated_order_place_duration_rebinned']]
        
    TrainFeatures[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned', 'order_protocol', 'created_at_month',
                   'created_at_dayOfWeek', 'created_at_hour',  'created_at_isWeekend', 'created_at_isHoliday', 
                   'estimated_order_place_duration_rebinned']] = TrainFeatures[['market_id', 'store_id_rebinned', 
                                                                               'store_primary_category_rebinned', 
                                                                                'order_protocol',
                                                                                'created_at_month', 'created_at_dayOfWeek',
                                                                                'created_at_hour',  'created_at_isWeekend',
                                                                                'created_at_isHoliday', 'estimated_order_place_duration_rebinned']].astype(object)
    NumFeatures = ['total_items', 'subtotal', 'num_distinct_items', 'min_item_price',  'max_item_price',
                   'total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders',
                   'estimated_store_to_consumer_driving_duration']
    CatFeatures = ['market_id', 'store_id_rebinned', 'store_primary_category_rebinned',  'order_protocol',
                   'created_at_month', 'created_at_dayOfWeek', 'created_at_hour', 'created_at_isWeekend',
                   'created_at_isHoliday', 'estimated_order_place_duration_rebinned']
    return TrainFeatures, NumFeatures, CatFeatures

In [10]:
def scale_oneHot_X(input_x, features_num, features_cat):
    # scale numerical features
    input_x_scale = scale(input_x[features_num])
    
    # OneHot cat features
    le=LabelEncoder()
    enc = OneHotEncoder()
    
    Cat_Train = input_x[features_cat].apply(le.fit_transform)
    enc.fit(Cat_Train)
    input_x_oneHot = enc.transform(Cat_Train).toarray()
    
    output_x = pd.concat([pd.DataFrame(input_x_scale), pd.DataFrame(input_x_oneHot)], axis=1)
    output_x.columns = [i for i in range(output_x.shape[1])]

    return output_x 

In [11]:
#Regular Linear Model
def lin_model(X_train, Y_train, X_test, Y_test):
    lm = linear_model.LinearRegression()
    lm.fit(X_train, Y_train)
    lm_predict_train = lm.predict(X_train)
    lm_predict_test = lm.predict(X_test)
    #lm_predict_train = [Y_train.mean()]*len(Y_train)
    #lm_predict_test = [Y_train.mean()]*len(Y_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, lm_predict_train)))
    #Root mean squared error for train 896.74
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, lm_predict_test)))
    #Root mean squared error for test: 893.58
    return lm

In [44]:
# Lasso
def lin_model_lasso(X_train, Y_train, X_test, Y_test, alpha):
    lm_lasso = linear_model.Lasso(alpha = 0.1)
    lm_lasso.fit(X_train, Y_train)
    lm_lasso_predict_train = lm_lasso.predict(X_train)
    lm_lasso_predict_test = lm_lasso.predict(X_test)
    #lm_predict_train = [Y_train.mean()]*len(Y_train)
    #lm_predict_test = [Y_train.mean()]*len(Y_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, lm_lasso_predict_train)))
    #Root mean squared error for train 896.74
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, lm_lasso_predict_test)))
    #Root mean squared error for test: 893.58
    return lm_lasso, math.sqrt(mean_squared_error(Y_train, lm_lasso_predict_train)), math.sqrt(mean_squared_error(Y_test, lm_lasso_predict_test))

In [None]:
# Ridge
def lin_model_ridge(X_train, Y_train, X_test, Y_test, alpha):
    lm_ridge = rideg(alpha = 0.1)
    lm_ridge.fit(X_train, Y_train)
    lm_ridge_predict_train = lm_ridge.predict(X_train)
    lm_ridge_predict_test = lm_ridge.predict(X_test)
    #lm_predict_train = [Y_train.mean()]*len(Y_train)
    #lm_predict_test = [Y_train.mean()]*len(Y_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, lm_ridge_predict_train)))
    #Root mean squared error for train 896.74
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, lm_ridge_predict_test)))
    #Root mean squared error for test: 893.58
    return lm_ridge, math.sqrt(mean_squared_error(Y_train, lm_ridge_predict_train)), math.sqrt(mean_squared_error(Y_test, lm_ridge_predict_test))



In [12]:
def rf_model(X_train, Y_train, X_test, Y_test):
    rf = RandomForestRegressor(max_depth=3, random_state=0)
    rf.fit(X_train, Y_train)
    rf_predict_train = rf.predict(X_train)
    rf_predict_test = rf.predict(X_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, rf_predict_train)))
    #Root mean squared error for train 952.91
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, rf_predict_test)))
    #Root mean squared error for test: 949.88
    return rf

In [56]:
def rf_model_grid_search(X_train, Y_train):
    
    param_grid = {"n_estimators": [500, 1000],
    "max_depth": [3, 5],
    "max_features": [10, 20],
    "min_samples_split": [20, 50],
    "min_samples_leaf": [10, 20],
    "bootstrap": [True, False]}
    
    rf = RandomForestRegressor(random_state=0)
    grid = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
    grid.fit(X_train, Y_train)
    print(grid.best_score_)
    print(grid.best_params_)
    return grid.best_params_

In [None]:
def rf_with_best_parameters(X_train, Y_train, X_test, Y_test, best_par_set):
    rf = RandomForestRegressor(best, random_state=0)
    rf.fit(X_train, Y_train)
    rf_predict_train = rf.predict(X_train)
    rf_predict_test = rf.predict(X_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, rf_predict_train)))
    #Root mean squared error for train 952.91
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, rf_predict_test)))
    #Root mean squared error for test: 949.88
    return rf

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {"n_estimators": [200, 500],
    "max_depth": [3, None],
    "max_features": [1, 3, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [True, False]}

model = RandomForestRegressor(random_state=0)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

In [13]:
def gbm_model(X_train, Y_train, X_test, Y_test):
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
              'learning_rate': 0.01, 'loss': 'ls'}
    gbm = ensemble.GradientBoostingRegressor(**params)
    gbm.fit(X_train, Y_train)
    gbm_predict_train = gbm.predict(X_train)
    gbm_predict_test = gbm.predict(X_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, gbm_predict_train)))
    #Root mean squared error for train:  863.21
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, gbm_predict_test)))
    #Root mean squared error for test: 867.61
    return gbm

In [14]:
def make_prediction(model, data):
    model_loaded = joblib.load(model)
    pred = model_loaded.predict(data)
    return pred

## Read in Train

In [15]:
Train = pd.read_csv(r'D:\Learn\DoorDash\historical_data.csv')

a0 = create_target(Train)
a1 = create_time_feature(a0)
a = process_continuous_features(a1)
b = impute_market_id(a)
c = impute_order_protocol(b)

d = make_store_category_cont(c)[0]
e = make_store_id_cont(d)[0]

store_category_count_table = make_store_category_cont(c)[1]
make_store_id_cont_table = make_store_id_cont(d)[1]

store_category_count_table.to_csv(r'D:\Learn\DoorDash\store_category_count_table.csv', index=False)
make_store_id_cont_table.to_csv(r'D:\Learn\DoorDash\make_store_id_cont_table.csv', index=False)

Train_processed = select_features(e, 'Train')[0]

In [16]:
Train_processed['duration'][Train_processed['duration'] > 3600] = 3600

In [17]:
target_in_train = Train_processed['duration']
del Train_processed['duration']

In [18]:
target_in_train.shape

(197421,)

In [19]:
target_in_train[:20]

0     3600.0
1     3600.0
2     1781.0
3     3075.0
4     2390.0
5     2300.0
6     1584.0
7     1965.0
8     1586.0
9     3192.0
10    2786.0
11    3600.0
12    2563.0
13    2282.0
14    2273.0
15    2988.0
16    3600.0
17    3600.0
18    3600.0
19    3019.0
Name: duration, dtype: float64

In [20]:
Train_processed.shape

(197421, 19)

In [21]:
NumFeatures = select_features(e, 'Train')[1]
CatFeatures = select_features(e, 'Train')[2]

In [22]:
print(NumFeatures, "+++",CatFeatures)

['total_items', 'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price', 'total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders', 'estimated_store_to_consumer_driving_duration'] +++ ['market_id', 'store_id_rebinned', 'store_primary_category_rebinned', 'order_protocol', 'created_at_month', 'created_at_dayOfWeek', 'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday', 'estimated_order_place_duration_rebinned']


In [23]:
Train_ready_encoded = scale_oneHot_X(Train_processed, NumFeatures, CatFeatures)

In [24]:
#Train_ready = pd.concat([target_in_train, Train_ready_encoded])

In [25]:
Train_ready_encoded.columns

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
             91,  92,  93,  94,  95,  96,  97,  98,  99, 100],
           dtype='int64', length=101)

In [26]:
X_train, X_test, Y_train, Y_test= train_test_split(Train_ready_encoded, target_in_train, test_size=0.4, random_state=17)

In [27]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
15870,0.34713,2.559182,0.81581,0.124688,4.589361,0.370669,0.432625,0.237004,1.39068,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
581,-0.500001,-0.487392,-0.411608,-0.795906,0.668467,-0.234041,-0.054497,-0.417191,-0.4216,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
154028,-0.076435,-0.328271,-0.411608,-1.059487,-0.609927,-0.929456,-0.898842,-0.734376,1.550453,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
174814,3.312087,2.146135,3.884355,-0.941263,-0.845181,-1.262046,-1.256065,-1.051561,-0.932873,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
163203,0.770695,0.884814,1.429519,-0.214478,0.714059,0.793965,0.919748,0.75243,-1.412192,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [28]:
X_train.shape

(118452, 101)

In [29]:
Y_train.shape

(118452,)

In [30]:
X_test.shape

(78969, 101)

In [31]:
Y_test.shape

(78969,)

In [32]:
78969 + 118452

197421

In [51]:
from sklearn import linear_model
linear_model = lin_model(X_train, Y_train, X_test, Y_test, 9)

Root mean squared error for train: 591.51
Root mean squared error for test: 590.10


In [34]:
from sklearn.ensemble import RandomForestRegressor
rf_model = rf_model(X_train, Y_train, X_test, Y_test)

Root mean squared error for train: 665.47
Root mean squared error for test: 664.68


In [None]:
best_parameters_rf = rf_model_grid_search(X_train, Y_train)

In [35]:
from sklearn import ensemble
gbm_model = gbm_model(X_train, Y_train, X_test, Y_test)

Root mean squared error for train: 600.67
Root mean squared error for test: 602.05


In [36]:
joblib.dump(linear_model, 'linear_model_saved.pkl')
joblib.dump(rf_model, 'rf_model_saved.pkl')
joblib.dump(gbm_model, 'gbm_model_saved.pkl')

['gbm_model_saved.pkl']

In [37]:
lm_pred = make_prediction('linear_model_saved.pkl', X_test)

In [38]:
rf_pred = make_prediction('rf_model_saved.pkl', X_test)

In [39]:
gbm_pred = make_prediction('gbm_model_saved.pkl', X_test)

In [40]:
print(len(lm_pred),len(rf_pred),len(gbm_pred))

78969 78969 78969


In [41]:
lm_pred[:20]

array([ 2390.6640625,  2812.0859375,  2220.234375 ,  2308.9296875,
        2233.7421875,  2438.3828125,  2568.6015625,  2278.1015625,
        2989.9140625,  2408.015625 ,  3048.6171875,  2313.2578125,
        2969.6953125,  2340.4609375,  3136.6796875,  2961.515625 ,
        2439.8203125,  2775.2890625,  2542.3984375,  2284.4140625])

In [42]:
rf_pred[:20]

array([ 2650.3242264 ,  2954.29260843,  2268.87912797,  2575.72747166,
        2268.87912797,  2590.89116275,  2686.63909013,  2650.3242264 ,
        3202.19939432,  2386.86891341,  2650.3242264 ,  2679.92707392,
        2700.33799324,  2650.3242264 ,  2568.62871291,  2686.63909013,
        2575.72747166,  2650.3242264 ,  2994.35599239,  2679.92707392])

In [43]:
gbm_pred[:20]

array([ 2567.69151735,  2840.9815715 ,  2370.89074489,  2328.98636186,
        2325.39751797,  2592.829483  ,  2622.05198529,  2373.54291763,
        2993.783579  ,  2333.9482885 ,  2768.28181379,  2442.92692151,
        2777.95251252,  2448.66384235,  2939.86520252,  3014.16728322,
        2428.23599071,  2641.08962181,  2713.25116329,  2636.0480359 ])