In [88]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import math 

from sklearn.ensemble import RandomForestRegressor
from sklearn import ensemble

from sklearn.externals import joblib

import json

import warnings
warnings.filterwarnings('ignore')

In [43]:
def create_target(df):
    # drop those records that have missing actual delivery time
    df = df[pd.notnull(df['actual_delivery_time'])]
    #df['created_at_datetime'] = df['created_at'].astype("datetime64[s]")
    #df['actual_delivery_time_datetime'] = df['actual_delivery_time'].astype("datetime64[s]")
    df['duration'] = df['actual_delivery_time'].astype("datetime64[s]") - df['created_at'].astype("datetime64[s]")
    df['duration'] = df['duration'] / np.timedelta64(1, 's')
    return df

In [44]:
def create_time_feature(df):
    
    #create created_at_year, created_at_month, created_at_day, created_at_date, created_at_dayOfWeek, 
    #created_at_time, created_at_hour, created_at_minute, created_at_second, created_at_isWeekend,
    #created_at_isHoliday
    
    df['created_at_datetime'] = df['created_at'].astype("datetime64[s]")
    #df['actual_delivery_time_datetime'] = df['actual_delivery_time'].astype("datetime64[s]")

    df['created_at_year'], df['created_at_month'], df['created_at_day'], df['created_at_date'], df['created_at_dayOfWeek'], df['created_at_time'], df['created_at_hour'], df['created_at_minute'], df['created_at_second'] = df['created_at_datetime'].dt.year, df['created_at_datetime'].dt.month, df['created_at_datetime'].dt.day, df['created_at_datetime'].dt.date, df['created_at_datetime'].dt.dayofweek, df['created_at_datetime'].dt.time, df['created_at_datetime'].dt.hour, df['created_at_datetime'].dt.minute, df['created_at_datetime'].dt.second

    df.loc[df['created_at_dayOfWeek'].isin([5, 6]), 'created_at_isWeekend'] = 1
    df.loc[df['created_at_dayOfWeek'].isin([0, 1, 2, 3, 4]), 'created_at_isWeekend'] = 0

    cal = calendar()
    holidays = cal.holidays(start=df['created_at_date'].min(), end=df['created_at_date'].max())
    df['created_at_isHoliday'] = np.where(df.created_at_datetime.dt.normalize().isin(holidays), 1, 0)

    return df

In [81]:
def process_continuous_features(df):
    
    def bin_num(x, a=251, b=446):
        if x == a:
             return 'fast'
        elif x == b:
             return 'slow'
        else:
            return 'other'
    
    #df = df.loc[(df['total_items'] < 20)]
    #df = df.loc[df['subtotal'] < 12000]
    #df = df.loc[df['num_distinct_items'] < 16]
    #df = df.loc[(df['min_item_price'] > 0) & (df['min_item_price'] <= 5000)]
    #df = df.loc[(df['max_item_price'] > 0) & (df['max_item_price'] <= 5000)]
    
    df['total_items'][(df['total_items'] > 20)] = 20
    df['subtotal'][df['subtotal'] > 12000] = 12000
    df['num_distinct_items'][df['num_distinct_items'] > 16] = 16
    df['min_item_price'][(df['min_item_price'] < 0)] = 0
    df['min_item_price'][(df['min_item_price'] > 5000)] = 5000

    df['max_item_price'][(df['max_item_price'] < 0)] = 0
    df['max_item_price'][(df['max_item_price'] > 5000)] = 5000

    
    #df = df.loc[df['total_onshift_dashers'] > 0]
    df['total_onshift_dashers'][df['total_onshift_dashers'] < 0] = 0
    df['total_onshift_dashers'] = df['total_onshift_dashers'].fillna(int(df['total_onshift_dashers'].mean()))
    
    #df = df.loc[df['total_busy_dashers'] > 0]
    df['total_busy_dashers'][df['total_busy_dashers'] < 0] = 0
    df['total_busy_dashers'] = df['total_busy_dashers'].fillna(int(df['total_busy_dashers'].mean()))
    
    #df = df.loc[df['total_outstanding_orders'] > 0]
    df['total_outstanding_orders'][df['total_outstanding_orders'] < 0] = 0
    df['total_outstanding_orders'] = df['total_outstanding_orders'].fillna(int(df['total_outstanding_orders'].mean()))
    
    df['estimated_order_place_duration_rebinned'] =  df['estimated_order_place_duration'].apply(bin_num)
    df['estimated_store_to_consumer_driving_duration'] = df['estimated_store_to_consumer_driving_duration'].fillna(int(df['estimated_store_to_consumer_driving_duration'].mean()))

    return df

In [46]:
# calculate number of orders for each store
def make_store_id_cont(df):
    store_counts_df = pd.DataFrame(df['store_id'].value_counts().reset_index().rename(columns={'index': 'store_id', 0: 'store_id_count'}))
    store_counts_df.columns = ['store_id', 'store_id_count']
    store_counts_df = store_counts_df.sort_values(by='store_id', ascending=True)
    df = pd.merge(df, store_counts_df, on='store_id', how='left')
    df['store_id_rebinned'] = df['store_id']
    df['store_id_rebinned'][(df['store_id_count'] <500) & (df['store_id_count'] >= 400)] = '[400, 500)'
    df['store_id_rebinned'][(df['store_id_count'] <400) & (df['store_id_count'] >= 200)] = '[200, 400)'
    df['store_id_rebinned'][(df['store_id_count'] <200) & (df['store_id_count'] >= 50)] = '[50, 200)'
    df['store_id_rebinned'][df['store_id_count'] <50] = '[0, 50)'

    return df

In [47]:
def bin_store_category(df):
    df['store_primary_category'][df['store_primary_category'].isnull()] = 'Unknown'
    
    store_primary_category_counts_df = pd.DataFrame(df['store_primary_category'].value_counts().reset_index().rename(columns={'index': 'store_primary_category', 0: 'store_primary_category_count'}))
    store_primary_category_counts_df.columns = ['store_primary_category', 'store_primary_category_count']
    df = pd.merge(df, store_primary_category_counts_df, on='store_primary_category', how='left')
    
    #lst_store_primary_category=df['store_primary_category'].tolist()
    #lst_store_primary_category_cnt = df['store_primary_category_count'].tolist()
    #lst_store_primary_category_bin = [lst_store_primary_category[i] if lst_store_primary_category_cnt[i] > 300 else "other2" for i in range(len(lst_store_primary_category)) ]   
    #df['store_primary_category_rebinned'] = lst_store_primary_category_bin
    
    df['store_primary_category_rebinned'] = df['store_primary_category']
    df['store_primary_category_rebinned'][df['store_primary_category_rebinned'].isnull()] = 'Unknown'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <3000) & (df['store_primary_category_count'] >= 2000)] = '[2000, 3000)'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <2000) & (df['store_primary_category_count'] >= 1000)] = '[1000, 2000)'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <1000) & (df['store_primary_category_count'] >= 200)] = '[200, 1000)'
    df['store_primary_category_rebinned'][df['store_primary_category_count'] <200] = '[0, 200)'
    return df

In [48]:
def impute_market_id(df):
    df['market_id'][df['market_id'].isnull()] = 0
    return df

In [49]:
def bin_order_protocol(df):
    df['order_protocol'][df['order_protocol'].isnull()] = 0
    df['order_protocol'].loc[df['order_protocol'] == 6] = 0
    df['order_protocol'].loc[df['order_protocol'] == 7] = 0
    return df

In [50]:
def select_features(df,TrainOrScore):
    if TrainOrScore == 'Train':
        TrainFeatures = df[['duration', 'market_id', 'store_id_rebinned', 'store_primary_category_rebinned',
                            'order_protocol',  'total_items', 'subtotal', 'num_distinct_items', 'min_item_price',
                            'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders',
                            'estimated_store_to_consumer_driving_duration', 'created_at_month', 'created_at_dayOfWeek',
                            'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday',
                            'estimated_order_place_duration_rebinned']]
    else:
        TrainFeatures = df[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned',
                    'order_protocol',  'total_items', 'subtotal', 'num_distinct_items', 'min_item_price',
                    'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders',
                    'estimated_store_to_consumer_driving_duration', 'created_at_month', 'created_at_dayOfWeek',
                    'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday',
                    'estimated_order_place_duration_rebinned']]
        
    TrainFeatures[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned', 'order_protocol', 'created_at_month', 'created_at_dayOfWeek', 'created_at_hour',  'created_at_isWeekend', 'created_at_isHoliday', 'estimated_order_place_duration_rebinned']] = TrainFeatures[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned', 'order_protocol', 'created_at_month', 'created_at_dayOfWeek', 'created_at_hour',  'created_at_isWeekend', 'created_at_isHoliday', 'estimated_order_place_duration_rebinned']].astype(object)
    NumFeatures = ['total_items', 'subtotal', 'num_distinct_items', 'min_item_price',  'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders', 'estimated_store_to_consumer_driving_duration']
    CatFeatures = ['market_id', 'store_id_rebinned', 'store_primary_category_rebinned',  'order_protocol', 'created_at_month', 'created_at_dayOfWeek', 'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday', 'estimated_order_place_duration_rebinned']
    return TrainFeatures, NumFeatures, CatFeatures

In [51]:
def scale_oneHot_X(input_x, features_num, features_cat):
    # scale numerical features
    input_x_scale = scale(input_x[features_num])
    
    # OneHot cat features
    le=LabelEncoder()
    enc = OneHotEncoder()
    
    Cat_Train = input_x[features_cat].apply(le.fit_transform)
    enc.fit(Cat_Train)
    input_x_oneHot = enc.transform(Cat_Train).toarray()
    
    output_x = pd.concat([pd.DataFrame(input_x_scale), pd.DataFrame(input_x_oneHot)], axis=1)

    return output_x 

In [89]:
def lin_model(X_train, Y_train, X_test, Y_test):
    lm = linear_model.LinearRegression()
    lm.fit(X_train, Y_train)
    lm_predict_train = lm.predict(X_train)
    lm_predict_test = lm.predict(X_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, lm_predict_train)))
    #Root mean squared error for train 896.74
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, lm_predict_test)))
    #Root mean squared error for test: 893.58
    return lm

In [90]:
def rf_model(X_train, Y_train, X_test, Y_test):
    rf = RandomForestRegressor(max_depth=3, random_state=0)
    rf.fit(X_train, Y_train)
    rf_predict_train = rf.predict(X_train)
    rf_predict_test = rf.predict(X_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, rf_predict_train)))
    #Root mean squared error for train 952.91
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, rf_predict_test)))
    #Root mean squared error for test: 949.88
    return rf

In [91]:
def gbm_model(X_train, Y_train, X_test, Y_test):
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
              'learning_rate': 0.01, 'loss': 'ls'}
    gbm = ensemble.GradientBoostingRegressor(**params)
    gbm.fit(X_train, Y_train)
    gbm_predict_train = gbm.predict(X_train)
    gbm_predict_test = gbm.predict(X_test)
    print("Root mean squared error for train: %.2f" % math.sqrt(mean_squared_error(Y_train, gbm_predict_train)))
    #Root mean squared error for train:  863.21
    print("Root mean squared error for test: %.2f" % math.sqrt(mean_squared_error(Y_test, gbm_predict_test)))
    #Root mean squared error for test: 867.61
    return gbm

In [92]:
def make_prediction(model, data):
    model_loaded = joblib.load(model)
    pred = model_loaded.predict(data)
    return pred

## Read in Train

In [82]:
Train = pd.read_csv(r'D:\Learn\DoorDash\historical_data.csv')

a0 = create_target(Train)
a1 = create_time_feature(a0)
a = process_continuous_features(a1)
b = impute_market_id(a)
c = bin_store_category(b)
d = bin_order_protocol(c)
e = make_store_id_cont(d)
Train_processed = select_features(e, 'Train')[0]

In [97]:
Train_processed.to_csv(r'D:\Learn\DoorDash\Train_processed.csv', index=False)

In [83]:
target_in_train = Train_processed['duration']
del Train_processed['duration']

In [84]:
NumFeatures = select_features(e, 'Train')[1]
CatFeatures = select_features(e, 'Train')[2]

In [85]:
Train_ready_encoded = scale_oneHot_X(Train_processed, NumFeatures, CatFeatures)

In [None]:
#Train_ready = pd.concat([target_in_train, Train_ready_encoded])

In [67]:
#Train_ready_encoded.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,
            ...
            82, 83, 84, 85, 86, 87, 88, 89, 90, 91],
           dtype='int64', length=101)

In [86]:
X_train, X_test, Y_train, Y_test= train_test_split(Train_ready_encoded, target_in_train, test_size=0.4, random_state=2017)

In [93]:
linear_model = lin_model(X_train, Y_train, X_test, Y_test)

Root mean squared error for train: 1373.49
Root mean squared error for test: 67061827560.87


In [94]:
rf_model = rf_model(X_train, Y_train, X_test, Y_test)

Root mean squared error for train: 1437.61
Root mean squared error for test: 30348.94


In [95]:
gbm_model = gbm_model(X_train, Y_train, X_test, Y_test)

KeyboardInterrupt: 

In [None]:
joblib.dump(linear_model, 'linear_model_saved.pkl')
joblib.dump(rf_model, 'rf_model_saved.pkl')
joblib.dump(gbm_model, 'gbm_model_saved.pkl')

In [None]:
lm_pred = make_prediction('linear_model_saved.pkl', combined_test)

In [None]:
rf_pred = make_prediction('rf_model_saved.pkl', combined_test)

In [None]:
gbm_pred = make_prediction('gbm_model_saved.pkl', combined_test)

In [None]:
print(len(lm_pred),len(rf_pred),len(gbm_pred))

In [None]:
lm_pred[:20]

In [None]:
rf_pred[:20]

In [None]:
gbm_pred[:20]