In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import math 

from sklearn.ensemble import RandomForestRegressor
from sklearn import ensemble

from sklearn.externals import joblib

import json

import warnings
warnings.filterwarnings('ignore')

In [2]:
def create_time_feature(df):
    
    #create created_at_year, created_at_month, created_at_day, created_at_date, created_at_dayOfWeek, 
    #created_at_time, created_at_hour, created_at_minute, created_at_second, created_at_isWeekend,
    #created_at_isHoliday
    
    df['created_at_datetime'] = df['created_at'].astype("datetime64[s]")
    #df['actual_delivery_time_datetime'] = df['actual_delivery_time'].astype("datetime64[s]")

    df['created_at_year'], df['created_at_month'], df['created_at_day'], df['created_at_date'], df['created_at_dayOfWeek'], df['created_at_time'], df['created_at_hour'], df['created_at_minute'], df['created_at_second'] = df['created_at_datetime'].dt.year, df['created_at_datetime'].dt.month, df['created_at_datetime'].dt.day, df['created_at_datetime'].dt.date, df['created_at_datetime'].dt.dayofweek, df['created_at_datetime'].dt.time, df['created_at_datetime'].dt.hour, df['created_at_datetime'].dt.minute, df['created_at_datetime'].dt.second

    df.loc[df['created_at_dayOfWeek'].isin([5, 6]), 'created_at_isWeekend'] = 1
    df.loc[df['created_at_dayOfWeek'].isin([0, 1, 2, 3, 4]), 'created_at_isWeekend'] = 0

    cal = calendar()
    holidays = cal.holidays(start=df['created_at_date'].min(), end=df['created_at_date'].max())
    df['created_at_isHoliday'] = np.where(df.created_at_datetime.dt.normalize().isin(holidays), 1, 0)

    return df

In [3]:
def process_continuous_features(df):
    
    def bin_num(x, a=251, b=446):
        if x == a:
             return 'fast'
        elif x == b:
             return 'slow'
        else:
            return 'other'
    
    #df = df.loc[df['total_items'] < 20]
    #df = df.loc[df['subtotal'] < 12000]
    #df = df.loc[df['num_distinct_items'] < 16]
    #df = df.loc[(df['min_item_price'] > 0) & (df['min_item_price'] <= 5000)]
    #df = df.loc[(df['max_item_price'] > 0) & (df['max_item_price'] <= 5000)]
    
    #df = df.loc[df['total_onshift_dashers'] > 0]
    df['total_onshift_dashers'] = df['total_onshift_dashers'].fillna(int(df['total_onshift_dashers'].mean()))
    
    #df = df.loc[df['total_busy_dashers'] > 0]
    df['total_busy_dashers'] = df['total_busy_dashers'].fillna(int(df['total_busy_dashers'].mean()))
    
    #df = df.loc[df['total_outstanding_orders'] > 0]
    df['total_outstanding_orders'] = df['total_outstanding_orders'].fillna(int(df['total_outstanding_orders'].mean()))
    
    df['estimated_order_place_duration_rebinned'] =  df['estimated_order_place_duration'].apply(bin_num)
    df['estimated_store_to_consumer_driving_duration'] = df['estimated_store_to_consumer_driving_duration'].fillna(int(df['estimated_store_to_consumer_driving_duration'].mean()))

    return df

In [4]:
# calculate number of orders for each store
def make_store_id_cont(df):
    store_counts_df = pd.DataFrame(df['store_id'].value_counts().reset_index().rename(columns={'index': 'store_id', 0: 'store_id_count'}))
    store_counts_df.columns = ['store_id', 'store_id_count']
    store_counts_df = store_counts_df.sort_values(by='store_id', ascending=True)
    df = pd.merge(df, store_counts_df, on='store_id', how='left')
    df['store_id_rebinned'] = df['store_id']
    df['store_id_rebinned'][(df['store_id_count'] <500) & (df['store_id_count'] >= 400)] = '[400, 500)'
    df['store_id_rebinned'][(df['store_id_count'] <400) & (df['store_id_count'] >= 200)] = '[200, 400)'
    df['store_id_rebinned'][(df['store_id_count'] <200) & (df['store_id_count'] >= 50)] = '[50, 200)'
    df['store_id_rebinned'][df['store_id_count'] <50] = '[0, 50)'

    return df

In [5]:
def bin_store_category(df):
    df['store_primary_category'][df['store_primary_category'].isnull()] = 'Unknown'
    
    store_primary_category_counts_df = pd.DataFrame(df['store_primary_category'].value_counts().reset_index().rename(columns={'index': 'store_primary_category', 0: 'store_primary_category_count'}))
    store_primary_category_counts_df.columns = ['store_primary_category', 'store_primary_category_count']
    df = pd.merge(df, store_primary_category_counts_df, on='store_primary_category', how='left')
    
    #lst_store_primary_category=df['store_primary_category'].tolist()
    #lst_store_primary_category_cnt = df['store_primary_category_count'].tolist()
    #lst_store_primary_category_bin = [lst_store_primary_category[i] if lst_store_primary_category_cnt[i] > 300 else "other2" for i in range(len(lst_store_primary_category)) ]   
    #df['store_primary_category_rebinned'] = lst_store_primary_category_bin
    
    df['store_primary_category_rebinned'] = df['store_primary_category']
    df['store_primary_category_rebinned'][df['store_primary_category_rebinned'].isnull()] = 'Unknown'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <3000) & (df['store_primary_category_count'] >= 2000)] = '[2000, 3000)'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <2000) & (df['store_primary_category_count'] >= 1000)] = '[1000, 2000)'
    df['store_primary_category_rebinned'][(df['store_primary_category_count'] <1000) & (df['store_primary_category_count'] >= 200)] = '[200, 1000)'
    df['store_primary_category_rebinned'][df['store_primary_category_count'] <200] = '[0, 200)'
    return df

In [76]:
def impute_market_id(df):
    df['market_id'][df['market_id'].isnull()] = 0
    return df

In [77]:
def bin_order_protocol(df):
    df['order_protocol'][df['order_protocol'].isnull()] = 0
    df['order_protocol'].loc[df['order_protocol'] == 6] = 0
    df['order_protocol'].loc[df['order_protocol'] == 7] = 0
    return df

In [78]:
def select_features(df,TrainOrScore):
    if TrainOrScore == 'Train':
        TrainFeatures = df[['duration', 'market_id', 'store_id_rebinned', 'store_primary_category_rebinned',
                            'order_protocol',  'total_items', 'subtotal', 'num_distinct_items', 'min_item_price',
                            'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders',
                            'estimated_store_to_consumer_driving_duration', 'created_at_month', 'created_at_dayOfWeek',
                            'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday',
                            'estimated_order_place_duration_rebinned']]
    else:
        TrainFeatures = df[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned',
                    'order_protocol',  'total_items', 'subtotal', 'num_distinct_items', 'min_item_price',
                    'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders',
                    'estimated_store_to_consumer_driving_duration', 'created_at_month', 'created_at_dayOfWeek',
                    'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday',
                    'estimated_order_place_duration_rebinned']]
        
    TrainFeatures[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned', 'order_protocol', 'created_at_month', 'created_at_dayOfWeek', 'created_at_hour',  'created_at_isWeekend', 'created_at_isHoliday', 'estimated_order_place_duration_rebinned']] = TrainFeatures[['market_id', 'store_id_rebinned', 'store_primary_category_rebinned', 'order_protocol', 'created_at_month', 'created_at_dayOfWeek', 'created_at_hour',  'created_at_isWeekend', 'created_at_isHoliday', 'estimated_order_place_duration_rebinned']].astype(object)
    NumFeatures = ['total_items', 'subtotal', 'num_distinct_items', 'min_item_price',  'max_item_price','total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders', 'estimated_store_to_consumer_driving_duration']
    CatFeatures = ['market_id', 'store_id_rebinned', 'store_primary_category_rebinned',  'order_protocol', 'created_at_month', 'created_at_dayOfWeek', 'created_at_hour', 'created_at_isWeekend', 'created_at_isHoliday', 'estimated_order_place_duration_rebinned']
    return TrainFeatures, NumFeatures, CatFeatures

In [79]:
def scale_oneHot_X(input_x, features_num, features_cat):
    # scale numerical features
    input_x_scale = scale(input_x[features_num])
    
    # OneHot cat features
    le=LabelEncoder()
    enc = OneHotEncoder()
    
    Cat_Train = input_x[features_cat].apply(le.fit_transform)
    enc.fit(Cat_Train)
    input_x_oneHot = enc.transform(Cat_Train).toarray()
    
    output_x = pd.concat([pd.DataFrame(input_x_scale), pd.DataFrame(input_x_oneHot)], axis=1)

    return output_x 

In [80]:
def make_prediction(model, data):
    model_loaded = joblib.load(model)
    pred = model_loaded.predict(data)
    return pred

In [81]:
def load_unlabeled_data(input_file):
    loaded_data = []
    with open(input_file) as f:
        for line in f:
            loaded_data.append(json.loads(line))
            
    created_at_lst = [x['created_at'] for x in loaded_data]
    delivery_id_lst = [x['delivery_id'] for x in loaded_data]
    estimated_order_place_duration_lst = [x['estimated_order_place_duration'] for x in loaded_data]
    estimated_store_to_consumer_driving_duration_lst = [x['estimated_store_to_consumer_driving_duration'] for x in loaded_data]
    market_id_lst = [x['market_id'] for x in loaded_data]
    max_item_price_lst = [x['max_item_price'] for x in loaded_data]
    min_item_price_lst = [x['min_item_price'] for x in loaded_data]
    num_distinct_items_lst = [x['num_distinct_items'] for x in loaded_data]
    order_protocol_lst = [x['order_protocol'] for x in loaded_data]
    platform_lst = [x['platform'] for x in loaded_data]
    store_id_lst = [x['store_id'] for x in loaded_data]
    store_primary_category_lst = [x['store_primary_category'] for x in loaded_data]
    subtotal_lst = [x['subtotal'] for x in loaded_data]
    total_busy_dashers_lst = [x['total_busy_dashers'] for x in loaded_data]
    total_items_lst = [x['total_items'] for x in loaded_data]
    total_onshift_dashers_lst = [x['total_onshift_dashers'] for x in loaded_data]
    total_outstanding_orders_lst = [x['total_outstanding_orders'] for x in loaded_data]
    
    unlabled_df = pd.DataFrame(
        {'created_at': created_at_lst,
         'delivery_id': delivery_id_lst,
         'estimated_order_place_duration': estimated_order_place_duration_lst,
         'estimated_store_to_consumer_driving_duration': estimated_store_to_consumer_driving_duration_lst,
         'market_id': market_id_lst,
         'max_item_price': max_item_price_lst,
         'min_item_price': min_item_price_lst,
         'num_distinct_items': num_distinct_items_lst,
         'order_protocol': order_protocol_lst,
         'platform': platform_lst,
         'store_id': store_id_lst,
         'store_primary_category': store_primary_category_lst,
         'subtotal': subtotal_lst,
         'total_busy_dashers': total_busy_dashers_lst,
         'total_items': total_items_lst,
         'total_onshift_dashers': total_onshift_dashers_lst,
         'total_outstanding_orders': total_outstanding_orders_lst
        })
    return unlabled_df

In [82]:
def create_target(df):
    # drop those records that have missing actual delivery time
    df = df[pd.notnull(df['actual_delivery_time'])]
    #df['created_at_datetime'] = df['created_at'].astype("datetime64[s]")
    #df['actual_delivery_time_datetime'] = df['actual_delivery_time'].astype("datetime64[s]")
    df['duration'] = df['actual_delivery_time'].astype("datetime64[s]") - df['created_at'].astype("datetime64[s]")
    df['duration'] = df['duration'] / np.timedelta64(1, 's')
    return df

## Load in unlabelled data

In [83]:
unlabeled_json = r'D:/Learn/DoorDash/data_to_predict.json'
unlabeled_df = load_unlabeled_data(unlabeled_json)

In [84]:
unlabeled_df.shape

(54778, 17)

In [85]:
unlabeled_df[['market_id','estimated_order_place_duration','estimated_store_to_consumer_driving_duration',
             'max_item_price','min_item_price', 'num_distinct_items', 'order_protocol',
             'subtotal','total_onshift_dashers','total_busy_dashers','total_items','total_onshift_dashers',
             'total_outstanding_orders']] = unlabeled_df[['market_id','estimated_order_place_duration','estimated_store_to_consumer_driving_duration',
             'max_item_price','min_item_price', 'num_distinct_items', 'order_protocol',
             'subtotal','total_onshift_dashers','total_busy_dashers','total_items','total_onshift_dashers',
             'total_outstanding_orders']].apply(pd.to_numeric, errors='coerce')


In [86]:
a1 = create_time_feature(unlabeled_df)
a = process_continuous_features(a1)
b = impute_market_id(a)
c = bin_store_category(b)
d = bin_order_protocol(c)
e = make_store_id_cont(d)
unlabeled_ready = select_features(e, 'Test')[0]

In [87]:
unlabeled_ready.shape

(54778, 19)

In [88]:
Train = pd.read_csv(r'D:\Learn\DoorDash\historical_data.csv')

a0 = create_target(Train)
a1 = create_time_feature(a0)
a = process_continuous_features(a1)
b = impute_market_id(a)
c = bin_store_category(b)
d = bin_order_protocol(c)
e = make_store_id_cont(d)
Train_ready = select_features(e, 'Train')[0]
del Train_ready['duration'] 

In [89]:
train_plus_unlabel = pd.concat([Train_ready, unlabeled_ready], axis=0)

In [90]:
NumFeatures = select_features(e, 'Test')[1]
CatFeatures = select_features(e, 'Test')[2]

In [91]:
train_plus_unlabel_encoded = scale_oneHot_X(train_plus_unlabel, NumFeatures,CatFeatures)

In [92]:
unlabeled_ready2 = train_plus_unlabel_encoded.tail(unlabeled_ready.shape[0])

In [93]:
lm_pred = make_prediction('linear_model_saved.pkl', unlabeled_ready2)

In [94]:
rf_pred = make_prediction('rf_model_saved.pkl', unlabeled_ready2)

In [95]:
gbm_pred = make_prediction('gbm_model_saved.pkl', unlabeled_ready2)

In [96]:
print(len(lm_pred),len(rf_pred),len(gbm_pred))

54778 54778 54778


In [97]:
lm_pred[:20]

array([ 3820.3125 ,  3248.09375,  3368.25   ,  3070.96875,  2790.34375,
        4142.     ,  3298.09375,  2678.125  ,  2294.34375,  2667.71875,
        1797.40625,  2229.09375,  2923.25   ,  2820.78125,  2658.90625,
        2527.625  ,  2370.90625,  2262.71875,  2708.59375,  2937.03125])

In [98]:
rf_pred[:20]

array([ 3566.84448195,  2625.11019812,  3202.67299455,  2879.63629676,
        2744.99607565,  3057.91951435,  2879.63629676,  2528.64391073,
        2434.83114527,  2960.06055137,  2434.83114527,  2625.11019812,
        3057.91951435,  3057.91951435,  3041.92910632,  2672.85212705,
        2434.83114527,  2625.11019812,  2960.06055137,  3122.01565049])

In [99]:
gbm_pred[:20]

array([ 3554.78663092,  3164.58918312,  3418.20169046,  3185.3913607 ,
        2972.63129221,  3718.89304481,  3539.30491931,  2764.68586825,
        2333.9713622 ,  2935.8126484 ,  2057.11700156,  2433.50642321,
        2839.21028669,  2852.16433479,  2758.2507588 ,  2804.78376313,
        2356.6763389 ,  2436.38133445,  2773.508449  ,  2918.31572916])

In [100]:
id = unlabeled_df['delivery_id'].tolist()

In [101]:
len(id)

54778

In [102]:
output_df = pd.DataFrame({'delivery_id': unlabeled_df['delivery_id'], 'predicted_delivery_seconds': gbm_pred})

In [103]:
output_df.to_csv(r'D:\Learn\DoorDash\predictions.csv', index=False)