### import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import Utilities 

import warnings
warnings.filterwarnings('ignore')

### Read in data

In [2]:
Train = pd.read_csv(r'D:\Learn\DoorDash\historical_data.csv')

In [3]:
Train.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                         object
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

In [4]:
Train.shape

(197428, 16)

In [5]:
Train.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,df263d996281d984952c07998dc54358,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,f0ade77b43923b38237db569b016ba25,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,f0ade77b43923b38237db569b016ba25,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,f0ade77b43923b38237db569b016ba25,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,f0ade77b43923b38237db569b016ba25,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


### Descriptive statistics

#### Summary Table for continuous features

In [7]:
Utilities.create_summary_table(Train)[0]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing,pct_missing,unique,DataType
market_id,196441.0,2.978706,1.524867,1.0,2.0,3.0,4.0,6.0,987,0.005024,7,float64
order_protocol,196433.0,2.882352,1.503771,1.0,1.0,3.0,4.0,7.0,995,0.005065,8,float64
total_items,197428.0,3.196391,2.666546,1.0,2.0,3.0,4.0,411.0,0,0.0,57,int64
subtotal,197428.0,2682.331402,1823.093688,0.0,1400.0,2200.0,3395.0,27100.0,0,0.0,8368,int64
num_distinct_items,197428.0,2.670791,1.630255,1.0,1.0,2.0,3.0,20.0,0,0.0,20,int64
min_item_price,197428.0,686.21847,522.038648,-86.0,299.0,595.0,949.0,14700.0,0,0.0,2312,int64
max_item_price,197428.0,1159.58863,558.411377,0.0,800.0,1095.0,1395.0,14700.0,0,0.0,2652,int64
total_onshift_dashers,181166.0,44.808093,34.526783,-4.0,17.0,37.0,65.0,171.0,16262,0.089763,173,float64
total_busy_dashers,181166.0,41.739747,32.145733,-5.0,15.0,34.0,62.0,154.0,16262,0.089763,160,float64
total_outstanding_orders,181166.0,58.050065,52.66183,-6.0,17.0,41.0,85.0,285.0,16262,0.089763,282,float64


#### Summary Table for categorical features

In [8]:
Utilities.create_summary_table(Train)[1]

Unnamed: 0,missing,unique,DataType
created_at,0,180985,object
actual_delivery_time,7,178111,object
store_id,0,6743,object
store_primary_category,4760,75,object


### Process Data

In [None]:
Train_with_label = Utilities.create_target(Train)

In [None]:
Train_with_label.head()

In [None]:
Train_with_label_time = Utilities.create_time_feature(Train_with_label)

In [None]:
Train_with_label_time.head()

In [None]:
# check the outliers of target variable
dur = Train_with_label_time['duration'].tolist()
dur2 = [x/3600 for x in dur]
len([x for x in dur2 if x > 2])

### EDA plots

In [None]:
# Filter duration > 7200 seconds orders, 1090 records removed
Train_with_label_time = Train_with_label_time.loc[Train_with_label_time['duration'] <= 7200]

duration_by_hour = Train_with_label_time.groupby(['created_at_hour'], as_index=False)['duration'].mean()
duration_by_dayOfWeek = Train_with_label_time.groupby(['created_at_dayOfWeek'], as_index=False)['duration'].mean()
duration_by_IsWeekend = Train_with_label_time.groupby(['created_at_isWeekend'], as_index=False)['duration'].mean()
duration_by_IsHoliday = Train_with_label_time.groupby(['created_at_isHoliday'], as_index=False)['duration'].mean()
duration_by_month = Train_with_label_time.groupby(['created_at_month'], as_index=False)['duration'].mean()

duration_by_orderProcotal = Train_with_label_time.groupby(['order_protocol'], as_index=False)['duration'].mean()
duration_by_totalItems = Train_with_label_time.groupby(['total_items'], as_index=False)['duration'].mean()
duration_by_distinctItems = Train_with_label_time.groupby(['num_distinct_items'], as_index=False)['duration'].mean()


In [None]:
Utilities.create_plot(duration_by_hour, 'created_at_hour', 'duration')

In [None]:
Utilities.create_plot(duration_by_dayOfWeek, 'created_at_dayOfWeek', 'duration')

In [None]:
Utilities.create_plot(duration_by_IsWeekend, 'created_at_isWeekend', 'duration')

In [None]:
Utilities.create_plot(duration_by_IsHoliday, 'created_at_isHoliday', 'duration')

In [None]:
Utilities.create_plot(duration_by_month, 'created_at_month', 'duration')

In [None]:
Utilities.create_plot(duration_by_orderProcotal, 'order_protocol', 'duration')

In [None]:
# Total_items = 411 is an outlier
Utilities.create_plot(duration_by_totalItems[duration_by_totalItems['total_items']<411], 'total_items', 'duration')

In [None]:
Utilities.create_plot(duration_by_distinctItems, 'num_distinct_items', 'duration')