## Sendy Logistics Challenge

#### Predict the estimated time of arrival (ETA) for motorbike deliveries in Nairobi

**Task**: Predict the time in seconds for Nairobian bike deliveries

**Approach**:

    - Linear Regression (quantitative response)
    - transit_time ~ orders + age + ?

In [115]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import missingno as msno

from lib.preprocessing import *

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [103]:
riders = pd.read_csv("data/riders.csv")
sampsub = pd.read_csv("data/sample_submission.csv")
test = pd.read_csv("data/test.csv")
clean = load_sample(sql_db="data/cleaned_alex_emily_nelson.db")

In [104]:
clean.drop(["Vehicle Type"], axis=1, inplace=True)

In [105]:
def get_seconds_from_dt_series(series: pd.Series) -> pd.Series:
        return pd.to_datetime(series).dt.hour * 3600 + pd.to_datetime(series).dt.minute * 60 + pd.to_datetime(series).dt.second
    
clean["Placement - Time"]=get_seconds_from_dt_series(clean["Placement - Time"])
clean["Confirmation - Time"]=get_seconds_from_dt_series(clean["Confirmation - Time"])
clean["Arrival at Pickup - Time"]=get_seconds_from_dt_series(clean["Arrival at Pickup - Time"])
clean["Pickup - Time"]=get_seconds_from_dt_series(clean["Pickup - Time"])
clean_one = pd.get_dummies(clean["Personal or Business"])

In [106]:
# display result
clean = pd.concat((clean_one, clean), axis=1)
clean = clean.drop(["Personal or Business"], axis=1)
clean = clean.drop(["Personal"], axis=1)
clean = clean.rename(columns={"Business": "Personal or Business"})
clean

Unnamed: 0,Personal or Business,index,order_no,user_id,Platform Type,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Time,Arrival at Pickup - Time,...,Destination Lat,Destination Long,rider_id,Time from Pickup to Arrival,Fulfillment - Weekday (Su = 0),Fulfillment - Day of Month,orders,age,average_rating,number_rating
0,1,5613,24316,346,3,15,1,56928,57012,57542,...,-1.288780,36.816831,709,1188,1,15,1023,294,14.2,145
1,1,1757,13424,299,3,14,5,60564,60598,63282,...,-1.346562,36.767784,139,7,5,14,676,153,14.4,66
2,1,886,27662,1500,3,3,3,42824,43002,44419,...,-1.263818,36.793006,155,4843,3,3,1023,242,12.5,114
3,1,3493,7052,1459,3,4,2,56785,56831,57126,...,-1.342955,36.726256,848,2124,2,4,1043,771,14.1,76
4,1,18572,14825,909,3,9,2,39683,39892,41007,...,-1.288013,36.830810,660,2675,2,9,1062,573,13.2,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115,1,6307,8619,1162,2,9,2,32652,32842,33694,...,-1.266126,36.803036,329,1523,2,9,4890,1103,14.1,1011
2116,1,9582,17604,2801,3,25,1,50577,50600,51180,...,-1.259542,36.787118,466,1355,1,25,2147,1007,14.1,311
2117,1,20157,18170,203,3,30,3,35849,35940,37585,...,-1.296310,36.768822,774,1296,3,30,3178,830,13.5,759
2118,1,18643,19619,3283,3,5,6,57043,57083,57503,...,-1.262847,36.781805,178,470,6,5,2684,788,14.2,343


In [111]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(clean[clean.columns[clean.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = clean.columns[clean.columns!='Time from Pickup to Arrival'])


Unnamed: 0,Personal or Business,index,order_no,user_id,Platform Type,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Time,Arrival at Pickup - Time,...,Pickup Long,Destination Lat,Destination Long,rider_id,Fulfillment - Weekday (Su = 0),Fulfillment - Day of Month,orders,age,average_rating,number_rating
0,0.468821,-0.797197,1.205166,-1.279197,0.396333,-0.066159,-1.394862,1.157223,1.113755,1.086354,...,1.822436,-0.184145,0.119552,0.855206,-1.371691,-0.066271,-0.463323,-1.070681,0.338083,-0.504150
1,0.468821,-1.419976,-0.118262,-1.320244,0.396333,-0.181409,1.146700,1.558086,1.507570,1.716118,...,-0.098823,-1.829990,-1.003806,-1.181994,1.188155,-0.181525,-0.671806,-1.283101,0.544695,-0.691269
2,0.468821,-1.560651,1.611720,-0.271354,0.396333,-1.449148,-0.124081,-0.397722,-0.424822,-0.353437,...,0.905367,0.526848,-0.426136,-1.124809,-0.091768,-1.449322,-0.463323,-1.149020,-1.418118,-0.577577
3,0.468821,-1.139597,-0.892489,-0.307161,0.396333,-1.333899,-0.759471,1.141457,1.093878,1.040712,...,-0.692459,-1.727253,-1.954942,1.351996,-0.731729,-1.334068,-0.451306,-0.352067,0.234777,-0.667583
4,0.468821,1.295799,0.051966,-0.787502,0.396333,-0.757654,-0.759471,-0.744013,-0.766362,-0.727785,...,-0.052929,-0.162295,0.439709,0.680078,-0.731729,-0.757796,-0.439891,-0.650359,-0.694976,-0.610737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2115,0.468821,-0.685110,-0.702092,-0.566545,-1.198025,-0.757654,-0.759471,-1.519170,-1.540592,-1.530131,...,-0.425095,0.461111,-0.196409,-0.502927,-0.731729,-0.757796,1.860029,0.148100,0.234777,1.547054
2116,0.468821,-0.156168,0.389627,0.864871,0.396333,1.086331,-1.394862,0.457034,0.409590,0.388346,...,1.559117,0.648652,-0.560989,-0.013285,-1.371691,1.086271,0.211993,0.003474,0.234777,-0.110963
2117,0.468821,1.551791,0.458399,-1.404086,0.396333,1.662576,-0.124081,-1.166705,-1.200370,-1.103230,...,-1.466217,-0.398617,-0.980026,1.087518,-0.091768,1.662542,0.831434,-0.263182,-0.385058,0.950168
2118,0.468821,1.307266,0.634459,1.285825,0.396333,-1.218650,1.782090,1.169901,1.121553,1.082075,...,0.162945,0.554511,-0.682671,-1.042606,1.828116,-1.218813,0.534631,-0.326456,0.338083,-0.035168


In [109]:
# fitting model
olsmod = LinearRegression()
olsmod.fit(X=X, y=clean["Time from Pickup to Arrival"])
pd.Series(olsmod.coef_, index=X.columns).sort_values()

Placement - Day of Month         -1652.185083
Pickup - Time                    -1646.124052
Placement - Time                  -495.059782
Confirmation - Time               -336.687968
orders                            -121.794855
Placement - Weekday (Mo = 1)       -78.666580
Personal or Business               -77.525707
Pickup Long                        -30.111285
average_rating                     -28.220765
rider_id                           -26.862387
Destination Long                   -24.435464
Destination Lat                     -9.080020
index                                3.653610
Pickup Lat                           6.551985
order_no                             9.394909
user_id                             13.093145
age                                 40.725740
Platform Type                       56.355962
Temperature                         57.206720
number_rating                       73.590982
Fulfillment - Weekday (Su = 0)      85.685715
Distance (KM)                     

In [116]:
rfe = RFECV(estimator=olsmod, cv=5, scoring="neg_mean_squared_error")
rfe.fit(X=X, y=clean["Time from Pickup to Arrival"])
print('Selected', rfe.n_features_, 'features:')
X.columns[rfe.support_]

Selected 20 features:


Index(['Personal or Business', 'user_id', 'Platform Type',
       'Placement - Day of Month', 'Placement - Weekday (Mo = 1)',
       'Placement - Time', 'Confirmation - Time', 'Arrival at Pickup - Time',
       'Pickup - Time', 'Distance (KM)', 'Temperature', 'Pickup Long',
       'Destination Long', 'rider_id', 'Fulfillment - Weekday (Su = 0)',
       'Fulfillment - Day of Month', 'orders', 'age', 'average_rating',
       'number_rating'],
      dtype='object')