In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
import pandas_profiling as pp
import statsmodels.api as sm
from scipy import stats
from math import *
import datetime,os

In [2]:
def write_submission_file(prediction, filename,
                          path_to_sample=os.path.join('SampleSubmission.csv')):
    submission = pd.read_csv(path_to_sample, index_col='Order_No')
    
    submission['Time from Pickup to Arrival'] = prediction
    submission.to_csv(filename)

In [3]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [4]:
Train = pd.read_csv("Train.csv")
Test = pd.read_csv("Test.csv")
Rider = pd.read_csv("Riders.csv")
ors_train = pd.read_csv("ors_train.csv")
ors_test = pd.read_csv("ors_test.csv")

In [5]:
Train = pd.merge(Train, Rider, how='left', left_on='Rider Id', right_on='Rider Id', left_index=True)
Test = pd.merge(Test, Rider, how='left', left_on='Rider Id', right_on='Rider Id', left_index=True)

In [6]:
ors_test.dtypes

distance    float64
duration    float64
dtype: object

In [7]:
pp.ProfileReport(Train)



In [6]:
Train.shape

(21201, 33)

In [7]:
Train.columns

Index(['Order No', 'User Id', 'Vehicle Type', 'Platform Type',
       'Personal or Business', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
       'Confirmation - Time', 'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Arrival at Pickup - Time',
       'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time',
       'Arrival at Destination - Day of Month',
       'Arrival at Destination - Weekday (Mo = 1)',
       'Arrival at Destination - Time', 'Distance (KM)', 'Temperature',
       'Precipitation in millimeters', 'Pickup Lat', 'Pickup Long',
       'Destination Lat', 'Destination Long', 'Rider Id',
       'Time from Pickup to Arrival', 'No_Of_Orders', 'Age', 'Average_Rating',
       'No_of_Ratings'],
      dtype='object')

In [8]:
Train.dtypes

Order No                                      object
User Id                                       object
Vehicle Type                                  object
Platform Type                                  int64
Personal or Business                          object
Placement - Day of Month                       int64
Placement - Weekday (Mo = 1)                   int64
Placement - Time                              object
Confirmation - Day of Month                    int64
Confirmation - Weekday (Mo = 1)                int64
Confirmation - Time                           object
Arrival at Pickup - Day of Month               int64
Arrival at Pickup - Weekday (Mo = 1)           int64
Arrival at Pickup - Time                      object
Pickup - Day of Month                          int64
Pickup - Weekday (Mo = 1)                      int64
Pickup - Time                                 object
Arrival at Destination - Day of Month          int64
Arrival at Destination - Weekday (Mo = 1)     

In [9]:
ors_train.columns

Index(['distance', 'duration '], dtype='object')

In [11]:
Train["distance_ors"]= ors_train["distance"]
Train["duration_ors"]= ors_train["duration "]

Test["distance_ors"]= ors_test["distance"]
Test["duration_ors"]= ors_test["distance"]

In [12]:
set(Train.columns) - set(Test.columns)

{'Arrival at Destination - Day of Month',
 'Arrival at Destination - Time',
 'Arrival at Destination - Weekday (Mo = 1)',
 'Time from Pickup to Arrival'}

In [14]:
Train.head(1)

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings,distance_ors,duration_ors
27,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,-1.300406,36.829741,Rider_Id_432,745,1637,1309,13.8,549,18359.0,3781.2


In [15]:
col_x = ["Distance (KM)","Temperature","Placement - Time","Confirmation - Time","Pickup - Time","Arrival at Pickup - Day of Month","Arrival at Pickup - Weekday (Mo = 1)","No_Of_Orders","Age","Average_Rating","No_of_Ratings","distance_ors","duration_ors"]
col_x = sorted(col_x)

col_y = ["Time from Pickup to Arrival"]

Train = Train[col_x+col_y]
Test = Test[col_x]

In [16]:
Test.dtypes

Age                                       int64
Arrival at Pickup - Day of Month          int64
Arrival at Pickup - Weekday (Mo = 1)      int64
Average_Rating                          float64
Confirmation - Time                      object
Distance (KM)                             int64
No_Of_Orders                              int64
No_of_Ratings                             int64
Pickup - Time                            object
Placement - Time                         object
Temperature                             float64
distance_ors                            float64
duration_ors                            float64
dtype: object

In [17]:
def convert_time24(nom,col,data=Train):
  tab = []
  for i in col:
    s = i.split()
    terminaison = s[1]
    if terminaison == "PM":
      time = s[0].split(":")
      time = (int(time[0]) + 12) * 3600 + int(time[1]) * 60 + int(time[2])
      tab.append(time)
    else:
      time = s[0].split(":")
      time = (int(time[0]) + 12) * 3600 + int(time[1]) * 60 + int(time[2])
      tab.append(time)
  data[nom] = tab
  return True

In [18]:
# Train
convert_time24("Placement - Time",Train["Placement - Time"])
convert_time24("Confirmation - Time",Train["Confirmation - Time"])
convert_time24("Pickup - Time",Train["Pickup - Time"])

#Test
convert_time24("Placement - Time",Test["Placement - Time"],Test)
convert_time24("Confirmation - Time",Test["Confirmation - Time"],Test)
convert_time24("Pickup - Time",Test["Pickup - Time"],Test)

True

In [19]:
Train['Temperature'] = Train['Temperature'].fillna(Train['Temperature'].mean())
Test['Temperature'] = Test['Temperature'].fillna(Test['Temperature'].mean())

In [20]:
Train.dtypes

Age                                       int64
Arrival at Pickup - Day of Month          int64
Arrival at Pickup - Weekday (Mo = 1)      int64
Average_Rating                          float64
Confirmation - Time                       int64
Distance (KM)                             int64
No_Of_Orders                              int64
No_of_Ratings                             int64
Pickup - Time                             int64
Placement - Time                          int64
Temperature                             float64
distance_ors                            float64
duration_ors                            float64
Time from Pickup to Arrival               int64
dtype: object

In [21]:
Train.columns[Train.dtypes == 'object']

Index([], dtype='object')

In [22]:
#Train = pd.get_dummies(Train,dummies_col[0],prefix_sep="_")
#Test = pd.get_dummies(Test,dummies_col[0],prefix_sep="_")

NameError: name 'dummies_col' is not defined

In [21]:
Train.shape

(21201, 17)

In [24]:
dummies_col = ["Arrival at Pickup - Day of Month","Arrival at Pickup - Weekday (Mo = 1)"]

In [25]:
#def dummies_fct(Train,Test):
for c in dummies_col:
  dummy = pd.get_dummies(Train[c],prefix=c,prefix_sep="_")
  Train = pd.concat([Train,dummy], axis=1)
  del Train[c]

for c in dummies_col:
  dummy = pd.get_dummies(Test[c],prefix=c,prefix_sep="_")
  Test = pd.concat([Test,dummy], axis=1)
  del Test[c]
#return Train,Test

In [0]:
#dummies_fct(Train,Test)

In [26]:
Train.shape

(21201, 50)

In [27]:
Test.dtypes

Age                                         int64
Average_Rating                            float64
Confirmation - Time                         int64
Distance (KM)                               int64
No_Of_Orders                                int64
No_of_Ratings                               int64
Pickup - Time                               int64
Placement - Time                            int64
Temperature                               float64
distance_ors                              float64
duration_ors                              float64
Arrival at Pickup - Day of Month_1          uint8
Arrival at Pickup - Day of Month_2          uint8
Arrival at Pickup - Day of Month_3          uint8
Arrival at Pickup - Day of Month_4          uint8
Arrival at Pickup - Day of Month_5          uint8
Arrival at Pickup - Day of Month_6          uint8
Arrival at Pickup - Day of Month_7          uint8
Arrival at Pickup - Day of Month_8          uint8
Arrival at Pickup - Day of Month_9          uint8


In [28]:
for c in Train.columns[Train.dtypes == 'uint8']:
  Train[c] = Train[c].astype("int")
  Test[c] = Test[c].astype("int")

In [32]:
Train.dtypes

Age                                         int64
Average_Rating                            float64
Confirmation - Time                         int64
Distance (KM)                               int64
No_Of_Orders                                int64
No_of_Ratings                               int64
Pickup - Time                               int64
Placement - Time                            int64
Temperature                               float64
distance_ors                              float64
duration_ors                              float64
Time from Pickup to Arrival                 int64
Arrival at Pickup - Day of Month_1          int64
Arrival at Pickup - Day of Month_2          int64
Arrival at Pickup - Day of Month_3          int64
Arrival at Pickup - Day of Month_4          int64
Arrival at Pickup - Day of Month_5          int64
Arrival at Pickup - Day of Month_6          int64
Arrival at Pickup - Day of Month_7          int64
Arrival at Pickup - Day of Month_8          int64


In [30]:
Test.dtypes

Age                                         int64
Average_Rating                            float64
Confirmation - Time                         int64
Distance (KM)                               int64
No_Of_Orders                                int64
No_of_Ratings                               int64
Pickup - Time                               int64
Placement - Time                            int64
Temperature                               float64
distance_ors                              float64
duration_ors                              float64
Arrival at Pickup - Day of Month_1          int64
Arrival at Pickup - Day of Month_2          int64
Arrival at Pickup - Day of Month_3          int64
Arrival at Pickup - Day of Month_4          int64
Arrival at Pickup - Day of Month_5          int64
Arrival at Pickup - Day of Month_6          int64
Arrival at Pickup - Day of Month_7          int64
Arrival at Pickup - Day of Month_8          int64
Arrival at Pickup - Day of Month_9          int64


In [79]:
y_train = Train['Time from Pickup to Arrival']
c = list(set(Train.columns) - {"Time from Pickup to Arrival"})
X_train = Train[sorted(c)]
X_test = Test[sorted(c)]

In [80]:
len(Test.columns) == len(X_train.columns)

True

# Model quality check ridge

In [43]:
ridge = Ridge(alpha=0.01, normalize = True, random_state=42)

In [81]:
X_train_,X_test_,y_train_,y_test_ = train_test_split(X_train,y_train,test_size=0.25, random_state=42)

In [47]:
ridge.fit(X_train_, y_train_)
ridge_valid_pred = ridge.predict(X_test_)
#In case delivery time is forecasted negative
ridge_valid_pred[ridge_valid_pred < 0] = 0

In [50]:
#On the leaderboard 785.648
rmse(y_test_, ridge_valid_pred)

807.7403674711878

In [59]:
ridge.fit(X_train, y_train)
print("coefficient: ",ridge.coef_)
print("intercept: ",ridge.intercept_)
print("score:",ridge.score(X_train,y_train))

X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())
ridge_prediction = ridge.predict(X_test)

coefficient:  [ 4.43643119e-03 -4.38608122e+00  1.51794417e+01 -2.94061715e+01
  1.52678697e+01  1.87280078e+01  1.89556596e+01  6.89657521e+00
 -4.46841433e+01 -5.95144855e+01  4.14436330e+01  2.15271196e+01
 -1.63306607e+01  1.47565241e+01  1.73147801e+01 -6.75377595e+01
 -2.07559098e+01 -2.10642501e+01 -1.43538035e+01  2.80092025e+01
 -8.31684043e+01 -1.13843478e+01 -4.26781959e+01  6.97531913e-01
  4.71569148e+01 -3.93758810e+01  2.70554718e+01 -2.62556925e+01
 -3.84575854e+00  1.71042702e+01  6.48844622e+01  6.65446842e+01
 -7.07468012e+01  4.02625848e+01  1.88040997e+01  1.71692047e+01
  4.39683790e+01 -1.36912190e+02 -6.12484530e+01 -3.97441448e+01
 -2.45265341e-03  1.00276282e+02 -9.75125757e-02  2.67829217e-01
 -5.48114171e-04  1.97042977e-03  9.67140556e-01  1.66481481e-03
 -1.91124205e-02]
intercept:  1292.9637560761482
score: 0.3509055152581405
                                 OLS Regression Results                                
Dep. Variable:     Time from Pickup to Arri

In [52]:
len(ridge_prediction)

7068

In [53]:
write_submission_file(prediction=ridge_prediction, filename='ridge.csv')

# xgboost

In [58]:
!pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/c1/24/5fe7237b2eca13ee0cfb100bec8c23f4e69ce9df852a64b0493d49dae4e0/xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl (142.8MB)
[K     |████████████████████████████████| 142.8MB 94kB/s eta 0:00:013
Installing collected packages: xgboost
Successfully installed xgboost-0.90


In [67]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [68]:
model_bost = xgb.XGBRegressor()
model_bost.fit(X_train_,y_train_)




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [72]:
model_bost.fit(X_train, y_train)
#print("coefficient: ",model_bost.coef_)
#print("intercept: ",model_bost.intercept_)
print("score:",model_bost.score(X_train,y_train))

X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())
y_pred_ = model_bost.predict(X_test)

#print(np.sqrt(mean_squared_error(y_test_,y_pred_)))

score: 0.4025838913799211
                                 OLS Regression Results                                
Dep. Variable:     Time from Pickup to Arrival   R-squared:                       0.351
Model:                                     OLS   Adj. R-squared:                  0.350
Method:                          Least Squares   F-statistic:                     243.4
Date:                         Tue, 12 Nov 2019   Prob (F-statistic):               0.00
Time:                                 22:39:09   Log-Likelihood:            -1.7168e+05
No. Observations:                        21201   AIC:                         3.435e+05
Df Residuals:                            21153   BIC:                         3.438e+05
Df Model:                                   47                                         
Covariance Type:                     nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025

In [82]:
len(y_pred_)

7068

In [34]:
Test.shape

(7068, 53)

In [0]:
y_pred = model.predict(X_test)

In [53]:
y_pred = pd.DataFrame({"y_predict":y_pred})
y_pred.head()

Unnamed: 0,y_predict
0,1442.286743
1,1340.821411
2,1241.733765
3,1256.041382
4,1206.420532


In [0]:
y_pred.shape

(7068, 1)

In [0]:
sample = pd.read_csv("SampleSubmission.csv")

In [0]:
sample["Time from Pickup to Arrival"] = y_pred

In [0]:
sample.head()

Unnamed: 0,Order_No,Time from Pickup to Arrival
0,Order_No_19248,1497.625854
1,Order_No_12736,1314.455566
2,Order_No_768,1237.3396
3,Order_No_15332,1227.804443
4,Order_No_21373,1209.96814


In [0]:
sample.to_csv('zindi_10112019_1518.csv',index=False)

In [0]:
# sumettre les deux derniers

# XGBRegressor

In [0]:
#max_depth=6, n_estimators=500,objective="reg:squarederror"

In [74]:
model = xgb.XGBRegressor(n_estimators=300,objective="reg:squarederror")
model.fit(X_train_,y_train_)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [75]:
print("score: ",model.score(X_train_,y_train_))
y_pred = model.predict(X_test_)

score:  0.46401744833574055


In [83]:
len(y_test_) == len(y_pred_)

False

In [85]:
print(len(y_test_))
print(len(y_pred_))

5301
7068


In [76]:
print(np.sqrt(mean_squared_error(y_test_,y_pred_)))

ValueError: Found input variables with inconsistent numbers of samples: [5301, 7068]

In [57]:
y_pred = pd.DataFrame({"y_predict":y_pred})
y_pred.head()

Unnamed: 0,y_predict
0,1819.744995
1,1446.433594
2,1647.509644
3,1195.289551
4,1023.753357


In [0]:
from lightgbm import LGBMRegressor

In [59]:
time_0 = datetime.datetime.now()

lgbm = LGBMRegressor(boosting_type="dart")

lgbm.fit(X_train, y_train)

time_1  = datetime.datetime.now()

print('{} seconds. Best iteration is {}'.format((time_1 - time_0).seconds, lgbm.best_iteration_))

1 seconds. Best iteration is None


In [60]:
lgbm.score(X_train,y_train)

0.42724904374879624

In [0]:
# y_pred pour Test

In [0]:
y_pred = lgbm.predict(X_test)

In [62]:
y_pred = pd.DataFrame({"y_predict":y_pred})
y_pred.head(10)

Unnamed: 0,y_predict
0,1389.017755
1,1086.153784
2,1099.355053
3,1098.723965
4,1047.351737
5,1862.626877
6,2061.411636
7,1417.724939
8,1403.761255
9,1910.315798


In [0]:
y_pred.shape

(7068, 1)

In [0]:
sample["Time from Pickup to Arrival"] = y_pred

In [0]:
sample.to_csv('zindi_11112019_0402.csv',index=False) #762.385182754135

In [0]:
from sklearn.linear_model import LinearRegression

In [0]:
lrm = LinearRegression().fit(X_train,y_train)
lrm.score(X_train,y_train)

0.35313414077039396

In [0]:
y_pred = lrm.predict(X_test)

In [0]:
y_pred = pd.DataFrame({"y_predict":y_pred})
y_pred.head(10)

Unnamed: 0,y_predict
0,1370.483258
1,1099.488429
2,1016.168812
3,1026.421259
4,1044.588263
5,2208.82218
6,2369.882163
7,1427.055335
8,1488.209432
9,2079.503834


In [0]:
from sklearn.ensemble import RandomForestRegressor

In [65]:
regressor = RandomForestRegressor(n_jobs=-1)
regressor.fit(X_train,y_train)
print(regressor.score(X_train,y_train))



0.8811550671061106


In [0]:
y_pred = regressor.predict(X_test)

In [0]:
y_pred = pd.DataFrame({"y_predict":y_pred})

Unnamed: 0,y_predict
0,2011.6
1,1961.2
2,2331.5
3,1818.4
4,1138.6


In [0]:
y_pred.shape

(7068, 1)

In [0]:
sample["Time from Pickup to Arrival"] = y_pred

In [0]:
sample.to_csv('zindi_10112019_1318.csv',index=False)

In [0]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVC
svc=SVC(probability=True, kernel='linear')

In [0]:
y_pred

array([2120.06845238, 2116.79590338, 1837.95959936, ..., 2213.63018681,
       2696.83738411, 2120.06845238])

In [0]:
#@title GradientBoostingRegressor { form-width: "250px" }
from sklearn.ensemble import GradientBoostingRegressor

In [0]:
regressor = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0
)
regressor.fit(X_train, y_train)

In [0]:
y_pred = regressor.predict(X_test)

In [0]:
"""**Imputations**"""

#del data["Precipitation in millimeters"]
#del data["Vehicle Type"]
data['Temperature'] = data['Temperature'].fillna(data['Temperature'].mean())


data2['Temperature'] = data2['Temperature'].fillna(data2['Temperature'].mean())

data.shape

(21201, 71)

In [0]:
def convert_time24(nom,col):
  tab = []
  for i in col:
    s = i.split()
    terminaison = s[1]
    if terminaison == "PM":
      time = s[0].split(":")
      time = (int(time[0]) + 12) * 3600 + int(time[1]) * 60 + int(time[2])
      tab.append(time)
    else:
      time = s[0].split(":")
      time = (int(time[0]) + 12) * 3600 + int(time[1]) * 60 + int(time[2])
      tab.append(time)
  data[nom] = tab
  return True
#exemple
#### convert_time24("Placement - Time",data["Placement - Time"])

In [0]:
convert_time24("Placement - Time",data["Placement - Time"])
convert_time24("Confirmation - Time",data["Confirmation - Time"])
convert_time24("Arrival at Pickup - Time",data["Arrival at Pickup - Time"])
convert_time24("Pickup - Time",data["Pickup - Time"])
convert_time24("Arrival at Destination - Time",data["Arrival at Destination - Time"])

True

In [0]:
data.dtypes

Order No                                      object
User Id                                       object
Platform Type                                  int64
Personal or Business                          object
Placement - Day of Month                       int64
Placement - Weekday (Mo = 1)                   int64
Placement - Time                               int64
Confirmation - Day of Month                    int64
Confirmation - Weekday (Mo = 1)                int64
Confirmation - Time                            int64
Arrival at Pickup - Day of Month               int64
Arrival at Pickup - Weekday (Mo = 1)           int64
Arrival at Pickup - Time                       int64
Pickup - Day of Month                          int64
Pickup - Weekday (Mo = 1)                      int64
Pickup - Time                                  int64
Arrival at Destination - Day of Month          int64
Arrival at Destination - Weekday (Mo = 1)      int64
Arrival at Destination - Time                 

In [0]:
data.describe()

Unnamed: 0,Platform Type,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Confirmation - Time,Arrival at Pickup - Day of Month,Arrival at Pickup - Weekday (Mo = 1),Arrival at Pickup - Time,Pickup - Day of Month,Pickup - Weekday (Mo = 1),Pickup - Time,Arrival at Destination - Day of Month,Arrival at Destination - Weekday (Mo = 1),Arrival at Destination - Time,Distance (KM),Temperature,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Time from Pickup to Arrival
count,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0,21201.0
mean,2.752182,15.653696,3.240083,69275.2348,15.653837,3.240225,69023.818971,15.653837,3.240225,68628.697561,15.653837,3.240225,68268.483232,15.653837,3.240225,67335.408424,9.506533,23.258889,-1.28147,36.811264,-1.282581,36.81122,1556.920947
std,0.625178,8.798916,1.567295,14710.524863,8.798886,1.567228,14703.064869,8.798886,1.567228,14725.837143,8.798886,1.567228,14729.363769,8.798886,1.567228,14596.742398,5.668963,3.222006,0.030507,0.037473,0.034824,0.044721,987.270788
min,1.0,1.0,1.0,46801.0,1.0,1.0,46800.0,1.0,1.0,46803.0,1.0,1.0,46800.0,1.0,1.0,46800.0,1.0,11.2,-1.438302,36.653621,-1.430298,36.606594,1.0
25%,3.0,8.0,2.0,54367.0,8.0,2.0,54427.0,8.0,2.0,54427.0,8.0,2.0,54453.0,8.0,2.0,54351.0,5.0,21.4,-1.300921,36.784605,-1.301201,36.785661,882.0
50%,3.0,15.0,3.0,74742.0,15.0,3.0,74039.0,15.0,3.0,68536.0,15.0,3.0,63952.0,15.0,3.0,61995.0,8.0,23.258889,-1.279395,36.80704,-1.284382,36.808002,1369.0
75%,3.0,23.0,5.0,82968.0,23.0,5.0,82918.0,23.0,5.0,82890.0,23.0,5.0,82944.0,23.0,5.0,82691.0,13.0,25.3,-1.257147,36.829741,-1.261177,36.829477,2040.0
max,4.0,31.0,7.0,89999.0,31.0,7.0,89998.0,31.0,7.0,89998.0,31.0,7.0,89999.0,31.0,7.0,89999.0,49.0,32.1,-1.14717,36.991046,-1.030225,37.016779,7883.0


In [0]:
def histo(param,tit):
  plt.grid()
  plt.hist(param)
  plt.title("Hist of " + tit)

In [0]:
plt.subplot(1,2,1)
histo(data["Temperature"],"Temperature")
plt.subplot(1,2,2)
histo(data["Distance (KM)"],"Distance (KM)")

plt.subplot(1,2,1)
histo(data["Destination Long"],"Destination Long")
plt.subplot(1,2,2)
histo(data["Destination Lat"],"Destination Lat")

plt.subplot(1,2,1)
histo(data["Pickup Long"],"Pickup Long")
plt.subplot(1,2,2)
histo(data["Pickup Lat"],"Pickup Lat")

histo(data["Time from Pickup to Arrival"],"Time from Pickup to Arrival")

histo(data["Placement - Time"],"Placement - Time")
histo(data["Placement - Time"],"Placement - Time")
histo(data["Confirmation - Time"],"Confirmation - Time")
histo(data["Arrival at Pickup - Time"],"Arrival at Pickup - Time")

plt.subplot(1,3,1)
conf = data.groupby('Confirmation - Weekday (Mo = 1)')['Confirmation - Weekday (Mo = 1)'].count()
plt.bar(conf.index,conf.values)
plt.title("Hist of Confirmation")

plt.subplot(1,3,2)
conf = data.groupby('Placement - Weekday (Mo = 1)')['Placement - Weekday (Mo = 1)'].count()
plt.bar(conf.index,conf.values)
plt.title("Hist of Placement")

plt.subplot(1,3,3)
conf = data.groupby('Arrival at Pickup - Weekday (Mo = 1)')['Arrival at Pickup - Weekday (Mo = 1)'].count()
plt.bar(conf.index,conf.values)
plt.title("Hist of Arrival at Pickup")

plt.subplot(1,3,1)
conf1 = data.groupby('Confirmation - Day of Month')['Confirmation - Day of Month'].count()
plt.bar(conf1.index,conf1.values)
plt.title("Hist of Confirmation")

plt.subplot(1,3,2)
conf1 = data.groupby('Placement - Day of Month')['Placement - Day of Month'].count()
plt.bar(conf1.index,conf1.values)
plt.title("Hist of Placement")

plt.subplot(1,3,3)
conf1 = data.groupby('Arrival at Pickup - Day of Month')['Arrival at Pickup - Day of Month'].count()
plt.bar(conf1.index,conf1.values)
plt.title("Hist of Arrival at Pickup")

In [0]:
"""**stat desc bivariée**"""

# data["Distance (KM)"].corr(data["Time from Pickup to Arrival"])
# data[["Distance (KM)","Time from Pickup to Arrival"]].rcorr(stars=False)
# pg.corr(x=data["Distance (KM)"], y=data["Time from Pickup to Arrival"])

data[["Platform Type","Distance (KM)","Temperature","Pickup Lat","Pickup Long","Destination Lat","Destination Long","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time","Time from Pickup to Arrival"]].corr()

Unnamed: 0,Platform Type,Distance (KM),Temperature,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Placement - Time,Confirmation - Time,Arrival at Pickup - Time,Pickup - Time,Arrival at Destination - Time,Time from Pickup to Arrival
Platform Type,1.0,0.005528,0.211494,-0.002742,0.021327,0.006319,0.043924,-0.003403,-0.006287,-0.008741,-0.015789,-0.023077,-0.003827
Distance (KM),0.005528,1.0,-0.00209,-0.124338,0.178537,-0.079366,0.135397,0.026674,0.023085,0.021874,0.018861,-0.009344,0.580608
Temperature,0.211494,-0.00209,1.0,0.01595,0.007824,0.009236,0.023421,-0.27047,-0.281677,-0.289268,-0.291132,-0.296367,0.005772
Pickup Lat,-0.002742,-0.124338,0.01595,1.0,-0.147629,0.114975,0.051399,-0.022462,-0.023947,-0.02557,-0.02409,-0.021798,-0.053823
Pickup Long,0.021327,0.178537,0.007824,-0.147629,1.0,0.014992,0.082803,0.009371,0.00377,0.001545,-0.003455,-0.008282,0.060285
Destination Lat,0.006319,-0.079366,0.009236,0.114975,0.014992,1.0,0.046767,-2.2e-05,-0.004107,-0.00923,-0.012777,-0.007566,-0.061872
Destination Long,0.043924,0.135397,0.023421,0.051399,0.082803,0.046767,1.0,0.00382,0.003877,-0.000725,-0.001366,-0.015781,0.070425
Placement - Time,-0.003403,0.026674,-0.27047,-0.022462,0.009371,-2.2e-05,0.00382,1.0,0.937901,0.830836,0.734769,0.537607,0.00189
Confirmation - Time,-0.006287,0.023085,-0.281677,-0.023947,0.00377,-0.004107,0.003877,0.937901,1.0,0.888654,0.789126,0.583695,-0.005739
Arrival at Pickup - Time,-0.008741,0.021874,-0.289268,-0.02557,0.001545,-0.00923,-0.000725,0.830836,0.888654,1.0,0.895,0.676203,-0.010883


In [0]:
def npt(x,y,lx="",ly="Time from Pickup to Arrival",tit=""):
  plt.scatter(x,y)
  plt.xlabel(lx)
  plt.ylabel(ly)
  plt.title("Scatter of " + tit + " - Time from Pickup to Arrival")

In [0]:
npt(data["Platform Type"],data["Time from Pickup to Arrival"],"Platform Type","","platform type")

npt(data["Distance (KM)"],data["Time from Pickup to Arrival"],"Distance (KM)","","Distance (KM)")

npt(data["Temperature"],data["Time from Pickup to Arrival"],"Temperature","","Temperature")

npt(data["Destination Long"],data["Time from Pickup to Arrival"],"Destination Long","","Destination Long")

npt(data["Destination Lat"],data["Time from Pickup to Arrival"],"Destination Lat","","Destination Lat")

npt(data["Pickup Long"],data["Time from Pickup to Arrival"],"Pickup Long","","Pickup Long")

npt(data["Pickup Lat"],data["Time from Pickup to Arrival"],"Pickup Lat","","Pickup Lat")

In [0]:
"""**Decoupage de la dataset**"""

X = data[["Platform Type","Distance (KM)","Temperature","Pickup Lat","Pickup Long","Destination Lat","Destination Long","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time"]]
y = data["Time from Pickup to Arrival"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.head()

Unnamed: 0,Platform Type,Distance (KM),Temperature,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Placement - Time,Confirmation - Time,Arrival at Pickup - Time,Pickup - Time,Arrival at Destination - Time
15568,3,13,18.2,-1.330929,36.835151,-1.257147,36.795063,75913,75939,77032,77739,79535
11862,1,6,26.6,-1.285397,36.818312,-1.281278,36.781033,49220,49722,50947,51908,52890
4488,3,12,18.0,-1.247525,36.881344,-1.272932,36.811562,75116,75142,75932,76708,77948
5144,3,5,24.4,-1.255189,36.782203,-1.271198,36.82402,55882,55899,55922,56112,57258
20311,2,8,19.9,-1.251921,36.828379,-1.251359,36.794767,79960,79996,80768,81606,82724


In [0]:
"""**Regression Lineaire Multiple**"""

def rl(X_train,y_train,X_test,y_test):
  regression = LinearRegression()
  regression.fit(X_train,y_train)

  y_pred = regression.predict(X_test)
  score_test = regression.score(X_test, y_test)

  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  
  return "score de prediction: " + str(score_test), "rmse: " + str(rmse)

In [0]:
#score = regression.score(X_test, y_test)
rl(X_train,y_train,X_test,y_test)

('score de prediction: 0.3349131991758846', 'rmse: 810.1809365103786')

In [0]:
dummy_platform_type = pd.get_dummies(data["Platform Type"])
dummy_platform_type.columns = ["Platform Type1","Platform Type2","Platform Type3","Platform Type4"]

data = pd.concat([data,dummy_platform_type], axis=1)
del data["Platform Type"]
data.shape

(21201, 30)

In [0]:
X = data[["Distance (KM)","Temperature","Pickup Lat","Pickup Long","Destination Lat","Destination Long","Platform Type1","Platform Type2","Platform Type3","Platform Type4","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time"]]
y = data["Time from Pickup to Arrival"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

rl(X_train,y_train,X_test,y_test)

('score de prediction: 0.33489716690639093', 'rmse: 810.1907013717627')

In [0]:
dummy_personal_business = pd.get_dummies(data["Personal or Business"])
data = pd.concat([data,dummy_personal_business], axis=1)
del data["Personal or Business"]
data.shape

(21201, 31)

In [0]:
X = data[["Distance (KM)","Temperature","Pickup Lat","Pickup Long","Destination Lat","Destination Long","Platform Type1","Platform Type2","Platform Type3","Platform Type4","Personal","Business","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time"]]
y = data["Time from Pickup to Arrival"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rl(X_train,y_train,X_test,y_test)

('score de prediction: 0.3356067638216804', 'rmse: 809.7583905936345')

In [0]:
X = data[["Distance (KM)","Temperature","Platform Type1","Platform Type2","Platform Type3","Platform Type4","Personal","Business","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time"]]
y = data["Time from Pickup to Arrival"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rl(X_train,y_train,X_test,y_test)

('score de prediction: 0.3337478217601695', 'rmse: 810.890432872586')

In [0]:
# nb: nombre colonne; nom: nom des variables; var: variable à renommer
def rename_col(nb,nom,var):
  tab = []
  for i in range(1,nb+1):
    tab.append(nom + str(i))
  var.columns = tab
  return tab

In [0]:
dummy_placement_month = pd.get_dummies(data["Placement - Day of Month"])
dummy_placement_week = pd.get_dummies(data["Placement - Weekday (Mo = 1)"])

In [0]:
rename_col(7,"placement_weekday",dummy_placement_week)
rename_col(31,"placement_month",dummy_placement_month)

In [0]:
del data["Placement - Weekday (Mo = 1)"]
del data["Placement - Day of Month"]

In [0]:
data = pd.concat([data,dummy_placement_week], axis=1)
data = pd.concat([data,dummy_placement_month], axis=1)

In [0]:
data.shape

(21201, 67)

In [0]:
col = ["Distance (KM)","Temperature","Platform Type1","Platform Type2","Platform Type3","Platform Type4","Personal","Business","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time"] + rename_col(7,"placement_weekday",dummy_placement_week) + rename_col(31,"placement_month",dummy_placement_month)
X = data[col]
y = data["Time from Pickup to Arrival"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rl(X_train,y_train,X_test,y_test)

('score de prediction: 0.3353030528492873', 'rmse: 809.943449979965')

In [0]:
# avec lat et long

col = ["Distance (KM)","Pickup Lat","Pickup Long","Destination Lat","Destination Long","Temperature","Platform Type1","Platform Type2","Platform Type3","Platform Type4","Personal","Business","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time"] + rename_col(7,"placement_weekday",dummy_placement_week) + rename_col(31,"placement_month",dummy_placement_month)
X = data[col]
y = data["Time from Pickup to Arrival"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(X_train.shape)

rl(X_train,y_train,X_test,y_test)

(15900, 55)


('score de prediction: 0.3371245426185563', 'rmse: 808.8329320164398')

In [0]:
data = pd.merge(data, data3, how='left', left_on='Rider Id', right_on='Rider Id', left_index=True)
data2 = pd.merge(data2, data3, how='left', left_on='Rider Id', right_on='Rider Id', left_index=True)

In [0]:
train_.head()

Unnamed: 0,Order No,User Id,Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Confirmation - Time,Arrival at Pickup - Day of Month,Arrival at Pickup - Weekday (Mo = 1),Arrival at Pickup - Time,Pickup - Day of Month,Pickup - Weekday (Mo = 1),Pickup - Time,Arrival at Destination - Day of Month,Arrival at Destination - Weekday (Mo = 1),Arrival at Destination - Time,Distance (KM),Temperature,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival,Platform Type1,Platform Type2,Platform Type3,Platform Type4,Business,Personal,placement_weekday1,placement_weekday2,placement_weekday3,placement_weekday4,placement_weekday5,placement_weekday6,placement_weekday7,placement_month1,placement_month2,placement_month3,placement_month4,placement_month5,placement_month6,placement_month7,placement_month8,placement_month9,placement_month10,placement_month11,placement_month12,placement_month13,placement_month14,placement_month15,placement_month16,placement_month17,placement_month18,placement_month19,placement_month20,placement_month21,placement_month22,placement_month23,placement_month24,placement_month25,placement_month26,placement_month27,placement_month28,placement_month29,placement_month30,placement_month31,No_Of_Orders,Age,Average_Rating,No_of_Ratings
27,Order_No_4211,User_Id_633,77746,9,5,78010,9,5,79487,9,5,80850,9,5,81595,4,20.4,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1637,1309,13.8,549
739,Order_No_25375,User_Id_2285,83776,12,5,84201,12,5,85222,12,5,85449,12,5,87442,16,26.4,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,396,339,13.6,69
851,Order_No_1899,User_Id_265,88765,30,2,88964,30,2,89374,30,2,89583,30,2,46838,3,23.258889,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1023,242,12.5,114
806,Order_No_9336,User_Id_1402,77134,15,5,77165,15,5,77876,15,5,78186,15,5,79527,9,19.2,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,886,283,14.5,113
159,Order_No_27883,User_Id_1737,78918,13,1,78978,13,1,79433,13,1,79523,13,1,80737,9,15.4,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2311,872,14.1,533


In [0]:
data2.shape

(7068, 29)

In [0]:
data.shape

(21201, 71)

In [0]:
c = ["No_Of_Orders","Age","Average_Rating","No_of_Ratings"]

# fusion train & riders

col = ["Distance (KM)","Temperature"] + c
X = data[col]
y = data["Time from Pickup to Arrival"]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(X.shape)

#rl(X_train,y_train,X_test,y_test)

(21201, 6)


In [0]:
c = ["No_Of_Orders","Age","Average_Rating","No_of_Ratings"]

# fusion train & riders

col = ["Distance (KM)","Pickup Lat","Pickup Long","Destination Lat","Destination Long","Temperature","Platform Type1","Platform Type2","Platform Type3","Platform Type4","Personal","Business","Placement - Time","Confirmation - Time","Arrival at Pickup - Time","Pickup - Time","Arrival at Destination - Time"] + rename_col(7,"placement_weekday",dummy_placement_week) + rename_col(31,"placement_month",dummy_placement_month) + c
X = data[col]
y = data["Time from Pickup to Arrival"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(X.shape)

rl(X_train,y_train,X_test,y_test)

(21201, 59)


('score de prediction: 0.34786370607037626', 'rmse: 801.1659035373094')

In [0]:
col = ['Distance (KM)', 'Temperature', 'No_Of_Orders', 'Age', 'Average_Rating','No_of_Ratings']
X_2 = data2[col]

In [0]:
print(X.shape)
print(X_2.shape)


(21201, 6)
(7068, 6)


**KNN**

In [0]:
# 2
for i in range(1,10):
  knn = KNeighborsClassifier(n_neighbors=i)
  model = knn.fit(X_train,y_train)
  
  y_pred = model.predict(X_test)
  score = model.score(X_test,y_test)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print("n = ",i," score = ",score," rmse = ", rmse)

n =  1  score =  0.0022637238256932655  rmse =  1079.685842802123
n =  2  score =  0.0035842293906810036  rmse =  978.3144641263015
n =  3  score =  0.003961516694963215  rmse =  1031.5762441254058
n =  4  score =  0.004338803999245426  rmse =  1085.9046901263864
n =  5  score =  0.0047160913035276366  rmse =  1125.7345892361814
n =  6  score =  0.004527447651386531  rmse =  1160.966099867071
n =  7  score =  0.005470665912092058  rmse =  1196.4155155433507
n =  8  score =  0.006036596868515374  rmse =  1232.9864025119969
n =  9  score =  0.005659309564233163  rmse =  1263.8759325774427


Pour n = 9

In [0]:
data.isnull().sum().T

Order No                           0
User Id                            0
Placement - Time                   0
Confirmation - Day of Month        0
Confirmation - Weekday (Mo = 1)    0
                                  ..
placement_month31                  0
No_Of_Orders                       0
Age                                0
Average_Rating                     0
No_of_Ratings                      0
Length: 71, dtype: int64

In [0]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X,y)
y_pred = model.predict(X_2)

In [0]:
y_pred.shape

(7068,)

In [0]:
y_pred = pd.DataFrame(y_pred)

In [0]:
y_pred.shape

(7068, 1)

In [0]:
y_pred.tail()

Unnamed: 0,0
7063,369
7064,180
7065,799
7066,1167
7067,847


In [0]:
sample = pd.read_csv("SampleSubmission.csv")

In [0]:
sample.isnull().sum()

Order_No                          0
Time from Pickup to Arrival    7065
dtype: int64

In [0]:
sample["Time from Pickup to Arrival"] = y_pred

In [0]:
sample.tail()

Unnamed: 0,Order_No,Time from Pickup to Arrival
7063,Order_No_3612,369
7064,Order_No_7657,180
7065,Order_No_1969,799
7066,Order_No_10591,1167
7067,Order_No_1603,847


In [0]:
sample.tail()

Unnamed: 0,Order_No,Time from Pickup to Arrival
7063,Order_No_3612,1829.789474
7064,Order_No_7657,1683.263158
7065,Order_No_1969,1174.947368
7066,Order_No_10591,1522.184211
7067,Order_No_1603,1663.578947


In [0]:
sample.to_csv("001_sample_01.csv")

*KNeighborsRegressor*

In [0]:
# 7
t = []
for i in range(2,40,3):
  knn = KNeighborsRegressor(n_neighbors=i)
  model = knn.fit(X_train,y_train)
  
  y_pred = model.predict(X_test)
  score = model.score(X_test,y_test)

  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  t.append(rmse)
  print("n = ",i," score = ",score," rmse = ", rmse)

n =  2  score =  0.07661001332795325  rmse =  954.6307860883153
n =  5  score =  0.22347001078634157  rmse =  875.4311017216236
n =  8  score =  0.250888179076195  rmse =  859.8371013588471
n =  11  score =  0.245003826976299  rmse =  863.207550905604
n =  14  score =  0.23665391921148907  rmse =  867.967763386024
n =  17  score =  0.23040480778341368  rmse =  871.5133187383449
n =  20  score =  0.22049466013659189  rmse =  877.1066482217224
n =  23  score =  0.20923209730062975  rmse =  883.4202947230488
n =  26  score =  0.20349530176014308  rmse =  886.6189849168949
n =  29  score =  0.1911304789084154  rmse =  893.4743538930035
n =  32  score =  0.18187229513987535  rmse =  898.5730837933725
n =  35  score =  0.1723998416612924  rmse =  903.7600465084274
n =  38  score =  0.16539743973723497  rmse =  907.5753919209567


In [0]:
min(t)

859.8371013588471

In [0]:
knn = KNeighborsRegressor(n_neighbors=8)
knn.fit(X,y)
y_pred = model.predict(X_2)

In [0]:
from xgboost import XGBRegressor

In [0]:
def xbc(X_train,y_train,X_test,y_test):
    model = XGBregressor()
    model.fit(X, y)
    
    y_pred = model.predict(X_2)

    #score_test = model.score(X_test, y_test)
    #rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    #return "score de prediction: " + str(score_test), "rmse: " + str(rmse)

In [0]:
model = XGBRegressor()
model.fit(X, y)
y_pred = model.predict(X_2)

rmse = np.sqrt(mean_squared_error(y_pred, ))
print(rmse)

In [0]:
y_pred = pd.DataFrame(y_pred)

In [0]:
XGBRegressor(max_depth=10)
model.fit(X, y)
y_pred = model.predict(X_2)

Unnamed: 0,0
7063,1478.43335
7064,2651.462402
7065,1662.851929
7066,2469.016602
7067,1436.290161


In [0]:
data2.columns

Index(['Order No', 'User Id', 'Vehicle Type', 'Platform Type',
       'Personal or Business', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
       'Confirmation - Time', 'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Arrival at Pickup - Time',
       'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'Precipitation in millimeters',
       'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long',
       'Rider Id', 'No_Of_Orders', 'Age', 'Average_Rating', 'No_of_Ratings'],
      dtype='object')

In [0]:
X = [['Distance (KM)', 'Temperature', 'No_Of_Orders', 'Age', 'Average_Rating',
       'No_of_Ratings']]

In [0]:
y_test.head()

406    2088
238    1730
438      23
717    3358
24      961
Name: Time from Pickup to Arrival, dtype: int64