# Problem Statement

**Food Delivery services like Zomato and Swiggy need to show the accurate time it will take to deliver your order to keep transparency with their customers. These companies use Machine Learning algorithms to predict the food delivery time based on how much time the delivery partners took for the same distance in the past.**

**To predict the food delivery time in real-time, we need to calculate the distance between the food preparation point and the point of food consumption. After finding the distance between the restaurant and the delivery locations, we need to find relationships between the time taken by delivery partners to deliver the food in the past for the same distance.**



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import radians,sin,cos,sqrt,atan2
import warnings
import folium
warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer

from sklearn.model_selection import train_test_split

In [2]:

df =pd.read_csv(r"C:\Users\mukesk2\OneDrive - kochind.com\Tutorials\MLOPS\Engg_Bhaiya\finalTrain.csv")

In [3]:
df.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [4]:
df.shape

(45584, 20)

In [5]:
df.columns

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'Time_taken (min)'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45584 non-null  object 
 1   Delivery_person_ID           45584 non-null  object 
 2   Delivery_person_Age          43730 non-null  float64
 3   Delivery_person_Ratings      43676 non-null  float64
 4   Restaurant_latitude          45584 non-null  float64
 5   Restaurant_longitude         45584 non-null  float64
 6   Delivery_location_latitude   45584 non-null  float64
 7   Delivery_location_longitude  45584 non-null  float64
 8   Order_Date                   45584 non-null  object 
 9   Time_Orderd                  43853 non-null  object 
 10  Time_Order_picked            45584 non-null  object 
 11  Weather_conditions           44968 non-null  object 
 12  Road_traffic_density         44983 non-null  object 
 13  Vehicle_conditio

In [7]:
type(df.shape)


tuple

In [8]:
df.isnull().sum()

ID                                0
Delivery_person_ID                0
Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Order_Date                        0
Time_Orderd                    1731
Time_Order_picked                 0
Weather_conditions              616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken (min)                  0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Vehicle_condition,multiple_deliveries,Time_taken (min)
count,43730.0,43676.0,45584.0,45584.0,45584.0,45584.0,45584.0,44591.0,45584.0
mean,29.566911,4.633774,17.017948,70.229684,17.46548,70.844161,1.023385,0.744635,26.293963
std,5.815064,0.334744,8.185674,22.885575,7.335562,21.120578,0.839055,0.57251,9.384298
min,15.0,1.0,-30.905562,-88.366217,0.01,0.01,0.0,0.0,10.0
25%,25.0,4.5,12.933284,73.17,12.988453,73.28,0.0,0.0,19.0
50%,30.0,4.7,18.55144,75.897963,18.633934,76.002574,1.0,1.0,26.0
75%,35.0,4.9,22.728163,78.044095,22.785049,78.107044,2.0,1.0,32.0
max,50.0,6.0,30.914057,88.433452,31.054057,88.563452,3.0,3.0,54.0


In [10]:
for i in df.columns:
    print(f"no of unique value in {i} is : {df[i].nunique()}")
    print("###########################################")
    if (df[i].nunique())<8:
        print(f"    unique values are : {df[i].unique()}")
        print("*************************************")
    else:
        pass

no of unique value in ID is : 45584
###########################################
no of unique value in Delivery_person_ID is : 1320
###########################################
no of unique value in Delivery_person_Age is : 22
###########################################
no of unique value in Delivery_person_Ratings is : 28
###########################################
no of unique value in Restaurant_latitude is : 657
###########################################
no of unique value in Restaurant_longitude is : 518
###########################################
no of unique value in Delivery_location_latitude is : 4373
###########################################
no of unique value in Delivery_location_longitude is : 4373
###########################################
no of unique value in Order_Date is : 44
###########################################
no of unique value in Time_Orderd is : 176
###########################################
no of unique value in Time_Order_picked is : 193
##############

In [11]:
# create some features with the help of our existin features
feature =[]
dtypes =[]
count=[]
unique=[]
missing =[]
missing_percentage =[]

for column in df.columns:
    feature.append(column)
    count.append(len(df[column]))
    missing.append(df[column].isnull().sum())
    unique.append(df[column].nunique())
    missing_percentage.append((df[column].isnull().sum() / df.shape[0]) *100)
    dtypes.append(df[column].dtype)
    
dataframe = pd.DataFrame({'feature':feature,
                         'count':count,
                          'missing':missing,
                          'nunique':unique,
                          'missing_percentage':missing_percentage,
                          'dtypes':dtypes
                         
                         })
dataframe.set_index('feature')

Unnamed: 0_level_0,count,missing,nunique,missing_percentage,dtypes
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ID,45584,0,45584,0.0,object
Delivery_person_ID,45584,0,1320,0.0,object
Delivery_person_Age,45584,1854,22,4.067217,float64
Delivery_person_Ratings,45584,1908,28,4.185679,float64
Restaurant_latitude,45584,0,657,0.0,float64
Restaurant_longitude,45584,0,518,0.0,float64
Delivery_location_latitude,45584,0,4373,0.0,float64
Delivery_location_longitude,45584,0,4373,0.0,float64
Order_Date,45584,0,44,0.0,object
Time_Orderd,45584,1731,176,3.797385,object


In [12]:
#Remove Age column since its not important
df.drop('Delivery_person_Age',axis=1,inplace=True)

#drop ID column as well
df.drop('ID',axis=1,inplace=True)

In [13]:
#Change the date time column to real date
df['Order_Date'] = pd.to_datetime(df['Order_Date'], dayfirst=True)


In [14]:
df['Order_Date']

0       2022-02-12
1       2022-02-13
2       2022-03-04
3       2022-02-13
4       2022-02-14
           ...    
45579   2022-03-24
45580   2022-02-16
45581   2022-03-11
45582   2022-03-07
45583   2022-03-02
Name: Order_Date, Length: 45584, dtype: datetime64[ns]

In [15]:
df['Year']=df['Order_Date'].dt.year
df['Month'] = df['Order_Date'].dt.month
df['Day']= df['Order_Date'].dt.day

In [17]:
#Removing order date,since its no longer in use
df.drop('Order_Date',axis=1,inplace=True)

In [None]:
#