In [1]:
import pandas as pd
import numpy as np
import geopy.distance
from sklearn.impute import SimpleImputer                # Handling Missing Values
from sklearn.preprocessing import StandardScaler, MinMaxScaler       # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder       # Ordinal Encoding
from sklearn.pipeline import Pipeline                   # Pipelining
from sklearn.compose import ColumnTransformer           # Transformers to Columns
from sklearn.model_selection import train_test_split    # Train & Test Data
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet     # Algorithm to perform on dataset
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error   # performance metrics

In [2]:
df = pd.read_csv("data/finalTrain.csv")
df.head(2)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23


In [38]:
# dff = pd.read_csv("data/newdData.csv")

In [None]:
# dff.head(2)

In [3]:
# renaming target feature
df.rename(columns={"Time_taken (min)": "delivery_time_taken"}, inplace=True)

In [4]:
# replacing invalid values with valid data  
df['Time_Orderd'] = df['Time_Orderd'].apply(lambda x : "00:00" if "." in str(x) else x)
df['Time_Orderd'] = df['Time_Orderd'].apply(lambda x : "00:00" if x == "1" else x)
df['Time_Orderd'] = df['Time_Orderd'].apply(lambda x : "00:00" if x in ['24:05:00','24:10:00','24:15:00'] else x)
df['Time_Order_picked'] = df['Time_Order_picked'].apply(lambda x : "00:00" if (("." in str(x)) or (x == "1")) else x)
df['Time_Order_picked'] = df['Time_Order_picked'].apply(lambda x : "00:00" if x in ['24:05:00','24:10:00','24:15:00'] else x)

In [5]:
numerical_feature_with_nan = ['Delivery_person_Age', 'Delivery_person_Ratings']
categorical_feature_with_nan = ['Weather_conditions', 'Road_traffic_density','Festival', 'City','multiple_deliveries']

In [6]:
####  imputer object
for feature in numerical_feature_with_nan:
    df[feature] = df[feature].fillna(df[feature].mean())
for feature in categorical_feature_with_nan:
    df[feature] = df[feature].replace(np.nan, df[feature].value_counts().keys()[0])

df["Time_Orderd"] = df["Time_Orderd"].fillna(df["Time_Order_picked"])

In [7]:
coordinate_features = ['Restaurant_latitude','Restaurant_longitude','Delivery_location_latitude','Delivery_location_longitude']

def calculate_distance(res_lat, res_long, del_lat,del_long):
    """
    calculates the distance between resturant to delivery location using co-ordinates
    """
    return geopy.distance.geodesic((abs(res_lat),abs(res_long)),(abs(del_lat),abs(del_long))).km

df['distance'] = df[coordinate_features].apply(lambda x: calculate_distance(*x), axis=1)

In [8]:
####  Converting order time and order_picked times to datetime objects and extracting time information

df["order_datetime"] = pd.to_datetime(df["Order_Date"] + ' ' + df["Time_Orderd"],format = "%d-%m-%Y %H:%M", dayfirst=True)
df["picked_datetime"] = pd.to_datetime(df["Order_Date"] + ' ' + df["Time_Order_picked"],format = "%d-%m-%Y %H:%M", dayfirst=True)

df["ordered_hour"] = df["order_datetime"].apply(lambda x: x.hour)
df["ordered_min"] = df["order_datetime"].apply(lambda x: x.minute)

df["picked_hour"] = df["picked_datetime"].apply(lambda x: x.hour)
df["picked_min"] = df["picked_datetime"].apply(lambda x: x.minute)


def calc_prep_time(ord_hour, ord_min, pc_hour, pc_min):
                
        if ord_hour == pc_hour:
            if pc_min > ord_min:
                return pc_min - ord_min
            else:
                return 0

        else:
            return 60 - ord_min + pc_min

#### calculate preparation time

df["prep_time"] = df.iloc[:,-4:].apply(lambda x: calc_prep_time(*x), axis=1)

In [9]:
df.head(2)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,...,City,delivery_time_taken,distance,order_datetime,picked_datetime,ordered_hour,ordered_min,picked_hour,picked_min,prep_time
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,...,Metropolitian,46,10.271464,2022-02-12 21:55:00,2022-02-12 22:10:00,21,55,22,10,15
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,...,Metropolitian,23,6.229376,2022-02-13 14:55:00,2022-02-13 15:05:00,14,55,15,5,10


In [10]:
df1 = df.drop(labels=['order_datetime', 'picked_datetime','picked_hour', 'picked_min','ordered_hour', 'ordered_min'],axis=1)

In [None]:
df1.head(2)
df1.isnull().sum()

In [None]:
df1.rename(columns={'prep_time' : 'prep_time_min'})

In [13]:
####### storing processed data to new file
df1.to_csv(path_or_buf=r"D:\DeliveryTimePredictor\notebooks\data\newdata.csv",index=False)

In [None]:
df1.columns

In [5]:
import pandas as pd
df2 = pd.read_csv('data/newdata.csv')
df2.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,...,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,delivery_time_taken,distance,prep_time
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,...,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.271464,15
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,...,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.229376,10
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,...,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.764306,10
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,...,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.927795,10
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,...,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.373484,15


In [7]:
df2.rename(columns={'prep_time' : 'prep_time_min'},inplace=True)
df2.columns

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'delivery_time_taken',
       'distance', 'prep_time_min'],
      dtype='object')

In [8]:
df2.to_csv(path_or_buf=r"D:\DeliveryTimePredictor\notebooks\data\newdata.csv",index=False)

In [3]:
df.columns

Index(['ID', 'Delivery_person_ID', 'Delivery_person_Age',
       'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather_conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'delivery_time_taken',
       'distance', 'prep_time'],
      dtype='object')

In [4]:
# dropping unnecessary features
df.drop(labels=['ID', 'Delivery_person_ID','Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude','Order_Date', 'Time_Orderd',
       'Time_Order_picked'], axis=1,inplace=True)

In [None]:
# 'ID','Delivery_person_ID','Restaurant_latitude','Restaurant_longitude', 'Delivery_location_latitude','Delivery_location_longitude','Order_Date', 'Time_Orderd','Time_Order_picked'

In [5]:
df.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'delivery_time_taken', 'distance', 'prep_time'],
      dtype='object')

In [6]:
df.head(1)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,delivery_time_taken,distance,prep_time
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.271464,15


In [7]:
X = df.drop(labels=['delivery_time_taken'], axis=1)
y = df[['delivery_time_taken']]
display(X)
display(y)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,distance,prep_time
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,10.271464,15
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,6.229376,10
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,13.764306,10
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,2.927795,10
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,19.373484,15
...,...,...,...,...,...,...,...,...,...,...,...,...
45579,30.0,4.8,Windy,High,1,Meal,motorcycle,0.0,No,Metropolitian,1.488112,10
45580,21.0,4.6,Windy,Jam,0,Buffet,motorcycle,1.0,No,Metropolitian,10.983242,15
45581,30.0,4.9,Cloudy,Low,1,Drinks,scooter,0.0,No,Metropolitian,4.648024,10
45582,20.0,4.7,Cloudy,High,0,Snack,motorcycle,1.0,No,Metropolitian,6.219668,5


Unnamed: 0,delivery_time_taken
0,46
1,23
2,21
3,20
4,41
...,...
45579,32
45580,36
45581,16
45582,26


##### **Define which column should be ordinal encoded and which should be scaled**

In [9]:
categorical_feature = X.select_dtypes(include='object').columns
numerical_feature = X.select_dtypes(exclude='object').columns

In [10]:
display(numerical_feature)
display(categorical_feature)

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',
       'multiple_deliveries', 'distance', 'prep_time'],
      dtype='object')

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_order',
       'Type_of_vehicle', 'Festival', 'City'],
      dtype='object')

##### **Numerical & Categorical Pipeline**

In [11]:
numerical_pipeline = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ]
) 

# Categorical Pipeline
categorical_pipeline = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder_ohe', OneHotEncoder(sparse=False)),
    ('scaler', StandardScaler())
    ]
)

##### **Column Transformer on above pipelines**

In [12]:
preprocessor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_feature),
    ('categorical_pipeline', categorical_pipeline, categorical_feature)
])

##### **Train Test Split data**

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state= 30)

X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns= preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())




In [14]:
X_train.head()

Unnamed: 0,numerical_pipeline__Delivery_person_Age,numerical_pipeline__Delivery_person_Ratings,numerical_pipeline__Vehicle_condition,numerical_pipeline__multiple_deliveries,numerical_pipeline__distance,numerical_pipeline__prep_time,categorical_pipeline__Weather_conditions_Cloudy,categorical_pipeline__Weather_conditions_Fog,categorical_pipeline__Weather_conditions_Sandstorms,categorical_pipeline__Weather_conditions_Stormy,...,categorical_pipeline__Type_of_order_Snack,categorical_pipeline__Type_of_vehicle_bicycle,categorical_pipeline__Type_of_vehicle_electric_scooter,categorical_pipeline__Type_of_vehicle_motorcycle,categorical_pipeline__Type_of_vehicle_scooter,categorical_pipeline__Festival_No,categorical_pipeline__Festival_Yes,categorical_pipeline__City_Metropolitian,categorical_pipeline__City_Semi-Urban,categorical_pipeline__City_Urban
0,1.127187,-1.616514,-1.223528,-1.318236,0.155639,-0.250741,-0.446305,-0.470836,2.265788,-0.444638,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
1,-0.103466,-3.137444,-1.223528,3.943714,-0.11421,-0.542981,-0.446305,-0.470836,-0.441348,-0.444638,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,-6.995304,6.995304,-1.859481,-0.05935,1.878437
2,0.248149,0.512787,1.160323,0.435747,-1.20154,-0.250741,-0.446305,-0.470836,-0.441348,-0.444638,...,-0.582851,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357
3,0.599764,-1.312329,1.160323,0.435747,1.844238,0.041499,-0.446305,-0.470836,-0.441348,-0.444638,...,-0.582851,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357
4,0.775572,-1.616514,-1.223528,0.435747,-1.188474,0.041499,-0.446305,-0.470836,-0.441348,-0.444638,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,-1.859481,-0.05935,1.878437


In [15]:
X_test.head()

Unnamed: 0,numerical_pipeline__Delivery_person_Age,numerical_pipeline__Delivery_person_Ratings,numerical_pipeline__Vehicle_condition,numerical_pipeline__multiple_deliveries,numerical_pipeline__distance,numerical_pipeline__prep_time,categorical_pipeline__Weather_conditions_Cloudy,categorical_pipeline__Weather_conditions_Fog,categorical_pipeline__Weather_conditions_Sandstorms,categorical_pipeline__Weather_conditions_Stormy,...,categorical_pipeline__Type_of_order_Snack,categorical_pipeline__Type_of_vehicle_bicycle,categorical_pipeline__Type_of_vehicle_electric_scooter,categorical_pipeline__Type_of_vehicle_motorcycle,categorical_pipeline__Type_of_vehicle_scooter,categorical_pipeline__Festival_No,categorical_pipeline__Festival_Yes,categorical_pipeline__City_Metropolitian,categorical_pipeline__City_Semi-Urban,categorical_pipeline__City_Urban
0,0.072341,0.512787,-0.031602,0.435747,1.860594,3.256138,-0.446305,-0.470836,2.265788,-0.444638,...,1.715704,-0.03716,-0.302932,-1.172902,1.40736,0.142953,-0.142953,0.537784,-0.05935,-0.532357
1,-0.982505,1.121159,-1.223528,0.435747,-0.140455,-0.542981,2.24062,-0.470836,-0.441348,-0.444638,...,-0.582851,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
2,-0.279274,-0.095585,1.160323,-1.318236,1.76936,-0.250741,-0.446305,-0.470836,-0.441348,-0.444638,...,1.715704,-0.03716,3.301074,-1.172902,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
3,-0.630889,-0.399771,-0.031602,-1.318236,-0.371017,0.041499,-0.446305,-0.470836,-0.441348,-0.444638,...,1.715704,-0.03716,-0.302932,0.852586,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357
4,-0.003799,0.00715,1.160323,0.435747,0.152922,-0.835221,-0.446305,2.123883,-0.441348,-0.444638,...,-0.582851,-0.03716,3.301074,-1.172902,-0.71055,0.142953,-0.142953,0.537784,-0.05935,-0.532357


In [16]:
## regression object 
regression = LinearRegression()
regression.fit(X_train, y_train)

In [17]:
# regression coefficient
regression.coef_

array([[ 2.29873320e+00, -2.26788405e+00, -1.74159001e+00,
         1.71119978e+00,  1.86318584e+00, -4.56783483e-02,
         2.94252518e+12,  3.04709067e+12,  2.92055731e+12,
         2.93516863e+12,  2.90189212e+12,  2.92664625e+12,
         2.92595649e+13,  4.59084961e+13,  4.73738152e+13,
         4.23291127e+13, -2.94396440e+10, -2.95508744e+10,
        -2.96015621e+10, -2.97176809e+10, -1.45741074e+12,
        -1.08973163e+13, -1.93898918e+13, -1.85437461e+13,
        -1.04329123e+12, -1.04329123e+12, -8.26872339e+12,
        -1.17233106e+12, -8.22231982e+12]])

In [18]:
# regression intercept
regression.intercept_

array([26.32133519])

In [19]:
# function to evaluate model error

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [20]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make Prediction
    y_pred= model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("RMSE : ",rmse)
    print("MAE : ",mae)
    print("R2 SCORE : ",r2_square*100)

    r2_list.append(r2_score)

    print("="*40)
    print("\n")


LinearRegression
Model Training Performance
RMSE :  6.0514029956854465
MAE :  4.816673060021903
R2 SCORE :  57.6628547308426


Lasso
Model Training Performance
RMSE :  6.7243525055696365
MAE :  5.327292581089238
R2 SCORE :  47.72303536858217


Ridge
Model Training Performance
RMSE :  6.0512897510575545
MAE :  4.815577069097179
R2 SCORE :  57.6644392921102


ElasticNet
Model Training Performance
RMSE :  6.630287478846626
MAE :  5.288375913169895
R2 SCORE :  49.175380345408705




In [21]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']