# Data Preprocessing

In [30]:
import pandas as pd

In [31]:
df = pd.read_csv("./data/finalcleandata.csv")

In [32]:
df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.28
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.24
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.79
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,2.93
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.4


In [33]:
## Independent and dependent features
X = df.drop(labels=["Time_taken (min)"], axis=1)
y = df["Time_taken (min)"]

In [34]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43853 entries, 0 to 43852
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Delivery_person_Age      43853 non-null  float64
 1   Delivery_person_Ratings  43853 non-null  float64
 2   Weather_conditions       43853 non-null  object 
 3   Road_traffic_density     43853 non-null  object 
 4   Vehicle_condition        43853 non-null  int64  
 5   Type_of_order            43853 non-null  object 
 6   Type_of_vehicle          43853 non-null  object 
 7   multiple_deliveries      43853 non-null  float64
 8   Festival                 43853 non-null  object 
 9   City                     43853 non-null  object 
 10  distance                 43853 non-null  float64
dtypes: float64(4), int64(1), object(6)
memory usage: 3.7+ MB


In [35]:
# Converting Vehicle_condition to categorical
#X["Vehicle_condition"] = X["Vehicle_condition"].astype("object")

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43853 entries, 0 to 43852
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Delivery_person_Age      43853 non-null  float64
 1   Delivery_person_Ratings  43853 non-null  float64
 2   Weather_conditions       43853 non-null  object 
 3   Road_traffic_density     43853 non-null  object 
 4   Vehicle_condition        43853 non-null  int64  
 5   Type_of_order            43853 non-null  object 
 6   Type_of_vehicle          43853 non-null  object 
 7   multiple_deliveries      43853 non-null  float64
 8   Festival                 43853 non-null  object 
 9   City                     43853 non-null  object 
 10  distance                 43853 non-null  float64
dtypes: float64(4), int64(1), object(6)
memory usage: 3.7+ MB


In [37]:
## Define which columns should be ordinal-encoded, onehot encoded and be scaled

numerical_cols = X.select_dtypes(exclude="object").columns
ordinal_cat_cols = ["Weather_conditions", "Road_traffic_density"]
categorical_cols = [col for col in X.columns[X.dtypes == "O"] if col not in ordinal_cat_cols]

In [38]:
print("numerical_cols : ", numerical_cols)
print("ordinal_cat_cols : ", ordinal_cat_cols)
print("categorical_cols : ", categorical_cols)

numerical_cols :  Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',
       'multiple_deliveries', 'distance'],
      dtype='object')
ordinal_cat_cols :  ['Weather_conditions', 'Road_traffic_density']
categorical_cols :  ['Type_of_order', 'Type_of_vehicle', 'Festival', 'City']


In [39]:
# Define Custom Ranking for Ordinal features
Weather_conditions_categories = ["Stormy","Sandstorms","Windy","Fog","Cloudy","Sunny"]
Road_traffic_density_categories = ["Jam","High", "Medium", "Low"]
#Vehicle_condition_categories = ["0","1","2"]

In [40]:
## Feature Engineering

from sklearn.impute import SimpleImputer                         ## Handling missing values
from sklearn.preprocessing import StandardScaler                 ## Handling feature scaling
from sklearn.preprocessing import OrdinalEncoder                 ## Handling oridinal encoding
from sklearn.preprocessing import OneHotEncoder                  ## Handling OneHot encoding
## Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [64]:
## Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler())

    
    ]
)


## Ordinal Pipeline
ord_pipeline = Pipeline(
    steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("ordinalencoder",OrdinalEncoder(categories=[Weather_conditions_categories,Road_traffic_density_categories])),
    ("scaler", StandardScaler())


    ]
)

## Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("onehotencoder",OneHotEncoder(sparse_output=False)),
    ("scaler", StandardScaler())
    

    ]
)



## Combine all 
preprocessor = ColumnTransformer([
("num_pipeline", num_pipeline,numerical_cols),
("ord_pipeline", ord_pipeline, ordinal_cat_cols),
("cat_pipeline", cat_pipeline, categorical_cols)
])

In [65]:
preprocessor

In [66]:
## Train-test split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [67]:
preprocessor.fit_transform(X_train)

array([[ 9.42714578e-01,  2.09499773e-01,  1.22298117e+00, ...,
         5.40585991e-01, -5.88648573e-02, -5.35250038e-01],
       [-6.23031726e-01, -1.07206925e-01, -1.22577318e+00, ...,
         5.40585991e-01, -5.88648573e-02, -5.35250038e-01],
       [ 1.11668639e+00, -1.07206925e-01, -1.22577318e+00, ...,
         5.40585991e-01, -5.88648573e-02, -5.35250038e-01],
       ...,
       [ 9.42714578e-01,  1.15961987e+00,  1.22298117e+00, ...,
        -1.84984446e+00, -5.88648573e-02,  1.86828572e+00],
       [-4.49059914e-01, -1.37403372e+00,  1.22298117e+00, ...,
         5.40585991e-01, -5.88648573e-02, -5.35250038e-01],
       [ 1.11668639e+00,  5.26206471e-01, -1.39600616e-03, ...,
        -1.84984446e+00, -5.88648573e-02,  1.86828572e+00]])

In [68]:
pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__distance,ord_pipeline__Weather_conditions,ord_pipeline__Road_traffic_density,cat_pipeline__Type_of_order_Buffet,cat_pipeline__Type_of_order_Drinks,cat_pipeline__Type_of_order_Meal,cat_pipeline__Type_of_order_Snack,cat_pipeline__Type_of_vehicle_electric_scooter,cat_pipeline__Type_of_vehicle_motorcycle,cat_pipeline__Type_of_vehicle_scooter,cat_pipeline__Festival_No,cat_pipeline__Festival_Yes,cat_pipeline__City_Metropolitian,cat_pipeline__City_Semi-Urban,cat_pipeline__City_Urban
0,0.942715,0.209500,1.222981,0.438110,-0.045033,-1.457987,-1.300067,1.731186,-0.569814,-0.578491,-0.583457,-0.296008,0.842302,-0.708904,0.141073,-0.141073,0.540586,-0.058865,-0.535250
1,-0.623032,-0.107207,-1.225773,-1.327733,-0.024377,1.476744,1.111253,1.731186,-0.569814,-0.578491,-0.583457,-0.296008,0.842302,-0.708904,0.141073,-0.141073,0.540586,-0.058865,-0.535250
2,1.116686,-0.107207,-1.225773,2.203953,-0.054531,0.889798,-1.300067,-0.577639,-0.569814,1.728634,-0.583457,-0.296008,0.842302,-0.708904,0.141073,-0.141073,0.540586,-0.058865,-0.535250
3,-0.970975,0.209500,-0.001396,0.438110,-0.060126,-0.871041,0.307480,-0.577639,1.754958,-0.578491,-0.583457,-0.296008,-1.187223,1.410628,0.141073,-0.141073,0.540586,-0.058865,-0.535250
4,0.420799,0.209500,-0.001396,0.438110,-0.084979,-0.871041,1.111253,-0.577639,-0.569814,1.728634,-0.583457,-0.296008,-1.187223,1.410628,0.141073,-0.141073,0.540586,-0.058865,-0.535250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30692,-0.449060,-1.374034,-1.225773,0.438110,-0.079904,-0.284094,-0.496294,-0.577639,-0.569814,-0.578491,1.713923,-0.296008,0.842302,-0.708904,0.141073,-0.141073,-1.849844,-0.058865,1.868286
30693,0.420799,-1.057327,-1.225773,0.438110,-0.079937,-1.457987,1.111253,-0.577639,-0.569814,-0.578491,1.713923,-0.296008,0.842302,-0.708904,0.141073,-0.141073,-1.849844,-0.058865,1.868286
30694,0.942715,1.159620,1.222981,-1.327733,-0.085109,1.476744,1.111253,-0.577639,1.754958,-0.578491,-0.583457,3.378292,-1.187223,-0.708904,0.141073,-0.141073,-1.849844,-0.058865,1.868286
30695,-0.449060,-1.374034,1.222981,0.438110,-0.055019,0.889798,-1.300067,-0.577639,-0.569814,-0.578491,1.713923,-0.296008,-1.187223,1.410628,0.141073,-0.141073,0.540586,-0.058865,-0.535250


In [69]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [70]:
X_train.head(2)

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__distance,ord_pipeline__Weather_conditions,ord_pipeline__Road_traffic_density,cat_pipeline__Type_of_order_Buffet,cat_pipeline__Type_of_order_Drinks,cat_pipeline__Type_of_order_Meal,cat_pipeline__Type_of_order_Snack,cat_pipeline__Type_of_vehicle_electric_scooter,cat_pipeline__Type_of_vehicle_motorcycle,cat_pipeline__Type_of_vehicle_scooter,cat_pipeline__Festival_No,cat_pipeline__Festival_Yes,cat_pipeline__City_Metropolitian,cat_pipeline__City_Semi-Urban,cat_pipeline__City_Urban
0,0.942715,0.2095,1.222981,0.43811,-0.045033,-1.457987,-1.300067,1.731186,-0.569814,-0.578491,-0.583457,-0.296008,0.842302,-0.708904,0.141073,-0.141073,0.540586,-0.058865,-0.53525
1,-0.623032,-0.107207,-1.225773,-1.327733,-0.024377,1.476744,1.111253,1.731186,-0.569814,-0.578491,-0.583457,-0.296008,0.842302,-0.708904,0.141073,-0.141073,0.540586,-0.058865,-0.53525


## Model Training


In [72]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [73]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [74]:
regression.coef_

array([ 2.14218002e+00, -2.39986639e+00, -1.68285710e+00,  1.99937811e+00,
       -5.49403828e-02, -2.53898199e-01, -3.16099617e+00,  1.54525071e+13,
        1.53465058e+13,  1.54638862e+13,  1.55294874e+13, -1.95722025e+13,
       -3.54339825e+13, -3.39292551e+13,  2.82232080e+13,  2.82232080e+13,
       -3.04466950e+13, -4.26943204e+12, -3.02806841e+13])

In [75]:
regression.intercept_

26.37447081675003

In [79]:
# Model Evaluation

import numpy as np
def evaluate_model(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    adj_r2=1 - (1-r2_square)*(len(y)-1)/(len(y)-X.shape[1]-1)  
    return mse,mae,rmse,r2_square,adj_r2


In [80]:
## Train multiple models

models={
    "LinearRegression" : LinearRegression(),
    "Ridge" : Ridge(),
    "Lasso" : Lasso(),
    "ElasticNet" : ElasticNet()
}

model_lst=[]
r2_list=[]



for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    # Make prediction
    y_pred = model.predict(X_test)


    mse,mae,rmse,r2_square,adj_r2 = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_lst.append(list(models.keys())[i])

    print('Model Training Performance')
    print('RMSE : ',rmse)
    print('MAE : ',mae)
    print('MSE : ',mse)
    print('R2 SCORE : ', r2_square*100)
    print('ADJ R2 SCORE : ', adj_r2*100)

    r2_list.append(r2_square)

    print("="*35)
    print("\n")

LinearRegression
Model Training Performance
RMSE :  6.596357902788077
MAE :  5.236794113605152
MSE :  43.51193758167472
R2 SCORE :  50.61483503591225
ADJ R2 SCORE :  50.60244396785711


Ridge
Model Training Performance
RMSE :  6.595639412804984
MAE :  5.235131376587307
MSE :  43.50245926374648
R2 SCORE :  50.62559273415517
ADJ R2 SCORE :  50.61320436527845


Lasso
Model Training Performance
RMSE :  7.009018289458117
MAE :  5.605477498540742
MSE :  49.12633738195839
R2 SCORE :  44.242605350876964
ADJ R2 SCORE :  44.22861544779217


ElasticNet
Model Training Performance
RMSE :  7.059684634123995
MAE :  5.673003699550273
MSE :  49.83914713328644
R2 SCORE :  43.43358076788133
ADJ R2 SCORE :  43.41938787511991


