In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.pandas.set_option("display.max_columns", None)

In [2]:
df = pd.read_csv("finalTrain.csv")
df.head(1)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46


In [3]:
deleted_columns = ["ID", "Delivery_person_ID", "Restaurant_latitude", "Restaurant_longitude"]

df.drop(deleted_columns, axis=1, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_Age          43730 non-null  float64
 1   Delivery_person_Ratings      43676 non-null  float64
 2   Delivery_location_latitude   45584 non-null  float64
 3   Delivery_location_longitude  45584 non-null  float64
 4   Order_Date                   45584 non-null  object 
 5   Time_Orderd                  43853 non-null  object 
 6   Time_Order_picked            45584 non-null  object 
 7   Weather_conditions           44968 non-null  object 
 8   Road_traffic_density         44983 non-null  object 
 9   Vehicle_condition            45584 non-null  int64  
 10  Type_of_order                45584 non-null  object 
 11  Type_of_vehicle              45584 non-null  object 
 12  multiple_deliveries          44591 non-null  float64
 13  Festival        

In [5]:
df["Order month"] = df["Order_Date"].str.split("-").str[0].astype("float")
df["Order day"] = df["Order_Date"].str.split("-").str[1].astype("float")
df["Order year"] = df["Order_Date"].str.split("-").str[2].astype("float")

del df["Order_Date"]

df.head(1)

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Delivery_location_latitude,Delivery_location_longitude,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Order month,Order day,Order year
0,36.0,4.2,30.397968,78.116106,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,12.0,2.0,2022.0


In [6]:
del df["Order year"]

In [7]:
df["Time_ordered_hr"] = df["Time_Orderd"].str.split(":").str[0].astype("float")
df["Time_ordered_min"] = df["Time_Orderd"].str.split(":").str[1].astype("float")

df["Time_Order_picked_hr"] = df["Time_Order_picked"].str.split(":").str[0].astype("float")
df["Time_Order_picked_min"] = df["Time_Order_picked"].str.split(":").str[1].astype("float")

df.drop(["Time_Orderd", "Time_Order_picked"], axis=1, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_Age          43730 non-null  float64
 1   Delivery_person_Ratings      43676 non-null  float64
 2   Delivery_location_latitude   45584 non-null  float64
 3   Delivery_location_longitude  45584 non-null  float64
 4   Weather_conditions           44968 non-null  object 
 5   Road_traffic_density         44983 non-null  object 
 6   Vehicle_condition            45584 non-null  int64  
 7   Type_of_order                45584 non-null  object 
 8   Type_of_vehicle              45584 non-null  object 
 9   multiple_deliveries          44591 non-null  float64
 10  Festival                     45356 non-null  object 
 11  City                         44384 non-null  object 
 12  Time_taken (min)             45584 non-null  int64  
 13  Order month     

In [9]:
Weather_conditions = ['Fog', 'Stormy', 'Sandstorms', 'Windy', 'Cloudy', 'Sunny']
Road_traffic = ['Jam', 'High', 'Medium', 'Low']
Type_of_order = ['Snack', 'Meal', 'Drinks', 'Buffet']
Type_of_vehicle = ['motorcycle', 'scooter', 'electric_scooter', 'bicycle']
Festival = ['No', 'Yes']
City = ['Metropolitian', 'Urban', 'Semi-Urban']

In [28]:
# Check the first row of the dataframe again

df.shape

(45584, 19)

In [12]:
X = df.drop("Time_taken (min)", axis=1)
y = df["Time_taken (min)"]

In [13]:
categorical_features = X.select_dtypes(include="object").columns
numerical_features = X.select_dtypes(exclude="object").columns

In [14]:
from sklearn.impute import SimpleImputer # for handling missing values
from sklearn.preprocessing import StandardScaler # for scaling data
from sklearn.preprocessing import OrdinalEncoder # for handling categorical features

from sklearn.compose import ColumnTransformer # for creating transforming the columns
from sklearn.pipeline import Pipeline

In [15]:
numerical_pipeline = Pipeline(
    steps= [
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps= [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder", OrdinalEncoder(categories=[Weather_conditions,Road_traffic, Type_of_order, Type_of_vehicle, Festival, City])),
        ("scaler", StandardScaler())
    ]
)

pipeline = ColumnTransformer(
    [
        ("numerical_pipeline", numerical_pipeline, numerical_features),
        ("categorical_pipeline", categorical_pipeline, categorical_features)
    ]
)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [18]:
X_train = pd.DataFrame(pipeline.fit_transform(X_train), columns=pipeline.get_feature_names_out())

In [19]:
X_test = pd.DataFrame(pipeline.transform(X_test), columns=pipeline.get_feature_names_out())

In [20]:
X_test.shape

(11396, 18)

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)

lr.score(X_test, y_test)

0.5011929569189113

In [23]:
y_pred = lr.predict(X_test)

In [26]:
result_df = pd.DataFrame({"Actual Value": y_test, "Predicted Value":np.round(y_pred, 2)})


result_df["Differnce"]= result_df["Predicted Value"] - result_df["Actual Value"]

result_df

Unnamed: 0,Actual Value,Predicted Value,Differnce
27811,33,33.68,0.68
43475,28,21.75,-6.25
11902,22,27.15,5.15
36731,25,25.79,0.79
18188,35,30.85,-4.15
...,...,...,...
5559,46,35.59,-10.41
15355,22,23.50,1.50
1500,35,33.71,-1.29
38160,30,29.60,-0.40


In [30]:
df.to_csv("Final_Train2.csv")