In [16]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [17]:
data = pd.read_csv("train_cleaned_imputed.csv")

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40353 entries, 0 to 40352
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  40353 non-null  int64  
 1   Ratings              40353 non-null  float64
 2   RestaurantLat        40353 non-null  float64
 3   RestaurantLon        40353 non-null  float64
 4   DeliveryLocationLat  40353 non-null  float64
 5   DeliveryLocationLon  40353 non-null  float64
 6   TimeOrderPickedUp    40353 non-null  object 
 7   WeatherConditions    40353 non-null  object 
 8   RoadTrafficDensity   40353 non-null  object 
 9   VehicleCondition     40353 non-null  int64  
 10  TypeOfOrder          40353 non-null  object 
 11  TypeOfVehicle        40353 non-null  object 
 12  MultipleDeliveries   40353 non-null  int64  
 13  Festival             40353 non-null  object 
 14  City                 40353 non-null  object 
 15  TimeTaken            40353 non-null 

### Encoding of categorical variables

#### Label Encoding

In [19]:
def label_encoding(datale):
    categorical_columns = datale.select_dtypes(include='object').columns
    label_encoder = LabelEncoder()
    datale[categorical_columns] = datale[categorical_columns].apply(lambda col: label_encoder.fit_transform(col))

datale = data
label_encoding(datale)
datale.head()

Unnamed: 0,Age,Ratings,RestaurantLat,RestaurantLon,DeliveryLocationLat,DeliveryLocationLon,TimeOrderPickedUp,WeatherConditions,RoadTrafficDensity,VehicleCondition,TypeOfOrder,TypeOfVehicle,MultipleDeliveries,Festival,City,TimeTaken,Distance,Day,Hour,OrderPeriod
0,37,4.9,22.745049,75.892471,22.765049,75.912471,46,4,0,2,3,1,0,0,2,24,3.025149,2,11,2
1,34,4.5,12.913041,77.683237,13.043041,77.813237,143,3,1,2,3,2,1,0,0,33,20.18353,0,19,1
2,23,4.4,12.914264,77.6784,12.924264,77.6884,10,2,2,0,1,1,1,0,2,26,1.552758,2,8,2
3,38,4.7,11.003669,76.976494,11.053669,77.026494,123,4,3,0,0,1,1,0,0,21,7.790401,5,18,1
4,32,4.6,12.972793,80.249982,13.012793,80.289982,70,0,0,1,3,2,1,0,0,30,6.210138,2,13,0


#### One Hot Encoding

In [20]:
ohe = OneHotEncoder()

dataoh = data[['RoadTrafficDensity','MultipleDeliveries','Festival', 'City', 'OrderPeriod', 
               'Day', 'WeatherConditions']]
ohe.fit(dataoh)
dataoh = pd.DataFrame(ohe.transform(dataoh).toarray(), 
                                  columns=ohe.get_feature_names_out(dataoh.columns))

dataoh = pd.concat([data[['Age', 'Ratings','Hour','TimeTaken', 'Distance']].reset_index(drop=True),
                    dataoh.reset_index(drop=True)], axis = 1)

dataoh.head()

Unnamed: 0,Age,Ratings,Hour,TimeTaken,Distance,RoadTrafficDensity_0,RoadTrafficDensity_1,RoadTrafficDensity_2,RoadTrafficDensity_3,MultipleDeliveries_0,...,Day_3,Day_4,Day_5,Day_6,WeatherConditions_0,WeatherConditions_1,WeatherConditions_2,WeatherConditions_3,WeatherConditions_4,WeatherConditions_5
0,37,4.9,11,24,3.025149,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,34,4.5,19,33,20.18353,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,23,4.4,8,26,1.552758,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,38,4.7,18,21,7.790401,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,32,4.6,13,30,6.210138,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Forming a Train-test Split for Machine Learning on Label-encoded Data

In [21]:
X = datale.drop('TimeTaken', axis=1)  # Features
y = datale['TimeTaken']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(30264, 19)
(30264,)
(10089, 19)
(10089,)


### Standardization

In [22]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Perform standardization on the training data
X_train = scaler.transform(X_train)

# Perform standardization on the testing data
X_test = scaler.transform(X_test)

In [23]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Find the best model
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    xgb.XGBRegressor(),
    
]

param_grid = [
    {},  
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [100, 200, 300]},
    {'n_estimators': [20, 25, 30], 'max_depth': [7, 10, 13]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.5162838240007435

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.7232654627491695

RandomForestRegressor:
Best parameters: {'n_estimators': 300}
Best R2 score: 0.8290596667168344

XGBRegressor:
Best parameters: {'max_depth': 7, 'n_estimators': 20}
Best R2 score: 0.8326236905688807



In [24]:
# Create a XGB regressor model
model = xgb.XGBRegressor(n_estimators=20,max_depth=7)

# Fit the model on the training data
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", round(mae,2))
print("Mean Squared Error (MSE):", round(mse,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
print("R-squared (R2) Score:", round(r2,2))

Mean Absolute Error (MAE): 3.1
Mean Squared Error (MSE): 14.74
Root Mean Squared Error (RMSE): 3.84
R-squared (R2) Score: 0.83


### Forming a Train-test Split for Machine Learning on One-hot-encoded Data

In [26]:
X = dataoh.drop('TimeTaken', axis=1)  # Features
y = dataoh['TimeTaken']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(30264, 34)
(30264,)
(10089, 34)
(10089,)


In [27]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Perform standardization on the training data
X_train = scaler.transform(X_train)

# Perform standardization on the testing data
X_test = scaler.transform(X_test)

In [28]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Find the best model
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    xgb.XGBRegressor(),
]

param_grid = [
    {},  
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [100, 200, 300], 'max_features': ['sqrt', 'log2', None]},
    {'n_estimators': [20, 25, 30], 'max_depth': [7, 10, 13]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.5687732891145172

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.7115900027453183

RandomForestRegressor:
Best parameters: {'max_features': 'log2', 'n_estimators': 300}
Best R2 score: 0.7474643930074252

XGBRegressor:
Best parameters: {'max_depth': 7, 'n_estimators': 20}
Best R2 score: 0.7650455224092649



In [29]:
# Create a XGB regressor model
model = xgb.XGBRegressor(n_estimators=20, max_depth=7)

# Fit the model on the training data
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", round(mae, 2))
print("Mean Squared Error (MSE):", round(mse, 2))
print("Root Mean Squared Error (RMSE):", round(rmse, 2))
print("R-squared (R2) Score:", round(r2, 2))

Mean Absolute Error (MAE): 3.65
Mean Squared Error (MSE): 21.34
Root Mean Squared Error (RMSE): 4.62
R-squared (R2) Score: 0.76
