In [26]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [27]:
data = pd.read_csv("train_cleaned_imputed_no_outlier_selcol.csv")

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39045 entries, 0 to 39044
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 39045 non-null  int64  
 1   Ratings             39045 non-null  float64
 2   MultipleDeliveries  39045 non-null  int64  
 3   RoadTrafficDensity  39045 non-null  object 
 4   TimeTaken           39045 non-null  int64  
 5   Distance            39045 non-null  float64
dtypes: float64(2), int64(3), object(1)
memory usage: 1.8+ MB


### Encoding of categorical variables

#### Label Encoding

In [29]:
def label_encoding(datale):
    categorical_columns = datale.select_dtypes(include='object').columns
    label_encoder = LabelEncoder()
    datale[categorical_columns] = datale[categorical_columns].apply(lambda col: label_encoder.fit_transform(col))

datale = data
label_encoding(datale)
datale.head()

Unnamed: 0,Age,Ratings,MultipleDeliveries,RoadTrafficDensity,TimeTaken,Distance
0,24,4.7,0,3,19,14.023232
1,36,4.8,0,3,15,13.407093
2,37,4.8,0,0,18,6.232139
3,34,4.7,0,0,23,6.118712
4,36,4.9,0,3,17,9.327285


#### One Hot Encoding

In [30]:
ohe = OneHotEncoder()

dataoh = data[['RoadTrafficDensity','MultipleDeliveries']]
ohe.fit(dataoh)
dataoh = pd.DataFrame(ohe.transform(dataoh).toarray(), 
                                  columns=ohe.get_feature_names_out(dataoh.columns))

dataoh = pd.concat([data[['Age', 'Ratings', 'TimeTaken', 'Distance']].reset_index(drop=True),
                    dataoh.reset_index(drop=True)], axis = 1)

dataoh.head()

Unnamed: 0,Age,Ratings,TimeTaken,Distance,RoadTrafficDensity_0,RoadTrafficDensity_1,RoadTrafficDensity_2,RoadTrafficDensity_3,MultipleDeliveries_0,MultipleDeliveries_1,MultipleDeliveries_2,MultipleDeliveries_3
0,24,4.7,19,14.023232,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,36,4.8,15,13.407093,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,37,4.8,18,6.232139,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,34,4.7,23,6.118712,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,36,4.9,17,9.327285,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


### Forming a Train-test Split for Machine Learning on Label-encoded Data

In [31]:
X = datale.drop('TimeTaken', axis=1)  # Features
y = datale['TimeTaken']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(29283, 5)
(29283,)
(9762, 5)
(9762,)


### Standardization

In [32]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Perform standardization on the training data
X_train = scaler.transform(X_train)

# Perform standardization on the testing data
X_test = scaler.transform(X_test)

In [33]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Find the best model
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    xgb.XGBRegressor(),
    
]

param_grid = [
    {},  
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [100, 200, 300]},
    {'n_estimators': [20, 25, 30], 'max_depth': [7, 10, 13]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.3843149116615908



DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.563004565012716

RandomForestRegressor:
Best parameters: {'n_estimators': 300}
Best R2 score: 0.48041580807718853

XGBRegressor:
Best parameters: {'max_depth': 7, 'n_estimators': 20}
Best R2 score: 0.5575714460706077



In [34]:
# Create a XGB regressor model
model = xgb.XGBRegressor(n_estimators=20,max_depth=7)

# Fit the model on the training data
model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", round(mae,2))
print("Mean Squared Error (MSE):", round(mse,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
print("R-squared (R2) Score:", round(r2,2))

Mean Absolute Error (MAE): 4.77
Mean Squared Error (MSE): 36.97
Root Mean Squared Error (RMSE): 6.08
R-squared (R2) Score: 0.57


### Forming a Train-test Split for Machine Learning on One-hot-encoded Data

In [36]:
X = dataoh.drop('TimeTaken', axis=1)  # Features
y = dataoh['TimeTaken']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(29283, 11)
(29283,)
(9762, 11)
(9762,)


In [37]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Perform standardization on the training data
X_train = scaler.transform(X_train)

# Perform standardization on the testing data
X_test = scaler.transform(X_test)

In [38]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Find the best model
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    xgb.XGBRegressor(),
]

param_grid = [
    {},  
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [100, 200, 300], 'max_features': ['sqrt', 'log2', None]},
    {'n_estimators': [20, 25, 30], 'max_depth': [7, 10, 13]},
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.47285097465248727

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.5591386015472966

RandomForestRegressor:
Best parameters: {'max_features': None, 'n_estimators': 300}
Best R2 score: 0.4821741296004243

XGBRegressor:
Best parameters: {'max_depth': 7, 'n_estimators': 20}
Best R2 score: 0.5563854153931108



In [39]:
# Create a XGB regressor model
model = xgb.XGBRegressor(n_estimators=20, max_depth=7)

# Fit the model on the training data
model.fit(X_train, y_train)

In [40]:
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", round(mae, 2))
print("Mean Squared Error (MSE):", round(mse, 2))
print("Root Mean Squared Error (RMSE):", round(rmse, 2))
print("R-squared (R2) Score:", round(r2, 2))

Mean Absolute Error (MAE): 4.77
Mean Squared Error (MSE): 37.04
Root Mean Squared Error (RMSE): 6.09
R-squared (R2) Score: 0.57
