## Load Libraries and Dataset

In [45]:

# we begin by importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statistics

from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [46]:
# load data into notebook
data = pd.read_csv("/content/train.csv")

## Data exploration

In [47]:
# to see the number of rows and columns
data.shape

(45593, 20)

In [48]:
data.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [49]:
# view datatypes
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          45593 non-null  object 
 3   Delivery_person_Ratings      45593 non-null  object 
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Order_Date                   45593 non-null  object 
 9   Time_Orderd                  45593 non-null  object 
 10  Time_Order_picked            45593 non-null  object 
 11  Weatherconditions            45593 non-null  object 
 12  Road_traffic_density         45593 non-null  object 
 13  Vehicle_conditio

In [50]:
# first check summary stats for columns with numerical data types
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Restaurant_latitude,45593.0,17.017729,8.185109,-30.905562,12.933284,18.546947,22.728163,30.914057
Restaurant_longitude,45593.0,70.231332,22.883647,-88.366217,73.17,75.898497,78.044095,88.433452
Delivery_location_latitude,45593.0,17.465186,7.335122,0.01,12.988453,18.633934,22.785049,31.054057
Delivery_location_longitude,45593.0,70.845702,21.118812,0.01,73.28,76.002574,78.107044,88.563452
Vehicle_condition,45593.0,1.023359,0.839065,0.0,0.0,1.0,2.0,3.0


In [51]:
# now check summary stats for columns with non-numerical datatype
data.describe(exclude=np.number).T

Unnamed: 0,count,unique,top,freq
ID,45593,45593,0x4607,1
Delivery_person_ID,45593,1320,PUNERES01DEL01,67
Delivery_person_Age,45593,23,35,2262
Delivery_person_Ratings,45593,29,4.8,7148
Order_Date,45593,44,15-03-2022,1192
Time_Orderd,45593,177,,1731
Time_Order_picked,45593,193,21:30:00,496
Weatherconditions,45593,7,conditions Fog,7654
Road_traffic_density,45593,5,Low,15477
Type_of_order,45593,4,Snack,11533


In [52]:
#Explore each column
for column in data.columns:
    print(column)
    print(data[column].value_counts())
    print("*********************************************")

ID
ID
0x4607     1
0x1f3e     1
0xe251     1
0x3f31     1
0x4a78     1
          ..
0xc3f1     1
0x5db7     1
0x1985     1
0xceda     1
0x5fb2     1
Name: count, Length: 45593, dtype: int64
*********************************************
Delivery_person_ID
Delivery_person_ID
PUNERES01DEL01     67
JAPRES11DEL02      67
HYDRES04DEL02      66
JAPRES03DEL01      66
VADRES11DEL02      66
                   ..
DEHRES18DEL03       7
AURGRES11DEL03      7
KOLRES09DEL03       6
KOCRES16DEL03       6
BHPRES010DEL03      5
Name: count, Length: 1320, dtype: int64
*********************************************
Delivery_person_Age
Delivery_person_Age
35      2262
36      2260
37      2227
30      2226
38      2219
24      2210
32      2202
22      2196
29      2191
33      2187
28      2179
25      2174
34      2166
26      2159
21      2153
27      2150
39      2144
20      2136
31      2120
23      2087
NaN     1854
50        53
15        38
Name: count, dtype: int64
*********************************


#### Observations:
1. We have null values present in several columns.
2. Data Formatting will be required for Weatherconditions & Time_taken(min) column.
3. We have both numerical & categorical features present.

## Data Cleaning

In [53]:
# drop redundant columns
data.drop(['ID'],axis=1,inplace=True)

data.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [54]:
#Convert String 'NaN' to np.na
data.replace('NaN', float(np.nan), regex=True, inplace=True)

In [55]:
#Check null values
data.isnull().sum().sort_values(ascending=False)

Delivery_person_Ratings        1908
Delivery_person_Age            1854
Time_Orderd                    1731
City                           1200
multiple_deliveries             993
Weatherconditions               616
Road_traffic_density            601
Festival                        228
Delivery_person_ID                0
Vehicle_condition                 0
Type_of_vehicle                   0
Type_of_order                     0
Time_Order_picked                 0
Order_Date                        0
Delivery_location_longitude       0
Delivery_location_latitude        0
Restaurant_longitude              0
Restaurant_latitude               0
Time_taken(min)                   0
dtype: int64

In [56]:
# we impute the null values with respect tho each column's nature

def impute_nulls(df):
    df['Delivery_person_Age'].fillna(np.random.choice(df['Delivery_person_Age']), inplace=True)
    df['Weatherconditions'].fillna(np.random.choice(df['Weatherconditions']), inplace=True)
    df['City'].fillna(df['City'].mode()[0], inplace=True)
    df['Festival'].fillna(df['Festival'].mode()[0], inplace=True)
    df['multiple_deliveries'].fillna(df['multiple_deliveries'].mode()[0], inplace=True)
    df['Road_traffic_density'].fillna(df['Road_traffic_density'].mode()[0], inplace=True)
    df['Delivery_person_Ratings'].fillna(df['Delivery_person_Ratings'].median(), inplace=True)

impute_nulls(data)
data.isnull().sum()

Delivery_person_ID                0
Delivery_person_Age               0
Delivery_person_Ratings           0
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Order_Date                        0
Time_Orderd                    1731
Time_Order_picked                 0
Weatherconditions                 0
Road_traffic_density              0
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries               0
Festival                          0
City                              0
Time_taken(min)                   0
dtype: int64

In [57]:
# check for duplicates
if (len(data[data.duplicated()])>0):
    print("There are Duplicate values present")
else:
    print("There is no duplicate value present")

There is no duplicate value present


In [58]:
# change column to similar naming format as others
data.rename(columns={'Weatherconditions': 'Weather_conditions'},inplace=True)
data.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [59]:
# update data types

data['Delivery_person_Age'] = data['Delivery_person_Age'].astype('float64')
data['Delivery_person_Ratings'] = data['Delivery_person_Ratings'].astype('float64')
data['multiple_deliveries'] = data['multiple_deliveries'].astype('float64')
data['Order_Date']=pd.to_datetime(data['Order_Date'],format="%d-%m-%Y")

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Delivery_person_ID           45593 non-null  object        
 1   Delivery_person_Age          45593 non-null  float64       
 2   Delivery_person_Ratings      45593 non-null  float64       
 3   Restaurant_latitude          45593 non-null  float64       
 4   Restaurant_longitude         45593 non-null  float64       
 5   Delivery_location_latitude   45593 non-null  float64       
 6   Delivery_location_longitude  45593 non-null  float64       
 7   Order_Date                   45593 non-null  datetime64[ns]
 8   Time_Orderd                  43862 non-null  object        
 9   Time_Order_picked            45593 non-null  object        
 10  Weather_conditions           45593 non-null  object        
 11  Road_traffic_density         45593 non-nu

## Data Engineering

In [61]:
# remove redundant info from columns

# Extract time and convert to int
data['Time_taken(min)'] = data['Time_taken(min)'].apply(lambda x: int(x.split(' ')[1].strip()))

# Extract Weather conditions
data['Weather_conditions'] = data['Weather_conditions'].apply(lambda x: x.split(' ')[1].strip())

# Extract city code from Delivery person
data['City_code']=data['Delivery_person_ID'].str.split("RES", expand=True)[0]

In [62]:
data["day"] = data.Order_Date.dt.day
data["month"] = data.Order_Date.dt.month
data["quarter"] = data.Order_Date.dt.quarter
data["year"] = data.Order_Date.dt.year
data['day_of_week'] = data.Order_Date.dt.day_of_week.astype(int)
data["is_month_start"] = data.Order_Date.dt.is_month_start.astype(int)
data["is_month_end"] = data["Order_Date"].dt.is_month_end.astype(int)
data["is_quarter_start"] = data["Order_Date"].dt.is_quarter_start.astype(int)
data["is_quarter_end"] = data["Order_Date"].dt.is_quarter_end.astype(int)
data["is_year_start"] = data["Order_Date"].dt.is_year_start.astype(int)
data["is_year_end"] = data["Order_Date"].dt

data.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,...,month,quarter,year,day_of_week,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,...,3,1,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,...,3,1,2022,4,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...
2,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,...,3,1,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,...,4,2,2022,1,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,...,3,1,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...


In [63]:
print(data.columns)

Index(['Delivery_person_ID', 'Delivery_person_Age', 'Delivery_person_Ratings',
       'Restaurant_latitude', 'Restaurant_longitude',
       'Delivery_location_latitude', 'Delivery_location_longitude',
       'Order_Date', 'Time_Orderd', 'Time_Order_picked', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'Time_taken(min)', 'City_code', 'day', 'month', 'quarter', 'year',
       'day_of_week', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end'],
      dtype='object')


In [64]:
# calculate time difference
# find the difference between ordered time & picked time  # Ensure 'Order_Date' is in datetime format
data['Order_Date'] = pd.to_datetime(data['Order_Date'])

# Ensure 'Time_Orderd' and 'Time_Order_picked' are in timedelta format
data['Time_Orderd'] = pd.to_timedelta(data['Time_Orderd'])
data['Time_Order_picked'] = pd.to_timedelta(data['Time_Order_picked'])

# Calculate 'Time_Ordered_formatted' as a datetime
data['Time_Ordered_formatted'] = data['Order_Date'] + data['Time_Orderd']

# Calculate 'Time_Order_picked_formatted' with day adjustment
data['Time_Order_picked_formatted'] = data['Order_Date'] + data['Time_Order_picked']
data.loc[data['Time_Order_picked'] < data['Time_Orderd'], 'Time_Order_picked_formatted'] += pd.DateOffset(days=1)

# Calculate the order preparation time in minutes
data['order_prepare_time'] = (data['Time_Order_picked_formatted'] - data['Time_Ordered_formatted']).dt.total_seconds() / 60

# Fill nulls with the median
data['order_prepare_time'].fillna(data['order_prepare_time'].median(), inplace=True)

# Drop the unnecessary columns
data.drop(['Time_Orderd', 'Time_Order_picked', 'Time_Ordered_formatted', 'Time_Order_picked_formatted', 'Order_Date'], axis=1, inplace=True)

data.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,...,quarter,year,day_of_week,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,order_prepare_time
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,Sunny,High,2,...,1,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,15.0
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,Stormy,Jam,2,...,1,2022,4,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,5.0
2,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,Sandstorms,Low,0,...,1,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,15.0
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,Sunny,Medium,0,...,2,2022,1,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,10.0
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,Cloudy,High,1,...,1,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,15.0


In [65]:
from geopy.distance import geodesic

#Calculate distance between restaurant location & delivery location
def calculate_distance(df):
    df['distance']=np.zeros(len(df))
    restaurant_coordinates=df[['Restaurant_latitude','Restaurant_longitude']].to_numpy()
    delivery_location_coordinates=df[['Delivery_location_latitude','Delivery_location_longitude']].to_numpy()
    df['distance'] = np.array([geodesic(restaurant, delivery) for restaurant, delivery in zip(restaurant_coordinates, delivery_location_coordinates)])
    df['distance']= df['distance'].astype("str").str.extract('(\d+)').astype("int64")

calculate_distance(data)

data.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,...,year,day_of_week,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,order_prepare_time,distance
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,Sunny,High,2,...,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,15.0,3
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,Stormy,Jam,2,...,2022,4,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,5.0,20
2,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,Sandstorms,Low,0,...,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,15.0,1
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,Sunny,Medium,0,...,2022,1,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,10.0,7
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,Cloudy,High,1,...,2022,5,0,0,0,0,0,<pandas.core.indexes.accessors.DatetimePropert...,15.0,6


## Data Preprocessing

In [66]:

#label encoding

def label_encoding(df):
    categorical_columns = df.select_dtypes(include='object').columns
    label_encoder = LabelEncoder()
    df[categorical_columns] = df[categorical_columns].apply(lambda col: label_encoder.fit_transform(col))

label_encoding(data)

data.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Weather_conditions,Road_traffic_density,Vehicle_condition,...,year,day_of_week,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,order_prepare_time,distance
0,637,37.0,4.9,22.745049,75.892471,22.765049,75.912471,4,0,2,...,2022,5,0,0,0,0,0,0,15.0,3
1,232,34.0,4.5,12.913041,77.683237,13.043041,77.813237,3,1,2,...,2022,4,0,0,0,0,0,0,5.0,20
2,234,23.0,4.4,12.914264,77.6784,12.924264,77.6884,2,2,0,...,2022,5,0,0,0,0,0,0,15.0,1
3,397,38.0,4.7,11.003669,76.976494,11.053669,77.026494,4,3,0,...,2022,1,0,0,0,0,0,0,10.0,7
4,333,32.0,4.6,12.972793,80.249982,13.012793,80.289982,0,0,1,...,2022,5,0,0,0,0,0,0,15.0,6


In [67]:
# split into features and target variables
X = data.drop('Time_taken(min)', axis=1)  # Features
y = data['Time_taken(min)']  # Target variable

In [68]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(36474, 29)
(36474,)
(9119, 29)
(9119,)


In [81]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from time import time

# Find the best model
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor()]

param_grid = [
    {},
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [50, 200, 300]}
    ]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train_scaled, y_train)

    t0 = time()
    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    t1 = time()
    print(f"Time taken = {(t1-t0):.4f}s")
    print()

LinearRegression:
Best parameters: {}
Best R2 score: 0.4216756195676802
Time taken = 0.0016s

DecisionTreeRegressor:
Best parameters: {'max_depth': 7}
Best R2 score: 0.7170473415768908
Time taken = 0.0026s

RandomForestRegressor:
Best parameters: {'n_estimators': 200}
Best R2 score: 0.8107365786244045
Time taken = 0.0034s



In [89]:
# training model
n_estimators = 200
model = RandomForestRegressor(n_estimators)
model.fit(X_train_scaled,y_train)

In [90]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", round(mae,2))
print("Mean Squared Error (MSE):", round(mse,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
print("R-squared (R2) Score:", round(r2,2))

Mean Absolute Error (MAE): 3.21
Mean Squared Error (MSE): 16.52
Root Mean Squared Error (RMSE): 4.06
R-squared (R2) Score: 0.81


## Conclusion

1. Mean Absolute Error (MAE): The MAE score of 3.21 means that, on average, the model's predictions are 3.21 units away from the actual values. We ideally look for a lower MAE score, indicating more accurate predictions.

2. Mean Squared Error (MSE): The MSE score of 16.52 means that the average squared difference between the predicted and actual values is 16.52. We ideally look for a lower MSE score, suggesting fewer large errors.

3. Root Mean Squared Error (RMSE): The RMSE score of 4.06 means that the model's predictions typically deviate from the actual values by approximately 4.06 units. We ideally look for a lower RMSE score, indicating higher prediction accuracy.

4. R-squared (R2) Score: The R2 score of 0.81 means that the model explains 81% of the variance in the target variable. We ideally look for an R2 score closer to 1, indicating a better fit and higher explanatory power of the model.

## Recommendation

To improve the model further, consider exploring additional features, tuning hyperparameters more extensively, or trying different modeling techniques. Regular validation with fresh datasets will help ensure the model's continued performance and adaptability.