## Importing and Installing necessary libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
!pip install catboost



In [3]:
!pip install lightgbm



In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import LinearSVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,mean_absolute_percentage_error


## Data Loading

In [4]:
df=pd.read_csv("train.csv")

In [6]:
df.head(2)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


## No duplicate Records

In [6]:
df.duplicated().sum()

0

## Handling null values

In [8]:
df.isna().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [9]:
##these are the features with nan value
features_with_na=[features for features in df.columns if df[features].isnull().sum()>=1]
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean()*100,5), '% missing values')

Product_Category_2 31.56664 % missing values
Product_Category_3 69.67266 % missing values


- Since `Product_Category_3` has missing percentage of 69.7%. Hence the column is dropped<br>
- In the features `User_ID` is unique to each and every customer and hence it has very high cardinality and it is not much helpful, hence dropping this feature as well.

In [10]:
df.drop(columns='User_ID',inplace=True)
df.drop(columns='Product_Category_3',inplace=True)

As `Product_Category_2` feature is a categorical variable so replacing the null values with the mode

In [11]:
df['Product_Category_2']=df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])

In [12]:
df.isna().sum()

Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Purchase                      0
dtype: int64

## Feature Encoding

In [14]:
df['Gender']=df['Gender'].map({'M':0, 'F':1})
df['City_Category']=df['City_Category'].map({'A':0,"B":1,"C":2})

In [15]:
age_categories = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
stay_categories = ['0', '1', '2', '3', '4+']

In [16]:
ord_encoder=OrdinalEncoder(categories=[age_categories,stay_categories])

In [17]:
df[['Age','Stay_In_Current_City_Years']]=ord_encoder.fit_transform(df[['Age','Stay_In_Current_City_Years']])

In [18]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,P00069042,1,0.0,10,0,2.0,0,3,8.0,8370
1,P00248942,1,0.0,10,0,2.0,0,1,6.0,15200
2,P00087842,1,0.0,10,0,2.0,0,12,8.0,1422
3,P00085442,1,0.0,10,0,2.0,0,12,14.0,1057
4,P00285442,0,6.0,16,2,4.0,0,8,8.0,7969


In [19]:
one_hot_columns=['Occupation',
'City_Category',
'Product_Category_1',
'Product_Category_2'
]

In [20]:
df_new=pd.get_dummies(df,columns=one_hot_columns)

In [21]:
df_new.head(2)

Unnamed: 0,Product_ID,Gender,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Occupation_0,Occupation_1,Occupation_2,Occupation_3,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,P00069042,1,0.0,2.0,0,8370,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,P00248942,1,0.0,2.0,0,15200,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
counts=df_new['Product_ID'].value_counts()

In [23]:
df_new['Product_ID']=df_new['Product_ID'].map(counts)

In [24]:
df_new.head()

Unnamed: 0,Product_ID,Gender,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Occupation_0,Occupation_1,Occupation_2,Occupation_3,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,227,1,0.0,2.0,0,8370,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,581,1,0.0,2.0,0,15200,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,102,1,0.0,2.0,0,1422,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,341,1,0.0,2.0,0,1057,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,203,0,6.0,4.0,0,7969,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df_new.head(1)

Unnamed: 0,Product_ID,Gender,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Occupation_0,Occupation_1,Occupation_2,Occupation_3,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,227,1,0.0,2.0,0,8370,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Spliting Dependent and Independent variables

In [26]:
x=df_new.drop(columns='Purchase')
y=df_new['Purchase']

In [27]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 66 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  550068 non-null  int64  
 1   Gender                      550068 non-null  int64  
 2   Age                         550068 non-null  float64
 3   Stay_In_Current_City_Years  550068 non-null  float64
 4   Marital_Status              550068 non-null  int64  
 5   Occupation_0                550068 non-null  uint8  
 6   Occupation_1                550068 non-null  uint8  
 7   Occupation_2                550068 non-null  uint8  
 8   Occupation_3                550068 non-null  uint8  
 9   Occupation_4                550068 non-null  uint8  
 10  Occupation_5                550068 non-null  uint8  
 11  Occupation_6                550068 non-null  uint8  
 12  Occupation_7                550068 non-null  uint8  
 13  Occupation_8  

## Train Test Split

In [28]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=123)

## Model Training

In [29]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'XGBoost Regression': XGBRegressor(),
    'KNN Regression':KNeighborsRegressor(),
    'Random Forest Regression':RandomForestRegressor(),
    'GBM Regression':GradientBoostingRegressor(),
    'AdaBoost Regression':AdaBoostRegressor(),
    'CatBoost Regression':CatBoostRegressor(),
    'Light GBM Regression':LGBMRegressor(),
    'SVR Regression':LinearSVR()
}

In [30]:
# Initialize metrics
metrics = ['MSE', 'MAE', 'R2', 'Adjusted R2', 'RMSE','MAPE']

# Initialize DataFrame to store results
results_df = pd.DataFrame(index=metrics, columns=models.keys())

In [31]:
# Fit models, predict on test data, and evaluate
for model_name, model in models.items():
    print(f'fitting {model_name}')
    # Fit the model
    model.fit(x_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(x_test)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    adjusted_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - x_test.shape[1] - 1)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mape=mean_absolute_percentage_error(y_test,y_pred)
    # Store results in DataFrame
    results_df.at['MSE', model_name] = mse
    results_df.at['MAE', model_name] = mae
    results_df.at['R2', model_name] = r2
    results_df.at['Adjusted R2', model_name] = adjusted_r2
    results_df.at['RMSE', model_name] = rmse
    results_df.at['MAPE', model_name] = mape
    print(f'done with {model_name}')

fitting Linear Regression
done with Linear Regression
fitting Ridge Regression
done with Ridge Regression
fitting Lasso Regression
done with Lasso Regression
fitting Decision Tree Regression
done with Decision Tree Regression
fitting XGBoost Regression
done with XGBoost Regression
fitting KNN Regression
done with KNN Regression
fitting Random Forest Regression
done with Random Forest Regression
fitting GBM Regression
done with GBM Regression
fitting AdaBoost Regression
done with AdaBoost Regression
fitting CatBoost Regression
Learning rate set to 0.10488
0:	learn: 4757.0605755	total: 240ms	remaining: 3m 59s
1:	learn: 4531.8296465	total: 300ms	remaining: 2m 29s
2:	learn: 4341.0617339	total: 357ms	remaining: 1m 58s
3:	learn: 4180.3767269	total: 419ms	remaining: 1m 44s
4:	learn: 4039.0224435	total: 480ms	remaining: 1m 35s
5:	learn: 3919.4147732	total: 539ms	remaining: 1m 29s
6:	learn: 3812.0753119	total: 597ms	remaining: 1m 24s
7:	learn: 3724.0569366	total: 654ms	remaining: 1m 21s
8:	lear

143:	learn: 2803.4409368	total: 8.11s	remaining: 48.2s
144:	learn: 2803.1513179	total: 8.17s	remaining: 48.2s
145:	learn: 2802.8251229	total: 8.22s	remaining: 48.1s
146:	learn: 2802.3985055	total: 8.27s	remaining: 48s
147:	learn: 2801.8701197	total: 8.34s	remaining: 48s
148:	learn: 2801.1748859	total: 8.39s	remaining: 47.9s
149:	learn: 2800.8704329	total: 8.45s	remaining: 47.9s
150:	learn: 2800.3668884	total: 8.51s	remaining: 47.9s
151:	learn: 2800.0449789	total: 8.56s	remaining: 47.8s
152:	learn: 2799.6216317	total: 8.62s	remaining: 47.7s
153:	learn: 2799.0240621	total: 8.67s	remaining: 47.6s
154:	learn: 2798.7519334	total: 8.73s	remaining: 47.6s
155:	learn: 2798.2082990	total: 8.79s	remaining: 47.5s
156:	learn: 2797.7008658	total: 8.84s	remaining: 47.5s
157:	learn: 2797.4923343	total: 8.9s	remaining: 47.4s
158:	learn: 2797.0986347	total: 8.95s	remaining: 47.4s
159:	learn: 2796.9391416	total: 9.01s	remaining: 47.3s
160:	learn: 2796.5173491	total: 9.06s	remaining: 47.2s
161:	learn: 279

295:	learn: 2749.9787836	total: 16.6s	remaining: 39.4s
296:	learn: 2749.8117164	total: 16.6s	remaining: 39.4s
297:	learn: 2749.6983879	total: 16.7s	remaining: 39.3s
298:	learn: 2749.3855348	total: 16.7s	remaining: 39.3s
299:	learn: 2749.2094513	total: 16.8s	remaining: 39.2s
300:	learn: 2749.0273390	total: 16.9s	remaining: 39.2s
301:	learn: 2748.7737018	total: 16.9s	remaining: 39.1s
302:	learn: 2748.4898463	total: 17s	remaining: 39s
303:	learn: 2748.2609181	total: 17s	remaining: 38.9s
304:	learn: 2747.9607763	total: 17.1s	remaining: 38.9s
305:	learn: 2747.7193142	total: 17.1s	remaining: 38.8s
306:	learn: 2747.5812331	total: 17.2s	remaining: 38.8s
307:	learn: 2747.3272430	total: 17.2s	remaining: 38.7s
308:	learn: 2747.0472397	total: 17.3s	remaining: 38.7s
309:	learn: 2746.8037515	total: 17.3s	remaining: 38.6s
310:	learn: 2746.4565366	total: 17.4s	remaining: 38.5s
311:	learn: 2746.3204985	total: 17.5s	remaining: 38.5s
312:	learn: 2746.0103018	total: 17.5s	remaining: 38.4s
313:	learn: 2745

449:	learn: 2716.1032464	total: 25.3s	remaining: 30.9s
450:	learn: 2715.9224896	total: 25.3s	remaining: 30.9s
451:	learn: 2715.7892364	total: 25.4s	remaining: 30.8s
452:	learn: 2715.6227137	total: 25.5s	remaining: 30.7s
453:	learn: 2715.4927509	total: 25.5s	remaining: 30.7s
454:	learn: 2715.2907816	total: 25.6s	remaining: 30.6s
455:	learn: 2715.1930293	total: 25.6s	remaining: 30.6s
456:	learn: 2714.9668053	total: 25.7s	remaining: 30.5s
457:	learn: 2714.8512423	total: 25.7s	remaining: 30.4s
458:	learn: 2714.6860540	total: 25.8s	remaining: 30.4s
459:	learn: 2714.5301411	total: 25.8s	remaining: 30.3s
460:	learn: 2714.4275099	total: 25.9s	remaining: 30.3s
461:	learn: 2714.3062939	total: 26s	remaining: 30.2s
462:	learn: 2713.9599934	total: 26s	remaining: 30.2s
463:	learn: 2713.8859949	total: 26.1s	remaining: 30.1s
464:	learn: 2713.6779157	total: 26.1s	remaining: 30.1s
465:	learn: 2713.2154453	total: 26.2s	remaining: 30s
466:	learn: 2713.0677888	total: 26.3s	remaining: 30s
467:	learn: 2712.8

602:	learn: 2690.8233584	total: 33.7s	remaining: 22.2s
603:	learn: 2690.7684058	total: 33.7s	remaining: 22.1s
604:	learn: 2690.6907414	total: 33.8s	remaining: 22.1s
605:	learn: 2690.5510106	total: 33.8s	remaining: 22s
606:	learn: 2690.4581284	total: 33.9s	remaining: 21.9s
607:	learn: 2690.3623650	total: 33.9s	remaining: 21.9s
608:	learn: 2690.1789356	total: 34s	remaining: 21.8s
609:	learn: 2689.9523851	total: 34.1s	remaining: 21.8s
610:	learn: 2689.8215380	total: 34.1s	remaining: 21.7s
611:	learn: 2689.7201848	total: 34.2s	remaining: 21.7s
612:	learn: 2689.6435215	total: 34.2s	remaining: 21.6s
613:	learn: 2689.5173855	total: 34.3s	remaining: 21.6s
614:	learn: 2689.4327419	total: 34.3s	remaining: 21.5s
615:	learn: 2689.3422204	total: 34.4s	remaining: 21.4s
616:	learn: 2689.2370423	total: 34.4s	remaining: 21.4s
617:	learn: 2689.0595305	total: 34.5s	remaining: 21.3s
618:	learn: 2688.9089700	total: 34.5s	remaining: 21.3s
619:	learn: 2688.8020614	total: 34.6s	remaining: 21.2s
620:	learn: 26

756:	learn: 2671.6494336	total: 42.2s	remaining: 13.5s
757:	learn: 2671.5149843	total: 42.3s	remaining: 13.5s
758:	learn: 2671.4259914	total: 42.3s	remaining: 13.4s
759:	learn: 2671.3177750	total: 42.3s	remaining: 13.4s
760:	learn: 2671.2321365	total: 42.4s	remaining: 13.3s
761:	learn: 2671.0648516	total: 42.5s	remaining: 13.3s
762:	learn: 2670.9150565	total: 42.5s	remaining: 13.2s
763:	learn: 2670.8461132	total: 42.6s	remaining: 13.2s
764:	learn: 2670.7653621	total: 42.6s	remaining: 13.1s
765:	learn: 2670.5928029	total: 42.7s	remaining: 13s
766:	learn: 2670.5275768	total: 42.7s	remaining: 13s
767:	learn: 2670.3670192	total: 42.8s	remaining: 12.9s
768:	learn: 2670.2947004	total: 42.8s	remaining: 12.9s
769:	learn: 2670.1150052	total: 42.9s	remaining: 12.8s
770:	learn: 2670.0118760	total: 43s	remaining: 12.8s
771:	learn: 2669.9188146	total: 43s	remaining: 12.7s
772:	learn: 2669.8430315	total: 43.1s	remaining: 12.6s
773:	learn: 2669.7309418	total: 43.1s	remaining: 12.6s
774:	learn: 2669.6

909:	learn: 2656.3976003	total: 50.7s	remaining: 5.01s
910:	learn: 2656.3335767	total: 50.8s	remaining: 4.96s
911:	learn: 2656.1768094	total: 50.8s	remaining: 4.9s
912:	learn: 2656.0190895	total: 50.9s	remaining: 4.85s
913:	learn: 2655.9619196	total: 51s	remaining: 4.79s
914:	learn: 2655.8794674	total: 51s	remaining: 4.74s
915:	learn: 2655.7477776	total: 51.1s	remaining: 4.68s
916:	learn: 2655.6279324	total: 51.1s	remaining: 4.63s
917:	learn: 2655.5452167	total: 51.2s	remaining: 4.57s
918:	learn: 2655.4903641	total: 51.3s	remaining: 4.52s
919:	learn: 2655.4240597	total: 51.3s	remaining: 4.46s
920:	learn: 2655.3603768	total: 51.4s	remaining: 4.41s
921:	learn: 2655.2613626	total: 51.4s	remaining: 4.35s
922:	learn: 2654.8135745	total: 51.5s	remaining: 4.3s
923:	learn: 2654.7113082	total: 51.6s	remaining: 4.24s
924:	learn: 2654.6441273	total: 51.6s	remaining: 4.18s
925:	learn: 2654.4998838	total: 51.7s	remaining: 4.13s
926:	learn: 2654.4383706	total: 51.7s	remaining: 4.07s
927:	learn: 2654



In [32]:
results_df

Unnamed: 0,Linear Regression,Ridge Regression,Lasso Regression,Decision Tree Regression,XGBoost Regression,KNN Regression,Random Forest Regression,GBM Regression,AdaBoost Regression,CatBoost Regression,Light GBM Regression,SVR Regression
MSE,8290016.378512,8290042.237272,8297516.065143,13558500.622103,7280845.163347,9008359.921806,8165457.383167,8281645.791594,13471897.820605,7145395.742548,7538730.400332,10619578.955432
MAE,2165.206276,2165.215514,2167.365933,2597.716831,2012.751197,2215.470049,2096.993913,2165.123088,2807.106705,1994.132476,2050.300713,2440.081509
R2,0.671438,0.671437,0.671141,0.46263,0.711435,0.642968,0.676375,0.67177,0.466062,0.716803,0.701214,0.57911
Adjusted R2,0.671307,0.671306,0.671009,0.462415,0.71132,0.642825,0.676245,0.671639,0.465849,0.71669,0.701095,0.578941
RMSE,2879.238854,2879.243345,2880.540933,3682.186935,2698.304127,3001.392997,2857.526445,2877.784876,3670.4084,2673.087305,2745.674853,3258.769546
MAPE,0.35671,0.356795,0.37781,0.373851,0.325238,0.37017,0.313854,0.565208,1.575721,0.328694,0.345942,1.486114


From above dataframe following are the better performing models based on R2, RMSE and MAPE
- XGBoost
- Random Forest
- CatBoost 
- Light GBM

And on these models hyperparameter tuning is done to select the best model

# Hyperparameter Tuning

In [43]:
def evaluate_clf(true, predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    r2 = r2_score(true, predicted)
    rmse = mean_squared_error(true, predicted, squared=False)
    mape=mean_absolute_percentage_error(true, predicted)
    
    return mse, mae , r2, rmse, mape

In [50]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
    
    models_list = []
    mse = []
    mae= []
    r2=[]
    rmse=[]
    mape=[]
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_mse, model_train_mae,model_train_r2,\
        model_train_rmse,model_train_mape=evaluate_clf(y_train ,y_train_pred)


        # Test set performance       
        model_test_mse, model_test_mae,model_test_r2,\
        model_test_rmse,model_test_mape=evaluate_clf(y_test ,y_test_pred)
        
        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- MSE: {:.4f}".format(model_train_mse))
        print('- MAE: {:.4f}'.format(model_train_mae)) 
        print('- R2: {:.4f}'.format(model_train_r2))
        print('- RMSE: {:.4f}'.format(model_train_rmse))
        print('- MAPE: {:.4f}'.format(model_train_mape))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- MSE: {:.4f}'.format(model_test_mse))
        mse.append(float(model_test_mse))
        mae.append(float(model_test_mae))
        r2.append(float(model_test_r2))
        rmse.append(float(model_test_rmse))
        mape.append(float(model_test_mape))
        print('- MAE: {:.4f}'.format(model_test_mae))
        print('- R2: {:.4f}'.format(model_test_r2))
        print('- RMSE: {:.4f}'.format(model_test_rmse))
        print('- MAPE: {:.4f}'.format(model_test_mape))
        print('='*35)
        print('\n')
        
    report=pd.DataFrame(list(zip(models_list,mse,mae,r2,rmse,mape)), columns=['Model Name', 'MSE','MAE','R2','RMSE','MAPE']).sort_values(by=['R2'], ascending=False)
        
    return report

In [36]:
#Initialize few parameter for Hyperparamter tuning
xgboost_params = {
    'max_depth':range(3,7,2),
    'min_child_weight':range(1,6,2)
}

rf_params = {
    "max_depth": [10, None, 15],
    "max_features": ['sqrt', 'log2', None],
    "n_estimators": [10, 100, 200]
}


catboost_params={
                  'depth'         : [6,8,10],
                  'learning_rate' : [0.01, 0.05, 0.1],
                  'iterations'    : [30, 50, 100]
}

lightgbm_params={
        "max_depth": [10,None, 15],
        'learning_rate' : [0.01, 0.05, 0.1],
        'num_leaves': [20,25,32],
}

In [37]:
# Models list for Hyperparameter tuning
randomcv_models = [
    ('XGBoost', XGBRegressor(), xgboost_params),
    ("RF", RandomForestRegressor(), rf_params),
    ("CatBoost", CatBoostRegressor(), catboost_params),
    ("LGBM", LGBMRegressor(), lightgbm_params)

]

In [38]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2, 
                                   n_jobs=-1)
    random.fit(x, y)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])



Fitting 3 folds for each of 6 candidates, totalling 18 fits




Fitting 3 folds for each of 27 candidates, totalling 81 fits




Fitting 3 folds for each of 27 candidates, totalling 81 fits
0:	learn: 4722.0610936	total: 108ms	remaining: 10.7s
1:	learn: 4463.1888307	total: 160ms	remaining: 7.82s
2:	learn: 4241.0082608	total: 227ms	remaining: 7.32s
3:	learn: 4052.3430360	total: 275ms	remaining: 6.6s
4:	learn: 3888.0006902	total: 327ms	remaining: 6.21s
5:	learn: 3750.6089552	total: 381ms	remaining: 5.96s
6:	learn: 3634.6045225	total: 430ms	remaining: 5.71s
7:	learn: 3533.9335324	total: 478ms	remaining: 5.5s
8:	learn: 3447.6493543	total: 529ms	remaining: 5.35s
9:	learn: 3370.7548844	total: 580ms	remaining: 5.22s
10:	learn: 3305.7952592	total: 628ms	remaining: 5.08s
11:	learn: 3251.8068175	total: 688ms	remaining: 5.05s
12:	learn: 3205.2562344	total: 735ms	remaining: 4.92s
13:	learn: 3162.7939691	total: 797ms	remaining: 4.89s
14:	learn: 3126.2682931	total: 849ms	remaining: 4.81s
15:	learn: 3087.1247089	total: 897ms	remaining: 4.71s
16:	learn: 3057.7890600	total: 944ms	remaining: 4.61s
17:	learn: 3030.5916974	total: 99

In [39]:
model_param

{'XGBoost': {'min_child_weight': 1, 'max_depth': 5},
 'RF': {'n_estimators': 200, 'max_features': None, 'max_depth': 15},
 'CatBoost': {'learning_rate': 0.1, 'iterations': 100, 'depth': 10},
 'LGBM': {'num_leaves': 32, 'max_depth': None, 'learning_rate': 0.1}}

In [51]:
best_models = {
    "Random Forest Regressor": RandomForestRegressor(**model_param['RF']),
    "CatBoostRegressor": CatBoostRegressor(**model_param['CatBoost']),
    "LGBMRegressor": LGBMRegressor(**model_param['LGBM']),
    "XGBRegressor": XGBRegressor(**model_param['XGBoost'],n_jobs=-1),
}
tuned_report =evaluate_models(X=x, y=y, models=best_models)

Random Forest Regressor
Model performance for Training set
- MSE: 6234167.8384
- MAE: 1916.1921
- R2: 0.7530
- RMSE: 2496.8316
- MAPE: 0.4349
----------------------------------
Model performance for Test set
- MSE: 7476939.1605
- MAE: 2058.5834
- R2: 0.7033
- RMSE: 2734.3992
- MAPE: 0.4584


0:	learn: 4722.2951514	total: 135ms	remaining: 13.3s
1:	learn: 4462.4785854	total: 261ms	remaining: 12.8s
2:	learn: 4240.4234173	total: 375ms	remaining: 12.1s
3:	learn: 4051.3929264	total: 500ms	remaining: 12s
4:	learn: 3887.8844584	total: 620ms	remaining: 11.8s
5:	learn: 3748.0547665	total: 747ms	remaining: 11.7s
6:	learn: 3632.2033002	total: 873ms	remaining: 11.6s
7:	learn: 3533.1310924	total: 1s	remaining: 11.5s
8:	learn: 3445.5260330	total: 1.12s	remaining: 11.3s
9:	learn: 3371.9078984	total: 1.24s	remaining: 11.2s
10:	learn: 3306.7888683	total: 1.38s	remaining: 11.1s
11:	learn: 3252.4333035	total: 1.5s	remaining: 11s
12:	learn: 3202.4673759	total: 1.63s	remaining: 10.9s
13:	learn: 3159.5868748

In [52]:
tuned_report

Unnamed: 0,Model Name,MSE,MAE,R2,RMSE,MAPE
0,Random Forest Regressor,7476939.0,2058.583438,0.703323,2734.399232,0.458427
3,XGBRegressor,7532268.0,2047.06226,0.701127,2744.497754,0.32639
2,LGBMRegressor,7621658.0,2059.496441,0.697581,2760.735093,0.343362
1,CatBoostRegressor,7850334.0,2089.836501,0.688507,2801.844682,0.382086


So after hyperparameter tuning and evaluating the models on test data set two models `XGBoost` and `Light GBM` were performing better and out of these `XGBoost` can be selected as the Final Model.