#### Importing necessary Libraries

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV 


#### Read the dataset

In [3]:
df = pd.read_csv('./Data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
# drop the column id
df.drop('id',axis=1, inplace=True)

#### Getting X and Y variables

In [5]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

#### Data Transformation Pipeline

In [6]:
# defining numerical & categorical columns 
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

#creating pipelnes 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler 

#defining the categorical variables with their values for ordinal encoding 

cut_feature = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_feature = ['F','J', 'G', 'E', 'D', 'H', 'I']
clarity_feature = ['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'] 

#creating pipeline 

num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OrdinalEncoder(categories=[cut_feature,color_feature,clarity_feature])),
        ('scaler',StandardScaler())
    ]
) 

#column transformation  

from sklearn.compose import ColumnTransformer 

preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,num_features),
        ('cat_pipeline',cat_pipeline,cat_features)
    ]
) 

#### Transformation of data through created Pipeline 

In [7]:
X = preprocessor.fit_transform(X)

In [8]:
X_transform = pd.DataFrame(X,columns=preprocessor.get_feature_names_out())

X_transform.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,1.576255,0.350768,0.402496,1.401353,1.460456,1.474414,-0.132929,-1.539563,-1.203285
1,2.678513,0.165874,0.402496,2.113437,2.17712,2.200187,-1.138809,-1.006501,-0.641585
2,-0.196003,-0.573702,-0.118652,-0.022816,0.008986,-0.04971,0.872951,-0.473439,-0.079885
3,-1.017293,-0.203914,-0.639801,-1.203614,-1.188477,-1.196432,0.872951,-0.473439,-0.079885
4,1.965288,0.720555,0.923644,1.743874,1.714463,1.793754,-0.132929,-0.473439,-1.203285


#### Split the Train & Test Data 

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((154858, 9), (38715, 9))

#### Evaluate Function to give metrics after model Training

In [10]:
def evaluate_model(true,predicted):
     mse =  mean_squared_error(true,predicted)
     mae = mean_absolute_error(true,predicted)
     r2_sqr = r2_score(true,predicted)
     rmse = np.sqrt(mean_squared_error(true, predicted))
  
     return r2_sqr,rmse, mae 

#### Defining all the machine learning models 

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

#### Training all models 

In [12]:
model_list = []
r2_list = []  

for i in range(len(list(models))):
    model = list(models.values())[i] 
    
    #train the model 
    model.fit(X_train,y_train.values.flatten())

    #make prediction 
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test) 

    # evalute the model  
    model_train_r2 , model_train_rmse, model_train_mae  = evaluate_model(y_train,y_train_pred)

    model_test_r2 , model_test_rmse, model_test_mae  = evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i]) 
    model_list.append(list(models.keys())[i]) 

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('-'*30)
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*30)
    print('\n')


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1195.8217
- Mean Absolute Error: 725.3900
- R2 Score: 0.9123
------------------------------
Model performance for Test set
- Root Mean Squared Error: 1184.7408
- Mean Absolute Error: 720.2102
- R2 Score: 0.9131




Lasso
Model performance for Training set
- Root Mean Squared Error: 1195.8918
- Mean Absolute Error: 726.8193
- R2 Score: 0.9123
------------------------------
Model performance for Test set
- Root Mean Squared Error: 1184.9134
- Mean Absolute Error: 721.7100
- R2 Score: 0.9131


Ridge
Model performance for Training set
- Root Mean Squared Error: 1195.8218
- Mean Absolute Error: 725.4152
- R2 Score: 0.9123
------------------------------
Model performance for Test set
- Root Mean Squared Error: 1184.7472
- Mean Absolute Error: 720.2344
- R2 Score: 0.9131


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 598.8191
- Mean Absolute Error: 305.4807
- R2 Score: 0.9780
------------------------------
Model performance for Test set
- Root Mean Squared Error: 750.5335
- Mean Absolute Error: 378.8495
- R2 Score: 0.9651


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 12.8816
- Mean Absolute Error: 0.8012
- R2 Score: 1.0000
----------

#### Model Results 

In [13]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model_Name','r2_Score']).sort_values(by='r2_Score',ascending=False)

Unnamed: 0,Model_Name,r2_Score
7,CatBoosting Regressor,0.978725
6,XGBRegressor,0.978096
5,Random Forest Regressor,0.976659
3,K-Neighbors Regressor,0.965141
4,Decision Tree,0.957233
0,Linear Regression,0.913141
2,Ridge,0.91314
1,Lasso,0.913115
8,AdaBoost Regressor,0.902327


#### Hyperparameter Tunning 

In [14]:
# this function prints the tunned model's results
def evaluated_results(model,X_train,y_train,X_test,y_test):
    ytrain_pred = model.predict(X_train)
    ytest_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_r2 , model_train_rmse, model_train_mae  = evaluate_model(y_train,y_train_pred)

    model_test_r2 , model_test_rmse, model_test_mae  = evaluate_model(y_test,y_test_pred)

    # Printing results
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

In [15]:
#Tunning Catboost 

# Initializing catboost
cat_r = CatBoostRegressor(verbose=False)

# Creating the hyperparameter grid
params = {'depth': [6,8,10],
              'learning_rate': [0.01, 0.05, 0.1],
               'iterations': [30, 50, 100]}

#Instantiate RandomSearchCV object
rscv_cat = RandomizedSearchCV(cat_r , params, scoring='r2', cv =5, n_jobs=-1)

# Fit the model
rscv_cat.fit(X_train, y_train.values.flatten())

# Print the tuned parameters and score
print(rscv_cat.best_params_)
print(rscv_cat.best_score_) 

# Selecting best model
best_cat_r = rscv_cat.best_estimator_

# Evaluate Train and Test dataset
evaluated_results(best_cat_r,X_train,y_train,X_test,y_test)

{'learning_rate': 0.1, 'iterations': 100, 'depth': 10}
0.9788394371570991
Model performance for Training set
- Root Mean Squared Error: 1263.2889
- Mean Absolute Error: 853.7682
- R2 Score: 0.9021
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1256.3273
- Mean Absolute Error: 847.9956
- R2 Score: 0.9023


In [16]:
# Tunning XGBoost

# Initializing xgboost
xgb_r = XGBRegressor()

# Parameters
params = {
 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
 'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
 'min_child_weight' : [ 1, 3, 5, 7 ],
 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ],
 'n_estimators':[300,400,500,600]
}

rscv_xgb=RandomizedSearchCV(xgb_r,param_distributions=params,scoring='r2',n_jobs=-1,cv=5)
rscv_xgb.fit(X_train, y_train.values.flatten())

# Print the tuned parameters and score
print(rscv_xgb.best_params_)
print(rscv_xgb.best_score_)

# Selecting best model
best_xgb_r = rscv_xgb.best_estimator_

# Evaluate Train and Test dataset
evaluated_results(best_xgb_r,X_train,y_train,X_test,y_test)

{'n_estimators': 400, 'min_child_weight': 7, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 0.7}
0.9783758467162823
Model performance for Training set
- Root Mean Squared Error: 1263.2889
- Mean Absolute Error: 853.7682
- R2 Score: 0.9021
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1256.3273
- Mean Absolute Error: 847.9956
- R2 Score: 0.9023


#### Final Result by Voting Regressor

In [17]:
from sklearn.ensemble import VotingRegressor

vr = VotingRegressor([('cbr_r',best_cat_r),('xgb',xgb_r)], weights=[3,1])
vr.fit(X_train, y_train.values.flatten())

evaluated_results(vr,X_train,y_train,X_test,y_test) 

Model performance for Training set
- Root Mean Squared Error: 1263.2889
- Mean Absolute Error: 853.7682
- R2 Score: 0.9021
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1256.3273
- Mean Absolute Error: 847.9956
- R2 Score: 0.9023
