In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
X = df.drop(columns=['id','price'],axis=1)
Y = df['price']

In [7]:
cat_columns = X.select_dtypes(include='object').columns
num_columns = X.select_dtypes(exclude='object').columns

cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordina_encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_columns),
    ('cat_pipeline',cat_pipeline,cat_columns)
])

In [5]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.2, random_state=42)

In [8]:
xtrain = pd.DataFrame(preprocessor.fit_transform(xtrain),columns=preprocessor.get_feature_names_out())
xtest = pd.DataFrame(preprocessor.transform(xtest),columns=preprocessor.get_feature_names_out())

In [9]:
xtrain.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.016395,-0.204317,0.402608,-1.202472,-1.187395,-1.194148,-0.132842,-0.936018,-0.64895
1,0.882396,0.720758,-0.118536,0.985177,0.941823,1.036109,-0.132842,-0.320002,0.017052
2,1.529711,0.350728,-1.160823,1.426308,1.394848,1.441611,0.872563,1.528047,0.017052
3,1.896523,0.073206,0.923751,1.741402,1.711965,1.70229,-0.132842,1.528047,-1.314953
4,0.450852,1.73834,1.444895,0.562052,0.52504,0.703019,-2.143651,0.912031,0.017052


In [10]:
xtest.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.628006,0.258221,-0.118536,-0.599293,-0.580341,-0.571414,0.872563,-1.552034,-0.64895
1,2.608569,-2.146973,-0.118536,2.128516,2.201232,1.962969,-1.138246,0.296015,-1.314953
2,-1.124281,-1.221899,0.923751,-1.373523,-1.413907,-1.46931,-0.132842,-0.936018,2.015061
3,-1.016395,-0.574346,0.923751,-1.157458,-1.160213,-1.194148,-0.132842,1.528047,2.015061
4,0.860819,0.628251,-0.639679,0.949167,0.987125,1.007145,0.872563,0.912031,-0.64895


In [14]:
def evaluate_model(true, pred):
    mae = mean_absolute_error(true,pred)
    mse = mean_squared_error(true,pred)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true,pred)
    return mae, rmse, r2_square

In [13]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [16]:
model_list=[]
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(xtrain,ytrain)

    ytrain_pred = model.predict(xtrain)
    ytest_pred = model.predict(xtest)

    train_mae, train_rmse, train_r2 = evaluate_model(ytrain, ytrain_pred)

    test_mae, test_rmse, test_r2 = evaluate_model(ytest, ytest_pred)

    model_list.append(list(models.keys())[i])
    print(list(models.keys())[i])

    print("Model Performance for Training Set")
    print("- Mean Absolute Error: {:.4f}".format(train_mae))
    print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
    print("- R2 Score: {:.4f}".format(train_r2))

    print('-'*35)

    print("Model Performance for Test Set")
    print("- Mean Absolute Error: {:.4f}".format(test_mae))
    print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
    print("- R2 Score: {:.4f}".format(test_r2))
    r2_list.append(test_r2)

    print('='*35)
    print('\n')

Linear Regression
Model Performance for Training Set
- Mean Absolute Error: 677.1656
- Root Mean Squared Error: 1016.9490
- R2 Score: 0.9366
-----------------------------------
Model Performance for Test Set
- Mean Absolute Error: 671.5856
- Root Mean Squared Error: 1006.6010
- R2 Score: 0.9373


Lasso
Model Performance for Training Set
- Mean Absolute Error: 678.3145
- Root Mean Squared Error: 1017.0718
- R2 Score: 0.9366
-----------------------------------
Model Performance for Test Set
- Mean Absolute Error: 672.8635
- Root Mean Squared Error: 1006.8716
- R2 Score: 0.9373


Ridge
Model Performance for Training Set
- Mean Absolute Error: 677.1925
- Root Mean Squared Error: 1016.9491
- R2 Score: 0.9366
-----------------------------------
Model Performance for Test Set
- Mean Absolute Error: 671.6137
- Root Mean Squared Error: 1006.6062
- R2 Score: 0.9373


K-Neighbors Regressor
Model Performance for Training Set
- Mean Absolute Error: 285.8150
- Root Mean Squared Error: 545.6502
- R2 

In [17]:
df_results = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
df_results

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.977145
3,K-Neighbors Regressor,0.972114
4,Decision Tree,0.957232
0,Linear Regression,0.937298
2,Ridge,0.937297
1,Lasso,0.937264
6,AdaBoost Regressor,0.889355
