In [27]:
from pycaret import *
from pycaret.datasets import get_data
dataset = get_data('diamond')

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,1.1,Ideal,H,SI1,VG,EX,GIA,5169
1,0.83,Ideal,H,VS1,ID,ID,AGSL,3470
2,0.85,Ideal,H,SI1,EX,EX,GIA,3183
3,0.91,Ideal,E,SI1,VG,VG,GIA,4370
4,0.83,Ideal,G,SI1,EX,EX,GIA,3171


In [28]:
data = dataset.sample(frac=0.9, random_state=786)
data_unseen = dataset.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (5400, 8)
Unseen Data For Predictions: (600, 8)


In [29]:
from pycaret.regression import *
exp_reg102 = setup(data = data, target = 'Price', session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95, 
                  bin_numeric_features = ['Carat Weight'],
                  log_experiment = True, experiment_name = 'diamond1')

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Price
2,Original Data,"(5400, 8)"
3,Missing Values,False
4,Numeric Features,1
5,Categorical Features,6
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(3779, 39)"


In [31]:
best = compare_models(exclude = ['ransac'], n_select=3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,708.8055,2492075.9705,1539.2702,0.9754,0.0749,0.0545,2.068
xgboost,Extreme Gradient Boosting,775.3886,2808499.6375,1640.2406,0.9728,0.0807,0.0589,0.526
lightgbm,Light Gradient Boosting Machine,766.0853,3116466.5642,1704.0975,0.9704,0.0799,0.0576,0.139
rf,Random Forest Regressor,850.1617,3271185.4195,1771.2208,0.9686,0.0905,0.0657,0.711
huber,Huber Regressor,940.6188,3651900.3365,1891.7109,0.964,0.0972,0.0708,0.19
ridge,Ridge Regression,952.2516,3846252.5,1934.6255,0.9624,0.0971,0.0715,0.015
lr,Linear Regression,956.0035,3984907.7,1964.0966,0.961,0.0972,0.0716,0.792
br,Bayesian Ridge,956.6502,3999159.6452,1967.8153,0.9608,0.0972,0.0716,0.02
et,Extra Trees Regressor,964.4137,4406428.3608,2061.2127,0.9569,0.1055,0.0759,0.896
dt,Decision Tree Regressor,1000.25,4685152.6779,2136.9863,0.9539,0.1082,0.0778,0.03


In [32]:
print(best)

[PowerTransformedTargetRegressor(border_count=254, loss_function='RMSE',
                                power_transformer_method='box-cox',
                                power_transformer_standardize=True,
                                random_state=123,
                                regressor=<catboost.core.CatBoostRegressor object at 0x0000021CBFE6F910>,
                                task_type='CPU', verbose=False), PowerTransformedTargetRegressor(base_score=None, booster='gbtree',
                                colsample_bylevel=None, colsample_bynode=None,
                                colsample_bytree=None, gamma=None, gpu_id=None,
                                importance_type='gain',
                                interaction_constraints=None,
                                learning_rate=None, max_delta_step=None,
                                max_depth=None, min_child_weight=None,
                                missing=nan, monotone_constraints=None,
          

In [10]:
rf = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,685.4484,1227464.4839,1107.91,0.9865,0.0793,0.0599
1,799.8848,4710626.7515,2170.3978,0.9639,0.0775,0.0568
2,733.2349,1701776.3793,1304.5215,0.9844,0.0754,0.0582
3,748.2089,1577962.3578,1256.1697,0.9826,0.0765,0.0589
4,741.7261,3810654.7125,1952.0898,0.9525,0.0884,0.0614
5,867.6636,5364843.1268,2316.2131,0.9542,0.0893,0.0628
6,719.1019,2587278.8952,1608.5021,0.9757,0.0751,0.0554
7,722.4955,1818603.9468,1348.5562,0.977,0.0787,0.0599
8,740.612,2304721.528,1518.1309,0.9774,0.0796,0.0582
9,847.9284,4192899.6781,2047.6571,0.9601,0.0977,0.0657


In [11]:
print(rf)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=123, verbose=0, warm_start=False)


In [12]:
ada=create_model("ada")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,4101.8809,23013830.0177,4797.2732,0.7473,0.4758,0.547
1,4251.5693,29296751.6657,5412.6474,0.7755,0.494,0.5702
2,4047.8474,22291660.1785,4721.4045,0.7955,0.5068,0.5871
3,4298.3867,23482783.6839,4845.9038,0.7409,0.5089,0.596
4,3888.5584,24461807.7242,4945.888,0.6949,0.4764,0.5461
5,4566.4889,29733914.8752,5452.8813,0.7462,0.5462,0.6598
6,4628.7271,27841092.1974,5276.4659,0.7384,0.5549,0.6676
7,4316.4317,25979752.0083,5097.0336,0.6715,0.5034,0.5858
8,3931.2163,21097072.3513,4593.1549,0.7928,0.4858,0.5513
9,4291.1097,24815566.0009,4981.5225,0.7637,0.5495,0.6592


In [13]:
print(ada)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=123)


In [15]:
cat=create_model("catboost")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,522.132,699905.1998,836.6034,0.9923,0.067,0.0479
1,731.0635,4447249.7686,2108.8503,0.9659,0.0652,0.0482
2,664.1178,1670242.7485,1292.3787,0.9847,0.0693,0.0521
3,548.9799,811831.1788,901.0167,0.991,0.0585,0.0453
4,585.8118,1412274.0085,1188.3914,0.9824,0.0679,0.048
5,689.6413,4922421.2121,2218.653,0.958,0.069,0.0491
6,647.4816,1790748.0065,1338.1883,0.9832,0.0679,0.0496
7,636.0041,1589003.0032,1260.5566,0.9799,0.067,0.0502
8,587.1486,1122174.7963,1059.3275,0.989,0.0619,0.0474
9,696.4141,2356856.6666,1535.2057,0.9776,0.0786,0.0542


## Model Tunning

In [17]:
rf_tuned=tune_model(rf)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,816.9083,1872560.7181,1368.4154,0.9794,0.097,0.0726
1,951.091,6584277.7731,2565.9848,0.9495,0.094,0.0706
2,909.3077,3013277.609,1735.8795,0.9724,0.0954,0.0737
3,907.8589,2464008.5102,1569.7161,0.9728,0.0971,0.0733
4,906.9244,3560151.8876,1886.8365,0.9556,0.1064,0.0777
5,1081.5682,7990185.0063,2826.6915,0.9318,0.1075,0.0787
6,930.3239,3946251.3422,1986.5174,0.9629,0.0969,0.0731
7,960.7591,3651063.5159,1910.7756,0.9538,0.105,0.0795
8,1003.5076,4133159.5882,2033.0174,0.9594,0.1078,0.0804
9,998.3897,4729012.0822,2174.6292,0.955,0.1138,0.0812


In [18]:
tuned_ada=tune_model(ada)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2759.6611,16380711.7542,4047.3092,0.8202,0.2808,0.2613
1,2901.3392,22572903.572,4751.095,0.827,0.3087,0.2942
2,2791.3101,15829783.4301,3978.666,0.8548,0.3135,0.3086
3,2622.3949,14647515.3862,3827.2073,0.8384,0.2717,0.2334
4,2457.5786,14562994.4838,3816.1492,0.8184,0.2671,0.2264
5,2686.6949,19471487.9107,4412.6509,0.8338,0.3001,0.2783
6,2780.2231,17357834.7679,4166.2735,0.8369,0.3054,0.2822
7,2898.4723,18031683.4258,4246.373,0.772,0.3076,0.2929
8,2593.3611,15134670.4021,3890.3304,0.8514,0.2964,0.2719
9,2533.0852,15820915.8817,3977.5515,0.8494,0.2943,0.2714


In [19]:
tuned_cat=tune_model(cat)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1044.4416,3287701.7162,1813.2021,0.9639,0.1335,0.1013
1,1218.6921,13161100.5046,3627.8231,0.8991,0.1396,0.1028
2,1139.1048,5615851.8773,2369.7789,0.9485,0.1429,0.1114
3,957.1337,2779588.4827,1667.2098,0.9693,0.1238,0.094
4,898.6778,1886506.1039,1373.5014,0.9765,0.1337,0.1015
5,1229.7487,9810488.4385,3132.1699,0.9163,0.1404,0.1083
6,1124.8123,5863060.3837,2421.3757,0.9449,0.15,0.1105
7,1064.9233,3910843.0377,1977.5852,0.9505,0.1446,0.1101
8,1109.8291,4544738.0486,2131.8391,0.9554,0.1349,0.1012
9,1149.4256,5715170.246,2390.6422,0.9456,0.1511,0.1144


## Predicting on Hold out set

In [22]:
predict_model(tuned_cat);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,1150.8954,5759547.6268,2399.9058,0.9475,0.1428,0.1082


## Finalizing the Model

In [23]:
final_cat = finalize_model(tuned_cat)

In [24]:
predict_model(final_cat)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,1006.2845,3968594.5795,1992.1332,0.9638,0.1215,0.0932


Unnamed: 0,Carat Weight,Cut_Fair,Cut_Good,Cut_Ideal,Cut_Signature-Ideal,Cut_Very Good,Color_D,Color_E,Color_F,Color_G,...,Polish_G,Polish_ID,Polish_VG,Symmetry_EX,Symmetry_G,Symmetry_ID,Symmetry_VG,Report_GIA,Price,Label
0,1.16,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,7059.0,6965.272138
1,1.14,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5809.0,6378.553133
2,1.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,10641.0,11658.227146
3,0.79,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,2871.0,3487.188410
4,2.23,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,21706.0,23911.071628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616,1.22,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,12906.0,10870.240339
1617,0.83,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3235.0,3779.732768
1618,1.51,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,9058.0,9351.319830
1619,1.16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5091.0,5411.838909


## Predicting on Unseen Data set

In [25]:
unseen_predictions = predict_model(final_cat, data=data_unseen)
unseen_predictions.head()

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price,Label
0,1.53,Ideal,E,SI1,ID,ID,AGSL,12791,12000.211911
1,1.5,Fair,F,SI1,VG,VG,GIA,10450,10971.40564
2,1.01,Good,E,SI1,G,G,GIA,5161,5541.502028
3,2.51,Very Good,G,VS2,VG,VG,GIA,34361,33846.270138
4,1.01,Good,I,SI1,VG,VG,GIA,4238,4472.159416


In [26]:
from pycaret.utils import check_metric
check_metric(unseen_predictions.Price, unseen_predictions.Label, 'R2')

0.9566

## Saving the model

In [None]:
#save_model(final_cat,'Final Cat')
#saved_final_cat = load_model('Final Cat')
#new_prediction = predict_model(saved_final_cat, data=data_unseen)
#new_prediction.head()