In [107]:
#https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning 
#https://www.kaggle.com/code/donkeys/exploring-hyperopt-parameter-tuning
#https://hyperopt.github.io/hyperopt/getting-started/search_spaces/
#some more cool stuff for this specific dataset https://www.kaggle.com/code/rifqihaikal/diamonds-lr-xgboost-eda

import numpy as np
import pandas as pd
import xgboost as xgb
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_percentage_error
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [108]:
xgb.set_config(verbosity=0)
config = xgb.get_config()
config

{'use_rmm': False, 'verbosity': 0}

In [109]:
df = pd.read_csv("diamonds.csv", sep=",", header=0)

X, y = df.drop('price', axis=1), df[['price']] #drop price from X, assign price to y

cats = X.select_dtypes(exclude=np.number).columns.tolist()
for col in cats:
   X[col] = X[col].astype('category')

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.33)

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True) #train dataset
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True) #test dataset
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

In [186]:
def simulate_quniform(low, high, q, size=100):
    # Simulate the behavior of hp.quniform
    samples = np.random.uniform(low, high+q, size)
    rounded_samples = np.round(samples / q) * q
    return rounded_samples

def simulate_uniform(low, high, size=10):
    # Simulate the behavior of hp.uniform
    samples = np.random.uniform(low, high, size)
    return samples



simulated_q_values = simulate_quniform(3, 18, 1)
print(simulated_q_values)

print("------------------")
simulated_u_values = simulate_uniform(0.1, 1)
print(simulated_u_values)

[12.  7. 12. 15.  8. 12.  5.  5. 14.  4. 10. 14. 19. 15.  4. 13. 11.  9.
 11. 18.  5. 16.  5. 17.  7. 11. 15. 15. 16. 19.  5. 14.  4.  8.  8.  9.
 16. 16. 18. 13. 11.  9.  5.  4.  3. 12. 12. 17.  7. 13. 10. 17. 13. 16.
 10. 10. 12. 14.  5. 11.  8.  7.  5. 12.  4. 16.  6.  9.  6. 16.  5. 12.
  9.  4.  4.  8.  4. 13. 11.  8. 11.  9.  5. 17. 15. 15.  9. 13.  8. 15.
  4.  3. 15. 19. 14.  8. 18.  3. 19. 16.]
------------------
[0.1780184  0.32409246 0.17345999 0.35949232 0.73168595 0.57297747
 0.27417437 0.74726213 0.11491309 0.72179357]


In [188]:
#https://www.databricks.com/blog/2021/04/15/how-not-to-tune-your-model-with-hyperopt.html
#I used chatGPT and then used the functions above to validate

# hp.uniform:

# hp.uniform(label, low, high) samples a floating-point value uniformly between low and high.
# It is used for continuous hyperparameters.
# For example, hp.uniform('x', 0, 1) will generate a random floating-point number between 0 and 1.
# hp.quniform:

# hp.quniform(label, low, high, q) samples a floating-point value uniformly between low and high, and then rounds the result to the nearest 
# multiple of q.
# It is often used for discrete hyperparameters, especially when you want to sample integers.
# The rounding behavior means it can effectively generate integer values if q is set to 1.
# For example, hp.quniform('x', 0, 10, 1) will generate a value like 3.0, 4.0, etc., effectively giving integers between 0 and 10.



space={'max_depth': hp.quniform("max_depth", 1, 25, 1), 
        'colsample_bytree': hp.uniform('colsample_bytree', 0.0,1), #It is the fraction of features that can be selected during any given boosting rounds.
        'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1), 
        'reg_alpha': hp.quniform('reg_alpha', 1,10,1),
        'reg_lambda': hp.quniform('reg_lambda', 1,10,1),
        'gamma': hp.quniform ('gamma', 0,10,1),
        'eta': hp.uniform('eta', 0.1,1), #learning_rate,
        'n_estimators': hp.quniform('n_estimators', 100, 6000, 1),
        'seed': 42
    }

In [189]:
def objective(space):
    params = {"objective": "reg:squarederror", 
                "tree_method": "hist",
                "max_depth": int(space['max_depth']), #range 0, inf
                "colsample_bytree": space['colsample_bytree'], #range 0,1
                "min_child_weight": int(space['min_child_weight']), #range 0, inf
                "reg_alpha": int(space['reg_alpha']), #range 0, inf
                "reg_lambda": int(space['reg_lambda']), #range 0, inf
                "gamma": int(space['gamma']), #range 0, inf
                "eta": space['eta'], #range 0,1
                #"booster": "dart" #check this for dart https://xgboost.readthedocs.io/en/stable/parameter.html#additional-parameters-for-dart-booster-booster-dart
                } 
    n = int(space['n_estimators'])
  
    model = xgb.train(
        params=params,
        dtrain=dtrain_reg,
        num_boost_round=n,
        evals=evals,
        verbose_eval=100, # Every ten rounds
        early_stopping_rounds=50
    )

    preds = model.predict(dtest_reg)
    rmse = mean_squared_error(y_test, preds, squared=False)
    print(f"RMSE of the base model: {rmse:.3f}") #the lower the value the better

    mape = mean_absolute_percentage_error(y_test, preds)
    print(f"MAPE of the base model: {mape:.3f}") #the lower the value the better

    #predictions = [round(value) for value in preds]
    #accuracy = accuracy_score(y_test, predictions)
    #print("Accuracy: %.2f%%" % (accuracy * 100.0))

    
    accuracy = accuracy_score(y_test, preds>0.5)
    print ("SCORE:", accuracy)
    print({'loss': -accuracy, 'status': STATUS_OK })
    return {'loss': -accuracy, 'status': STATUS_OK }

In [190]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50, #will be hundreds, maybe thousands, and determine sampling
                        trials = trials)

[0]	train-rmse:831.42557	validation-rmse:832.62637                                                                     
[51]	train-rmse:16.29540	validation-rmse:686.34204                                                                     
RMSE of the base model: 686.347                                                                                        
MAPE of the base model: 0.087                                                                                          
SCORE:                                                                                                                 
0.0                                                                                                                    
{'loss': -0.0, 'status': 'ok'}                                                                                         
[0]	train-rmse:4311.19847	validation-rmse:4230.90932                                                                   
[90]	train-rmse:185.95941	validation-rms

In [172]:
#just for funzies
for trial in trials.trials:
    print(trial['misc']['vals'])

{'colsample_bytree': [0.7000000000000001], 'eta': [0.2], 'gamma': [5.90107336873019], 'max_depth': [26.0], 'min_child_weight': [13.0], 'n_estimators': [5211.0], 'reg_alpha': [14.377958842857801], 'reg_lambda': [6.679091213160025]}
{'colsample_bytree': [0.2], 'eta': [0.1], 'gamma': [0.22559990053647727], 'max_depth': [17.0], 'min_child_weight': [18.0], 'n_estimators': [5815.0], 'reg_alpha': [10.141340650400668], 'reg_lambda': [8.419197287292292]}
{'colsample_bytree': [0.8], 'eta': [0.8], 'gamma': [2.4120929600679575], 'max_depth': [24.0], 'min_child_weight': [7.0], 'n_estimators': [3209.0], 'reg_alpha': [6.620833830082047], 'reg_lambda': [12.608307950074646]}
{'colsample_bytree': [0.8], 'eta': [0.9], 'gamma': [10.188225076717414], 'max_depth': [25.0], 'min_child_weight': [0.0], 'n_estimators': [4983.0], 'reg_alpha': [3.7938810879097544], 'reg_lambda': [8.456740860146223]}
{'colsample_bytree': [0.9], 'eta': [0.8], 'gamma': [19.413102582503846], 'max_depth': [20.0], 'min_child_weight': [2

In [191]:
#Get the best hyperparameters and run the predict again
print(best_hyperparams)
best_hypers = best_hyperparams

{'colsample_bytree': 0.9936978100324734, 'eta': 0.9501439118417301, 'gamma': 1.0, 'max_depth': 17.0, 'min_child_weight': 2.0, 'n_estimators': 2118.0, 'reg_alpha': 8.0, 'reg_lambda': 2.0}


In [198]:
params = {"objective": "reg:squarederror", 
        "tree_method": "hist",
        "max_depth": int(best_hypers["max_depth"]), 
        "colsample_bytree": best_hypers["colsample_bytree"], 
        "min_child_weight": best_hypers["min_child_weight"], 
        "reg_alpha": best_hypers["reg_alpha"], 
        "reg_lambda": best_hypers["reg_lambda"], 
        "gamma": best_hypers["gamma"], 
        "eta": best_hypers["eta"], 
        #"booster": "dart" #check this for dart https://xgboost.readthedocs.io/en/stable/parameter.html#additional-parameters-for-dart-booster-booster-dart
        }
n = int(best_hypers["n_estimators"])

model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    evals=evals,
    verbose_eval=30, # Every ten rounds
    early_stopping_rounds=50
)

preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}") #the lower the value the better

mape = mean_absolute_percentage_error(y_test, preds)
print(f"MAPE of the base model: {mape:.3f}") #the lower the value the better

[0]	train-rmse:831.42557	validation-rmse:832.62637
[30]	train-rmse:23.42792	validation-rmse:686.24927
[51]	train-rmse:16.29540	validation-rmse:686.34204
RMSE of the base model: 686.347
MAPE of the base model: 0.087


In [199]:
model.get_score(importance_type='gain')

{'carat': 25028626.0,
 'cut': 169468.484375,
 'color': 1241895.0,
 'clarity': 2875942.25,
 'depth': 98386.359375,
 'table': 37845.75,
 'x': 85141.015625,
 'y': 7165253.5,
 'z': 290613.6875}