In [0]:
import pandas as pd
import xgboost as xgb
import mlflow
import pyspark.pandas as ps
from hyperopt.pyll import scope
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, SparkTrials

In [0]:
df = ps.read_csv("/dbfs/FileStore/dfprocessed.csv", header='infer')

In [0]:
df = df.to_pandas()

In [0]:
df.head()

Unnamed: 0,A2,A3,A8,A14,A15,A1_encoded,A4_encoded,A5_encoded,A6_encoded,A7_encoded,A9_encoded,A10_encoded,A11_encoded,A12_encoded,A13_encoded,class
0,30.83,0.0,1.25,202.0,0,0,0,0,2,0,0,1,1,0,0,1
1,58.67,4.46,3.04,43.0,560,1,0,0,1,1,0,1,4,0,0,1
2,24.5,0.5,1.5,280.0,824,1,0,0,1,1,0,0,0,0,0,1
3,27.83,1.54,3.75,100.0,3,0,0,0,2,0,0,1,6,1,0,1
4,20.17,5.625,1.71,120.0,0,0,0,0,2,0,0,0,0,0,1,1


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A2           690 non-null    float64
 1   A3           690 non-null    float64
 2   A8           690 non-null    float64
 3   A14          690 non-null    float64
 4   A15          690 non-null    int32  
 5   A1_encoded   690 non-null    int32  
 6   A4_encoded   690 non-null    int32  
 7   A5_encoded   690 non-null    int32  
 8   A6_encoded   690 non-null    int32  
 9   A7_encoded   690 non-null    int32  
 10  A9_encoded   690 non-null    int32  
 11  A10_encoded  690 non-null    int32  
 12  A11_encoded  690 non-null    int32  
 13  A12_encoded  690 non-null    int32  
 14  A13_encoded  690 non-null    int32  
 15  class        690 non-null    int32  
dtypes: float64(4), int32(12)
memory usage: 59.3 KB


In [0]:
# Define train set size
train_size = int(0.7 * len(df))

In [0]:
# Split dataset 
train = df[:train_size]
test = df[train_size:]

In [0]:
# Convert data to the optimized data structure DMatrix
dtrain = xgb.DMatrix(train[['A2', 'A3', 'A8', 'A14', 'A15', 'A1_encoded', 'A4_encoded', 'A5_encoded', 'A6_encoded', 'A7_encoded', 'A9_encoded', 'A10_encoded', 'A11_encoded', 'A12_encoded', 'A13_encoded']], label=train["class"])
dtest = xgb.DMatrix(test[['A2', 'A3', 'A8', 'A14', 'A15', 'A1_encoded', 'A4_encoded', 'A5_encoded', 'A6_encoded', 'A7_encoded', 'A9_encoded', 'A10_encoded', 'A11_encoded', 'A12_encoded', 'A13_encoded']], label=test["class"])

In [0]:
# Define the objective function that Hyperopt will optimize.
def train_xgb(params):
    mlflow.xgboost.autolog(silent=True) 
    with mlflow.start_run(nested=True):
                             
                                 
                                   
            model = xgb.train(params=params, dtrain=dtrain, num_boost_round=500, evals=[(dtest, "test")],                                             verbose_eval=False)

            predictions = model.predict(dtest)
            
            auc_score = roc_auc_score(test['class'], predictions)
                      
            mlflow.log_metric("auc_score", auc_score)

 

    return {'status': STATUS_OK, 'loss': -auc_score}

In [0]:
# Hyperparameters search space
space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': scope.int(hp.uniform('max_depth', 1, 50)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma', -10, 10),
    'alpha': hp.loguniform('alpha', -10, 10),
    'lambda': hp.loguniform('lambda', -10, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 42,
}

In [0]:
# Execute search using fmi
with mlflow.start_run(run_name='initial_search'):
    best_params = fmin(
      fn=train_xgb,
      space=space,
      algo=tpe.suggest,
      max_evals=20,
      #rstate=np.random.RandomState(123),
      # trials=spark_trials
    )

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]  5%|▌         | 1/20 [00:05<01:36,  5.07s/trial, best loss: -0.5] 10%|█         | 2/20 [00:09<01:29,  4.96s/trial, best loss: -0.9224105461393597] 15%|█▌        | 3/20 [00:14<01:19,  4.70s/trial, best loss: -0.9224105461393597] 20%|██        | 4/20 [00:19<01:16,  4.80s/trial, best loss: -0.9224105461393597] 25%|██▌       | 5/20 [00:23<01:09,  4.65s/trial, best loss: -0.9224105461393597] 30%|███       | 6/20 [00:27<01:02,  4.49s/trial, best loss: -0.9224105461393597] 35%|███▌      | 7/20 [00:32<00:57,  4.39s/trial, best loss: -0.9224105461393597] 40%|████      | 8/20 [00:36<00:54,  4.51s/trial, best loss: -0.9224105461393597] 45%|████▌     | 9/20 [00:42<00:52,  4.82s/trial, best loss: -0.9224105461393597] 50%|█████     | 10/20 [00:47<00:48,  4.83s/trial, best loss: -0.9224105461393597] 55%|█████▌    | 11/20 [00:51<00:43,  4.78s/trial, best loss: -0.9224105461393597] 60%|██████    | 12/20 [00:56<00:38,  4.79s/trial, best

In [0]:
best_params

Out[31]: {'alpha': 0.4197135997100693,
 'colsample_bytree': 0.5155667954438115,
 'gamma': 0.10617399074268867,
 'lambda': 0.024402513266066045,
 'learning_rate': 0.0023298729879080325,
 'max_depth': 33.155055684623356,
 'min_child_weight': 0.27770312245307754,
 'subsample': 0.522352483148095}

Parallel coordinates maxdepth, learning rate, etc
![1.PNG](attachment:1.PNG)

Regularization hyperparameters
![2.PNG](attachment:2.PNG)

Ten first runs
![3.PNG](attachment:3.PNG)

Learning rate - scatter chart
![5.PNG](attachment:5.PNG)