# Optuna - Hyperparameter Tuning

In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from xgboost import XGBClassifier 

In [4]:
import optuna
from optuna.storages import RDBStorage
# from optuna_dashboard import wsgi

storage = RDBStorage("sqlite:///db.sqlite3")
# application = wsgi(storage)

In [5]:
def performance(X, y, model, title=None):

    # Predict probabilities on the test set
    y_probs = model.predict_proba(X)[:, 1]
    y_preds = model.predict(X)

    # Calculate evaluation metrics
    roc_auc = metrics.roc_auc_score(y, y_probs)
    precision = metrics.precision_score(y, y_preds)
    recall = metrics.recall_score(y, y_preds)
    f1 = metrics.f1_score(y, y_preds)
    aupr = metrics.average_precision_score(y, y_probs)

    if title is None:
        title = 'Performance metrics:'
    else:
        title = f'Performance metrics: {title}'
    print(title)
    print("ROC AUC:".ljust(10) + f"{roc_auc:.2%}".rjust(8))
    print("Precision:".ljust(10) + f"{precision:.2%}".rjust(8))
    print("Recall:".ljust(10) + f"{recall:.2%}".rjust(8))
    print("F1:".ljust(10) + f"{f1:.2%}".rjust(8))
    print("AUPRC:".ljust(10) + f"{aupr:.2%}".rjust(8))
    print()

In [6]:
# load data
data = pd.read_csv('../kdd2004.csv')
data['target'] = data['target'].replace({-1:0, 1:1})

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
0,52.0,32.69,0.3,2.5,20.0,1256.8,-0.89,0.33,11.0,-55.0,...,1595.1,-1.64,2.83,-2.0,-50.0,445.2,-0.35,0.26,0.76,0
1,58.0,33.33,0.0,16.5,9.5,608.1,0.5,0.07,20.5,-52.5,...,762.9,0.29,0.82,-3.0,-35.0,140.3,1.16,0.39,0.73,0
2,77.0,27.27,-0.91,6.0,58.5,1623.6,-1.4,0.02,-6.5,-48.0,...,1491.8,0.32,-1.29,0.0,-34.0,658.2,-0.76,0.26,0.24,0
3,41.0,27.91,-0.35,3.0,46.0,1921.6,-1.36,-0.47,-32.0,-51.5,...,2047.7,-0.98,1.53,0.0,-49.0,554.2,-0.83,0.39,0.73,0
4,50.0,28.0,-1.32,-9.0,12.0,464.8,0.88,0.19,8.0,-51.5,...,479.5,0.68,-0.59,2.0,-36.0,-6.9,2.02,0.14,-0.23,0


In [7]:
# Test if dataset is imbalanced 
print(f"Number of records: {data.shape[0]:,.0f}")
data['target'].value_counts(normalize=True, dropna=False)

Number of records: 145,751


0    0.991108
1    0.008892
Name: target, dtype: float64

In [8]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    stratify=data['target'],
    random_state=0)

X_train.shape, X_test.shape

((102025, 74), (43726, 74))

## XGBoost Classifier

In [9]:
params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 6,
    'scale_pos_weight': 10
}
xgb = XGBClassifier(**params, n_jobs=-1, random_state=42)

xgb.fit(X_train, y_train)

y_train_xgb = xgb.predict_proba(X_train)[:,1]
y_test_xgb = xgb.predict_proba(X_test)[:,1]

In [10]:
print(f"Precision XGBoost test: {metrics.precision_score(y_train, xgb.predict(X_train)):0.2%}") 
print(f"Precision XGBoost test: {metrics.precision_score(y_test, xgb.predict(X_test)):0.2%}") 

Precision XGBoost test: 98.48%
Precision XGBoost test: 92.26%


In [9]:
def objective(trial):

    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.11)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    scale_pos_weight = trial.suggest_float('scale_pos_weight', 1, 30)

    model = XGBClassifier(n_estimators=n_estimators,
                          learning_rate=learning_rate,
                          max_depth=max_depth,
                          scale_pos_weight=scale_pos_weight,
                          n_jobs=-1,
                          random_state=42)

    # Calculate F1 score using cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    f1_mean = scores.mean()  # Take the mean F1 score

    return f1_mean

In [16]:
# weight will be used as a hyperparameter to offset the class imbalance
weight = int(y_train[y_train==0].shape[0] / y_train[y_train==1].shape[0])
weight

111

In [17]:

def objective(trial):
    params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 500),
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.11),
        'max_depth' : trial.suggest_int('max_depth', 2, 10),
        'scale_pos_weight' : trial.suggest_float('scale_pos_weight', 1, weight)
    }

    model = XGBClassifier(**params,n_jobs=-1, random_state=42)

    # Calculate F1 score using cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    f1_mean = scores.mean()  # Take the mean F1 score

    return f1_mean

In [18]:
study = optuna.create_study(
    storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
    study_name="KDD_Cup_2004_5",
    direction='maximize',
    sampler=optuna.samplers.RandomSampler(seed=42),
    load_if_exists=True
)
study.optimize(objective, n_trials=200)
print(f"Best value: {study.best_value} (params: {study.best_params})")

[I 2024-03-06 18:23:30,473] Using an existing study with name 'KDD_Cup_2004_5' instead of creating a new one.


[I 2024-03-06 18:23:47,245] Trial 204 finished with value: 0.8635816962074104 and parameters: {'n_estimators': 250, 'learning_rate': 0.10507143064099161, 'max_depth': 8, 'scale_pos_weight': 66.85243326167402}. Best is trial 13 with value: 0.8718970271163409.
[I 2024-03-06 18:23:52,512] Trial 205 finished with value: 0.4073961519601294 and parameters: {'n_estimators': 162, 'learning_rate': 0.025599452033620268, 'max_depth': 2, 'scale_pos_weight': 96.27937603524288}. Best is trial 13 with value: 0.8718970271163409.
[I 2024-03-06 18:24:02,978] Trial 206 finished with value: 0.5727092105310435 and parameters: {'n_estimators': 341, 'learning_rate': 0.08080725777960454, 'max_depth': 2, 'scale_pos_weight': 107.69008373781938}. Best is trial 13 with value: 0.8718970271163409.
[I 2024-03-06 18:24:17,964] Trial 207 finished with value: 0.7787808963953322 and parameters: {'n_estimators': 433, 'learning_rate': 0.031233911067827615, 'max_depth': 3, 'scale_pos_weight': 21.17449608387772}. Best is tr

Best value: 0.8718970271163409 (params: {'n_estimators': 476, 'learning_rate': 0.09948273504276488, 'max_depth': 7, 'scale_pos_weight': 27.734352815670388})


In [20]:
study.best_params

{'n_estimators': 476,
 'learning_rate': 0.09948273504276488,
 'max_depth': 7,
 'scale_pos_weight': 27.734352815670388}

In [21]:
xgb_o = XGBClassifier(**study.best_params, n_jobs=-1, random_state=42)
xgb_o.fit(X_train, y_train)

In [22]:
# Load study from the SQLite database
study = optuna.load_study(
    study_name="KDD_Cup_2004_5",
    storage="sqlite:///db.sqlite3"  # Specify the same storage URL here.
)

# Access the best parameters
best_params = study.best_params

print(f"Best parameters: {best_params}")

Best parameters: {'n_estimators': 476, 'learning_rate': 0.09948273504276488, 'max_depth': 7, 'scale_pos_weight': 27.734352815670388}


In [23]:
xgb_o = XGBClassifier(**best_params, n_jobs=-1, random_state=42)
xgb_o.fit(X_train, y_train)

performance(X_train, y_train, xgb_o, title="X_train")
performance(X_test, y_test, xgb_o, title="X_test")

Performance metrics: X_train
ROC AUC:   100.00%
Precision: 100.00%
Recall:    100.00%
F1:        100.00%
AUPRC:     100.00%

Performance metrics: X_test
ROC AUC:    99.31%
Precision:  95.40%
Recall:     79.95%
F1:         86.99%
AUPRC:      90.40%



In [24]:
performance(X_train, y_train, xgb, title="X_train")
performance(X_test, y_test, xgb, title="X_test")

Performance metrics: X_train
ROC AUC:   100.00%
Precision:  98.48%
Recall:    100.00%
F1:         99.23%
AUPRC:      99.99%

Performance metrics: X_test
ROC AUC:    99.29%
Precision:  92.26%
Recall:     79.69%
F1:         85.52%
AUPRC:      89.39%



In [25]:
# F1 Score:
2 * 0.9543 * 0.8046 / (0.9543 + 0.8046)

0.8730795156063448