# Optuna - Hyperparameter Tuning

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from xgboost import XGBClassifier 

In [12]:
import optuna
from optuna.storages import RDBStorage
# from optuna_dashboard import wsgi

storage = RDBStorage("sqlite:///db.sqlite3")
# application = wsgi(storage)

In [13]:
# load data
data = pd.read_csv('../kdd2004.csv')
data['target'] = data['target'].replace({-1:0, 1:1})

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
0,52.0,32.69,0.3,2.5,20.0,1256.8,-0.89,0.33,11.0,-55.0,...,1595.1,-1.64,2.83,-2.0,-50.0,445.2,-0.35,0.26,0.76,0
1,58.0,33.33,0.0,16.5,9.5,608.1,0.5,0.07,20.5,-52.5,...,762.9,0.29,0.82,-3.0,-35.0,140.3,1.16,0.39,0.73,0
2,77.0,27.27,-0.91,6.0,58.5,1623.6,-1.4,0.02,-6.5,-48.0,...,1491.8,0.32,-1.29,0.0,-34.0,658.2,-0.76,0.26,0.24,0
3,41.0,27.91,-0.35,3.0,46.0,1921.6,-1.36,-0.47,-32.0,-51.5,...,2047.7,-0.98,1.53,0.0,-49.0,554.2,-0.83,0.39,0.73,0
4,50.0,28.0,-1.32,-9.0,12.0,464.8,0.88,0.19,8.0,-51.5,...,479.5,0.68,-0.59,2.0,-36.0,-6.9,2.02,0.14,-0.23,0


In [14]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    stratify=data['target'],
    random_state=0)

X_train.shape, X_test.shape

((102025, 74), (43726, 74))

## XGBoost Classifier

In [15]:
params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 6,
    'scale_pos_weight': 10
}
xgb = XGBClassifier(**params, n_jobs=-1, random_state=42)

xgb.fit(X_train, y_train)

y_train_xgb = xgb.predict_proba(X_train)[:,1]
y_test_xgb = xgb.predict_proba(X_test)[:,1]

In [16]:
print(f"Precision XGBoost test: {metrics.precision_score(y_train, xgb.predict(X_train)):0.2%}") 
print(f"Precision XGBoost test: {metrics.precision_score(y_test, xgb.predict(X_test)):0.2%}") 

Precision XGBoost test: 98.48%
Precision XGBoost test: 92.26%


In [20]:
def objective(trial):

    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.11)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    scale_pos_weight = trial.suggest_float('scale_pos_weight', 1, 30)

    model = XGBClassifier(n_estimators=n_estimators,
                          learning_rate=learning_rate,
                          max_depth=max_depth,
                          scale_pos_weight=scale_pos_weight,
                          n_jobs=-1,
                          random_state=42)

    # Calculate F1 score using cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    f1_mean = scores.mean()  # Take the mean F1 score

    return f1_mean

In [27]:
study = optuna.create_study(
    storage="sqlite:///db.sqlite3",  # Specify the storage URL here.
    study_name="KDD_Cup_2004_5",
    direction='maximize',
    sampler=optuna.samplers.RandomSampler(seed=42)
)
study.optimize(objective, n_trials=200)
print(f"Best value: {study.best_value} (params: {study.best_params})")

[I 2024-03-06 07:40:46,308] A new study created in RDB with name: KDD_Cup_2004_5
[I 2024-03-06 07:41:04,280] Trial 0 finished with value: 0.8660098279951818 and parameters: {'n_estimators': 250, 'learning_rate': 0.10507143064099161, 'max_depth': 8, 'scale_pos_weight': 18.361096041714063}. Best is trial 0 with value: 0.8660098279951818.
[I 2024-03-06 07:41:08,032] Trial 1 finished with value: 0.662125689077613 and parameters: {'n_estimators': 162, 'learning_rate': 0.025599452033620268, 'max_depth': 2, 'scale_pos_weight': 26.11910822747312}. Best is trial 0 with value: 0.8660098279951818.
[I 2024-03-06 07:41:18,958] Trial 2 finished with value: 0.7222483541203408 and parameters: {'n_estimators': 341, 'learning_rate': 0.08080725777960454, 'max_depth': 2, 'scale_pos_weight': 29.127385712697837}. Best is trial 0 with value: 0.8660098279951818.
[I 2024-03-06 07:41:34,680] Trial 3 finished with value: 0.8456759388078113 and parameters: {'n_estimators': 433, 'learning_rate': 0.0312339110678276

Best value: 0.8718970271163409 (params: {'n_estimators': 476, 'learning_rate': 0.09948273504276488, 'max_depth': 7, 'scale_pos_weight': 27.734352815670388})


In [28]:
study.best_params

{'n_estimators': 476,
 'learning_rate': 0.09948273504276488,
 'max_depth': 7,
 'scale_pos_weight': 27.734352815670388}

In [29]:
xgb_o = XGBClassifier(**study.best_params, n_jobs=-1, random_state=42)
xgb_o.fit(X_train, y_train)

In [30]:
print(f"Precision XGBoost train: {metrics.precision_score(y_train, xgb.predict(X_train)):0.2%}") 
print(f"Precision XGBoost test: {metrics.precision_score(y_test, xgb.predict(X_test)):0.2%}") 
print()
print(f"Precision XGBoost train: {metrics.precision_score(y_train, xgb_o.predict(X_train)):0.2%}") 
print(f"Precision XGBoost test: {metrics.precision_score(y_test, xgb_o.predict(X_test)):0.2%}") 

Precision XGBoost train: 98.48%
Precision XGBoost test: 92.26%

Precision XGBoost train: 100.00%
Precision XGBoost test: 95.40%
