In [12]:
from functools import partial
from collections import defaultdict
from collections import namedtuple

import optuna
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import log_loss


In [13]:
def define_model(trial, seed=None):
    params = {
        "random_state": seed or 42,
        "max_depth": trial.suggest_int("max_depth", 1, 4),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 3e-1, step=0.01),
    }
    return LGBMClassifier(**params)


In [14]:
def objective(trial, X, y, seed=None):
    seed = seed or 42
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=seed)
    y_oof_proba = np.zeros_like(y, dtype=np.float32)
    model = define_model(trial, seed)

    for k, (train_ids, valid_ids) in enumerate(skf.split(X, y), start=1):
        X_train, y_train = X[train_ids], y[train_ids]
        X_valid, y_valid = X[valid_ids], y[valid_ids]
        model.fit(X_train, y_train)
        y_oof_proba[valid_ids] = model.predict_proba(X_valid)[:, 1]

        # intermediate_value = log_loss(y_valid, oof_proba[valid_ids])
        # trial.report(intermediate_value, k)

        # if trial.should_prune():
        #     raise optuna.TrialPruned()

    return log_loss(y, y_oof_proba)


In [15]:
def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Optuna Trial: {:03} - Best Value: {:.5f}\nParams: {}. ".format(
                frozen_trial.number,
                frozen_trial.value,
                frozen_trial.params,
            )
        )


In [16]:
def seed_study(seed, X, y, n_trials=100, n_jobs=1):
    sampler = optuna.samplers.TPESampler(seed=seed)
    pruner = optuna.pruners.HyperbandPruner()
    study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
    study.optimize(
        partial(objective, X=X, y=y, seed=seed),  # type: ignore
        n_trials=n_trials,
        callbacks=[logging_callback],
        n_jobs=n_jobs,
    )
    best_model = LGBMClassifier(random_state=seed, **study.best_params)
    best_value = np.round(study.best_value, 5)
    study_frame = study.trials_dataframe(
        attrs=("number", "value", "params", "state"),
    ).sort_values(by="value")

    return best_model, best_value, study_frame


In [17]:
np.random.seed(42)

n_seeds = 3
seeds = np.random.randint(0, 1000, size=n_seeds)


In [18]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model_study = namedtuple("Study", ["best_model", "best_value", "study_frame"])
models = defaultdict(model_study)

for seed in seeds:
    print("Seed:", seed)
    best_lgbm, best_value, study_frame = seed_study(seed, X_train, y_train)
    models[f"{seed:03}"] = model_study(best_lgbm, best_value, study_frame)
    print()


Seed: 102
Optuna Trial: 000 - Best Value: 0.21219
Params: {'max_depth': 3, 'n_estimators': 400, 'learning_rate': 0.09}. 
Optuna Trial: 002 - Best Value: 0.09462
Params: {'max_depth': 1, 'n_estimators': 200, 'learning_rate': 0.11}. 
Optuna Trial: 006 - Best Value: 0.09435
Params: {'max_depth': 1, 'n_estimators': 150, 'learning_rate': 0.17}. 

Seed: 435
Optuna Trial: 000 - Best Value: 0.18012
Params: {'max_depth': 4, 'n_estimators': 450, 'learning_rate': 0.09999999999999999}. 
Optuna Trial: 001 - Best Value: 0.11002
Params: {'max_depth': 1, 'n_estimators': 450, 'learning_rate': 0.15000000000000002}. 
Optuna Trial: 005 - Best Value: 0.09174
Params: {'max_depth': 1, 'n_estimators': 150, 'learning_rate': 0.19}. 
Optuna Trial: 021 - Best Value: 0.08991
Params: {'max_depth': 1, 'n_estimators': 150, 'learning_rate': 0.2}. 
Optuna Trial: 078 - Best Value: 0.08894
Params: {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.26}. 

Seed: 860
Optuna Trial: 000 - Best Value: 0.11201
Params: {'m

In [19]:
y_test_proba = np.zeros_like(y_test, dtype=np.float32)

for model, _, _ in models.values():
    model.fit(X_train, y_train)
    y_test_proba += model.predict_proba(X_test)[:, 1]

print("Test Log Loss:", log_loss(y_test, y_test_proba / len(models)))


Test Log Loss: 0.11553627668851965
