In [69]:
from functools import partial
from collections import defaultdict
from collections import namedtuple

import optuna
import joblib
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor
from lightgbm import early_stopping
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error


In [70]:
def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Optuna Trial: {:03} - Best Value: {:.5f} Params: {}. ".format(
                frozen_trial.number,
                frozen_trial.value,
                frozen_trial.params,
            )
        )


In [71]:
def define_model(trial, seed=None):
    params = {
        "random_state": seed or 42,
        "min_child_samples": trial.suggest_int("min_child_samples", 16, 128, step=8),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 2e-2, 2e-1, step=0.02),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 1e2, log=True),
    }
    return LGBMRegressor(**params)


In [72]:
def objective(trial, X, y, seed=None):
    seed = seed or 42
    kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
    y_oof = np.zeros_like(y, dtype=np.float32)
    model = define_model(trial, seed)

    for k, (train_ids, valid_ids) in enumerate(kfold.split(X, y), start=1):
        X_train, y_train = X[train_ids], y[train_ids]
        X_valid, y_valid = X[valid_ids], y[valid_ids]

        X_train, X_test, y_train, y_test = train_test_split(
            X_train, y_train, test_size=0.2, random_state=seed
        )

        model.fit(
            X_train,
            y_train,
            eval_metric="rmse",
            eval_set=(X_test, y_test),
            callbacks=[early_stopping(stopping_rounds=20, verbose=False)],
        )
        
        y_oof[valid_ids] = model.predict(X_valid)

    return mean_squared_error(y, y_oof, squared=False)


In [73]:
def seed_study(seed, X, y, n_trials=100, n_jobs=1):
    sampler = optuna.samplers.TPESampler(seed=seed)
    pruner = optuna.pruners.HyperbandPruner()
    study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)
    study.optimize(
        partial(objective, X=X, y=y, seed=seed),  # type: ignore
        n_trials=n_trials,
        callbacks=[logging_callback],
        n_jobs=n_jobs,
    )
    return (
        LGBMRegressor(**study.best_params),
        np.round(study.best_value, 5),
        study.trials_dataframe(
            attrs=("number", "value", "params", "state"),
        ).sort_values(by="value"),
    )


In [74]:
np.random.seed(42)

n_seeds = 3
seeds = np.random.randint(0, 1000, size=n_seeds)


In [None]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

X, y = fetch_california_housing(return_X_y=True)
y = y * 100_000

model_study = namedtuple("Study", ["best_model", "best_value", "study_frame"])
models = defaultdict(model_study)

for seed in seeds:
    print("Seed:", seed)
    best_lgbm, best_value, study_frame = seed_study(seed, X, y, n_trials=50)
    models[f"{seed:03}"] = model_study(best_lgbm, best_value, study_frame)
    print()


In [76]:
models["102"].study_frame

Unnamed: 0,number,value,params_learning_rate,params_min_child_samples,params_n_estimators,params_reg_lambda,state
40,40,44976.643436,0.12,48,450,30.403357,COMPLETE
49,49,44982.665414,0.14,56,500,66.128835,COMPLETE
46,46,45067.873117,0.12,80,500,69.737543,COMPLETE
35,35,45101.915091,0.1,64,350,55.405284,COMPLETE
23,23,45104.173676,0.08,56,450,24.483835,COMPLETE
14,14,45110.132916,0.08,104,500,4.845278,COMPLETE
33,33,45112.319537,0.08,88,450,25.208213,COMPLETE
29,29,45112.72712,0.08,88,400,7.060431,COMPLETE
17,17,45113.82993,0.08,16,450,15.135891,COMPLETE
31,31,45118.84489,0.08,104,450,13.030826,COMPLETE
