In [1]:
import optuna
import numpy as np
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.datasets import fetch_covtype, fetch_california_housing
from sklearn.model_selection import train_test_split, cross_validate
from perpetual import PerpetualBooster

In [2]:
!python --version

Python 3.10.13


In [3]:
from importlib.metadata import version

print(f"numpy: {version('numpy')}")
print(f"optuna: {version('optuna')}")
print(f"lightgbm: {version('lightgbm')}")
print(f"scikit-learn: {version('scikit-learn')}")
print(f"perpetual: {version('perpetual')}")

numpy: 1.26.4
optuna: 3.5.0
lightgbm: 4.1.0
scikit-learn: 1.3.0
perpetual: 0.1.57


In [4]:
task_is_cal_housing = False  # change to False for Cover Types task.

In [5]:
seed = 0   # average results are reported for 5 seeds -> [0, 1, 2, 3, 4]
n_estimators = 100  # results are reported for 100, 200, 300, 400, 500 n_estimators.
n_trials = 100

In [6]:
if task_is_cal_housing:
    data, target = fetch_california_housing(return_X_y=True, as_frame=True)
    scoring = "neg_mean_squared_error"
    metric_function = mean_squared_error
    metric_name = "mse"
    LGBMBooster = LGBMRegressor
    objective_type = "SquaredLoss"
else:
    data, target = fetch_covtype(return_X_y=True, as_frame=True)
    scoring = "neg_log_loss"
    metric_function = log_loss
    metric_name = "log_loss"
    LGBMBooster = LGBMClassifier
    objective_type = "LogLoss"

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2248, random_state=seed)

print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 450400
len(X_test): 130612


In [8]:
best_cv_results = None
cv_results = None

def save_best_cv_results(study, trial):
    global best_cv_results
    if study.best_trial.number == trial.number:
        best_cv_results = cv_results

In [9]:
def objective_function(trial):
    global cv_results
    params = {
        'seed': seed,
        'verbosity': -1,
        'n_estimators': n_estimators,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-6, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
        'max_depth': trial.suggest_int('max_depth', 3, 33),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
    }
    model = LGBMBooster(**params)
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=scoring, return_train_score=True, return_estimator=True)
    return -1 * np.mean(cv_results['test_score'])

In [10]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction='minimize', sampler=sampler)

[I 2024-03-16 08:25:08,915] A new study created in memory with name: no-name-8d93836c-7983-4085-903b-786201844dba


In [11]:
%%time
study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])

[I 2024-03-16 08:43:17,963] Trial 0 finished with value: 0.22175740241526296 and parameters: {'learning_rate': 0.03028525153605885, 'min_split_gain': 0.019549524484259877, 'reg_alpha': 0.00413599739383989, 'reg_lambda': 0.0018590843630169633, 'colsample_bytree': 0.5389238394711238, 'subsample': 0.7167152904533249, 'subsample_freq': 5, 'max_depth': 30, 'num_leaves': 987, 'min_child_samples': 39}. Best is trial 0 with value: 0.22175740241526296.
[I 2024-03-16 08:57:24,209] Trial 1 finished with value: 0.2732769181539135 and parameters: {'learning_rate': 0.13703835270362635, 'min_split_gain': 0.0014906288366101645, 'reg_alpha': 0.002560161525002871, 'reg_lambda': 0.35775015430826956, 'colsample_bytree': 0.25682884655830956, 'subsample': 0.26970343976123257, 'subsample_freq': 1, 'max_depth': 28, 'num_leaves': 798, 'min_child_samples': 88}. Best is trial 0 with value: 0.22175740241526296.
[I 2024-03-16 09:06:31,544] Trial 2 finished with value: 0.20403118094430436 and parameters: {'learning

In [None]:
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial:")
print(f"  Number: {study.best_trial.number}")
print(f"  Value: {study.best_trial.value}")
print(f"  Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
print(f"CV train scores: {-1 * best_cv_results['train_score']}")
print(f"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}")
print(f"CV test scores: {-1 * best_cv_results['test_score']}")
print(f"CV test scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}")

In [None]:
models = best_cv_results["estimator"]

In [None]:
for i, model in enumerate(models):
    y_pred = model.predict_proba(X_train) if metric_name == "log_loss" else model.predict(X_train)
    print(f"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}")

In [None]:
for i, model in enumerate(models):
    y_pred = model.predict_proba(X_test) if metric_name == "log_loss" else model.predict(X_test)
    print(f"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

In [None]:
if metric_name == "log_loss":
    y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)
else:
    y_pred = np.mean([model.predict(X_train) for model in models], axis=0)
print(f"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}")

In [None]:
if metric_name == "log_loss":
    y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)
else:
    y_pred = np.mean([model.predict(X_test) for model in models], axis=0)
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

In [None]:
model = PerpetualBooster(objective=objective_type)

In [None]:
%%time
model.fit(X_train, y_train, budget=0.45)

In [None]:
if metric_name == "log_loss":
    y_pred = model.predict_proba(X_test)
else:
    y_pred = model.predict(X_test)
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")