In [1]:
import pandas as pd
import numpy as np
import random
import optuna
import sys, os
from datetime import datetime, timezone
from catboost import Pool, cv

# helpers
sys.path.append("..")
from helpers.loss_functions import *

SEED = 108
random.seed(SEED)
N_FOLDS = 3
early_stop = 50
iterations = 10000
# data
train_path = "../../data/mushrooms/train.csv"
test_path = "../../data/mushrooms/test.csv"

In [2]:
def objective(trial):
    import gc
    from tqdm import tqdm
    from sklearn.model_selection import StratifiedKFold
    from sklearn.preprocessing import LabelBinarizer
    from sklearn.model_selection import train_test_split

    train = pd.read_csv(train_path)
    X_train = pd.read_csv(test_path)
    # prepare columns
    target = "class"

    X_train = train.drop(columns=[target, "id"], axis=1)
    y_train = train[target]
    # Binarize the target labels
    lb = LabelBinarizer()

    y_train = lb.fit_transform(y_train)

    # Category handling
    X_train, categorical_training_cols = handle_categorical_columns(X_train)

    # Hyper params
    param = {
        "loss_function": "Logloss",
        "iterations": iterations,
        "task_type": "GPU",
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 15),
        "depth": trial.suggest_int("depth", 1, 15),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.001, 10.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "bagging_temperature": trial.suggest_float(
            "bagging_temperature", 0.1, 1.0
        ),  # 1
        "random_strength": trial.suggest_float("random_strength", 0.1, 1.0),
    }

    train_pool = Pool(
        X_train,
        label=y_train,
        cat_features=categorical_training_cols,
        data_can_be_none=True,
    )
    scores = cv(
        train_pool,
        param,
        fold_count=3,
        early_stopping_rounds=50,
        shuffle=True,
        stratified=True,
        seed=SEED,
        as_pandas=True,
        plot=True,
        metric_period=100,
    )
    gc.collect()
    return scores["test-Logloss-mean"].values.tolist()[-1]

In [3]:
# Create or load a study
today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
curr_timestamp = int(datetime.now(timezone.utc).timestamp())
study_name = f"catboost_study"
study = optuna.create_study(
    study_name=study_name,
    storage=f"sqlite:///{study_name}.db",
    direction="minimize",
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-08-18 23:21:16,396] Using an existing study with name 'catboost_study' instead of creating a new one.


  0%|          | 0/100 [00:00<?, ?it/s]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



Training on fold [0/3]
0:	learn: 0.6870588	test: 0.6870588	best: 0.6870588 (0)	total: 42.7ms	remaining: 7m 7s
100:	learn: 0.5057239	test: 0.5054158	best: 0.5054158 (100)	total: 3.95s	remaining: 6m 27s
200:	learn: 0.4460436	test: 0.4454964	best: 0.4454964 (200)	total: 7.86s	remaining: 6m 23s
300:	learn: 0.4140322	test: 0.4133512	best: 0.4133512 (300)	total: 11.8s	remaining: 6m 18s
400:	learn: 0.3982075	test: 0.3975056	best: 0.3975056 (400)	total: 15.7s	remaining: 6m 15s
500:	learn: 0.3879140	test: 0.3871779	best: 0.3871779 (500)	total: 19.6s	remaining: 6m 11s
600:	learn: 0.3807174	test: 0.3799344	best: 0.3799344 (600)	total: 23.5s	remaining: 6m 7s
700:	learn: 0.3762383	test: 0.3754420	best: 0.3754420 (700)	total: 27.4s	remaining: 6m 3s
800:	learn: 0.3719836	test: 0.3711595	best: 0.3711595 (800)	total: 31.3s	remaining: 5m 59s
900:	learn: 0.3679780	test: 0.3671419	best: 0.3671419 (900)	total: 35.2s	remaining: 5m 55s
1000:	learn: 0.3625746	test: 0.3617119	best: 0.3617119 (1000)	total: 39.1

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



Training on fold [0/3]
0:	learn: 0.6726521	test: 0.6726005	best: 0.6726005 (0)	total: 164ms	remaining: 27m 17s
100:	learn: 0.0826281	test: 0.0818972	best: 0.0818972 (100)	total: 17s	remaining: 27m 41s
200:	learn: 0.0514177	test: 0.0507654	best: 0.0507654 (200)	total: 33.5s	remaining: 27m 11s
300:	learn: 0.0441307	test: 0.0435914	best: 0.0435914 (300)	total: 50.1s	remaining: 26m 54s
400:	learn: 0.0422216	test: 0.0417835	best: 0.0417835 (400)	total: 1m 5s	remaining: 26m 17s
500:	learn: 0.0412391	test: 0.0408603	best: 0.0408603 (500)	total: 1m 21s	remaining: 25m 43s
600:	learn: 0.0406629	test: 0.0403341	best: 0.0403341 (600)	total: 1m 36s	remaining: 25m 11s
700:	learn: 0.0402494	test: 0.0399580	best: 0.0399580 (700)	total: 1m 51s	remaining: 24m 39s
800:	learn: 0.0398285	test: 0.0395756	best: 0.0395756 (800)	total: 2m 7s	remaining: 24m 22s
900:	learn: 0.0394934	test: 0.0392883	best: 0.0392883 (900)	total: 2m 22s	remaining: 24m 1s
1000:	learn: 0.0392669	test: 0.0390915	best: 0.0390915 (1000

KeyboardInterrupt: 