In [1]:
import pandas as pd
import numpy as np
import random
import optuna
import sys, os
from datetime import datetime, timezone
from catboost import Pool, cv
from sklearn.preprocessing import LabelBinarizer

# data
train_path = "../../data/train.csv"
test_path = "../../data/train.csv"
# helpers
sys.path.append("..")
from helpers.loss_functions import *

SEED = 108
random.seed(SEED)
N_FOLDS = 5
early_stop = 50
iterations = 1000

In [2]:
# Prepare data
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
X_test = pd.read_csv(test_path)
print(f"test size: {X_test.shape}")
X_test.drop(columns=["id"], inplace=True)

# prepare columns
target = "class"

X_train = train.drop(columns=[target, "id"], axis=1)
y_train = train[target]
# Binarize the target labels
lb = LabelBinarizer()

y_train = lb.fit_transform(y_train)

# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=42
# )


# Category handling
X_train, categorical_training_cols = handle_categorical_columns(X_train)
X_test, categorical_test_cols = handle_categorical_columns(X_test)
# X_val, categorical_val_cols = handle_categorical_columns(X_val)
# test_pool = Pool(
#     X_test,
#     cat_features=categorical_test_cols,
# )
gc.collect()

train size: (3116945, 22)
test size: (3116945, 22)


0

In [3]:
def objective(trial):
    import gc
    from tqdm import tqdm
    from sklearn.model_selection import StratifiedKFold
    from sklearn.preprocessing import LabelBinarizer
    from sklearn.model_selection import train_test_split

    # Hyper params
    param = {
        "loss_function": "Logloss",
        "iterations": iterations,
        "task_type": "GPU",
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 15),
        "depth": trial.suggest_int("depth", 1, 15),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.001, 10.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "bagging_temperature": trial.suggest_float(
            "bagging_temperature", 0.1, 1.0
        ),  # 1
        "random_strength": trial.suggest_float("random_strength", 0.1, 1.0),
    }

    gc.collect()
    skf = StratifiedKFold(n_splits=N_FOLDS)

    y_preds = []
    y_trues = []
    X = X_train.to_numpy()
    for train_index, test_index in tqdm(skf.split(X, y_train)):
        X_train_splitted, X_test_splitted = (
            X_train.loc[train_index],
            X_train.loc[test_index],
        )
        y_train_splitted, y_test_splitted = y_train[train_index], y_train[test_index]

        train_pool = Pool(
            X_train_splitted,
            label=y_train_splitted,
            cat_features=categorical_training_cols,
        )
        val_pool = Pool(
            X_test_splitted,
            label=y_test_splitted,
            cat_features=categorical_training_cols,
        )

        model = CatBoostClassifier(**param)
        # train the model
        model.fit(
            train_pool,
            use_best_model=True,
            eval_set=val_pool,
            metric_period=100,
            early_stopping_rounds=50,
        )
        y_pred = model.predict(val_pool)
        y_preds.append(y_pred)
        y_trues.append(y_test_splitted)
    # Concatenate the predictions and true labels
    y_preds_concat = np.concatenate(y_preds)
    y_trues_concat = np.concatenate(y_trues)
    mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
    return mcc

In [4]:
# Create or load a study
today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
curr_timestamp = int(datetime.now(timezone.utc).timestamp())
study_name = f"catboost_study"
study = optuna.create_study(
    study_name=study_name,
    storage=f"sqlite:///{study_name}.db",
    direction="maximize",
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-08-19 15:52:21,324] Using an existing study with name 'catboost_study' instead of creating a new one.


  0%|          | 0/100 [00:00<?, ?it/s]



0:	learn: 0.6785258	test: 0.6784735	best: 0.6784735 (0)	total: 553ms	remaining: 9m 12s
100:	learn: 0.1124766	test: 0.1121549	best: 0.1121549 (100)	total: 1m 8s	remaining: 10m 5s
200:	learn: 0.0599099	test: 0.0599432	best: 0.0599432 (200)	total: 2m 14s	remaining: 8m 53s
300:	learn: 0.0479607	test: 0.0481908	best: 0.0481908 (300)	total: 3m 22s	remaining: 7m 49s
400:	learn: 0.0442769	test: 0.0445742	best: 0.0445742 (400)	total: 4m 27s	remaining: 6m 39s
500:	learn: 0.0426825	test: 0.0430229	best: 0.0430229 (500)	total: 5m 32s	remaining: 5m 30s
600:	learn: 0.0417971	test: 0.0421755	best: 0.0421755 (600)	total: 6m 36s	remaining: 4m 23s
700:	learn: 0.0411706	test: 0.0415663	best: 0.0415663 (700)	total: 7m 39s	remaining: 3m 16s
800:	learn: 0.0408096	test: 0.0412302	best: 0.0412302 (800)	total: 8m 39s	remaining: 2m 8s
900:	learn: 0.0405293	test: 0.0409675	best: 0.0409675 (900)	total: 9m 35s	remaining: 1m 3s
999:	learn: 0.0403050	test: 0.0407674	best: 0.0407674 (998)	total: 10m 26s	remaining: 0u



0:	learn: 0.6784737	test: 0.6785487	best: 0.6785487 (0)	total: 551ms	remaining: 9m 10s
100:	learn: 0.1148892	test: 0.1147686	best: 0.1147686 (100)	total: 1m 7s	remaining: 10m 4s
200:	learn: 0.0620380	test: 0.0620527	best: 0.0620527 (200)	total: 2m 11s	remaining: 8m 42s
300:	learn: 0.0492553	test: 0.0493512	best: 0.0493512 (300)	total: 3m 17s	remaining: 7m 37s
400:	learn: 0.0446202	test: 0.0447768	best: 0.0447768 (400)	total: 4m 24s	remaining: 6m 35s
500:	learn: 0.0426983	test: 0.0428869	best: 0.0428869 (500)	total: 5m 29s	remaining: 5m 28s
600:	learn: 0.0415437	test: 0.0417684	best: 0.0417684 (600)	total: 6m 33s	remaining: 4m 21s
700:	learn: 0.0410618	test: 0.0413011	best: 0.0413011 (700)	total: 7m 33s	remaining: 3m 13s
800:	learn: 0.0406427	test: 0.0409108	best: 0.0409108 (800)	total: 8m 35s	remaining: 2m 8s
900:	learn: 0.0402613	test: 0.0405476	best: 0.0405476 (899)	total: 9m 40s	remaining: 1m 3s
999:	learn: 0.0399647	test: 0.0402701	best: 0.0402701 (999)	total: 10m 40s	remaining: 0u



0:	learn: 0.6786203	test: 0.6785998	best: 0.6785998 (0)	total: 548ms	remaining: 9m 7s
100:	learn: 0.1098603	test: 0.1095368	best: 0.1095368 (100)	total: 1m 8s	remaining: 10m 12s
200:	learn: 0.0610737	test: 0.0607945	best: 0.0607945 (200)	total: 2m 13s	remaining: 8m 49s
300:	learn: 0.0482550	test: 0.0480239	best: 0.0480239 (300)	total: 3m 20s	remaining: 7m 46s
400:	learn: 0.0443782	test: 0.0442557	best: 0.0442557 (400)	total: 4m 29s	remaining: 6m 42s
500:	learn: 0.0428271	test: 0.0427465	best: 0.0427465 (500)	total: 5m 30s	remaining: 5m 29s
