In [None]:
import pandas as pd
import numpy as np
import random
import optuna
import sys, os
from datetime import datetime, timezone
from catboost import Pool, cv
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
# data
train_path = "../../data/train.csv"
test_path = "../../data/train.csv"
# helpers
sys.path.append("..")
from helpers.loss_functions import *

SEED = 108
random.seed(SEED)
N_FOLDS = 5
early_stop = 50
iterations = 1000

In [None]:
# Prepare data
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
X_test = pd.read_csv(test_path)
print(f"test size: {X_test.shape}")
X_test.drop(columns=["id"], inplace=True)

# prepare columns
target = "class"

X_train = train.drop(columns=[target, "id"], axis=1)
y_train = train[target]
# Binarize the target labels
lb = LabelBinarizer()

y_train = lb.fit_transform(y_train)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)
y_val = lb.transform(y_val)


# Category handling
X_train, categorical_training_cols = handle_categorical_columns(X_train)
X_test, categorical_test_cols = handle_categorical_columns(X_test)
X_val, categorical_val_cols = handle_categorical_columns(X_val)
# test_pool = Pool(
#     X_test,
#     cat_features=categorical_test_cols,
# )
gc.collect()

In [None]:
def objective(trial):
    import gc
    from tqdm import tqdm
    from sklearn.model_selection import StratifiedKFold
    from sklearn.preprocessing import LabelBinarizer
    from sklearn.model_selection import train_test_split

    # Hyper params
    param = {
        "loss_function": "Logloss",
        "iterations": iterations,
        "task_type": "GPU",
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 15),
        "depth": trial.suggest_int("depth", 1, 15),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.001, 10.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "bagging_temperature": trial.suggest_float(
            "bagging_temperature", 0.1, 1.0
        ),  # 1
        "random_strength": trial.suggest_float("random_strength", 0.1, 1.0),
    }

    train_pool = Pool(
        X_train,
        label=y_train,
        cat_features=categorical_training_cols,
    )
    val_pool = Pool(
        X_val,
        label=y_val,
        cat_features=categorical_val_cols,
    )

    model = CatBoostClassifier(**param)
    # train the model
    model.fit(
        train_pool,
        use_best_model=True,
        eval_set=val_pool,
        metric_period=100,
        early_stopping_rounds=50,
    )
    y_preds = (model.predict(val_pool) > 0.5).astype(int)
    mcc = matthews_corrcoef(y_val, y_preds)
    return mcc

In [None]:
# Create or load a study
today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
curr_timestamp = int(datetime.now(timezone.utc).timestamp())
study_name = f"catboost_study"
study = optuna.create_study(
    study_name=study_name,
    storage=f"sqlite:///{study_name}.db",
    direction="maximize",
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))