In [1]:
import pandas as pd
import numpy as np
import random
import optuna
import sys, os
from datetime import datetime, timezone
from catboost import Pool, cv
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import polars as pl

# data
train_path = "../../data/train.csv"
test_path = "../../data/test.csv"
# helpers
sys.path.append("..")
from helpers.loss_functions import *

SEED = 108
random.seed(SEED)
N_FOLDS = 5
early_stop = 50
iterations = 500

# Prepare data
train = pl.scan_parquet("../train_fold.parquet")
categorical_cols = train.select(pl.col(pl.Utf8)).columns
categorical_cols.pop(0)
categorical_cols
# X_test = pd.read_csv(test_path)
# print(f"test size: {X_test.shape}")
# X_test.drop(columns=["id"], inplace=True)

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [2]:


# # prepare columns
# target = "class"

# X_train = train.drop(columns=[target, "id"], axis=1)
# y_train = train[target]
# # Binarize the target labels
# lb = LabelBinarizer()

# y_train = lb.fit_transform(y_train)

# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=SEED, stratify=y_train
# )
# y_val = lb.transform(y_val)


# # Category handling
# X_train, categorical_training_cols = handle_categorical_columns(X_train)
# # X_test, categorical_test_cols = handle_categorical_columns(X_test)
# X_val, categorical_val_cols = handle_categorical_columns(X_val)
# train_pool = Pool(
#     X_train,
#     label=y_train,
#     cat_features=categorical_training_cols,
# )
# val_pool = Pool(
#     X_val,
#     label=y_val,
#     cat_features=categorical_val_cols,
# )
# # test_pool = Pool(
# #     X_test,
# #     cat_features=categorical_test_cols,
# # )
# gc.collect()

In [3]:
# skf = StratifiedKFold(n_splits=N_FOLDS)

# y_preds = []
# y_trues = []
# # X = X_train.to_numpy()
# for i, (train_index, test_index) in enumerate(
#     skf.split(train, train["class"], groups=train["id"])
# ):
#     train.loc[test_index, ["fold"]] = i
# train.head()

In [4]:
# train.to_parquet("train_fold.parquet")

In [5]:
# import gc
# from tqdm import tqdm
# from sklearn.model_selection import StratifiedKFold
# from sklearn.preprocessing import LabelBinarizer
# from sklearn.model_selection import train_test_split

# # Hyper params
# param = {
#     "loss_function": "Logloss",
#     "iterations": iterations,
#     "task_type": "GPU",
#     # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 15),
#     # "depth": trial.suggest_int("depth", 1, 15),
#     # "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.001, 10.0),
#     # "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
#     # "bagging_temperature": trial.suggest_float(
#     #     "bagging_temperature", 0.1, 1.0
#     # ),  # 1
#     # "random_strength": trial.suggest_float("random_strength", 0.1, 1.0),
# }

# model = CatBoostClassifier(**param)
# # train the model
# model.fit(
#     train_pool,
#     use_best_model=True,
#     eval_set=val_pool,
#     # metric_period=100,
#     early_stopping_rounds=early_stop,
# )
# model.get_evals_result()["validation"]["Logloss"][-1]

In [6]:
# X_train_m4n = (
#     train.filter(pl.col("fold") != 1)
#     .drop(["id","class"])
#     .fill_null("other")
#     .collect()
#     .to_pandas()
# )
# y_train_m4n = (
#     train.filter(pl.col("fold") != 1).select(pl.col("class")).collect().to_numpy()
# )

# X_test_m4n = (
#     train.filter(pl.col("fold") == 1)
#     .drop(["id","class"])
#     .fill_null("other")
#     .collect()
#     .to_pandas()
# )
# y_test_m4n = (
#     train.filter(pl.col("fold") == 1).select(pl.col("class")).collect().to_numpy()
# )


In [7]:
# train_pool = Pool(
#     X_train_m4n,
#     label=y_train_m4n,
#     cat_features=categorical_cols,
# )

In [8]:
def objective(trial):
    import gc
    from tqdm import tqdm
    from sklearn.model_selection import StratifiedKFold
    from sklearn.preprocessing import LabelBinarizer
    from sklearn.model_selection import train_test_split

    # Hyper params
    param = {
        "loss_function": "Logloss",
        "iterations": iterations,
        "task_type": "GPU",
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 15),
        "depth": trial.suggest_int("depth", 1, 15),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.001, 10.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "bagging_temperature": trial.suggest_float(
            "bagging_temperature", 0.1, 1.0
        ),  # 1
        "random_strength": trial.suggest_float("random_strength", 0.1, 1.0),
    }

    y_preds = []
    y_trues = []
    for idx in range(N_FOLDS):
        X_train = (
            train.filter(pl.col("fold") != idx)
            .drop(["id", "class"])
            .fill_null("other")
            .collect()
            .to_pandas()
        )
        y_train = (
            train.filter(pl.col("fold") != idx)
            .select(pl.col("class"))
            .collect()
            .to_numpy()
        )

        X_test = (
            train.filter(pl.col("fold") == idx)
            .drop(["id", "class"])
            .fill_null("other")
            .collect()
            .to_pandas()
        )
        y_test = (
            train.filter(pl.col("fold") == idx)
            .select(pl.col("class"))
            .collect()
            .to_numpy()
        )

        train_pool = Pool(
            X_train,
            label=y_train,
            cat_features=categorical_cols,
        )
        val_pool = Pool(
            X_test,
            label=y_test,
            cat_features=categorical_cols,
        )

        model = CatBoostClassifier(**param)
        # train the model
        model.fit(
            train_pool,
            use_best_model=True,
            eval_set=val_pool,
            metric_period=50,
            early_stopping_rounds=early_stop,
        )
        y_pred = model.predict(val_pool)
        y_preds.append(y_pred)
        y_trues.append(y_test)
    # Concatenate the predictions and true labels
    y_preds_concat = np.concatenate(y_preds)
    y_trues_concat = np.concatenate(y_trues)
    mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
    return mcc

In [9]:
# Create or load a study
today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
curr_timestamp = int(datetime.now(timezone.utc).timestamp())
study_name = f"catboost_study"
study = optuna.create_study(
    study_name=study_name,
    storage=f"sqlite:///{study_name}.db",
    direction="maximize",
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-08-20 06:33:47,180] Using an existing study with name 'catboost_study' instead of creating a new one.


  0%|          | 0/100 [00:00<?, ?it/s]

0:	learn: 0.5829293	test: 0.5821039	best: 0.5821039 (0)	total: 1.7s	remaining: 14m 7s
50:	learn: 0.0425985	test: 0.0434707	best: 0.0434707 (50)	total: 1m 29s	remaining: 13m 7s
100:	learn: 0.0374033	test: 0.0388704	best: 0.0388704 (100)	total: 2m 50s	remaining: 11m 14s
150:	learn: 0.0362581	test: 0.0380688	best: 0.0380688 (150)	total: 4m 4s	remaining: 9m 25s
200:	learn: 0.0356491	test: 0.0376938	best: 0.0376938 (200)	total: 5m 19s	remaining: 7m 55s
250:	learn: 0.0351605	test: 0.0374733	best: 0.0374733 (250)	total: 6m 34s	remaining: 6m 31s
300:	learn: 0.0347419	test: 0.0373064	best: 0.0373064 (300)	total: 7m 55s	remaining: 5m 14s
350:	learn: 0.0344226	test: 0.0371893	best: 0.0371882 (347)	total: 9m 12s	remaining: 3m 54s
400:	learn: 0.0341250	test: 0.0371008	best: 0.0371008 (400)	total: 10m 33s	remaining: 2m 36s
