In [1]:
# !pip install --user optuna xgboost

## Constants

In [2]:
import sys, os
import pandas as pd
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

# data
is_dshub = os.getenv("NB_USER", False)
if is_dshub:
    train_path = "../data/train.csv"
    test_path = "../data/test.csv"
    cache_path = "../data/cache"
else:
    train_path = "../data/mushrooms/train.csv"
    test_path = "../data/mushrooms/test.csv"
    cache_path = "../data/mushrooms/cache"
# model
is_tunning = False
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"


def mcc_metric_v2(preds, dtrain):
    labels = dtrain.get_label()
    preds = (preds > 0.5).astype(int)
    return "MCC", matthews_corrcoef(labels, preds)


# custom metric
objective_dict = {
    "binary:logistic": {
        "metric": {
            "is_custom": True,
            "name": "MCC",
            "fval": mcc_metric_v2,
        },
        "direction": "maximize",
    }
}

# objective_dict = {
#     "binary:logistic": {
#         "metric": {
#             "is_custom": False,
#             "name": "logloss",
#             "fval": None,
#         },
#         "direction": "minimize",
#     }
# }

metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
direction = objective_dict[goal]["direction"]
best_params = {
    "device": device,
    "verbosity": 0,
}

device: cuda


In [3]:
best_params

{'device': 'cuda', 'verbosity': 0}

## Prepare data

In [4]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

train size: (3116945, 22)
test size: (2077964, 21)


In [5]:
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)
for c in categorical_cols:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")
numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)

In [6]:
# X_train, X_val, y_train, y_val = train_test_split(
#     train.drop(columns=target),
#     train[target],
#     test_size=0.2,
#     random_state=42,
#     stratify=train[target],
# )
# X_test = test

X_train = train.drop(columns=target)
y_train = train[target]
gc.collect()

64

## Data preprocessing

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        # ("scaler", StandardScaler()),
        # ("minmax", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
# X_test_transformed = preprocessor.transform(X_test)
# X_val_transformed = preprocessor.transform(X_val)
# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)
# y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
# dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
# dtest = xgb.DMatrix(X_test_transformed)

gc.collect()

22

## Train best model

In [11]:
# Use tunned params
tunned_params = {
    "tree_method": "hist",
    "eta": 0.0696294726051571,
    "max_depth": 0,
    "min_child_weight": 1,
    "gamma": 0.044230646284796976,
    "subsample": 0.9405269471473167,
    "colsample_bytree": 0.2999355523666192,
    "lambda": 0.9746051811186938,
    "alpha": 4.210861941737071,
}

best_params.update(tunned_params)
best_params

{'device': 'cuda',
 'verbosity': 0,
 'tree_method': 'hist',
 'eta': 0.0696294726051571,
 'max_depth': 0,
 'min_child_weight': 0,
 'gamma': 0.044230646284796976,
 'subsample': 0.9405269471473167,
 'colsample_bytree': 0.2999355523666192,
 'lambda': 0.9746051811186938,
 'alpha': 4.210861941737071}

In [12]:
from sklearn.model_selection import KFold

print("Training best model...")
evals_best_result = {}

cv_results = xgb.cv(
    best_params,
    dtrain,
    num_boost_round=1000,
    nfold=5,
    shuffle=True,
    seed=42,
    feval=fval,  # Custom evaluation metric
    maximize=True,
    as_pandas=True,
    callbacks=[
        xgb.callback.EvaluationMonitor(show_stdv=True),
        xgb.callback.EarlyStopping(50),
    ],
)
gc.collect()
# evals_result["eval"][metric][-1]
# Use the best score for the final iteration
score = cv_results["test-MCC-mean"].values[-1]

Training best model...
[0]	train-rmse:0.48603+0.00341	train-MCC:0.00000+0.00000	test-rmse:0.48604+0.00337	test-MCC:0.00000+0.00000
[1]	train-rmse:0.46426+0.00324	train-MCC:0.47831+0.07017	test-rmse:0.46428+0.00318	test-MCC:0.47787+0.06987
[2]	train-rmse:0.43547+0.00303	train-MCC:0.83182+0.01426	test-rmse:0.43559+0.00296	test-MCC:0.83050+0.01318
[3]	train-rmse:0.41315+0.00303	train-MCC:0.92542+0.00318	test-rmse:0.41331+0.00295	test-MCC:0.92397+0.00284
[4]	train-rmse:0.39522+0.00287	train-MCC:0.93983+0.00180	test-rmse:0.39540+0.00279	test-MCC:0.93854+0.00163
[5]	train-rmse:0.37714+0.00268	train-MCC:0.95236+0.00084	test-rmse:0.37736+0.00260	test-MCC:0.95114+0.00066
[6]	train-rmse:0.35853+0.00265	train-MCC:0.96409+0.00049	test-rmse:0.35879+0.00256	test-MCC:0.96302+0.00049
[7]	train-rmse:0.33876+0.00249	train-MCC:0.96944+0.00022	test-rmse:0.33907+0.00240	test-MCC:0.96842+0.00021
[8]	train-rmse:0.31907+0.00231	train-MCC:0.97525+0.00025	test-rmse:0.31946+0.00221	test-MCC:0.97415+0.00025
[9]	t

In [13]:
score

0.0