## Constants

In [1]:
import sys, os
import pandas as pd
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

# data
train_path = "../data/mushrooms/train.csv"
test_path = "../data/mushrooms/test.csv"
cache_path = "../data/mushrooms/cache"
# model
is_tunning = False
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"


def mcc_metric_v2(preds, dtrain):
    labels = dtrain.get_label()
    preds = (preds > 0.5).astype(int)
    return "MCC", matthews_corrcoef(labels, preds)


# custom metric
objective_dict = {
    "binary:logistic": {
        "metric": {
            "is_custom": True,
            "name": "MCC",
            "fval": mcc_metric_v2,
        },
        "direction": "maximize",
    }
}

# objective_dict = {
#     "binary:logistic": {
#         "metric": {
#             "is_custom": False,
#             "name": "logloss",
#             "fval": None,
#         },
#         "direction": "minimize",
#     }
# }
metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
direction = objective_dict[goal]["direction"]
best_params = {
    "device": device,
    "verbosity": 0,
}

No Nvidia GPU in system!


## Prepare data

In [2]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

train size: (3116945, 22)
test size: (2077964, 21)


In [3]:
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)
for c in categorical_cols:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")
numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)

In [4]:
# X_train, X_val, y_train, y_val = train_test_split(
#     train.drop(columns=target),
#     train[target],
#     test_size=0.2,
#     random_state=42,
#     stratify=train[target],
# )
# X_test = test

X_train = train.drop(columns=target)
y_train = train[target]
gc.collect()

103

## Data preprocessing

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        # ("scaler", StandardScaler()),
        # ("minmax", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
# X_test_transformed = preprocessor.transform(X_test)
# X_val_transformed = preprocessor.transform(X_val)
# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)
# y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
# dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
# dtest = xgb.DMatrix(X_test_transformed)

gc.collect()

22

## Train best model

In [6]:
from sklearn.model_selection import KFold

# Use tunned params
tunned_params = {
    "tree_method": "hist",
    "eta": 0.07,
    "max_depth": 20,
    "min_child_weight": 1,
    "gamma": 0.044230646284796976,
    "subsample": 0.9405269471473167,
    "colsample_bytree": 0.2999355523666192,
    "lambda": 0.9746051811186938,
    "alpha": 4.210861941737071,
}

best_params.update(tunned_params)
print(f"best_params: {best_params}")
print("Training best model...")
evals_best_result = {}

cv_results = xgb.cv(
    best_params,
    dtrain,
    num_boost_round=4000,
    nfold=5,
    shuffle=True,
    seed=42,
    feval=fval,  # Custom evaluation metric
    maximize=True,
    as_pandas=True,
    callbacks=[
        xgb.callback.EvaluationMonitor(show_stdv=True),
        xgb.callback.EarlyStopping(200),
    ],
)
gc.collect()
# evals_result["eval"][metric][-1]
# Use the best score for the final iteration
best_score = cv_results["test-MCC-mean"].max()

best_params: {'device': 'cpu', 'verbosity': 0, 'tree_method': 'hist', 'eta': 0.07, 'max_depth': 20, 'min_child_weight': 1, 'gamma': 0.044230646284796976, 'subsample': 0.9405269471473167, 'colsample_bytree': 0.2999355523666192, 'lambda': 0.9746051811186938, 'alpha': 4.210861941737071}
Training best model...
[0]	train-rmse:0.47097+0.00004	train-MCC:0.00000+0.00000	test-rmse:0.47100+0.00004	test-MCC:0.00000+0.00000
[1]	train-rmse:0.44678+0.00008	train-MCC:0.77305+0.00157	test-rmse:0.44688+0.00008	test-MCC:0.77152+0.00208
[2]	train-rmse:0.42769+0.00008	train-MCC:0.87275+0.00246	test-rmse:0.42781+0.00010	test-MCC:0.87138+0.00288
[3]	train-rmse:0.41279+0.00007	train-MCC:0.90794+0.00166	test-rmse:0.41292+0.00010	test-MCC:0.90704+0.00202
[4]	train-rmse:0.38863+0.00012	train-MCC:0.95058+0.00035	test-rmse:0.38880+0.00008	test-MCC:0.94955+0.00055
[5]	train-rmse:0.37799+0.00014	train-MCC:0.95558+0.00029	test-rmse:0.37817+0.00010	test-MCC:0.95465+0.00055
[6]	train-rmse:0.35598+0.00013	train-MCC:0.9

In [None]:
best_score

## Feature important

In [None]:
from matplotlib import pyplot

In [None]:
# # plot
# feature_important = model.get_score(importance_type="gain")
# keys = list(feature_important.keys())
# values = list(feature_important.values())

# data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(
#     by="score", ascending=False
# )
# data.nlargest(40, columns="score").plot(
#     kind="barh", figsize=(20, 10)
# )  ## plot top 40 features