## Constants

In [None]:
import sys, os
import pandas as pd
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

# helpers
sys.path.append("..")
from helpers.loss_functions import *
from helpers.mlflow import *

# data
train_path = "../data/mushrooms/train.csv"
test_path = "../data/mushrooms/test.csv"
cache_path = "../data/mushrooms/cache"
# model
is_tunning = True
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"

# custom metric
objective_dict = {
    "binary:logistic": {
        "metric": {
            "is_custom": True,
            "name": "MCC",
            "fval": mcc_metric_v2,
        },
        "direction": "maximize",
    }
}

# objective_dict = {
#     "binary:logistic": {
#         "metric": {
#             "is_custom": False,
#             "name": "logloss",
#             "fval": None,
#         },
#         "direction": "minimize",
#     }
# }
metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
direction = objective_dict[goal]["direction"]
best_params = {
    "device": device,
    "verbosity": 0,
}

## Prepare data

In [None]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

In [None]:
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)
for c in categorical_cols:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")
numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target),
    train[target],
    test_size=0.2,
    random_state=42,
    stratify=train[target],
)
X_test = test

## Data preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        # ("minmax", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_val_transformed = preprocessor.transform(X_val)
# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)
y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
dtest = xgb.DMatrix(X_test_transformed)

gc.collect()

## Hyperparamters tuning

In [9]:
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold

maximize = direction == "maximize"


def objective(trial):
    hyper_parameters = {
        **best_params,
        **{
            "objective": goal,
            "tree_method": "gpu_hist",
            "eta": trial.suggest_float("eta", 0.001, 0.2),
            "max_depth": trial.suggest_int("max_depth", 0.001, 0.3),
            "min_child_weight": trial.suggest_int("min_child_weight", 0.001, 0.3),
            "gamma": trial.suggest_float("gamma", 0.001, 0.2),
            "subsample": trial.suggest_float("subsample", 0.001, 0.2),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.001, 0.3),
            "lambda": trial.suggest_float("lambda", 0.001, 0.2),
            "alpha": trial.suggest_float("alpha", 0.001, 0.2),
        },
    }
    evals_result = {}
    print(f"maximize: {maximize}")
    if is_custom_metric:
        model = xgb.train(
            params=hyper_parameters,
            dtrain=dtrain,
            maximize=maximize,
            num_boost_round=10000,
            evals=[(dval, "eval")],
            feval=fval,
            evals_result=evals_result,
            early_stopping_rounds=100,
        )
    else:
        model = xgb.train(
            params=hyper_parameters,
            dtrain=dtrain,
            maximize=maximize,
            num_boost_round=4000,
            evals=[(dval, "eval")],
            evals_result=evals_result,
            early_stopping_rounds=200,
        )

    # KFold cross-validation
    # kf = KFold(n_splits=5, shuffle=True, random_state=42)
    # cv_results = xgb.cv(
    #     hyper_parameters,
    #     dtrain,
    #     num_boost_round=10000,
    #     folds=kf,
    #     early_stopping_rounds=100,
    #     stratified=True,
    #     as_pandas=True,
    #     seed=42,
    #     custom_metric=fval,
    #     maximize=maximize,
    # )

    # # Use the best score for the final iteration
    # best_score = cv_results[f"test-MCC-mean"].min()
    # return best_score

    # y_pred_prob = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
    # y_pred_binary = (y_pred_prob > 0.5).astype(int)
    # mcc = matthews_corrcoef(y_val_binarized, y_pred_binary)
    return evals_result["eval"][metric][-1]


if is_tunning:
    # Create or load a study
    today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
    curr_timestamp = int(datetime.now(timezone.utc).timestamp())
    study_name = f"study_{today}_{curr_timestamp}"
    study = optuna.create_study(
        study_name=study_name,
        storage=f"sqlite:///{study_name}.db",
        direction=direction,
        load_if_exists=True,
    )
    study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
    # Print best trial
    best_trial = study.best_trial
    print("Best trial:")
    print(f" {metric}:", best_trial.value)
    print("  Params: ")
    for key, value in best_trial.params.items():
        print("    {}: {}".format(key, value))
    study_best_params = study.best_params
    best_params.update(study_best_params)
    jl.dump(best_params, "best_params.pkl")
    # 0.03734 anh Tu

[I 2024-08-06 00:43:51,359] A new study created in RDB with name: study_2024_08_05_1722879831


  0%|          | 0/100 [00:00<?, ?it/s]

maximize: True
[0]	eval-logloss:0.68425	eval-mcc:0.00000
[1]	eval-logloss:0.68099	eval-mcc:0.00000
[2]	eval-logloss:0.67631	eval-mcc:0.00000
[3]	eval-logloss:0.67560	eval-mcc:0.00000
[4]	eval-logloss:0.67257	eval-mcc:0.00000
[5]	eval-logloss:0.66947	eval-mcc:0.01542
[6]	eval-logloss:0.66680	eval-mcc:0.01555
[7]	eval-logloss:0.66576	eval-mcc:0.03928
[8]	eval-logloss:0.66541	eval-mcc:0.04084
[9]	eval-logloss:0.65683	eval-mcc:0.05109
[10]	eval-logloss:0.65346	eval-mcc:0.05146
[11]	eval-logloss:0.65208	eval-mcc:0.05681
[12]	eval-logloss:0.64905	eval-mcc:0.05832
[13]	eval-logloss:0.64791	eval-mcc:0.06464
[14]	eval-logloss:0.64661	eval-mcc:0.07649
[15]	eval-logloss:0.64433	eval-mcc:0.11319
[16]	eval-logloss:0.63422	eval-mcc:0.16086
[17]	eval-logloss:0.63290	eval-mcc:0.18021
[18]	eval-logloss:0.62928	eval-mcc:0.20314
[19]	eval-logloss:0.62522	eval-mcc:0.24660
[20]	eval-logloss:0.62375	eval-mcc:0.25305
[21]	eval-logloss:0.62174	eval-mcc:0.26939
[22]	eval-logloss:0.61828	eval-mcc:0.27992
[23]	e

## Train best model

In [None]:
# Mlflow
mlflow.xgboost.autolog()
model_name = "poisonous-mushroom-classifier"
project_name = "Binary Prediction of Poisonous Mushrooms"
exp_name = "train-mushroom-classifier"
exp_desc = "Training model to submit to Binary Prediction of Poisonous Mushrooms."
mlf_client = Mlflow(model_name=model_name)
mlf_client.get_or_create_exp(
    project_name=project_name,
    experiment_name=exp_name,
    experiment_description=exp_desc,
)
curr_timestamp = int(datetime.now(timezone.utc).timestamp())

artifact_path = "model"
with mlflow.start_run(
    run_name=f"mean_strategy_{curr_timestamp}",
    tags={"metric": "logloss"},
    description=None,
    log_system_metrics=True,
) as run:
    try:
        mlflow.log_artifact(local_path=f"{study_name}.db", artifact_path=artifact_path)
    except:
        pass
    # Use tunned params
    # tunned_params = {
    #     "eta": 0.1899871885683955,
    #     "max_depth": 10,
    #     "min_child_weight": 2,
    #     "gamma": 0.42570860420610934,
    #     "subsample": 0.74464089552046,
    #     "colsample_bytree": 0.6449797444444113,
    #     "lambda": 0.4134501484785982,
    #     "alpha": 6.520908679019516,
    # }

    # {
    #     "eta": 0.018087779785882732,
    #     "max_depth": 10,
    #     "min_child_weight": 2,
    #     "gamma": 0.42570860420610934,
    #     "subsample": 0.74464089552046,
    #     "colsample_bytree": 0.6449797444444113,
    #     "lambda": 0.4134501484785982,
    #     "alpha": 6.520908679019516,
    # }

    # best_params.update(tunned_params)
    print(f"best_params: {best_params}")
    print("Training best model...")
    evals_best_result = {}
    # Create the full pipeline with the XGBoost model
    if is_custom_metric:
        print("Training with custom metric")
        model = xgb.train(
            params=best_params,
            dtrain=dtrain,
            num_boost_round=5000,
            evals=[(dval, "eval")],
            feval=fval,
            maximize=maximize,
            evals_result=evals_best_result,
            early_stopping_rounds=100,
        )
    else:
        print("Training with original metric")
        model = xgb.train(
            params=best_params,
            dtrain=dtrain,
            maximize=maximize,
            num_boost_round=5000,
            evals=[(dval, "eval")],
            evals_result=evals_best_result,
            early_stopping_rounds=400,
        )

    mcc, _, validate_df = matthews_corrcoef_score(model, dval, y_val_binarized, lb)
    # logs evaluation tables
    mlflow.log_table(data=validate_df, artifact_file=f"eval_results_{today}.json")

    # logs metrics
    metrics = {}
    metrics["MCC"] = mcc
    metrics["logloss"] = evals_best_result["eval"]["logloss"][-1]
    mlflow.log_metrics(metrics)
    _, classes, _ = matthews_corrcoef_score(model, dtest, None, lb)
    signature = infer_signature(X_test_transformed, classes)
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path=artifact_path,
        signature=signature,
    )
    # Register model name in the model registry
    try:
        mlf_client.register_model()
    except:
        print(f"model {model_name} already registered")
    # Model versioning
    mv = mlf_client.version_model(run.info.run_id)
    print(f"Name: {mv.name}")
    print(f"Version: {mv.version}")
    print(f"Description: {mv.description}")
    print(f"Status: {mv.status}")
    print(f"Stage: {mv.current_stage}")

## Update Submission

In [None]:
dtest = xgb.DMatrix(X_test_transformed)
_, classes, _ = matthews_corrcoef_score(model, dtest, None, lb)

submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
submit_df["class"] = classes
submit_df.to_csv("submission.csv", index=False)

## Feature important

In [None]:
from matplotlib import pyplot

In [None]:
# plot
feature_important = model.get_score(importance_type="gain")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(
    by="score", ascending=False
)
data.nlargest(40, columns="score").plot(
    kind="barh", figsize=(20, 10)
)  ## plot top 40 features