## Constants

In [1]:
import sys, os
import pandas as pd
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow

mlflow.xgboost.autolog()
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

# helpers
sys.path.append("..")
from helpers.loss_functions import *
from helpers.mlflow import *

# data
train_path = "../data/mushrooms/train.csv"
test_path = "../data/mushrooms/test.csv"
cache_path = "../data/mushrooms/cache"
# model
is_tunning = True
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"

# custom metric
# objective_dict = {
#     "binary:logistic": {
#         "metric": {"is_custom": True, "name": "MCC", "fval": mcc_metric}
#     }
# }

objective_dict = {
    "binary:logistic": {"metric": {"is_custom": False, "name": "logloss", "fval": None}}
}
metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
best_params = {
    "objective": goal,
    "device": device,
    "verbosity": 0,
}

device: cuda


## Prepare data

In [2]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

train size: (3116945, 22)
test size: (2077964, 21)


In [3]:
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)
for c in categorical_cols:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")
numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target),
    train[target],
    test_size=0.2,
    random_state=42,
    stratify=train[target],
)
X_test = test

## Data preprocessing

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        # ("minmax", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_val_transformed = preprocessor.transform(X_val)
# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)
y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
dtest = xgb.DMatrix(X_test_transformed)

## Hyperparamters tuning

In [6]:
# warnings.filterwarnings("ignore")


# def objective(trial):
#     hyper_parameters = {
#         **best_params,
#         **{
#             "eta": trial.suggest_float("eta", 0.01, 0.2),
#             "max_depth": trial.suggest_int("max_depth", 0.01, 0.03),
#             "min_child_weight": trial.suggest_int("min_child_weight", 0.01, 0.3),
#             "gamma": trial.suggest_float("gamma", 0.01, 0.2),
#             "subsample": trial.suggest_float("subsample", 0.01, 0.2),
#             "colsample_bytree": trial.suggest_float("colsample_bytree", 0, 1),
#             "lambda": trial.suggest_float("lambda", 0.01, 0.2),
#             "alpha": trial.suggest_float("alpha", 0.01, 0.2),
#         },
#     }
#     evals_result = {}
#     if is_custom_metric:
#         xgb.train(
#             params=hyper_parameters,
#             dtrain=dtrain,
#             num_boost_round=10000,
#             evals=[(dval, "eval")],
#             feval=fval,
#             evals_result=evals_result,
#             early_stopping_rounds=200,
#         )
#     else:
#         xgb.train(
#             params=hyper_parameters,
#             dtrain=dtrain,
#             num_boost_round=10000,
#             evals=[(dval, "eval")],
#             evals_result=evals_result,
#             early_stopping_rounds=200,
#         )
#     return evals_result["eval"][metric][-1]


# if is_tunning:
#     # Create or load a study
#     today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
#     curr_timestamp = int(datetime.now(timezone.utc).timestamp())
#     study_name = f"study_{today}"
#     study = optuna.create_study(
#         study_name=study_name,
#         storage=f"sqlite:///{study_name}.db",
#         direction="minimize",
#         load_if_exists=True,
#     )
#     study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
#     # Print best trial
#     best_trial = study.best_trial
#     print("Best trial:")
#     print(f" {metric}:", best_trial.value)
#     print("  Params: ")
#     for key, value in best_trial.params.items():
#         print("    {}: {}".format(key, value))
#     study_best_params = study.best_params
#     best_params.update(study_best_params)
#     jl.dump(best_params, "best_params.pkl")
#     # 0.03734 anh Tu

## Train best model

In [7]:
# Mlflow
model_name = "poisonous-mushroom-classifier"
project_name = "Binary Prediction of Poisonous Mushrooms"
exp_name = "train-mushroom-classifier"
exp_desc = "Training model to submit to Binary Prediction of Poisonous Mushrooms."
mlf_client = Mlflow(model_name=model_name)
mlf_client.get_or_create_exp(
    project_name=project_name,
    experiment_name=exp_name,
    experiment_description=exp_desc,
)
curr_timestamp = int(datetime.now(timezone.utc).timestamp())

artifact_path = "model"
with mlflow.start_run(
    run_name=f"mean_strategy_{curr_timestamp}",
    tags={"metric": "logloss"},
    description=None,
    log_system_metrics=True,
) as run:
    try:
        mlflow.log_artifact(local_path=f"{study_name}.db", artifact_path=artifact_path)
    except:
        pass
    # Use tunned params
    tunned_params = {
        "eta": 0.1899871885683955,
        "max_depth": 10,
        "min_child_weight": 2,
        "gamma": 0.42570860420610934,
        "subsample": 0.74464089552046,
        "colsample_bytree": 0.6449797444444113,
        "lambda": 0.4134501484785982,
        "alpha": 6.520908679019516,
    }

    best_params.update(tunned_params)

    print("Training best model...")
    evals_best_result = {}
    # Create the full pipeline with the XGBoost model
    if is_custom_metric:
        print("Training with custom metric")
        model = xgb.train(
            params=best_params,
            dtrain=dtrain,
            num_boost_round=10000,
            evals=[(dval, "eval")],
            feval=fval,
            evals_result=evals_best_result,
            early_stopping_rounds=100,
        )
    else:
        print("Training with original metric")
        model = xgb.train(
            params=best_params,
            dtrain=dtrain,
            num_boost_round=10000,
            evals=[(dval, "eval")],
            evals_result=evals_best_result,
            early_stopping_rounds=200,
        )

    mcc, _, validate_df = matthews_corrcoef_score(model, dval, y_val_binarized, lb)
    # logs evaluation tables
    mlflow.log_table(data=validate_df, artifact_file=f"eval_results_{today}.json")

    # logs metrics
    metrics = {}
    metrics["MCC"] = mcc
    metrics[metric] = evals_best_result["eval"][metric][-1]
    mlflow.log_metrics(metrics)
    _, classes, _ = matthews_corrcoef_score(model, dtest, None, lb)
    signature = infer_signature(X_test_transformed, classes)
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path=artifact_path,
        signature=signature,
    )
    # Register model name in the model registry
    try:
        mlf_client.register_model()
    except:
        print(f"model {model_name} already registered")
    # Model versioning
    mv = mlf_client.version_model(run.info.run_id)
    print(f"Name: {mv.name}")
    print(f"Version: {mv.version}")
    print(f"Description: {mv.description}")
    print(f"Status: {mv.status}")
    print(f"Stage: {mv.current_stage}")

2024/08/05 00:01:15 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Setting connection to Mlflow...
Checking whether experiment existed
Using existed experiment: train-mushroom-classifier
Training best model...
Training with original metric
[0]	eval-logloss:0.56379
[1]	eval-logloss:0.47158
[2]	eval-logloss:0.40652
[3]	eval-logloss:0.35440
[4]	eval-logloss:0.30978
[5]	eval-logloss:0.27201
[6]	eval-logloss:0.24125
[7]	eval-logloss:0.21564
[8]	eval-logloss:0.19675
[9]	eval-logloss:0.17898
[10]	eval-logloss:0.15863
[11]	eval-logloss:0.14343
[12]	eval-logloss:0.12918
[13]	eval-logloss:0.11914
[14]	eval-logloss:0.10876
[15]	eval-logloss:0.10419
[16]	eval-logloss:0.09789
[17]	eval-logloss:0.09253
[18]	eval-logloss:0.08659
[19]	eval-logloss:0.08382
[20]	eval-logloss:0.08045
[21]	eval-logloss:0.07785
[22]	eval-logloss:0.07498
[23]	eval-logloss:0.07316
[24]	eval-logloss:0.07093
[25]	eval-logloss:0.06827
[26]	eval-logloss:0.06719
[27]	eval-logloss:0.06378
[28]	eval-logloss:0.06160
[29]	eval-logloss:0.05979
[30]	eval-logloss:0.05820
[31]	eval-logloss:0.05666
[32]	

2024/08/05 00:01:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: poisonous-mushroom-classifier, version 4
2024/08/05 00:01:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run mean_strategy_1722790875 at: http://localhost:8080/#/experiments/901265626273758176/runs/3094f62eda8d4bda8e0e465077a4363c.
2024/08/05 00:01:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/901265626273758176.
2024/08/05 00:01:38 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/08/05 00:01:38 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


poisonous-mushroom-classifier
poisonous-mushroom-classifier has been existed
Name: poisonous-mushroom-classifier
Version: 4
Description: 
Status: READY
Stage: None


In [8]:
validate_df

Unnamed: 0,actual_value,predicted_value
0,e,e
1,e,e
2,e,e
3,e,e
4,e,e
...,...,...
623384,e,e
623385,e,e
623386,e,e
623387,e,e


## Model eval

In [9]:
mcc, _, validate_df = matthews_corrcoef_score(model, dval, y_val_binarized, lb)
print(f"correlation coefficient: {mcc}")

correlation coefficient: 0.9833906875450024


## Update Submission

In [10]:
dtest = xgb.DMatrix(X_test_transformed)
_, classes = matthews_corrcoef_score(model, dtest, None, lb)

submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
submit_df["class"] = classes
submit_df.to_csv("submission.csv", index=False)

ValueError: too many values to unpack (expected 2)

## Feature important

In [None]:
from matplotlib import pyplot

In [None]:
# plot
feature_important = model.get_score(importance_type="gain")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(
    by="score", ascending=False
)
data.nlargest(40, columns="score").plot(
    kind="barh", figsize=(20, 10)
)  ## plot top 40 features