## Constants

In [None]:
import sys, os
import pandas as pd
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

from hyper_params import (
    mushroom_tuning_2024_08_06_1722934727_params,
)

# helpers
from helpers.loss_functions import *
from helpers.mlflow import *


SEED = 108
random.seed(SEED)
N_FOLDS = 20
# data
train_path = "../../data/mushrooms/train.csv"
test_path = "../../data/mushrooms/test.csv"
cache_path = "../../data/mushrooms/cache"
# model
is_tunning = True
SEED = 108
random.seed(SEED)
N_FOLDS = 3
# data
train_path = "../data/mushrooms/train.csv"
test_path = "../data/mushrooms/test.csv"
cache_path = "../data/mushrooms/cache"
# model
is_tunning = True
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"

# custom metric
objective_dict = {
    "binary:logistic": {
        "metric": {
            "is_custom": False,
            "name": "logloss",
            "fval": None,
        },
        "direction": "minimize",
    }
}
# objective_dict = {
#     "binary:logistic": {
#         "metric": {
#             "is_custom": True,
#             "name": "MCC",
#             "fval": mcc_metric_v2,
#         },
#         "direction": "maximize",
#     }
# }
metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
direction = objective_dict[goal]["direction"]

best_params = {
    "device": device,
    "verbosity": 0,
    "objective": goal,
}
# best_params.update(mushroom_tuning_2024_08_06_1722934727_params)
best_params

## Prepare data

In [None]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

In [None]:
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)
for c in categorical_cols:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")
numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(
#     train.drop(columns=target),
#     train[target],
#     test_size=0.2,
#     random_state=42,
#     stratify=train[target],
# )
X_test = test
X_train = train.drop(columns=target)
y_train = train[target]
gc.collect()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(font_scale=1.5)
X_train['cap-shape'].value_counts().sort_values().plot(kind='barh',figsize=(10, 8))
plt.xlabel("Count", labelpad=14)
plt.ylabel("Label", labelpad=14)
plt.title("Orginal Label for product column", y=1.02)

## Data preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler

weird_columns = [
    "cap-shape",
    "cap-surface",
    "cap-color",
    "gill-attachment",
    "gill-spacing",
    "gill-color",
    "veil-type",
    "veil-color",
    "has-ring",
    "ring-type",
    "spore-print-color",
    "habitat",
    "does-bruise-or-bleed",
    "stem-root",
    "stem-surface",
    "stem-color",
]

for col in weird_columns:
    allowed_vals = test[col].unique()
    train.loc[~train[col].isin(allowed_vals), col] = np.nan
    test.loc[~test[col].isin(allowed_vals), col] = np.nan


# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("minmax", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = data_pipeline.fit_transform(X_train)
X_test_transformed = data_pipeline.transform(X_test)
# X_val_transformed = preprocessor.transform(X_val)
# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)
# y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
# dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
dtest = xgb.DMatrix(X_test_transformed)

gc.collect()

## CV

In [18]:
from tqdm import tqdm


def objective(trial):
    skf = StratifiedKFold(n_splits=N_FOLDS)

    # kf = KFold(n_splits=N_FOLDS)
    y_preds = []
    y_trues = []
    hyper_parameters = {
        **best_params,
        **{
            "objective": goal,
            "tree_method": "gpu_hist",
            "eta": trial.suggest_float("eta", 0.001, 0.1),  # 4
            "max_depth": trial.suggest_int("max_depth", 0, 15),  # 3
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0.001, 0.1),  # 2
            "subsample": trial.suggest_float("subsample", 0.1, 1),  # 1
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
            "lambda": trial.suggest_float("lambda", 0.001, 10),
            "alpha": trial.suggest_float("alpha", 0.001, 10),
        },
    }
    for train_index, test_index in tqdm(
        skf.split(X_train_transformed, y_train_binarized)
    ):
        X_train, X_test = (
            X_train_transformed[train_index],
            X_train_transformed[test_index],
        )
        y_train, y_test = y_train_binarized[train_index], y_train_binarized[test_index]

        # prepare data for training
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_test, label=y_test)
        clf = xgb.train(
            params=hyper_parameters,
            dtrain=dtrain,
            num_boost_round=2000,
            evals=[(dval, "eval")],
            early_stopping_rounds=100,
        )
        y_pred = clf.predict(dval)
        y_pred = (y_pred > 0.51).astype(int)
        y_preds.append(y_pred)
        y_trues.append(y_test)
    # Concatenate the predictions and true labels
    y_preds_concat = np.concatenate(y_preds)
    y_trues_concat = np.concatenate(y_trues)
    mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
    return mcc


# Create or load a study
today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
curr_timestamp = int(datetime.now(timezone.utc).timestamp())
study_name = f"study_{today}_{curr_timestamp}"
study = optuna.create_study(
    study_name=study_name,
    storage=f"sqlite:///{study_name}.db",
    direction="maximize",
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
# Print best trial
best_trial = study.best_trial
print("Best trial:")
print(f" {metric}:", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))
study_best_params = study.best_params
best_params.update(study_best_params)
jl.dump(best_params, "best_params.pkl")

[I 2024-08-09 01:03:11,440] A new study created in RDB with name: study_2024_08_08_1723140191


  0%|          | 0/100 [00:00<?, ?it/s]



[0]	eval-logloss:0.64381
[1]	eval-logloss:0.60729
[2]	eval-logloss:0.56756
[3]	eval-logloss:0.54082
[4]	eval-logloss:0.51093
[5]	eval-logloss:0.48768
[6]	eval-logloss:0.45452
[7]	eval-logloss:0.43055
[8]	eval-logloss:0.40829
[9]	eval-logloss:0.38581
[10]	eval-logloss:0.36420
[11]	eval-logloss:0.34223
[12]	eval-logloss:0.32337
[13]	eval-logloss:0.30806
[14]	eval-logloss:0.29402
[15]	eval-logloss:0.28161
[16]	eval-logloss:0.27234
[17]	eval-logloss:0.26417
[18]	eval-logloss:0.25369
[19]	eval-logloss:0.24702
[20]	eval-logloss:0.23955
[21]	eval-logloss:0.23182
[22]	eval-logloss:0.22641
[23]	eval-logloss:0.22027
[24]	eval-logloss:0.21480
[25]	eval-logloss:0.20722
[26]	eval-logloss:0.20321
[27]	eval-logloss:0.19913
[28]	eval-logloss:0.19428
[29]	eval-logloss:0.18627
[30]	eval-logloss:0.18284
[31]	eval-logloss:0.17936
[32]	eval-logloss:0.17620
[33]	eval-logloss:0.16844
[34]	eval-logloss:0.16526
[35]	eval-logloss:0.15888
[36]	eval-logloss:0.15570
[37]	eval-logloss:0.15025
[38]	eval-logloss:0.14



[0]	eval-logloss:0.64064
[1]	eval-logloss:0.60391
[2]	eval-logloss:0.56422
[3]	eval-logloss:0.53437
[4]	eval-logloss:0.50180
[5]	eval-logloss:0.47399
[6]	eval-logloss:0.44095
[7]	eval-logloss:0.41459
[8]	eval-logloss:0.39428
[9]	eval-logloss:0.37746
[10]	eval-logloss:0.35645
[11]	eval-logloss:0.33976
[12]	eval-logloss:0.32167
[13]	eval-logloss:0.31032
[14]	eval-logloss:0.29489
[15]	eval-logloss:0.28499
[16]	eval-logloss:0.27376
[17]	eval-logloss:0.26422
[18]	eval-logloss:0.25665
[19]	eval-logloss:0.24750
[20]	eval-logloss:0.24103
[21]	eval-logloss:0.23355
[22]	eval-logloss:0.22691
[23]	eval-logloss:0.22174
[24]	eval-logloss:0.21587
[25]	eval-logloss:0.21119
[26]	eval-logloss:0.20624
[27]	eval-logloss:0.20254
[28]	eval-logloss:0.19363
[29]	eval-logloss:0.19066
[30]	eval-logloss:0.18543
[31]	eval-logloss:0.18055
[32]	eval-logloss:0.17790
[33]	eval-logloss:0.17486
[34]	eval-logloss:0.17027
[35]	eval-logloss:0.16554
[36]	eval-logloss:0.16296
[37]	eval-logloss:0.16124
[38]	eval-logloss:0.15



[0]	eval-logloss:0.64399
[1]	eval-logloss:0.60757
[2]	eval-logloss:0.56788
[3]	eval-logloss:0.53677
[4]	eval-logloss:0.51068
[5]	eval-logloss:0.48269
[6]	eval-logloss:0.45218
[7]	eval-logloss:0.42999
[8]	eval-logloss:0.40900
[9]	eval-logloss:0.38680
[10]	eval-logloss:0.36215
[11]	eval-logloss:0.34021
[12]	eval-logloss:0.32127
[13]	eval-logloss:0.31005
[14]	eval-logloss:0.29486
[15]	eval-logloss:0.28485
[16]	eval-logloss:0.26989
[17]	eval-logloss:0.26208
[18]	eval-logloss:0.25412
[19]	eval-logloss:0.24612
[20]	eval-logloss:0.23896
[21]	eval-logloss:0.23035
[22]	eval-logloss:0.22489
[23]	eval-logloss:0.21921
[24]	eval-logloss:0.21430
[25]	eval-logloss:0.20865
[26]	eval-logloss:0.20409
[27]	eval-logloss:0.19862
[28]	eval-logloss:0.19154
[29]	eval-logloss:0.18745
[30]	eval-logloss:0.18398
[31]	eval-logloss:0.18051
[32]	eval-logloss:0.17726
[33]	eval-logloss:0.17353
[34]	eval-logloss:0.16979
[35]	eval-logloss:0.16659
[36]	eval-logloss:0.16210
[37]	eval-logloss:0.15946
[38]	eval-logloss:0.15

In [None]:
submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
y_preds = clf.predict(X_test_transformed)
pred_classes = lb.inverse_transform(y_preds)
submit_df["class"] = pred_classes
submit_df.to_csv("submission.csv", index=False)