## Constants

In [1]:
import sys, os
import pandas as pd
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")
from hyper_params import (
    mushroom_tuning_2024_08_06_1722934727_params,
)

# helpers
sys.path.append("..")
from helpers.loss_functions import *
from helpers.mlflow import *


SEED = 108
random.seed(SEED)
N_FOLDS = 5
# data
train_path = "../data/mushrooms/train.csv"
test_path = "../data/mushrooms/test.csv"
cache_path = "../data/mushrooms/cache"
# model
is_tunning = True
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"

# custom metric
objective_dict = {
    "binary:logistic": {
        "metric": {
            "is_custom": False,
            "name": "logloss",
            "fval": None,
        },
        "direction": "minimize",
    }
}
# objective_dict = {
#     "binary:logistic": {
#         "metric": {
#             "is_custom": True,
#             "name": "MCC",
#             "fval": mcc_metric_v2,
#         },
#         "direction": "maximize",
#     }
# }
metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
direction = objective_dict[goal]["direction"]
best_params = {
    "device": device,
    "verbosity": 0,
    "objective": goal,
    # "eval_metric": fval,
}
best_params.update(mushroom_tuning_2024_08_06_1722934727_params)
best_params

{'device': 'cuda',
 'verbosity': 0,
 'objective': 'binary:logistic',
 'tree_method': 'hist',
 'eta': 0.0696294726051571,
 'max_depth': 0,
 'min_child_weight': 1,
 'gamma': 0.044230646284796976,
 'subsample': 0.9405269471473167,
 'colsample_bytree': 0.2999355523666192,
 'lambda': 0.9746051811186938,
 'alpha': 4.210861941737071}

## Prepare data

In [2]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")
weird_columns = [
    "cap-shape",
    "cap-surface",
    "cap-color",
    "gill-attachment",
    "gill-spacing",
    "gill-color",
    "veil-type",
    "veil-color",
    "has-ring",
    "ring-type",
    "spore-print-color",
    "habitat",
    "does-bruise-or-bleed",
    "stem-root",
    "stem-surface",
    "stem-color",
]

for col in weird_columns:
    allowed_vals = test[col].unique()
    train.loc[~train[col].isin(allowed_vals), col] = "other"
    test.loc[~test[col].isin(allowed_vals), col] = "other"

train size: (3116945, 22)
test size: (2077964, 21)


In [3]:
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)
for c in categorical_cols:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")
numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)

In [4]:
# X_train, X_val, y_train, y_val = train_test_split(
#     train.drop(columns=target),
#     train[target],
#     test_size=0.2,
#     random_state=42,
#     stratify=train[target],
# )
X_test = test
X_train = train.drop(columns=target)
y_train = train[target]
gc.collect()

132

In [5]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# sns.set_theme(font_scale=1.5)
# X_train['cap-shape'].value_counts().sort_values().plot(kind='barh',figsize=(10, 8))
# plt.xlabel("Count", labelpad=14)
# plt.ylabel("Label", labelpad=14)
# plt.title("Orginal Label for product column", y=1.02)

## Data preprocessing

In [6]:
from sklearn.preprocessing import MinMaxScaler




# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("minmax", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
# X_val_transformed = preprocessor.transform(X_val)
# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)
# y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
# dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
dtest = xgb.DMatrix(X_test_transformed)

gc.collect()

62

## CV

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

clf: xgb.XGBClassifier = xgb.XGBClassifier(
    **best_params,
    # callbacks=[xgb.callback.EarlyStopping(50)],
    n_estimators=300,
)

In [8]:
from tqdm import tqdm
kf = KFold(n_splits=N_FOLDS)
y_preds = []
y_trues = []
for train_index, test_index in tqdm(kf.split(X_train_transformed, y_train_binarized)):
    X_train, X_test = X_train_transformed[train_index], X_train_transformed[test_index]
    y_train, y_test = y_train_binarized[train_index], y_train_binarized[test_index]
    clf.fit(
        X=X_train,
        y=y_train,
        eval_set=[(X_test, y_test)],
    )

    y_pred = clf.predict(X_test)
    y_preds.append(y_pred)
    y_trues.append(y_test)
# Concatenate the predictions and true labels
y_preds_concat = np.concatenate(y_preds)
y_trues_concat = np.concatenate(y_trues)
mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
print(f"Validation mcc score: {mcc}")

0it [00:00, ?it/s]

[0]	validation_0-logloss:0.63812
[1]	validation_0-logloss:0.58476
[2]	validation_0-logloss:0.54235
[3]	validation_0-logloss:0.51615
[4]	validation_0-logloss:0.48078
[5]	validation_0-logloss:0.45419
[6]	validation_0-logloss:0.43975
[7]	validation_0-logloss:0.40820
[8]	validation_0-logloss:0.38201
[9]	validation_0-logloss:0.35452
[10]	validation_0-logloss:0.33968
[11]	validation_0-logloss:0.31763
[12]	validation_0-logloss:0.29763
[13]	validation_0-logloss:0.27966
[14]	validation_0-logloss:0.27007
[15]	validation_0-logloss:0.25598
[16]	validation_0-logloss:0.24617
[17]	validation_0-logloss:0.23652
[18]	validation_0-logloss:0.22713
[19]	validation_0-logloss:0.21877
[20]	validation_0-logloss:0.21019
[21]	validation_0-logloss:0.19890
[22]	validation_0-logloss:0.18782
[23]	validation_0-logloss:0.18144
[24]	validation_0-logloss:0.17155
[25]	validation_0-logloss:0.16448
[26]	validation_0-logloss:0.16039
[27]	validation_0-logloss:0.15160
[28]	validation_0-logloss:0.14568
[29]	validation_0-loglos

1it [00:18, 18.46s/it]

[0]	validation_0-logloss:0.63840
[1]	validation_0-logloss:0.58499
[2]	validation_0-logloss:0.54259
[3]	validation_0-logloss:0.51637
[4]	validation_0-logloss:0.48095
[5]	validation_0-logloss:0.45432
[6]	validation_0-logloss:0.43990
[7]	validation_0-logloss:0.40838
[8]	validation_0-logloss:0.38214
[9]	validation_0-logloss:0.35462
[10]	validation_0-logloss:0.33977
[11]	validation_0-logloss:0.31769
[12]	validation_0-logloss:0.29769
[13]	validation_0-logloss:0.27976
[14]	validation_0-logloss:0.27015
[15]	validation_0-logloss:0.25604
[16]	validation_0-logloss:0.24621
[17]	validation_0-logloss:0.23654
[18]	validation_0-logloss:0.22713
[19]	validation_0-logloss:0.21879
[20]	validation_0-logloss:0.21017
[21]	validation_0-logloss:0.19885
[22]	validation_0-logloss:0.18776
[23]	validation_0-logloss:0.18137
[24]	validation_0-logloss:0.17148
[25]	validation_0-logloss:0.16440
[26]	validation_0-logloss:0.16031
[27]	validation_0-logloss:0.15153
[28]	validation_0-logloss:0.14562
[29]	validation_0-loglos

2it [00:36, 18.43s/it]

[0]	validation_0-logloss:0.63830
[1]	validation_0-logloss:0.58490
[2]	validation_0-logloss:0.54244
[3]	validation_0-logloss:0.51622
[4]	validation_0-logloss:0.48086
[5]	validation_0-logloss:0.45429
[6]	validation_0-logloss:0.43988
[7]	validation_0-logloss:0.40836
[8]	validation_0-logloss:0.38211
[9]	validation_0-logloss:0.35462
[10]	validation_0-logloss:0.33979
[11]	validation_0-logloss:0.31773
[12]	validation_0-logloss:0.29769
[13]	validation_0-logloss:0.27970
[14]	validation_0-logloss:0.27012
[15]	validation_0-logloss:0.25602
[16]	validation_0-logloss:0.24621
[17]	validation_0-logloss:0.23657
[18]	validation_0-logloss:0.22715
[19]	validation_0-logloss:0.21880
[20]	validation_0-logloss:0.21017
[21]	validation_0-logloss:0.19884
[22]	validation_0-logloss:0.18776
[23]	validation_0-logloss:0.18138
[24]	validation_0-logloss:0.17146
[25]	validation_0-logloss:0.16439
[26]	validation_0-logloss:0.16031
[27]	validation_0-logloss:0.15152
[28]	validation_0-logloss:0.14559
[29]	validation_0-loglos

3it [00:55, 18.41s/it]

[0]	validation_0-logloss:0.63834
[1]	validation_0-logloss:0.58497
[2]	validation_0-logloss:0.54256
[3]	validation_0-logloss:0.51640
[4]	validation_0-logloss:0.48106
[5]	validation_0-logloss:0.45443
[6]	validation_0-logloss:0.43999
[7]	validation_0-logloss:0.40850
[8]	validation_0-logloss:0.38228
[9]	validation_0-logloss:0.35480
[10]	validation_0-logloss:0.33994
[11]	validation_0-logloss:0.31787
[12]	validation_0-logloss:0.29786
[13]	validation_0-logloss:0.27990
[14]	validation_0-logloss:0.27030
[15]	validation_0-logloss:0.25620
[16]	validation_0-logloss:0.24639
[17]	validation_0-logloss:0.23671
[18]	validation_0-logloss:0.22730
[19]	validation_0-logloss:0.21894
[20]	validation_0-logloss:0.21033
[21]	validation_0-logloss:0.19901
[22]	validation_0-logloss:0.18793
[23]	validation_0-logloss:0.18155
[24]	validation_0-logloss:0.17166
[25]	validation_0-logloss:0.16458
[26]	validation_0-logloss:0.16050
[27]	validation_0-logloss:0.15171
[28]	validation_0-logloss:0.14578
[29]	validation_0-loglos

4it [01:13, 18.34s/it]

[0]	validation_0-logloss:0.63823
[1]	validation_0-logloss:0.58486
[2]	validation_0-logloss:0.54247
[3]	validation_0-logloss:0.51626
[4]	validation_0-logloss:0.48221
[5]	validation_0-logloss:0.45552
[6]	validation_0-logloss:0.44099
[7]	validation_0-logloss:0.40938
[8]	validation_0-logloss:0.38307
[9]	validation_0-logloss:0.35550
[10]	validation_0-logloss:0.34060
[11]	validation_0-logloss:0.31849
[12]	validation_0-logloss:0.29837
[13]	validation_0-logloss:0.28034
[14]	validation_0-logloss:0.27070
[15]	validation_0-logloss:0.25655
[16]	validation_0-logloss:0.24670
[17]	validation_0-logloss:0.23699
[18]	validation_0-logloss:0.22755
[19]	validation_0-logloss:0.21917
[20]	validation_0-logloss:0.21054
[21]	validation_0-logloss:0.19916
[22]	validation_0-logloss:0.18804
[23]	validation_0-logloss:0.18163
[24]	validation_0-logloss:0.17169
[25]	validation_0-logloss:0.16459
[26]	validation_0-logloss:0.16048
[27]	validation_0-logloss:0.15166
[28]	validation_0-logloss:0.14572
[29]	validation_0-loglos

5it [01:31, 18.38s/it]


Validation mcc score: 0.9835346920598655


In [9]:
submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
y_preds = clf.predict(X_test_transformed)
pred_classes = lb.inverse_transform(y_preds)
submit_df["class"] = pred_classes
submit_df.to_csv("submission.csv", index=False)