## Constants

In [None]:
import sys, os
import pandas as pd
import polars as pl
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

from hyper_params import (
    mushroom_tuning_2024_08_06_1722934727_params,
)

# helpers


SEED = 108
random.seed(SEED)
N_FOLDS = 7
# data
train_path = "../../data/mushrooms/train.csv"
test_path = "../../data/mushrooms/test.csv"
cache_path = "../../data/mushrooms/cache"
# model
is_tunning = True
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"

# custom metric
objective_dict = {
    "binary:logistic": {
        "metric": {
            "is_custom": False,
            "name": "logloss",
            "fval": None,
        },
        "direction": "minimize",
    }
}
# objective_dict = {
#     "binary:logistic": {
#         "metric": {
#             "is_custom": True,
#             "name": "MCC",
#             "fval": mcc_metric_v2,
#         },
#         "direction": "maximize",
#     }
# }
metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
direction = objective_dict[goal]["direction"]

best_params = {
    "device": device,
    "verbosity": 0,
    "objective": goal,
}
best_params.update(mushroom_tuning_2024_08_06_1722934727_params)
best_params

## Prepare data

In [None]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
X_test = pd.read_csv(test_path)
print(f"test size: {X_test.shape}")
submit_df = pd.DataFrame()
submit_df["id"] = X_test["id"]
y_train = train["class"]
X_train = train.drop(columns=["id", "class"], axis=1)
X_test.drop(columns=["id"], inplace=True, axis=1)

In [None]:
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)
# for c in categorical_cols:
#     train[c] = train[c].astype("category")
#     X_test[c] = X_test[c].astype("category")
numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)
gc.collect()

## Data preprocessing

In [4]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

# get top 10 most frequent names
n = 10
for c in categorical_cols:
    train_mode_values = X_train[c].value_counts()[:n].index.tolist()
    X_train.loc[~X_train[c].isin(train_mode_values), c] = "other"
    test_mode_values = X_test[c].value_counts()[:n].index.tolist()
    X_test.loc[~X_test[c].isin(test_mode_values), c] = "other"


# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        # ("minmax", MinMaxScaler()),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
        # ("ordinal", OrdinalEncoder()),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)
data_pipeline.set_output(transform="polars")
# Preprocess the data
X_all = pd.concat([X_train, X_test])
data_pipeline.fit(pl.from_pandas(X_all))
X_test_transformed = data_pipeline.transform(X_test)
X_train_transformed = data_pipeline.transform(X_train)
# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)

print(f"Exporting to pickple...")
jl.dump(X_train_transformed, "X_train.pkl")
jl.dump(y_train_binarized, "y_train.pkl")
jl.dump(X_test_transformed, "X_test.pkl")
jl.dump(submit_df, "submit_df.pkl")
jl.dump(lb, "lb.pkl")

gc.collect()

## CV

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


clf: xgb.XGBClassifier = xgb.XGBClassifier(
    **best_params,
    n_estimators=4000,
    early_stopping_rounds=50,
    enable_categorical=True,
)

In [None]:
from tqdm import tqdm

skf = StratifiedKFold(n_splits=N_FOLDS)

# kf = KFold(n_splits=N_FOLDS)
y_preds = []
y_trues = []
for train_index, test_index in tqdm(skf.split(X_train_transformed, y_train_binarized)):
    X_train, X_test = X_train_transformed[train_index], X_train_transformed[test_index]
    y_train, y_test = y_train_binarized[train_index], y_train_binarized[test_index]

    clf.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)])

    y_pred = clf.predict(X_test)
    y_preds.append(y_pred)
    y_trues.append(y_test)
# Concatenate the predictions and true labels
y_preds_concat = np.concatenate(y_preds)
y_trues_concat = np.concatenate(y_trues)
mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
print(f"Validation mcc score: {mcc}")

In [None]:
y_preds = clf.predict(X_test_transformed)
pred_classes = lb.inverse_transform(y_preds)
submit_df["class"] = pred_classes
submit_df.to_csv("submission.csv", index=False)