## Constants

In [1]:
import sys, os
import pandas as pd
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
from sklearn.metrics import mean_absolute_error as mae
import warnings
from sklearn.metrics import r2_score
import xgboost as xgb
import joblib as jl
today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

# helpers
sys.path.append("..")
from helpers.loss_functions import *

# data
train_path = "../data/train.csv"
test_path = "../data/test.csv"

# model
is_tunning = True
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"
goal = "binary:logistic"

# custom metric
# objective_dict = {
#     "binary:logistic": {
#         "metric": {"is_custom": True, "name": "MCC", "fval": mcc_metric}
#     }
# }

objective_dict = {
    "binary:logistic": {"metric": {"is_custom": False, "name": "logloss", "fval": None}}
}
metric = objective_dict[goal]["metric"]["name"]
is_custom_metric = objective_dict[goal]["metric"]["is_custom"]
fval = objective_dict[goal]["metric"]["fval"]
best_params = {
    "objective": goal,
    "device": device,
    "verbosity": 0,
}

device: cuda


## Prepare data

In [2]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

In [None]:
target = "class"

categorical_cols = train.drop(columns=target).select_dtypes(include="object").columns.to_list()
for c in categorical_cols:
    train[c] = train[c].astype('category')
    test[c] = test[c].astype('category')
numerical_cols = train.drop(columns="id").select_dtypes(include="number").columns.to_list()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target),
    train[target],
    test_size=0.2,
    random_state=42,
    stratify=train[target],
)
X_test = test

## Data preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_val_transformed = preprocessor.transform(X_val)

# Binarize the target labels
lb = LabelBinarizer()
y_train_binarized = lb.fit_transform(y_train)
y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)

## Hyperparamters tuning

In [None]:
# import gc
# import optuna
# from datetime import datetime, timezone
# from sklearn.metrics import mean_absolute_error as mae
# import warnings
# from sklearn.metrics import r2_score
# import xgboost as xgb
# import joblib as jl

# warnings.filterwarnings("ignore")


# def objective(trial):
#     hyper_parameters = {
#         **best_params,
#         **{
#             "eta": trial.suggest_float("eta", 0.01, 0.3),
#             "max_depth": trial.suggest_int("max_depth", 3, 10),
#             "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#             "gamma": trial.suggest_float("gamma", 0, 5),
#             "subsample": trial.suggest_float("subsample", 0.5, 1),
#             "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
#             "lambda": trial.suggest_float("lambda", 0, 10),
#             "alpha": trial.suggest_float("alpha", 0, 10),
#         },
#     }
#     evals_result = {}
#     if is_custom_metric:
#         xgb.train(
#             params=hyper_parameters,
#             dtrain=dtrain,
#             num_boost_round=10000,
#             evals=[(dval, "eval")],
#             feval=fval,
#             evals_result=evals_result,
#             early_stopping_rounds=200,
#         )
#     else:
#         xgb.train(
#             params=hyper_parameters,
#             dtrain=dtrain,
#             num_boost_round=10000,
#             evals=[(dval, "eval")],
#             evals_result=evals_result,
#             early_stopping_rounds=200,
#         )
#     return evals_result["eval"][metric][-1]


# if is_tunning:
#     # Create or load a study
#     today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
#     curr_timestamp = int(datetime.now(timezone.utc).timestamp())
#     study_name = f"study_{today}"
#     study = optuna.create_study(
#         study_name=study_name,
#         storage=f"sqlite:///{study_name}.db",
#         direction="minimize",
#         load_if_exists=True,
#     )
#     study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
#     # Print best trial
#     best_trial = study.best_trial
#     print("Best trial:")
#     print(f" {metric}:", best_trial.value)
#     print("  Params: ")
#     for key, value in best_trial.params.items():
#         print("    {}: {}".format(key, value))
#     study_best_params = study.best_params
#     best_params.update(study_best_params)
#     jl.dump(best_params, "best_params.pkl")
#     # 0.03734

## Train best model

In [None]:
from sklearn.metrics import matthews_corrcoef
import xgboost as xgb
from typing import Tuple

tunned_params = {
    "eta": 0.018087779785882732,
    "max_depth": 10,
    "min_child_weight": 2,
    "gamma": 0.42570860420610934,
    "subsample": 0.74464089552046,
    "colsample_bytree": 0.6449797444444113,
    "lambda": 0.4134501484785982,
    "alpha": 6.520908679019516,
}
best_params.update(tunned_params)
print("Training best model...")
evals_best_result = {}
jl.dump(best_params, f"param_{today}")
# Create the full pipeline with the XGBoost model
if is_custom_metric:
    print("Training with custom metric")
    model = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=10000,
        evals=[(dval, "eval")],
        feval=fval,
        evals_result=evals_best_result,
        early_stopping_rounds=100,
    )
else:
    print("Training with original metric")
    model = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=10000,
        evals=[(dval, "eval")],
        evals_result=evals_best_result,
        early_stopping_rounds=200,
    )

Training best model...
Training with original metric
[0]	eval-logloss:0.57929
[1]	eval-logloss:0.48450
[2]	eval-logloss:0.42824
[3]	eval-logloss:0.39800
[4]	eval-logloss:0.33541
[5]	eval-logloss:0.31740
[6]	eval-logloss:0.29994
[7]	eval-logloss:0.26858
[8]	eval-logloss:0.25104
[9]	eval-logloss:0.23430
[10]	eval-logloss:0.22278
[11]	eval-logloss:0.21312
[12]	eval-logloss:0.20076
[13]	eval-logloss:0.19400
[14]	eval-logloss:0.18587
[15]	eval-logloss:0.17372
[16]	eval-logloss:0.16710
[17]	eval-logloss:0.16016
[18]	eval-logloss:0.15068
[19]	eval-logloss:0.14747
[20]	eval-logloss:0.14397
[21]	eval-logloss:0.13580
[22]	eval-logloss:0.13306
[23]	eval-logloss:0.12756
[24]	eval-logloss:0.12139
[25]	eval-logloss:0.11810
[26]	eval-logloss:0.11419
[27]	eval-logloss:0.11107
[28]	eval-logloss:0.10820
[29]	eval-logloss:0.10569
[30]	eval-logloss:0.10158
[31]	eval-logloss:0.09559
[32]	eval-logloss:0.09306
[33]	eval-logloss:0.08957
[34]	eval-logloss:0.08814
[35]	eval-logloss:0.08655
[36]	eval-logloss:0.0

## Model eval

In [None]:
mcc = min(evals_best_result["eval"][metric])
print(f"correlation coefficient: {mcc}")

correlation coefficient: 0.03844491021200418


## Update Submission

In [None]:
dtest = xgb.DMatrix(X_test_transformed)
_, classes = matthews_corrcoef_score(model, dtest, None, lb)

submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
submit_df["class"] = classes
submit_df.to_csv("submission.csv", index=False)

Failed to calculate matthews_corrcoef. Detail: The 'y_true' parameter of matthews_corrcoef must be an array-like. Got None instead.
