## Constants

In [1]:
import sys, os
import pandas as pd
import numpy as np
import subprocess

sys.path.append("..")
from helpers.loss_functions import *

train_path = "../data/train.csv"
test_path = "../data/test.csv"
is_tunning = False
device = "cpu"
goal = "binary"
objective_mapping = {
    "regression": {"metric": "rmse"},
    "binary": {"metric": "binary_logloss"},
}
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")

device: cuda


## Prepare data

In [2]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

train size: (3116945, 22)
test size: (2077964, 21)


In [3]:
target = "class"

categorical_cols = train.drop(columns=target).select_dtypes(include="object").columns.to_list()
for c in categorical_cols:
    train[c] = train[c].astype('category')
    test[c] = test[c].astype('category')
numerical_cols = train.drop(columns="id").select_dtypes(include="number").columns.to_list()

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target),
    train[target],
    test_size=0.2,
    random_state=42,
    stratify=train[target],
)
X_test = test

## Data preprocessing

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_val_transformed = preprocessor.transform(X_val)

# Binarize the target labels
lb = LabelBinarizer()
y_train_binarized = lb.fit_transform(y_train)
y_val_binarized = lb.transform(y_val)

## Hyperparamters tuning

In [6]:
# import gc
# import optuna
# from datetime import datetime, timezone
# from sklearn.metrics import mean_absolute_error as mae
# import warnings
# from sklearn.metrics import r2_score
# import xgboost as xgb
# import joblib as jl

# warnings.filterwarnings("ignore")

# metric = objective_mapping[goal]["metric"]
# best_params = {
#     "objective": goal,
#     "metric": metric,
#     "boosting_type": "gbdt",
#     "device": device,
# }

# dtrain = xgb.DMatrix(X_train, label=y_train)
# dvalid = xgb.DMatrix(X_val, label=y_val)


# def mcc_eval(y_pred, y_true):
#     from sklearn.metrics import matthews_corrcoef

#     y_pred_binary = (y_pred > 0.5).astype(int)
#     mcc = matthews_corrcoef(y_true, y_pred_binary)
#     return "mcc", mcc, True  # True indicates higher values are betters


# def objective(trial):
#     param = {
#         "verbosity": 0,
#         "objective": "binary:logistic",
#         # use exact for small dataset.
#         "tree_method": "exact",
#         # defines booster, gblinear for linear functions.
#         "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
#         # L2 regularization weight.
#         "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
#         # L1 regularization weight.
#         "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
#         # sampling ratio for training data.
#         "subsample": trial.suggest_float("subsample", 0.2, 1.0),
#         # sampling according to each tree.
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
#     }

#     if param["booster"] in ["gbtree", "dart"]:
#         # maximum depth of the tree, signifies complexity of the tree.
#         param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
#         # minimum child weight, larger the term more conservative the tree.
#         param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
#         param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
#         # defines how selective algorithm is.
#         param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
#         param["grow_policy"] = trial.suggest_categorical(
#             "grow_policy", ["depthwise", "lossguide"]
#         )

#     if param["booster"] == "dart":
#         param["sample_type"] = trial.suggest_categorical(
#             "sample_type", ["uniform", "weighted"]
#         )
#         param["normalize_type"] = trial.suggest_categorical(
#             "normalize_type", ["tree", "forest"]
#         )
#         param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
#         param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

#     bst = xgb.train(param, dtrain)
#     # Fit the model with evaluation set
#     model_pipeline.named_steps['classifier'].fit(
#         model_pipeline.named_steps['preprocessor'].fit_transform(X_train), y_train_binarized,
#         eval_set=[(model_pipeline.named_steps['preprocessor'].transform(X_val), y_val_binarized)],
#     )
#     preds = bst.predict(dvalid)
#     pred_labels = np.rint(preds)
#     accuracy = mcc_eval(pred_labels, y_val)
#     return accuracy


# if is_tunning:
#     # Create or load a study
#     today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
#     curr_timestamp = int(datetime.now(timezone.utc).timestamp())
#     study_name = f"study"
#     study = optuna.create_study(
#         study_name=study_name,
#         storage=f"sqlite:///{study_name}.db",
#         direction="maximize",
#         load_if_exists=True,
#     )
#     study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
#     # Print best trial
#     best_trial = study.best_trial
#     print("Best trial:")
#     print(f"  {metric}_error:", best_trial.value)
#     print("  Params: ")
#     for key, value in best_trial.params.items():
#         print("    {}: {}".format(key, value))
#     study_best_params = study.best_params
#     best_params.update(study_best_params)
#     jl.dump(best_params, "best_params.pkl")

## Train best model

In [7]:
from sklearn.metrics import matthews_corrcoef
import xgboost as xgb
from typing import Tuple


# Custom MCC metric function for XGBoost
def mcc_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    """Custom Matthews correlation coefficient metric for XGBoost."""
    y_true = dtrain.get_label()
    y_pred_binary = np.round(predt).astype(int)
    mcc = matthews_corrcoef(y_true, y_pred_binary)
    return "MCC", mcc


best_params = {
    "objective": "binary:logistic",
    "device": device,
    # "boosting_type": "gbdt",
    # "lambda_l1": 0.11019458627539475,
    # "lambda_l2": 0.1554907572643459,
    # "num_leaves": 256,
    # "learning_rate": 0.010305818596596942,
    # "feature_fraction": 0.6278911649411257,
    # "bagging_fraction": 0.6910499222495768,
    # "bagging_freq": 9,
    # "min_child_samples": 37,
    # "reg_alpha": 0.0037881112342830815,
    # "reg_lambda": 0.9245246490277238,
    # "max_depth": 25,
}
print("Training best model...")
# final_model = lgb.train(
#     params=best_params,
#     train_set=train_data,
#     valid_sets=[valid_data],
#     feval=mcc_eval,
#     num_boost_round=4000,
#     callbacks=[
#         lgb.early_stopping(stopping_rounds=100, verbose=True),
#         lgb.log_evaluation(period=50, show_stdv=True),
#     ],
# )
# feature_importances = final_model.feature_importance()
# y_pred = final_model.predict(X_test)
# Fit the model
# Preprocess the data

# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
# Create the full pipeline with the XGBoost model
model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=10000,
    evals=[(dval, "eval")],
    feval=mcc_metric,
    early_stopping_rounds=200,
    # evals_result=[(dval, "eval"), (dtrain, "train")],
)
# params=best_params,
# dtrain=dtrain,
# num
# n_estimators=10000,
# enable_categorical=True,
# eval_metric=matthews_corrcoef,
# device="cuda",

Training best model...




[0]	eval-logloss:0.57929	eval-MCC:0.60803
[1]	eval-logloss:0.48450	eval-MCC:0.36222
[2]	eval-logloss:0.42824	eval-MCC:0.35739
[3]	eval-logloss:0.39800	eval-MCC:0.24317
[4]	eval-logloss:0.33541	eval-MCC:0.16723
[5]	eval-logloss:0.31740	eval-MCC:0.12601
[6]	eval-logloss:0.29994	eval-MCC:0.11413
[7]	eval-logloss:0.26858	eval-MCC:0.11890
[8]	eval-logloss:0.25104	eval-MCC:0.11321
[9]	eval-logloss:0.23430	eval-MCC:0.09814
[10]	eval-logloss:0.22278	eval-MCC:0.09234
[11]	eval-logloss:0.21312	eval-MCC:0.06666
[12]	eval-logloss:0.20076	eval-MCC:0.06126
[13]	eval-logloss:0.19400	eval-MCC:0.05688
[14]	eval-logloss:0.18587	eval-MCC:0.05340
[15]	eval-logloss:0.17372	eval-MCC:0.05193
[16]	eval-logloss:0.16710	eval-MCC:0.04900
[17]	eval-logloss:0.16016	eval-MCC:0.05081
[18]	eval-logloss:0.15068	eval-MCC:0.05046
[19]	eval-logloss:0.14747	eval-MCC:0.04573
[20]	eval-logloss:0.14397	eval-MCC:0.04197
[21]	eval-logloss:0.13580	eval-MCC:0.03596
[22]	eval-logloss:0.13306	eval-MCC:0.03368
[23]	eval-logloss:0.1

## Model eval

In [8]:
# def matthews_corrcoef_score(model, data, y_true):
#     from sklearn.metrics import matthews_corrcoef

#     y_pred_prob = model.predict(data)
#     y_pred_binary = np.round(y_pred_prob).astype(int)
#     try:
#         classes = lb.inverse_transform(y_pred_binary)
#     except Exception as e:
#         classes = None
#         print(f"Failed to convert y_pred to original form. Detail: {e}")
#     try:
#         mcc = matthews_corrcoef(y_true, y_pred_binary)
#     except Exception as e:
#         mcc = None
#         print(f"Failed to calculate matthews_corrcoef. Detail: {e}")
#     return mcc, classes

In [10]:
from helpers.loss_functions import *

mcc, _ = matthews_corrcoef_score(model, dval, y_val_binarized, lb)
print(f"correlation coefficient: {mcc}")

correlation coefficient: 0.9831513004841934


## Update Submission

In [12]:
dtest = xgb.DMatrix(X_test_transformed)
_, classes = matthews_corrcoef_score(model, dtest, None, lb)

submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
submit_df["class"] = classes
submit_df.to_csv("submission.csv", index=False)

Failed to calculate matthews_corrcoef. Detail: The 'y_true' parameter of matthews_corrcoef must be an array-like. Got None instead.
