## Constants

In [None]:
import sys, os
import pandas as pd
import numpy as np
import subprocess

# helpers
sys.path.append("..")
from helpers.loss_functions import *

# data
train_path = "../data/train.csv"
test_path = "../data/test.csv"

# model
is_tunning = True
device = "cpu"
goal = "binary:logistic"
objective_dict = {"binary:logistic": {"metric": "MCC"}}

metric = objective_dict[goal]["metric"]
best_params = {
    "objective": goal,
    "device": device,
    "tree_method": "exact",
    "verbosity": 0,
}
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")

## Prepare data

In [None]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

In [None]:
target = "class"

categorical_cols = train.drop(columns=target).select_dtypes(include="object").columns.to_list()
for c in categorical_cols:
    train[c] = train[c].astype('category')
    test[c] = test[c].astype('category')
numerical_cols = train.drop(columns="id").select_dtypes(include="number").columns.to_list()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target),
    train[target],
    test_size=0.2,
    random_state=42,
    stratify=train[target],
)
X_test = test

## Data preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# Preprocess the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_val_transformed = preprocessor.transform(X_val)

# Binarize the target labels
lb = LabelBinarizer()
y_train_binarized = lb.fit_transform(y_train)
y_val_binarized = lb.transform(y_val)

# prepare data for training
dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
dtest = xgb.DMatrix(X_test_transformed)

## Hyperparamters tuning

In [None]:
import gc
import optuna
from datetime import datetime, timezone
from sklearn.metrics import mean_absolute_error as mae
import warnings
from sklearn.metrics import r2_score
import xgboost as xgb
import joblib as jl

warnings.filterwarnings("ignore")


def objective(trial):
    param = {
        **best_params,
        **{
            # use exact for small dataset.
            # defines booster, gblinear for linear functions.
            "booster": trial.suggest_categorical(
                "booster", ["gbtree", "gblinear", "dart"]
            ),
            # L2 regularization weight.
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            # L1 regularization weight.
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            # sampling ratio for training data.
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            # sampling according to each tree.
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        },
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"]
        )

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical(
            "sample_type", ["uniform", "weighted"]
        )
        param["normalize_type"] = trial.suggest_categorical(
            "normalize_type", ["tree", "forest"]
        )
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    evals_result = {}
    xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=10000,
        evals=[(dval, "eval")],
        feval=mcc_metric,
        evals_result=evals_result,
        early_stopping_rounds=200,
    )
    return evals_result["eval"][metric][-1]


if is_tunning:
    # Create or load a study
    today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
    curr_timestamp = int(datetime.now(timezone.utc).timestamp())
    study_name = f"study"
    study = optuna.create_study(
        study_name=study_name,
        storage=f"sqlite:///{study_name}.db",
        direction="maximize",
        load_if_exists=True,
    )
    study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
    # Print best trial
    best_trial = study.best_trial
    print("Best trial:")
    print(f" {metric}:", best_trial.value)
    print("  Params: ")
    for key, value in best_trial.params.items():
        print("    {}: {}".format(key, value))
    study_best_params = study.best_params
    best_params.update(study_best_params)
    jl.dump(best_params, "best_params.pkl")

## Train best model

In [None]:
from sklearn.metrics import matthews_corrcoef
import xgboost as xgb
from typing import Tuple

print("Training best model...")
evals_best_result = {}
# Create the full pipeline with the XGBoost model
model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=10000,
    evals=[(dval, "eval")],
    feval=mcc_metric,
    evals_result=evals_best_result,
    early_stopping_rounds=200,
)

## Model eval

In [None]:
mcc = evals_best_result["eval"][metric][-1]
print(f"correlation coefficient: {mcc}")

## Update Submission

In [None]:
_, classes = matthews_corrcoef_score(model, dtest, None, lb)

submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
submit_df["class"] = classes
submit_df.to_csv("submission.csv", index=False)