## Constants

In [5]:
import sys, os
import pandas as pd
import polars as pl
import numpy as np
import subprocess
import gc
import optuna
from datetime import datetime, timezone
import warnings
import xgboost as xgb
import joblib as jl
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import matthews_corrcoef
from mlflow.models import infer_signature
import mlflow
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold

today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
warnings.filterwarnings("ignore")

from hyper_params import (
    mushroom_tuning_2024_08_06_1722934727_params,
)

SEED = 108
random.seed(SEED)
N_FOLDS = 10
# model
is_tunning = True
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "cuda" if rs is not None else "cpu"
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")
    device = "cpu"

best_params = {
    "device": device,
    "verbosity": 0,
    "objective": "binary:logistic",
}
best_params.update(mushroom_tuning_2024_08_06_1722934727_params)
best_params

{'device': 'cuda',
 'verbosity': 0,
 'objective': 'binary:logistic',
 'tree_method': 'hist',
 'eta': 0.0696294726051571,
 'max_depth': 0,
 'min_child_weight': 1,
 'gamma': 0.044230646284796976,
 'subsample': 0.9405269471473167,
 'colsample_bytree': 0.2999355523666192,
 'lambda': 0.9746051811186938,
 'alpha': 4.210861941737071}

## Prepare data

In [6]:
y_train_pkl = jl.load("../y_train.pkl")
X_train_pkl = jl.load("../X_train.pkl")

print(f"train size: {X_train_pkl.shape}")

train size: (3116945, 294)


## CV

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


clf: xgb.XGBClassifier = xgb.XGBClassifier(
    **best_params,
    n_estimators=4000,
    early_stopping_rounds=50,
    enable_categorical=True,
)

In [8]:
from tqdm import tqdm

gc.collect()
skf = StratifiedKFold(n_splits=N_FOLDS)

y_preds = []
y_trues = []
for train_index, test_index in tqdm(skf.split(X_train_pkl, y_train_pkl)):
    X_train, X_test = X_train_pkl[train_index], X_train_pkl[test_index]
    y_train, y_test = y_train_pkl[train_index], y_train_pkl[test_index]

    clf.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)])

    y_pred = clf.predict(X_test)
    y_preds.append(y_pred)
    y_trues.append(y_test)

    del X_train, X_test, y_train, y_test, y_pred
    gc.collect()
# Concatenate the predictions and true labels
y_preds_concat = np.concatenate(y_preds)
y_trues_concat = np.concatenate(y_trues)
mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
print(f"Validation mcc score: {mcc}")

0it [00:00, ?it/s]

[0]	validation_0-logloss:0.62811
[1]	validation_0-logloss:0.57452
[2]	validation_0-logloss:0.52929
[3]	validation_0-logloss:0.49793
[4]	validation_0-logloss:0.45847
[5]	validation_0-logloss:0.42435
[6]	validation_0-logloss:0.40119
[7]	validation_0-logloss:0.37332
[8]	validation_0-logloss:0.34690
[9]	validation_0-logloss:0.32693
[10]	validation_0-logloss:0.31008
[11]	validation_0-logloss:0.28907
[12]	validation_0-logloss:0.27194
[13]	validation_0-logloss:0.25579
[14]	validation_0-logloss:0.24169
[15]	validation_0-logloss:0.22852
[16]	validation_0-logloss:0.21696
[17]	validation_0-logloss:0.20686
[18]	validation_0-logloss:0.19673
[19]	validation_0-logloss:0.18649
[20]	validation_0-logloss:0.17683
[21]	validation_0-logloss:0.16652
[22]	validation_0-logloss:0.15795
[23]	validation_0-logloss:0.15191
[24]	validation_0-logloss:0.14348
[25]	validation_0-logloss:0.13709
[26]	validation_0-logloss:0.13271
[27]	validation_0-logloss:0.12603
[28]	validation_0-logloss:0.11981
[29]	validation_0-loglos

1it [01:03, 63.76s/it]

[0]	validation_0-logloss:0.62978
[1]	validation_0-logloss:0.57598
[2]	validation_0-logloss:0.53064
[3]	validation_0-logloss:0.49916


In [None]:
submit_df = jl.load("../submit_df.pkl")
X_test_pkl = jl.load("../X_test.pkl")
lb = jl.load("../lb.pkl")

In [None]:
y_preds = clf.predict(X_test_pkl)
pred_classes = lb.inverse_transform(y_preds)
submit_df["class"] = pred_classes
submit_df.to_csv("submission.csv", index=False)