In [None]:
!conda install -c conda-forge py-xgboost-gpu

In [1]:
import pandas as pd
import numpy as np
import subprocess

is_tunning = True
device = "cpu"
goal = "binary"
objective_mapping = {
    "regression": {"metric": "rmse"},
    "binary": {"metric": "binary_logloss"},
}
try:
    rs = subprocess.check_output("nvidia-smi")
    device = "gpu" if rs is not None else "cpu"
    print(f"device: {device}")
except (
    Exception
):  # this command not being found can raise quite a few different errors depending on the configuration
    print("No Nvidia GPU in system!")

No Nvidia GPU in system!


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
from sklearn.model_selection import train_test_split

target = "class"
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target), train[target], test_size=0.2, random_state=42
)
X_test = test

In [4]:
categorical_columns = train.drop(columns=target).select_dtypes(include="object").columns.to_list()
numerical_cols = train.drop(columns="id").select_dtypes(include="number").columns.to_list()

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
# Combine the preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, categorical_columns)]
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
pipeline.fit_transform(X_train)
# Transform both the training and testing data
X_train_transformed = pipeline.transform(X_train).toarray()
X_test_transformed = pipeline.transform(X_test).toarray()
X_val_transformed = pipeline.transform(X_val).toarray()

# Convert transformed features and encoded targets to DataFrames for easier handling
X_train = pd.DataFrame(
    X_train_transformed,
    columns=pipeline.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(),
)
X_test = pd.DataFrame(
    X_test_transformed,
    columns=pipeline.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(),
)
X_val = pd.DataFrame(
    X_val_transformed,
    columns=pipeline.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(),
)

In [6]:
from sklearn.preprocessing import LabelBinarizer

# Encode the target variable
label_encoder = LabelBinarizer()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

In [7]:
y_val

array([[1],
       [1],
       [0],
       ...,
       [1],
       [0],
       [1]])

In [8]:
import gc
import optuna
import lightgbm as lgb
from datetime import datetime, timezone
from sklearn.metrics import mean_absolute_error as mae
import warnings
from sklearn.metrics import r2_score


warnings.filterwarnings("ignore")

metric = objective_mapping[goal]["metric"]
best_params = {
    "objective": goal,
    "metric": metric,
    "boosting_type": "gbdt",
    "device": device,
}
# Create LightGBM dataset
train_data = lgb.Dataset(
    data=X_train,
    label=y_train,
    free_raw_data=False,
    params={"verbose": -1},
)
valid_data = lgb.Dataset(
    data=X_val,
    label=y_val,
    free_raw_data=False,
    params={"verbose": -1},
)


def objective(trial):
    hyper_parameters = {
        **best_params,
        **{
            "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 0.6),
            "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 0.6),
            "num_leaves": trial.suggest_int("num_leaves", 100, 256),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 0.9),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 0.9),
            "bagging_freq": trial.suggest_int("bagging_freq", 8, 16),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
            "max_depth": trial.suggest_int("max_depth", 20, 64),
        },
    }

    model = lgb.train(
        params=hyper_parameters,
        train_set=train_data,
        valid_sets=[valid_data],
        num_boost_round=4000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(period=50, show_stdv=True),
        ],
    )
    # Try to minimize the loss metric
    return model.best_score["valid_0"][metric]


if is_tunning:
    # Create or load a study
    today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
    curr_timestamp = int(datetime.now(timezone.utc).timestamp())
    study_name = f"study"
    study = optuna.create_study(
        study_name=study_name,
        storage=f"sqlite:///{study_name}.db",
        direction="minimize",
        load_if_exists=True,
    )
    study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
    # Print best trial
    best_trial = study.best_trial
    print("Best trial:")
    print(f"  {metric}_error:", best_trial.value)
    print("  Params: ")
    for key, value in best_trial.params.items():
        print("    {}: {}".format(key, value))
    study_best_params = study.best_params
    best_params.update(study_best_params)
    best_params

In [9]:
# Define custom MCC evaluation function
def mcc_eval(y_pred, train_data):
    from sklearn.metrics import matthews_corrcoef

    y_true = train_data.get_label()
    y_pred_binary = (y_pred > 0.5).astype(int)
    mcc = matthews_corrcoef(y_true, y_pred_binary)
    return "mcc", mcc, True  # True indicates higher values are betters


best_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "lambda_l1": 0.11019458627539475,
    "lambda_l2": 0.1554907572643459,
    "num_leaves": 256,
    "learning_rate": 0.010305818596596942,
    "feature_fraction": 0.6278911649411257,
    "bagging_fraction": 0.6910499222495768,
    "bagging_freq": 9,
    "min_child_samples": 37,
    "reg_alpha": 0.0037881112342830815,
    "reg_lambda": 0.9245246490277238,
    "max_depth": 25,
}
print("Training best model...")
final_model = lgb.train(
    params=best_params,
    train_set=train_data,
    valid_sets=[valid_data],
    feval=mcc_eval,
    num_boost_round=4000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=50, show_stdv=True),
    ],
)
feature_importances = final_model.feature_importance()
y_pred = final_model.predict(X_test)

Training best model...
[LightGBM] [Info] Number of positive: 1364404, number of negative: 1129152
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train set: 2493556, number of used features: 158
Training until validation scores don't improve for 100 rounds
[50]	valid_0's binary_logloss: 0.0517747	valid_0's mcc: 0.974335
[100]	valid_0's binary_logloss: 0.0482139	valid_0's mcc: 0.976287
[150]	valid_0's binary_logloss: 0.0475338	valid_0's mcc: 0.976787
[200]	valid_0's binary_logloss: 0.0472676	valid_0's mcc: 0.977021
[250]	valid_0's binary_logloss: 0.0471333	valid_0's mcc: 0.977147
[300]	valid_0's binary_logloss: 0.0470655	valid_0's mcc: 0.977282
[350]	valid_0's binary_logloss: 0.0470469	valid_0's mcc: 0.977298
[400]	valid_0's binary_l

In [15]:
study.best_params

NameError: name 'study' is not defined

In [12]:
y_pred_prob = final_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
classes = [label_encoder.inverse_transform(p)[0] for p in y_pred]

array([0, 1, 1, ..., 1, 0, 0])

In [14]:
submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
submit_df["class"] = classes
submit_df.to_csv("submission.csv", index=False)