## Constants

In [1]:
import pandas as pd
import numpy as np
import subprocess

train_path = "../data/train.csv"
test_path = "../data/test.csv"
is_tunning = False
device = "cpu"
goal = "binary"
objective_mapping = {
    "regression": {"metric": "rmse"},
    "binary": {"metric": "binary_logloss"},
}
# try:
#     rs = subprocess.check_output("nvidia-smi")
#     device = "gpu" if rs is not None else "cpu"
#     print(f"device: {device}")
# except (
#     Exception
# ):  # this command not being found can raise quite a few different errors depending on the configuration
#     print("No Nvidia GPU in system!")

## Prepare data

In [2]:
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
test = pd.read_csv(test_path)
print(f"test size: {test.shape}")

In [4]:
target = "class"

categorical_cols = train.drop(columns=target).select_dtypes(include="object").columns.to_list()
for c in categorical_cols:
    train[c] = train[c].astype('category')
    test[c] = test[c].astype('category')
numerical_cols = train.drop(columns="id").select_dtypes(include="number").columns.to_list()

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target), train[target], test_size=0.2, random_state=42
)
X_test = test

## Data preprocessing

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ]
)

# Binarize the target labels
lb = LabelBinarizer()
y_train_binarized = lb.fit_transform(y_train)
y_val_binarized = lb.transform(y_val)

## Hyperparamters tuning

In [8]:
import gc
# import optuna
import lightgbm as lgb
from datetime import datetime, timezone
from sklearn.metrics import mean_absolute_error as mae
import warnings
from sklearn.metrics import r2_score


warnings.filterwarnings("ignore")

metric = objective_mapping[goal]["metric"]
best_params = {
    "objective": goal,
    "metric": metric,
    "boosting_type": "gbdt",
    "device": device,
}
# Create LightGBM dataset
train_data = lgb.Dataset(
    data=X_train,
    label=y_train,
    free_raw_data=False,
    params={"verbose": -1},
)
valid_data = lgb.Dataset(
    data=X_val,
    label=y_val,
    free_raw_data=False,
    params={"verbose": -1},
)


def objective(trial):
    hyper_parameters = {
        **best_params,
        **{
            "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 0.6),
            "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 0.6),
            "num_leaves": trial.suggest_int("num_leaves", 100, 256),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 0.9),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 0.9),
            "bagging_freq": trial.suggest_int("bagging_freq", 8, 16),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
            "max_depth": trial.suggest_int("max_depth", 20, 64),
        },
    }

    model = lgb.train(
        params=hyper_parameters,
        train_set=train_data,
        valid_sets=[valid_data],
        num_boost_round=4000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(period=50, show_stdv=True),
        ],
    )
    # Try to minimize the loss metric
    return model.best_score["valid_0"][metric]


if is_tunning:
    # Create or load a study
    today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
    curr_timestamp = int(datetime.now(timezone.utc).timestamp())
    study_name = f"study"
    study = optuna.create_study(
        study_name=study_name,
        storage=f"sqlite:///{study_name}.db",
        direction="minimize",
        load_if_exists=True,
    )
    study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
    # Print best trial
    best_trial = study.best_trial
    print("Best trial:")
    print(f"  {metric}_error:", best_trial.value)
    print("  Params: ")
    for key, value in best_trial.params.items():
        print("    {}: {}".format(key, value))
    study_best_params = study.best_params
    best_params.update(study_best_params)
    best_params

## Train best model

In [14]:
# Define custom MCC evaluation function
def mcc_eval(y_pred, train_data):
    from sklearn.metrics import matthews_corrcoef

    y_true = train_data.get_label()
    y_pred_binary = (y_pred > 0.5).astype(int)
    mcc = matthews_corrcoef(y_true, y_pred_binary)
    return "mcc", mcc, True  # True indicates higher values are betters


best_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "lambda_l1": 0.11019458627539475,
    "lambda_l2": 0.1554907572643459,
    "num_leaves": 256,
    "learning_rate": 0.010305818596596942,
    "feature_fraction": 0.6278911649411257,
    "bagging_fraction": 0.6910499222495768,
    "bagging_freq": 9,
    "min_child_samples": 37,
    "reg_alpha": 0.0037881112342830815,
    "reg_lambda": 0.9245246490277238,
    "max_depth": 25,
}
print("Training best model...")
# final_model = lgb.train(
#     params=best_params,
#     train_set=train_data,
#     valid_sets=[valid_data],
#     feval=mcc_eval,
#     num_boost_round=4000,
#     callbacks=[
#         lgb.early_stopping(stopping_rounds=100, verbose=True),
#         lgb.log_evaluation(period=50, show_stdv=True),
#     ],
# )
# feature_importances = final_model.feature_importance()
# y_pred = final_model.predict(X_test)
# Fit the model

# Create the full pipeline with the XGBoost model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(enable_categorical=True, eval_metric='logloss', device="cuda"))
])
# Fit the model with evaluation set
model_pipeline.named_steps['classifier'].fit(
    model_pipeline.named_steps['preprocessor'].transform(X_train), y_train_binarized,
    eval_set=[(model_pipeline.named_steps['preprocessor'].transform(X_val), y_val_binarized)],
)

Training best model...
[0]	validation_0-logloss:0.67288
[1]	validation_0-logloss:0.65735
[2]	validation_0-logloss:0.644049
[3]	validation_0-logloss:0.627222
[4]	validation_0-logloss:0.615775
[5]	validation_0-logloss:0.604451
[6]	validation_0-logloss:0.592499
[7]	validation_0-logloss:0.584449
[8]	validation_0-logloss:0.574154
[9]	validation_0-logloss:0.568383
[10]	validation_0-logloss:0.560177
[11]	validation_0-logloss:0.550903
[12]	validation_0-logloss:0.545425
[13]	validation_0-logloss:0.538238
[14]	validation_0-logloss:0.532794
[15]	validation_0-logloss:0.523156
[16]	validation_0-logloss:0.519006
[17]	validation_0-logloss:0.513043
[18]	validation_0-logloss:0.509354
[19]	validation_0-logloss:0.502461
[20]	validation_0-logloss:0.4969
[21]	validation_0-logloss:0.492384
[22]	validation_0-logloss:0.489513
[23]	validation_0-logloss:0.484482
[24]	validation_0-logloss:0.48029
[25]	validation_0-logloss:0.476729
[26]	validation_0-logloss:0.471995
[27]	validation_0-logloss:0.466739
[28]	validat

## Model eval

In [17]:
y_pred_prob = model_pipeline.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
classes = lb.inverse_transform(y_pred)

array(['e', 'p', 'p', ..., 'p', 'e', 'e'], dtype='<U1')

## Update Submission

In [18]:
submit_df = pd.DataFrame()
submit_df["id"] = test["id"]
submit_df["class"] = classes
submit_df.to_csv("submission.csv", index=False)