A shout out to H-Z-Ning and his notebook
<a href="https://www.kaggle.com/code/hzning/top-10-solution-0-97525-esay-is-all-you#%F0%9F%8F%81-LightGBM-Training">🔥 Top 10% Solution|🏆0.97525,esay is all you!🚀</a>
for feature engineering ideas

## Load libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder, FunctionTransformer, Binarizer, OrdinalEncoder
import imblearn.pipeline as imb
from sklearn.model_selection import train_test_split
from pathlib import Path
import os
from sklearn import set_config
from imblearn.over_sampling import SMOTE
import sys
import xgboost as xgb
path = Path.cwd().parent

# Absolute path to your package
sys.path.append(os.path.join(path))

from utils import convert_months_to_categorical, calculate_score, ClipValues, SinePreprocess, CategoryCounter

# Enable pandas output globally
set_config(transform_output="pandas")

## Load data

In [2]:

train_file_path = os.path.join(path, "raw", "train.csv")
test_file_path = os.path.join(path, "raw", "test.csv")

In [3]:
original_train_df = pd.read_csv(train_file_path, index_col=0)
test_df = pd.read_csv(test_file_path, index_col=0)

In [4]:
y_var = "y"
categorical_variables = original_train_df.select_dtypes(include="object").columns.to_list()
numerical_variables = (
    original_train_df
    .select_dtypes(exclude="object")
    .columns
    .drop(
        labels=["y"]
    )
    .to_list()
)

## Train val split

In [5]:
train_df, val_df = train_test_split(original_train_df, test_size=0.2)

## Preprocessing and train Pipeline

In [6]:
import numpy as np

In [20]:
def force_category(df):
    # cat_cols = ["default", "housing", "loan", "poutcome", "education", "contact", "job", "marital"]
    for col in df.select_dtypes(include="object").columns:
        df[col] = pd.Categorical(df[col])
    return df

def add_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["balance_negative_times_default_yes"] = (df["balance"] < 0) * (df['default'] == "yes")
    df["balance_negative_times_loan_yes"] = (df["balance"] < 0) * (df['loan'] == "yes")
    df["balance_negative"] = (df["balance"] < 0).astype(int)
    df["balance_log"] = np.log1p(df['balance'].clip(lower=0))
    df["default_yes_times_loan_yes"] = (df['default'] == "yes") * (df['loan'] == "yes")
    df["contact_unknown_times_default_yes"] = (df['contact'] == "unknown") * (df['default'] == "yes")
    df["duration_times_loan_yes"] = (df["duration"]) * (df['loan'] == "yes")
    df["duration_times_housing_yes"] = (df["duration"]) * (df['housing'] == "yes")
    df["duration_times_contact_unknown"] = (df["duration"]) * (df['contact'] == "unknown")
    df["duration_times_age"] = (df["duration"] < 53) * (df["age"])
    df["balance_negative_times_poutcome_unknown"] = (df["balance"] < 0) * (df['poutcome'] == "unknown")
    df["duration_times_default_yes"] = (df["duration"]) * (df['default'] == "yes")
    df["duration_times_education_primary"] = (df["duration"]) * (df['education'] == "primary")
    df["pdays_by_campaign"] = (df["pdays"]) / (df['campaign'])
    df["previous_by_campaign"] = (df["previous"]) / (df['campaign'] + 1e-8)
    df["pdays_times_previous"] = (df["pdays"]) * (df['previous'])
    df["day_of_year"] = df["month"].cat.codes * 30 + df["day"]
    df["duration_sq"] = df["duration"] ** 2
    df['job_edu'] = df['job'].astype(str) + "_" + df['education'].astype(str)
    return df


power_pipeline = Pipeline(
    [
        ("passthrough", "passthrough"),
        # ("clip", ClipValues(quantile_range=(0.25, 0.75))),
        # ("power_transform", PowerTransformer("yeo-johnson", standardize=True)),
        # ("splines", SplineTransformer(n_knots=10, knots="quantile"))
    ],
)

cat_encoder = Pipeline(
    [
        ("encoder", CategoryCounter()),
        
    ]
)

target_encoder = TargetEncoder()
sin_preprocess = SinePreprocess(
    {
        "day": 31,
        "month": 12,
        "duration": 400
    }
)

preprocessing_pipeline_0 = ColumnTransformer(
    [
        ("numerical", power_pipeline, [
            "balance", "duration", "campaign", "pdays", "age",
        ]),
        ("binarizer", Binarizer(threshold=0.01), ["balance", "pdays"]),
        ("encode", "passthrough", ["default", "housing", "loan", "poutcome", "education", "contact", "job", "marital"]),
        ("target_encoder", target_encoder, ["day", "month", "default", "housing", "loan", "poutcome", "education", "contact", "job", "marital", "job_edu"]),
        ("sine_preproces", sin_preprocess, ["day", "month", "duration"]),
        ("drop", "drop", ["previous"])
    ],
    remainder="passthrough"
    # verbose_feature_names_out=False
)

def create_pipeline(params = {}):

    pipeline = Pipeline(
        [
            ("force_cats", FunctionTransformer(force_category, validate=False)),
            ("month_transform_to_categorical", FunctionTransformer(
                convert_months_to_categorical, 
                validate=False,
            )),
            ("add_columns", FunctionTransformer(
                add_columns, 
                validate=False,
            )),
            ("preprocess_0", preprocessing_pipeline_0),
            ("drop_not_needed", 
            ColumnTransformer(transformers=[
                ("drop", "drop", []),
            ], remainder="passthrough", verbose_feature_names_out=False)
            ),
            # ("smote", SMOTE(random_state=42)),
            ("classifier", xgb.XGBClassifier(
                tree_method="hist", 
                enable_categorical=True,
                objective="binary:logistic",
                eval_metric="logloss",
                **params
            ))
        ]
    )
    return pipeline

In [8]:
pipeline = create_pipeline({
    "n_estimators": 1000,
    "eta": 0.03
})
fitted_pipeline = pipeline.fit(train_df.drop("y", axis=1), train_df[y_var])

In [9]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
train_pred_probs = fitted_pipeline.predict_proba(train_df)[:, 1]
calculate_score(train_df["y"], train_pred_probs)

{'auc': np.float64(0.9740406960275618),
 'accuracy': 0.9429133333333334,
 'f1': 0.7495026913175755}

In [10]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
val_pred_probs = fitted_pipeline.predict_proba(val_df)[:, 1]
calculate_score(val_df["y"], val_pred_probs)

{'auc': np.float64(0.9697476957775893),
 'accuracy': 0.9393666666666667,
 'f1': 0.7334954727928034}

## Optuna

In [11]:
import optuna

In [28]:
def objective(trial):
    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-8, 10, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 50, log=True),
        "max_leaves": trial.suggest_int("max_leaves", 1, 250, log=True),
        "eta": trial.suggest_float("eta", 1e-8, 1.0, log=False),
        "gamma": trial.suggest_float("gamma", 1e-8, 10.0, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.1, 10.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200, log=True)
    }
    pipeline = create_pipeline(param)
    # sample = train_df.sample(100000)
    fitted_pipeline = pipeline.fit(train_df.drop("y", axis=1).copy(), train_df[y_var].copy())
    val_pred_probs = fitted_pipeline.predict_proba(val_df)[:, 1]
    scores = calculate_score(val_df["y"], val_pred_probs)
    return scores["auc"]

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize', study_name="Bank", sampler=optuna.samplers.NSGAIISampler())
study.optimize(objective, n_trials=100)

[I 2025-08-12 09:40:46,832] A new study created in memory with name: Bank
[I 2025-08-12 09:44:50,184] Trial 0 finished with value: 0.9597799331783436 and parameters: {'booster': 'dart', 'min_child_weight': 1.0566129870444458e-08, 'max_depth': 15, 'max_leaves': 3, 'eta': 0.5797875044895596, 'gamma': 0.0005432905030227457, 'lambda': 0.031583785543398225, 'alpha': 0.022350531407423064, 'subsample': 0.4143404618790661, 'colsample_bytree': 0.47847658174885305, 'scale_pos_weight': 2.7105424687944515, 'n_estimators': 87}. Best is trial 0 with value: 0.9597799331783436.
[I 2025-08-12 09:45:01,172] Trial 1 finished with value: 0.9630581966284519 and parameters: {'booster': 'gbtree', 'min_child_weight': 0.001402656553561887, 'max_depth': 7, 'max_leaves': 5, 'eta': 0.49393401723438174, 'gamma': 0.00022502887935703242, 'lambda': 1.6857715338075006e-08, 'alpha': 9.156586010334946e-08, 'subsample': 0.571549593483784, 'colsample_bytree': 0.9677199349103318, 'scale_pos_weight': 9.123087450050388, 'n_e

KeyboardInterrupt: 

## Evaluate

In [25]:
param = study.best_trial.params
pipeline = create_pipeline(param)
fitted_pipeline = pipeline.fit(train_df.drop("y", axis=1), train_df[y_var])
# test = fitted_pipeline.transform(sample.drop("y", axis=1))
# fitted_pipeline = pipeline.fit(train_df.drop("y", axis=1), train_df[y_var])

In [26]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
train_pred_probs = fitted_pipeline.predict_proba(train_df)[:, 1]
calculate_score(train_df["y"], train_pred_probs)

{'auc': np.float64(0.9664443952657242),
 'accuracy': 0.8844966666666667,
 'f1': 0.6627410139863543}

In [27]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
val_pred_probs = fitted_pipeline.predict_proba(val_df)[:, 1]
calculate_score(val_df["y"], val_pred_probs)

{'auc': np.float64(0.9650362958919594),
 'accuracy': 0.88396,
 'f1': 0.6599593654762835}

## Review

In [None]:
fitted_pipeline[-1].get_booster().get_score(importance_type='gain')

## Submission

In [None]:
param = study.best_trial_params
pipeline = create_pipeline(param)
fitted_pipeline = pipeline.fit(original_train_df.drop("y", axis=1), original_train_df[y_var])

In [None]:
test_pred_probs = fitted_pipeline.predict_proba(test_df)
test_pred_probs

In [None]:
submission = test_df[[]]
submission = submission.copy()
submission.loc[:, "y"] = test_pred_probs[:, 1]
submission.to_csv("submission_xgb.csv")