In [3]:
import sys
from pathlib import Path

# Get the current working directory and go up one level
parent_dir = Path.cwd().parent
sys.path.append(str(parent_dir))


In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import mlflow
mlflow.set_experiment("kaggle-playground-5-8")

<Experiment: artifact_location='file:///c:/Users/olive/desktop/projects/kaggle/kaggle_playground_5-8/new_fortransformers/mlruns/435223158587620822', creation_time=1755273647363, experiment_id='435223158587620822', last_update_time=1755273647363, lifecycle_stage='active', name='kaggle-playground-5-8', tags={}>

In [5]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")
original = pd.read_csv("../bank-full.csv", sep=";")

In [6]:
categorical_features = train.select_dtypes(include=["object"]).columns
numerical_features = list(set(train.select_dtypes(include=["int64", "float64"]).columns) - {"y", "id"})

In [None]:
categorical_features = train.select_dtypes(include=["object"]).columns
numerical_features = list(set(train.select_dtypes(include=["int64", "float64"]).columns) - {"y", "id"})

preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
    ],
    remainder = "passthrough")

preprocessor.set_output(transform="pandas")

In [8]:
preprocessor.fit(train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [11]:
train_transformed = preprocessor.fit_transform(train)

In [21]:
train_transformed

Unnamed: 0,num__balance,num__duration,num__pdays,num__day,num__campaign,num__age,num__previous,cat__job_admin.,cat__job_blue-collar,cat__job_entrepreneur,...,cat__month_may,cat__month_nov,cat__month_oct,cat__month_sep,cat__poutcome_failure,cat__poutcome_other,cat__poutcome_success,cat__poutcome_unknown,remainder__id,remainder__y
0,-0.422083,-0.510829,-0.302803,1.076594,0.155597,0.106310,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0
1,-0.243316,-0.261338,-0.302803,0.228194,-0.580100,-0.289776,-0.223475,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0
2,-0.212287,-0.532843,-0.302803,-0.256606,-0.212251,-0.487819,-0.223475,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,0
3,-0.412563,-0.903409,-0.302803,1.440194,-0.212251,-1.379012,-0.223475,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0
4,-0.111092,2.369319,-0.302803,-1.589805,-0.580100,-1.478033,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,0.027479,2.750893,-0.302803,-1.468605,-0.212251,-1.180969,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,749995,1
749996,-0.202062,-0.620898,-0.302803,0.349394,-0.580100,2.779889,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,749996,0
749997,-0.348038,-0.525505,-0.302803,0.106994,-0.580100,0.898482,-0.223475,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,749997,0
749998,-0.521163,-0.543849,-0.302803,1.197794,1.259142,-0.883905,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,749998,0


In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import numpy as np

x_train = train_transformed.drop(columns=["remainder__y", "remainder__id"])
y_train = train_transformed["remainder__y"]

def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "verbose": 0,
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 1.0),
        "gamma": trial.suggest_float("gamma", 0.01, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1.0),
        "random_state": 42,
        "seed": 42,
    }

    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(x_train, y_train):
        X_train_cv, X_valid_cv = x_train.iloc[train_idx], x_train.iloc[valid_idx]
        y_train_cv, y_valid_cv = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        model = XGBClassifier(**params)
        model.fit(X_train_cv, y_train_cv)

        preds = model.predict_proba(X_valid_cv)[:, 1]
        auc = roc_auc_score(y_valid_cv, preds)
        aucs.append(auc)

    return np.mean(aucs)


In [25]:
import optuna

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)


[I 2025-08-15 17:11:58,194] A new study created in memory with name: no-name-1f074abf-65b1-4001-baf4-7e82dc1e8df6
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-15 17:12:21,983] Trial 0 finished with value: 0.9654995855759178 and parameters: {'max_depth': 4, 'learning_rate': 0.4498187423864299, 'n_estimators': 367, 'subsample': 0.6406957290284031, 'colsample_bytree': 0.7030899175707727, 'gamma': 0.7239179637729526, 'reg_alpha': 0.38750256376714304, 'reg_lambda': 0.8354983584606867}. Best is trial 0 with value: 0.9654995855759178.
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-15 17:12:35,031] Trial 1 finished with value: 0.960290791346285 and parameters: {'max_depth': 8, 'learning_rate': 0.32771996325265085, 'n_estimators'

Best AUC: 0.9656115308446573
Best Params: {'max_depth': 3, 'learning_rate': 0.25957912726452353, 'n_estimators': 547, 'subsample': 0.6493638869718359, 'colsample_bytree': 0.4116155930248932, 'gamma': 0.38425002665423963, 'reg_alpha': 0.6123024086183246, 'reg_lambda': 0.28632367478528276}


In [26]:
study.best_params

{'max_depth': 3,
 'learning_rate': 0.25957912726452353,
 'n_estimators': 547,
 'subsample': 0.6493638869718359,
 'colsample_bytree': 0.4116155930248932,
 'gamma': 0.38425002665423963,
 'reg_alpha': 0.6123024086183246,
 'reg_lambda': 0.28632367478528276}

In [28]:
X

Unnamed: 0,num__balance,num__duration,num__pdays,num__day,num__campaign,num__age,num__previous,cat__job_admin.,cat__job_blue-collar,cat__job_entrepreneur,...,cat__month_jun,cat__month_mar,cat__month_may,cat__month_nov,cat__month_oct,cat__month_sep,cat__poutcome_failure,cat__poutcome_other,cat__poutcome_success,cat__poutcome_unknown
0,-0.422083,-0.510829,-0.302803,1.076594,0.155597,0.106310,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.243316,-0.261338,-0.302803,0.228194,-0.580100,-0.289776,-0.223475,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.212287,-0.532843,-0.302803,-0.256606,-0.212251,-0.487819,-0.223475,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.412563,-0.903409,-0.302803,1.440194,-0.212251,-1.379012,-0.223475,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.111092,2.369319,-0.302803,-1.589805,-0.580100,-1.478033,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,0.027479,2.750893,-0.302803,-1.468605,-0.212251,-1.180969,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
749996,-0.202062,-0.620898,-0.302803,0.349394,-0.580100,2.779889,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
749997,-0.348038,-0.525505,-0.302803,0.106994,-0.580100,0.898482,-0.223475,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
749998,-0.521163,-0.543849,-0.302803,1.197794,1.259142,-0.883905,-0.223475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score  # use the right metric for your task

# Example: binary classification
X = train_transformed.drop(columns=["remainder__y", "remainder__id"]).values
y = train_transformed["remainder__y"].values

model = XGBClassifier(**study.best_params)

n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))  # to store out-of-fold predictions
scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]

    model.fit(X_train, y_train)
    
    preds = model.predict_proba(X_valid)[:, 1]  # probability for positive class
    oof_preds[valid_idx] = preds
    
    score = roc_auc_score(y_valid, preds)
    scores.append(score)
    
    print(f"Fold {fold} AUC: {score:.4f}")

mean_score = np.mean(scores)
print(f"Mean AUC: {mean_score:.4f}")

# OOF predictions now in oof_preds

Fold 1 AUC: 0.9665
Fold 2 AUC: 0.9652
Fold 3 AUC: 0.9653
Fold 4 AUC: 0.9662
Fold 5 AUC: 0.9659
Mean AUC: 0.9658


In [32]:
test['y'] = 0

In [33]:
test_transformed = preprocessor.transform(test)

In [35]:
test_id = test["id"]
test_ready = test_transformed.drop(columns=["remainder__y", "remainder__id"])

probas = model.predict_proba(test_ready)[:, 1]

submission = pd.DataFrame({"id": test_id, "y": probas})
submission.to_csv("submission.csv", index=False)

In [36]:
with mlflow.start_run(run_name="first_practice"):
    mlflow.log_params({
        "model": "xgboost",
        "max_depth": 3,
        "learning_rate": 0.25957912726452353,
        "n_estimators": 547,
        "subsample": 0.6493638869718359,
        "colsample_bytree": 0.4116155930248932,
        "gamma": 0.38425002665423963,
        "reg_alpha": 0.6123024086183246,
        "reg_lambda": 0.28632367478528276
    })

    mlflow.set_tag(
    "mlflow.note.content",
    "Simple xgb model, 10 runs on optuna."
    "Pipeline has standard scalar on numerical features and one-hot encoder on categorical features.")

    for fold, score in enumerate(scores, start=1):
        mlflow.log_metric(f"fold{fold}_auc", score)

    mlflow.log_metric("mean_auc", np.mean(scores))

    # Replace these with your actual CV results
    mlflow.log_metric("public_lb", 0.96684)

    