In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, auc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

original = pd.read_csv("bank-full.csv", sep=";")
original['y'] = original['y'].apply(lambda x: 1 if x=="yes" else 0)

---

## Findings

- "day" can get encoded or transformed in some way since its not ratio data.  
- "pdays" has "-1" for missing data, otherwise is useful ratio data. Often people get called around quarter, half, full year

---

In [4]:
## convert day column to str so we will get target encoding as below

train['day'] = train['day'].astype(str)
original['day'] = original['day'].astype(str)

In [5]:
## convert 

---

In [6]:
# Step 1: Concatenate original + synthetic
train = pd.concat([train, original], ignore_index=True)

# Step 2: Categorical columns
cat_cols = original.select_dtypes(include=['object']).columns

# Step 3: Compute target means from original, and map to full train
for col in cat_cols:
    te_map = original.groupby(col)['y'].mean().to_dict()  # mean target per category
    train[col + "_mean"] = train[col].map(te_map)         # apply to full train

In [11]:
# create a column for pdays = -1

train['pdays_none'] = train['pdays'] == -1


train['pdays'] = train['pdays'].apply(lambda x: np.nan if x == -1 else x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train.drop(columns=['y','id']), train['y'], test_size=0.2, random_state=42)

---

In [13]:
# Assuming x_train is your DataFrame of features
categorical_features_names = x_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Make sure 'remainder__MSZoning' is in this list. It should be if its dtype is object.
print("Detected categorical features:", categorical_features_names)

Detected categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome']


In [14]:
import json
import optuna

# Load the best parameters from the saved file
try:
    with open("models/optuna_cat1.json", "r") as f:
        best_params = json.load(f)
    print("Successfully loaded best parameters:")
    print(best_params)
except FileNotFoundError:
    print("Error: The file 'best_params.json' was not found.")

Successfully loaded best parameters:
{'depth': 7, 'learning_rate': 0.16242304317665213, 'l2_leaf_reg': 9.784139225596583, 'iterations': 991, 'subsample': 0.6576101811374836}


In [15]:
cat_days_enc = CatBoostClassifier(**best_params, cat_features=categorical_features_names)

cat_days_enc.fit(x_train, y_train)

0:	learn: 0.4431566	total: 786ms	remaining: 12m 58s
1:	learn: 0.3304450	total: 1.53s	remaining: 12m 38s
2:	learn: 0.2721577	total: 2.18s	remaining: 11m 59s
3:	learn: 0.2375044	total: 2.83s	remaining: 11m 39s
4:	learn: 0.2197653	total: 3.47s	remaining: 11m 23s
5:	learn: 0.2091893	total: 4.07s	remaining: 11m 7s
6:	learn: 0.2012592	total: 4.71s	remaining: 11m 1s
7:	learn: 0.1953461	total: 5.36s	remaining: 10m 58s
8:	learn: 0.1906391	total: 6.01s	remaining: 10m 56s
9:	learn: 0.1877195	total: 6.67s	remaining: 10m 54s
10:	learn: 0.1855219	total: 7.32s	remaining: 10m 52s
11:	learn: 0.1828321	total: 8s	remaining: 10m 52s
12:	learn: 0.1783854	total: 8.59s	remaining: 10m 46s
13:	learn: 0.1772225	total: 9.26s	remaining: 10m 46s
14:	learn: 0.1762571	total: 9.9s	remaining: 10m 44s
15:	learn: 0.1749419	total: 10.5s	remaining: 10m 39s
16:	learn: 0.1723260	total: 11.1s	remaining: 10m 34s
17:	learn: 0.1715459	total: 11.7s	remaining: 10m 29s
18:	learn: 0.1708356	total: 12.2s	remaining: 10m 25s
19:	learn

<catboost.core.CatBoostClassifier at 0x23da20ec920>

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import numpy as np

def objective(trial):
    params = {
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "verbose": 0,
        "depth": trial.suggest_int("depth", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 10.0),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
    }

    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(x_train, y_train):
        X_train_cv, X_valid_cv = x_train.iloc[train_idx], x_train.iloc[valid_idx]
        y_train_cv, y_valid_cv = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        model = CatBoostClassifier(**params, cat_features=categorical_features_names)
        model.fit(X_train_cv, y_train_cv)

        preds = model.predict_proba(X_valid_cv)[:, 1]
        auc = roc_auc_score(y_valid_cv, preds)
        aucs.append(auc)

    return np.mean(aucs)


In [17]:
import optuna

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

print("Best AUC:", study.best_value)
print("Best Params:", study.best_params)


[I 2025-08-07 13:00:55,740] A new study created in memory with name: no-name-e9225a92-3533-46ec-b2ed-4ba0758678b9
[I 2025-08-07 13:08:45,803] Trial 0 finished with value: 0.9643599492789072 and parameters: {'depth': 5, 'learning_rate': 0.3416027149777706, 'l2_leaf_reg': 3.8428687859685704, 'iterations': 820, 'subsample': 0.6102492740395965}. Best is trial 0 with value: 0.9643599492789072.
[I 2025-08-07 13:21:24,313] Trial 1 finished with value: 0.9647696536219931 and parameters: {'depth': 7, 'learning_rate': 0.18500813401248692, 'l2_leaf_reg': 6.109561641503863, 'iterations': 894, 'subsample': 0.8921725091559454}. Best is trial 1 with value: 0.9647696536219931.
[I 2025-08-07 13:32:29,375] Trial 2 finished with value: 0.9645268762061188 and parameters: {'depth': 6, 'learning_rate': 0.24793828703984758, 'l2_leaf_reg': 1.3273785944319036, 'iterations': 908, 'subsample': 0.9112405823410208}. Best is trial 1 with value: 0.9647696536219931.
[I 2025-08-07 13:47:02,650] Trial 3 finished with v

KeyboardInterrupt: 

In [None]:
train.isna

In [None]:
optuna_best_model = CatBoostClassifier(**study.best_params, cat_features=categorical_features_names)

In [None]:
import json

# Your existing Optuna code
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=40)

print("Best Params:", study.best_params)

# Save the best parameters to a JSON file
with open("models/optuna_cat2_targetenc.json", "w") as f:
    json.dump(study.best_params, f, indent=4)

print("Best parameters saved to 'best_params.json'")