In [1]:
import catboost
import numpy as np
import optuna
import pandas as pd
from catboost import CatBoostClassifier
from optuna.integration import CatBoostPruningCallback  # type: ignore
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv("./data.csv")
test = pd.read_csv("./test.csv")
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30_59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60_89DaysPastDueNotWorse,NumberOfDependents
0,0.0,0.957151,40.0,0.0,0.115002,7.863651,4.0,0.0,0.0,0.0,1.0
1,0.0,0.65818,38.0,1.0,0.081684,8.020599,2.0,1.0,0.0,0.0,0.0
2,0.0,0.907239,49.0,1.0,0.02462,11.060196,7.0,0.0,1.0,0.0,0.0
3,0.0,0.213179,74.0,0.0,0.318895,8.160804,3.0,0.0,1.0,0.0,1.0
4,0.0,0.754464,39.0,0.0,0.190571,8.160804,8.0,0.0,0.0,0.0,0.0


In [3]:
df.dropna(subset=["SeriousDlqin2yrs"], inplace=True)

In [4]:
X = df.drop(['SeriousDlqin2yrs'], axis=1)
y = df['SeriousDlqin2yrs']
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2)

In [5]:
train_x.shape, valid_x.shape

((42429, 10), (10608, 10))

In [6]:
model = CatBoostClassifier(
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
)

model.fit(
    train_x, train_y,
    eval_set=(valid_x, valid_y),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7843588	best: 0.7843588 (0)	total: 160ms	remaining: 2m 40s
1:	test: 0.8065719	best: 0.8065719 (1)	total: 178ms	remaining: 1m 28s
2:	test: 0.8302905	best: 0.8302905 (2)	total: 196ms	remaining: 1m 5s
3:	test: 0.8291250	best: 0.8302905 (2)	total: 213ms	remaining: 53s
4:	test: 0.8286385	best: 0.8302905 (2)	total: 231ms	remaining: 46s
5:	test: 0.8426320	best: 0.8426320 (5)	total: 250ms	remaining: 41.5s
6:	test: 0.8465426	best: 0.8465426 (6)	total: 269ms	remaining: 38.2s
7:	test: 0.8470391	best: 0.8470391 (7)	total: 287ms	remaining: 35.6s
8:	test: 0.8509005	best: 0.8509005 (8)	total: 304ms	remaining: 33.5s
9:	test: 0.8516087	best: 0.8516087 (9)	total: 321ms	remaining: 31.8s
10:	test: 0.8510506	best: 0.8516087 (9)	total: 341ms	remaining: 30.6s
11:	test: 0.8520528	best: 0.8520528 (11)	total: 359ms	remaining: 29.5s
12:	test: 0.8523827	best: 0.8523827 (12)	total: 377ms	remaining: 28.6s
13:	test: 0.8527764	best: 0.8527764 (13)	total: 396ms	remaining: 27.9s
14:	test: 0.8525945	best: 0.8

<catboost.core.CatBoostClassifier at 0x224fd13b7c0>

In [8]:
y_pred = model.predict(valid_x)
print('Accuracy:', accuracy_score(valid_y, y_pred))

Accuracy: 0.9377828054298643


In [10]:
def objective(trial: optuna.Trial) -> float:
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "26gb",
        "eval_metric": "Accuracy",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)

    return accuracy

In [11]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
)
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-18 21:48:14,618][0m A new study created in memory with name: no-name-fb369ffc-5360-4a5d-9300-e33e0d113092[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-11-18 21:48:17,536][0m Trial 0 finished with value: 0.938631221719457 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08493199419103004, 'depth': 1, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7799768835188183}. Best is trial 0 with value: 0.938631221719457.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-11-18 21:48:18,156][0m Trial 1 finished with value: 0.9353318250377074 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.06930879626636127, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.2388670300199855}. Best is trial 0 with value: 0.938631221719457.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-11-18 21:48:18,

Number of finished trials: 100
Best trial:
  Value: 0.9392911010558069
  Params: 
    objective: Logloss
    colsample_bylevel: 0.07519183402207409
    depth: 5
    boosting_type: Plain
    bootstrap_type: Bernoulli
    subsample: 0.47621609474171295


In [19]:
full_model = CatBoostClassifier(**trial.params)
full_model.fit(X, y, verbose=0)

y_pred = full_model.predict(test)

In [24]:
test["SeriousDlqin2yrs"] = y_pred
test.to_csv("CatboostOptuna.csv", columns=["SeriousDlqin2yrs"], index=False)