In [1]:
import catboost
import numpy as np
import optuna
import pandas as pd
from catboost import CatBoostClassifier
from optuna.integration import CatBoostPruningCallback  # type: ignore
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./data.csv")
test = pd.read_csv("./test.csv")
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30_59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60_89DaysPastDueNotWorse,NumberOfDependents
0,0.0,0.957151,40.0,0.0,0.115002,7.863651,4.0,0.0,0.0,0.0,1.0
1,0.0,0.65818,38.0,1.0,0.081684,8.020599,2.0,1.0,0.0,0.0,0.0
2,0.0,0.907239,49.0,1.0,0.02462,11.060196,7.0,0.0,1.0,0.0,0.0
3,0.0,0.213179,74.0,0.0,0.318895,8.160804,3.0,0.0,1.0,0.0,1.0
4,0.0,0.754464,39.0,0.0,0.190571,8.160804,8.0,0.0,0.0,0.0,0.0


In [3]:
df.dropna(subset=["SeriousDlqin2yrs"], inplace=True)

In [4]:
X = df.drop(['SeriousDlqin2yrs'], axis=1)
y = df['SeriousDlqin2yrs']
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2)

In [5]:
train_x.shape, valid_x.shape

((42429, 10), (10608, 10))

In [6]:
model = CatBoostClassifier(
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
)

model.fit(
    train_x, train_y,
    eval_set=(valid_x, valid_y),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.7813256	best: 0.7813256 (0)	total: 165ms	remaining: 2m 44s
1:	test: 0.7935624	best: 0.7935624 (1)	total: 176ms	remaining: 1m 27s
2:	test: 0.8198499	best: 0.8198499 (2)	total: 188ms	remaining: 1m 2s
3:	test: 0.8292165	best: 0.8292165 (3)	total: 199ms	remaining: 49.6s
4:	test: 0.8309019	best: 0.8309019 (4)	total: 211ms	remaining: 41.9s
5:	test: 0.8388652	best: 0.8388652 (5)	total: 222ms	remaining: 36.8s
6:	test: 0.8380468	best: 0.8388652 (5)	total: 234ms	remaining: 33.2s
7:	test: 0.8392872	best: 0.8392872 (7)	total: 245ms	remaining: 30.4s
8:	test: 0.8428033	best: 0.8428033 (8)	total: 256ms	remaining: 28.2s
9:	test: 0.8440683	best: 0.8440683 (9)	total: 268ms	remaining: 26.5s
10:	test: 0.8447683	best: 0.8447683 (10)	total: 279ms	remaining: 25.1s
11:	test: 0.8452175	best: 0.8452175 (11)	total: 290ms	remaining: 23.9s
12:	test: 0.8467641	best: 0.8467641 (12)	total: 302ms	remaining: 22.9s
13:	test: 0.8476069	best: 0.8476069 (13)	total: 313ms	remaining: 22s
14:	test: 0.8479443	best: 

<catboost.core.CatBoostClassifier at 0x11931e42080>

In [7]:
y_pred = model.predict(valid_x)
print('Accuracy:', accuracy_score(valid_y, y_pred))

Accuracy: 0.9326923076923077


In [8]:
def objective(trial: optuna.Trial) -> float:
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "26gb",
        "eval_metric": "Accuracy",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)

    return accuracy

In [9]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
)
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-19 12:45:24,543][0m A new study created in memory with name: no-name-41137d3e-35ee-4cb2-8adf-85a734a3d60c[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-11-19 12:45:25,140][0m Trial 0 finished with value: 0.9288273001508296 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.022926944570934155, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.265314356250851}. Best is trial 0 with value: 0.9288273001508296.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-11-19 12:45:27,278][0m Trial 1 finished with value: 0.9288273001508296 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.027473695821987564, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9288273001508296.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-11-19 12:45:27,888][0m Trial 2 finished with val

Number of finished trials: 100
Best trial:
  Value: 0.9356146304675717
  Params: 
    objective: Logloss
    colsample_bylevel: 0.0930925604868408
    depth: 5
    boosting_type: Plain
    bootstrap_type: Bayesian
    bagging_temperature: 3.0579447298423603


In [10]:
full_model = CatBoostClassifier(**trial.params)
full_model.fit(X, y, verbose=0)

y_pred = full_model.predict(test)

In [13]:
test["SeriousDlqin2yrs"] = y_pred
# Create id column as required by the competition
test
test.to_csv("CatboostOptuna.csv", columns=["id", "SeriousDlqin2yrs"], index=False)