## <b>1 <span style='color:#4682B4'>|</span> Загрузка данных </b>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from catboost import CatBoostClassifier
import optuna

In [2]:
train_df = pd.read_parquet("/kaggle/input/alfa-x-finu-hack/train_data.pqt")
test_df = pd.read_parquet("/kaggle/input/alfa-x-finu-hack/test_data.pqt")
train_df.head()

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
3,1,month_1,-0.081586,-0.09186,-0.11404,-0.08089,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
4,1,month_2,-0.094962,-0.100504,-0.119302,-0.094307,channel_code_2,city_14,city_type_0,,...,0.946066,0.43075,0.067275,0.559928,0.696576,-0.183854,0.255545,0.495419,{other},{other}


## <b>2 <span style='color:#4682B4'>|</span> Предобработка данных </b>

In [3]:
# Список категориальных признаков
cat_features = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year"
]

# Преобразование категориальных признаков в тип 'category'
for feature in cat_features:
    train_df[feature] = train_df[feature].astype("category")
    test_df[feature] = test_df[feature].astype("category")

# Подготовка данных для обучения модели
X_train = train_df.drop(["id", "date", "end_cluster"], axis=1)
y_train = train_df["end_cluster"]

# Разделение данных на обучающий и валидационный наборы
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Определение числовых столбцов после заполнения пропущенных значений
numerical_cols = x_train.select_dtypes(include='number').columns
    
# Используем SimpleImputer для заполнения пропущенных значений в числовых признаках средним
mean_imputer = SimpleImputer(strategy='mean') 
x_train[numerical_cols] = mean_imputer.fit_transform(x_train[numerical_cols])
x_val[numerical_cols] = mean_imputer.transform(x_val[numerical_cols])
test_df[numerical_cols] = mean_imputer.transform(test_df[numerical_cols])

# Используем SimpleImputer для заполнения пропущенных значений в категориальных признаках
simple_imputer = SimpleImputer(strategy='most_frequent') 
x_train[cat_features] = simple_imputer.fit_transform(x_train[cat_features])
x_val[cat_features] = simple_imputer.transform(x_val[cat_features])
test_df[cat_features] = simple_imputer.transform(test_df[cat_features])

# Инициализация RobustScaler
scaler = RobustScaler()

# Применение RobustScaler к числовым переменным
x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
x_val[numerical_cols] = scaler.transform(x_val[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

In [4]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

cluster_weights = pd.read_excel("/kaggle/input/alfa-x-finu-hack/cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

## <b>3 <span style='color:#4682B4'>|</span> Подбор гиперпараметров при помощи Optuna </b>

In [5]:
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 0.0, 100.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),
    }

    model = CatBoostClassifier(iterations=5_000,
                               loss_function='MultiClass',
                               task_type='GPU',
                               **params)

    model.fit(x_train,
              y_train,
              cat_features=cat_features,
              eval_set=(x_val, y_val),
              verbose=5000,
              early_stopping_rounds=100)

    y_pred_proba = model.predict_proba(x_val)
    score = weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40) 

[I 2024-04-21 22:12:46,355] A new study created in memory with name: no-name-685c0ce5-1717-47ba-a65d-1aad31b08bbd


0:	learn: 1.5229880	test: 1.5256852	best: 1.5256852 (0)	total: 320ms	remaining: 26m 40s
bestTest = 0.643606901
bestIteration = 4225
Shrink model to first 4226 iterations.


[I 2024-04-21 22:20:18,394] Trial 0 finished with value: 0.9589545086001503 and parameters: {'depth': 8, 'learning_rate': 0.19012957765207258, 'l2_leaf_reg': 0.3009774766532844, 'bagging_temperature': 0.07862959138237768, 'border_count': 160, 'random_strength': 61.88799479804526, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.9589545086001503.


0:	learn: 1.2010112	test: 1.2069814	best: 1.2069814 (0)	total: 74.7ms	remaining: 6m 13s
bestTest = 0.8981358073
bestIteration = 16
Shrink model to first 17 iterations.


[I 2024-04-21 22:20:28,751] Trial 1 finished with value: 0.8744578808129274 and parameters: {'depth': 6, 'learning_rate': 0.40519344070742164, 'l2_leaf_reg': 0.08990483952245525, 'bagging_temperature': 0.3583561220076431, 'border_count': 89, 'random_strength': 55.92917647100746, 'min_data_in_leaf': 15}. Best is trial 0 with value: 0.9589545086001503.


0:	learn: 1.6490647	test: 1.6514234	best: 1.6514234 (0)	total: 68.1ms	remaining: 5m 40s
bestTest = 0.7759804688
bestIteration = 4071
Shrink model to first 4072 iterations.


[I 2024-04-21 22:23:17,284] Trial 2 finished with value: 0.9372249316570921 and parameters: {'depth': 4, 'learning_rate': 0.17577528160569228, 'l2_leaf_reg': 0.010107571534041985, 'bagging_temperature': 0.9921945930488872, 'border_count': 174, 'random_strength': 39.963842695100546, 'min_data_in_leaf': 6}. Best is trial 0 with value: 0.9589545086001503.


0:	learn: 1.2041809	test: 1.2097065	best: 1.2097065 (0)	total: 318ms	remaining: 26m 31s
bestTest = 0.8798060547
bestIteration = 49
Shrink model to first 50 iterations.


[I 2024-04-21 22:23:46,645] Trial 3 finished with value: 0.8856474633309064 and parameters: {'depth': 10, 'learning_rate': 0.2914438935904745, 'l2_leaf_reg': 0.004776755698174928, 'bagging_temperature': 0.9013789120144303, 'border_count': 79, 'random_strength': 82.59577855306857, 'min_data_in_leaf': 16}. Best is trial 0 with value: 0.9589545086001503.


0:	learn: 1.2264307	test: 1.2306445	best: 1.2306445 (0)	total: 75.8ms	remaining: 6m 18s
4999:	learn: 0.4649438	test: 0.7444234	best: 0.7444178 (4998)	total: 3m 7s	remaining: 0us
bestTest = 0.7444177734
bestIteration = 4998
Shrink model to first 4999 iterations.


[I 2024-04-21 22:27:01,080] Trial 4 finished with value: 0.9488308080727161 and parameters: {'depth': 5, 'learning_rate': 0.32858215379609046, 'l2_leaf_reg': 1.3496543574757078, 'bagging_temperature': 0.3018160398319255, 'border_count': 52, 'random_strength': 20.71129072886009, 'min_data_in_leaf': 12}. Best is trial 0 with value: 0.9589545086001503.


0:	learn: 1.7224059	test: 1.7246000	best: 1.7246000 (0)	total: 127ms	remaining: 10m 34s
bestTest = 0.6385609375
bestIteration = 4159
Shrink model to first 4160 iterations.


[I 2024-04-21 22:32:53,909] Trial 5 finished with value: 0.9597533547209722 and parameters: {'depth': 8, 'learning_rate': 0.1514557311605127, 'l2_leaf_reg': 0.1189871489100411, 'bagging_temperature': 0.3384476692865779, 'border_count': 33, 'random_strength': 32.520563232929604, 'min_data_in_leaf': 7}. Best is trial 5 with value: 0.9597533547209722.


0:	learn: 2.4888047	test: 2.4892469	best: 2.4892469 (0)	total: 76.7ms	remaining: 6m 23s
4999:	learn: 0.6879492	test: 0.7745021	best: 0.7745021 (4999)	total: 3m 46s	remaining: 0us
bestTest = 0.7745020833
bestIteration = 4999


[I 2024-04-21 22:36:47,908] Trial 6 finished with value: 0.9399209273329573 and parameters: {'depth': 5, 'learning_rate': 0.043498687619744035, 'l2_leaf_reg': 0.002099598502562045, 'bagging_temperature': 0.44246184255011023, 'border_count': 213, 'random_strength': 21.55719973709055, 'min_data_in_leaf': 13}. Best is trial 5 with value: 0.9597533547209722.


0:	learn: 1.2356604	test: 1.2441161	best: 1.2441161 (0)	total: 333ms	remaining: 27m 45s
bestTest = 1.244116146
bestIteration = 0
Shrink model to first 1 iterations.


[I 2024-04-21 22:37:20,235] Trial 7 finished with value: 0.7514508460236511 and parameters: {'depth': 10, 'learning_rate': 0.4610232938609242, 'l2_leaf_reg': 0.06091777929282363, 'bagging_temperature': 0.3040085665951042, 'border_count': 233, 'random_strength': 65.1330659080642, 'min_data_in_leaf': 17}. Best is trial 5 with value: 0.9597533547209722.


0:	learn: 1.1862717	test: 1.1911260	best: 1.1911260 (0)	total: 167ms	remaining: 13m 57s
4999:	learn: 0.1727938	test: 0.6268732	best: 0.6266387 (4956)	total: 8m 57s	remaining: 0us
bestTest = 0.626638737
bestIteration = 4956
Shrink model to first 4957 iterations.


[I 2024-04-21 22:46:27,534] Trial 8 finished with value: 0.9610071116607684 and parameters: {'depth': 8, 'learning_rate': 0.3378743083379042, 'l2_leaf_reg': 4.7758223043970265, 'bagging_temperature': 0.3553014932370997, 'border_count': 221, 'random_strength': 4.34468099503702, 'min_data_in_leaf': 4}. Best is trial 8 with value: 0.9610071116607684.


0:	learn: 1.1710042	test: 1.1764589	best: 1.1764589 (0)	total: 211ms	remaining: 17m 33s
bestTest = 0.8947449219
bestIteration = 27
Shrink model to first 28 iterations.


[I 2024-04-21 22:46:49,914] Trial 9 finished with value: 0.8968342911616267 and parameters: {'depth': 9, 'learning_rate': 0.34396757877739276, 'l2_leaf_reg': 0.03692064580462473, 'bagging_temperature': 0.26424565423194735, 'border_count': 116, 'random_strength': 18.752421092616633, 'min_data_in_leaf': 19}. Best is trial 8 with value: 0.9610071116607684.


0:	learn: 1.2361003	test: 1.2433182	best: 1.2433182 (0)	total: 120ms	remaining: 10m
bestTest = 1.243318229
bestIteration = 0
Shrink model to first 1 iterations.


[I 2024-04-21 22:47:02,700] Trial 10 finished with value: 0.7392548535960164 and parameters: {'depth': 7, 'learning_rate': 0.45508026876398056, 'l2_leaf_reg': 8.99142708393269, 'bagging_temperature': 0.6764139488298095, 'border_count': 254, 'random_strength': 2.6030594547121453, 'min_data_in_leaf': 2}. Best is trial 8 with value: 0.9610071116607684.


0:	learn: 2.1631305	test: 2.1642854	best: 2.1642854 (0)	total: 161ms	remaining: 13m 23s
4999:	learn: 0.4366574	test: 0.6705720	best: 0.6705720 (4999)	total: 8m 18s	remaining: 0us
bestTest = 0.6705720052
bestIteration = 4999


[I 2024-04-21 22:55:30,901] Trial 11 finished with value: 0.9583026218796311 and parameters: {'depth': 8, 'learning_rate': 0.08551636633541718, 'l2_leaf_reg': 7.910796254398781, 'bagging_temperature': 0.6745870425624277, 'border_count': 192, 'random_strength': 0.9523476436716578, 'min_data_in_leaf': 7}. Best is trial 8 with value: 0.9610071116607684.


0:	learn: 1.4103065	test: 1.4134783	best: 1.4134783 (0)	total: 126ms	remaining: 10m 28s
bestTest = 0.6351647786
bestIteration = 4178
Shrink model to first 4179 iterations.


[I 2024-04-21 23:01:16,750] Trial 12 finished with value: 0.9607720599732871 and parameters: {'depth': 8, 'learning_rate': 0.2178480339963667, 'l2_leaf_reg': 0.6401534235227949, 'bagging_temperature': 0.05959068201550266, 'border_count': 34, 'random_strength': 35.2394598548177, 'min_data_in_leaf': 7}. Best is trial 8 with value: 0.9610071116607684.


0:	learn: 1.3459340	test: 1.3492138	best: 1.3492138 (0)	total: 118ms	remaining: 9m 49s
4999:	learn: 0.2763339	test: 0.6551999	best: 0.6551999 (4999)	total: 5m 55s	remaining: 0us
bestTest = 0.6551998698
bestIteration = 4999


[I 2024-04-21 23:07:21,088] Trial 13 finished with value: 0.9576006534597816 and parameters: {'depth': 7, 'learning_rate': 0.2403546905663004, 'l2_leaf_reg': 1.2463898949127246, 'bagging_temperature': 0.025451140096602716, 'border_count': 133, 'random_strength': 93.64968891115447, 'min_data_in_leaf': 4}. Best is trial 8 with value: 0.9610071116607684.


0:	learn: 1.3204945	test: 1.3239951	best: 1.3239951 (0)	total: 209ms	remaining: 17m 26s
bestTest = 0.6125214193
bestIteration = 3578
Shrink model to first 3579 iterations.


[I 2024-04-21 23:15:21,982] Trial 14 finished with value: 0.9630019754600523 and parameters: {'depth': 9, 'learning_rate': 0.24317497601509644, 'l2_leaf_reg': 1.7949146060543817, 'bagging_temperature': 0.1252812756393336, 'border_count': 111, 'random_strength': 42.01528213631604, 'min_data_in_leaf': 9}. Best is trial 14 with value: 0.9630019754600523.


0:	learn: 1.1741909	test: 1.1796146	best: 1.1796146 (0)	total: 208ms	remaining: 17m 20s
bestTest = 0.6440710286
bestIteration = 2416
Shrink model to first 2417 iterations.


[I 2024-04-21 23:20:54,415] Trial 15 finished with value: 0.9592577877930176 and parameters: {'depth': 9, 'learning_rate': 0.37416118520556485, 'l2_leaf_reg': 2.8648570092899512, 'bagging_temperature': 0.21255889070451978, 'border_count': 118, 'random_strength': 74.7080235112025, 'min_data_in_leaf': 10}. Best is trial 14 with value: 0.9630019754600523.


0:	learn: 1.2025818	test: 1.2072891	best: 1.2072891 (0)	total: 236ms	remaining: 19m 38s
bestTest = 0.6115973958
bestIteration = 2867
Shrink model to first 2868 iterations.


[I 2024-04-21 23:29:06,330] Trial 16 finished with value: 0.962126032208716 and parameters: {'depth': 9, 'learning_rate': 0.306591971400504, 'l2_leaf_reg': 3.362999700456141, 'bagging_temperature': 0.5717321680904301, 'border_count': 196, 'random_strength': 45.04907867667126, 'min_data_in_leaf': 9}. Best is trial 14 with value: 0.9630019754600523.


0:	learn: 1.2569980	test: 1.2616751	best: 1.2616751 (0)	total: 244ms	remaining: 20m 18s
bestTest = 0.6378572917
bestIteration = 2229
Shrink model to first 2230 iterations.


[I 2024-04-21 23:35:07,526] Trial 17 finished with value: 0.9602776567967327 and parameters: {'depth': 9, 'learning_rate': 0.2677936864674162, 'l2_leaf_reg': 0.31880556913473274, 'bagging_temperature': 0.6033918513759353, 'border_count': 157, 'random_strength': 46.80343881633115, 'min_data_in_leaf': 10}. Best is trial 14 with value: 0.9630019754600523.


0:	learn: 1.8875062	test: 1.8892826	best: 1.8892826 (0)	total: 341ms	remaining: 28m 24s
bestTest = 0.5771657552
bestIteration = 4814
Shrink model to first 4815 iterations.


[I 2024-04-21 23:55:15,213] Trial 18 finished with value: 0.9658258602697232 and parameters: {'depth': 10, 'learning_rate': 0.12334169037050063, 'l2_leaf_reg': 2.384840368076266, 'bagging_temperature': 0.5176746587054086, 'border_count': 190, 'random_strength': 47.98410024336251, 'min_data_in_leaf': 10}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 2.0094674	test: 2.0111355	best: 2.0111355 (0)	total: 287ms	remaining: 23m 56s
bestTest = 0.5763470703
bestIteration = 4184
Shrink model to first 4185 iterations.


[I 2024-04-22 00:10:20,505] Trial 19 finished with value: 0.9651179129098844 and parameters: {'depth': 10, 'learning_rate': 0.10491776466873673, 'l2_leaf_reg': 1.5382148603664072, 'bagging_temperature': 0.773447156850416, 'border_count': 89, 'random_strength': 72.24274155468883, 'min_data_in_leaf': 13}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.9181969	test: 1.9202042	best: 1.9202042 (0)	total: 305ms	remaining: 25m 23s
bestTest = 0.5895061198
bestIteration = 3413
Shrink model to first 3414 iterations.


[I 2024-04-22 00:22:48,222] Trial 20 finished with value: 0.9640570869533495 and parameters: {'depth': 10, 'learning_rate': 0.11780314464711608, 'l2_leaf_reg': 0.509671589714356, 'bagging_temperature': 0.7841840352577605, 'border_count': 80, 'random_strength': 74.03332306107262, 'min_data_in_leaf': 14}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.9779092	test: 1.9797827	best: 1.9797827 (0)	total: 296ms	remaining: 24m 39s
bestTest = 0.5913780599
bestIteration = 3074
Shrink model to first 3075 iterations.


[I 2024-04-22 00:34:00,595] Trial 21 finished with value: 0.9637118399960483 and parameters: {'depth': 10, 'learning_rate': 0.10895976345347438, 'l2_leaf_reg': 0.40388267198014594, 'bagging_temperature': 0.8310122302020916, 'border_count': 78, 'random_strength': 78.7447242162462, 'min_data_in_leaf': 13}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 2.4742062	test: 2.4749297	best: 2.4749297 (0)	total: 310ms	remaining: 25m 50s
4999:	learn: 0.2581517	test: 0.5976437	best: 0.5976390 (4998)	total: 16m 24s	remaining: 0us
bestTest = 0.5976389974
bestIteration = 4998
Shrink model to first 4999 iterations.


[I 2024-04-22 00:50:40,139] Trial 22 finished with value: 0.964458596559796 and parameters: {'depth': 10, 'learning_rate': 0.043244175919371214, 'l2_leaf_reg': 0.9072776176963915, 'bagging_temperature': 0.7903796954553841, 'border_count': 65, 'random_strength': 70.2372030928223, 'min_data_in_leaf': 12}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 2.7363031	test: 2.7364911	best: 2.7364911 (0)	total: 308ms	remaining: 25m 38s
4999:	learn: 0.5751035	test: 0.7251503	best: 0.7251503 (4999)	total: 12m 44s	remaining: 0us
bestTest = 0.7251502604
bestIteration = 4999


[I 2024-04-22 01:03:37,491] Trial 23 finished with value: 0.9534668050269677 and parameters: {'depth': 10, 'learning_rate': 0.011456423291004464, 'l2_leaf_reg': 1.0763141543479195, 'bagging_temperature': 0.7626403698125876, 'border_count': 56, 'random_strength': 93.37601268781602, 'min_data_in_leaf': 12}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 2.3421771	test: 2.3432302	best: 2.3432302 (0)	total: 310ms	remaining: 25m 50s
4999:	learn: 0.1715543	test: 0.5947919	best: 0.5947919 (4999)	total: 16m 15s	remaining: 0us
bestTest = 0.5947919271
bestIteration = 4999


[I 2024-04-22 01:20:08,618] Trial 24 finished with value: 0.963685241820699 and parameters: {'depth': 10, 'learning_rate': 0.05967245514244131, 'l2_leaf_reg': 0.17837179399830513, 'bagging_temperature': 0.5142026083197858, 'border_count': 57, 'random_strength': 66.91397824019069, 'min_data_in_leaf': 18}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.7887286	test: 1.7908682	best: 1.7908682 (0)	total: 214ms	remaining: 17m 49s
4999:	learn: 0.1900861	test: 0.5883559	best: 0.5883559 (4999)	total: 11m 6s	remaining: 0us
bestTest = 0.5883559245
bestIteration = 4999


[I 2024-04-22 01:31:28,032] Trial 25 finished with value: 0.9641197812501987 and parameters: {'depth': 9, 'learning_rate': 0.14022370155999214, 'l2_leaf_reg': 2.6197664845546402, 'bagging_temperature': 0.9336511598591178, 'border_count': 99, 'random_strength': 55.17461694249137, 'min_data_in_leaf': 11}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 2.6308552	test: 2.6312484	best: 2.6312484 (0)	total: 339ms	remaining: 28m 14s
4999:	learn: 0.3916877	test: 0.6489944	best: 0.6489944 (4999)	total: 17m 29s	remaining: 0us
bestTest = 0.648994401
bestIteration = 4999


[I 2024-04-22 01:49:12,379] Trial 26 finished with value: 0.9607933241728185 and parameters: {'depth': 10, 'learning_rate': 0.0240657806184746, 'l2_leaf_reg': 0.7838738684741539, 'bagging_temperature': 0.7225033705651422, 'border_count': 135, 'random_strength': 87.72589068440453, 'min_data_in_leaf': 15}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 2.2286341	test: 2.2297924	best: 2.2297924 (0)	total: 247ms	remaining: 20m 35s
4999:	learn: 0.3280967	test: 0.6245602	best: 0.6245602 (4999)	total: 12m 12s	remaining: 0us
bestTest = 0.6245601563
bestIteration = 4999


[I 2024-04-22 02:01:37,216] Trial 27 finished with value: 0.9617738532006062 and parameters: {'depth': 9, 'learning_rate': 0.07598441332964345, 'l2_leaf_reg': 4.309614517248738, 'bagging_temperature': 0.8520306508122387, 'border_count': 175, 'random_strength': 69.4569247755366, 'min_data_in_leaf': 20}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 2.0577484	test: 2.0595013	best: 2.0595013 (0)	total: 320ms	remaining: 26m 38s
bestTest = 0.6213195312
bestIteration = 2769
Shrink model to first 2770 iterations.


[I 2024-04-22 02:11:06,223] Trial 28 finished with value: 0.9609047304313721 and parameters: {'depth': 10, 'learning_rate': 0.09697837726172306, 'l2_leaf_reg': 0.034247481496592264, 'bagging_temperature': 0.45638344450096874, 'border_count': 64, 'random_strength': 99.69858566380547, 'min_data_in_leaf': 9}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.5124227	test: 1.5150178	best: 1.5150178 (0)	total: 115ms	remaining: 9m 34s
4999:	learn: 0.2635605	test: 0.6599921	best: 0.6599921 (4999)	total: 6m 6s	remaining: 0us
bestTest = 0.6599920573
bestIteration = 4999


[I 2024-04-22 02:17:21,647] Trial 29 finished with value: 0.9558284548062441 and parameters: {'depth': 7, 'learning_rate': 0.19470771828921574, 'l2_leaf_reg': 0.21830605060613703, 'bagging_temperature': 0.6127613151495941, 'border_count': 157, 'random_strength': 51.53601270026301, 'min_data_in_leaf': 11}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.7504471	test: 1.7524316	best: 1.7524316 (0)	total: 139ms	remaining: 11m 35s
4999:	learn: 0.2518029	test: 0.6177489	best: 0.6177383 (4997)	total: 7m 21s	remaining: 0us
bestTest = 0.6177383464
bestIteration = 4997
Shrink model to first 4998 iterations.


[I 2024-04-22 02:24:52,946] Trial 30 finished with value: 0.9611418748079466 and parameters: {'depth': 8, 'learning_rate': 0.14714063816463285, 'l2_leaf_reg': 1.8035990318159234, 'bagging_temperature': 0.6766090497001312, 'border_count': 101, 'random_strength': 61.132002673910584, 'min_data_in_leaf': 14}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.8549281	test: 1.8569014	best: 1.8569014 (0)	total: 215ms	remaining: 17m 52s
4999:	learn: 0.2003018	test: 0.5900720	best: 0.5900423 (4997)	total: 11m 7s	remaining: 0us
bestTest = 0.5900422526
bestIteration = 4997
Shrink model to first 4998 iterations.


[I 2024-04-22 02:36:13,060] Trial 31 finished with value: 0.963770068447011 and parameters: {'depth': 9, 'learning_rate': 0.12948012928641062, 'l2_leaf_reg': 2.5135370393299086, 'bagging_temperature': 0.9661730183888366, 'border_count': 105, 'random_strength': 57.84749568753564, 'min_data_in_leaf': 12}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.6363424	test: 1.6388565	best: 1.6388565 (0)	total: 290ms	remaining: 24m 10s
bestTest = 0.570239974
bestIteration = 4064
Shrink model to first 4065 iterations.


[I 2024-04-22 02:50:56,421] Trial 32 finished with value: 0.9653161028360506 and parameters: {'depth': 10, 'learning_rate': 0.16729569591461654, 'l2_leaf_reg': 5.519268489151185, 'bagging_temperature': 0.8990579971823365, 'border_count': 88, 'random_strength': 56.817866427997316, 'min_data_in_leaf': 11}. Best is trial 18 with value: 0.9658258602697232.


0:	learn: 1.6174049	test: 1.6199918	best: 1.6199918 (0)	total: 325ms	remaining: 27m 3s
bestTest = 0.5675098958
bestIteration = 4068
Shrink model to first 4069 iterations.


[I 2024-04-22 03:05:22,635] Trial 33 finished with value: 0.9663749572145994 and parameters: {'depth': 10, 'learning_rate': 0.17202049904649439, 'l2_leaf_reg': 7.606154289229104, 'bagging_temperature': 0.8673588076824617, 'border_count': 68, 'random_strength': 61.14863444620232, 'min_data_in_leaf': 14}. Best is trial 33 with value: 0.9663749572145994.


0:	learn: 1.5977263	test: 1.5997953	best: 1.5997953 (0)	total: 83.6ms	remaining: 6m 57s
4999:	learn: 0.5099333	test: 0.7094854	best: 0.7094824 (4998)	total: 4m	remaining: 0us
bestTest = 0.7094824219
bestIteration = 4998
Shrink model to first 4999 iterations.


[I 2024-04-22 03:09:30,759] Trial 34 finished with value: 0.9524349477428404 and parameters: {'depth': 6, 'learning_rate': 0.18018449758429622, 'l2_leaf_reg': 9.477218402931038, 'bagging_temperature': 0.8591721621417886, 'border_count': 92, 'random_strength': 52.44875442765034, 'min_data_in_leaf': 16}. Best is trial 33 with value: 0.9663749572145994.


0:	learn: 1.4456892	test: 1.4488216	best: 1.4488216 (0)	total: 332ms	remaining: 27m 40s
bestTest = 0.5756539063
bestIteration = 3723
Shrink model to first 3724 iterations.


[I 2024-04-22 03:24:29,161] Trial 35 finished with value: 0.9651768110074157 and parameters: {'depth': 10, 'learning_rate': 0.20833748698188395, 'l2_leaf_reg': 5.377472468205059, 'bagging_temperature': 0.91981035127124, 'border_count': 128, 'random_strength': 63.075777835420524, 'min_data_in_leaf': 15}. Best is trial 33 with value: 0.9663749572145994.


0:	learn: 1.4788945	test: 1.4818815	best: 1.4818815 (0)	total: 67.7ms	remaining: 5m 38s
4999:	learn: 0.6749012	test: 0.7747540	best: 0.7747540 (4999)	total: 3m 8s	remaining: 0us
bestTest = 0.7747540365
bestIteration = 4999


[I 2024-04-22 03:27:45,191] Trial 36 finished with value: 0.9394800802543154 and parameters: {'depth': 4, 'learning_rate': 0.21627241779775672, 'l2_leaf_reg': 6.8166358570705095, 'bagging_temperature': 0.9764155643142192, 'border_count': 130, 'random_strength': 35.33955948363166, 'min_data_in_leaf': 15}. Best is trial 33 with value: 0.9663749572145994.


0:	learn: 1.6174578	test: 1.6200875	best: 1.6200875 (0)	total: 310ms	remaining: 25m 52s
bestTest = 0.5761363932
bestIteration = 4083
Shrink model to first 4084 iterations.


[I 2024-04-22 03:44:52,985] Trial 37 finished with value: 0.9655730230225981 and parameters: {'depth': 10, 'learning_rate': 0.17105062837714524, 'l2_leaf_reg': 5.770445248824186, 'bagging_temperature': 0.907832755354328, 'border_count': 174, 'random_strength': 60.219846868632494, 'min_data_in_leaf': 17}. Best is trial 33 with value: 0.9663749572145994.


0:	learn: 1.6442775	test: 1.6468396	best: 1.6468396 (0)	total: 259ms	remaining: 21m 34s
4999:	learn: 0.1946618	test: 0.5888861	best: 0.5888787 (4997)	total: 13m 21s	remaining: 0us
bestTest = 0.5888787109
bestIteration = 4997
Shrink model to first 4998 iterations.


[I 2024-04-22 03:58:27,076] Trial 38 finished with value: 0.9641045412588201 and parameters: {'depth': 9, 'learning_rate': 0.16675840964908029, 'l2_leaf_reg': 4.9193893138446905, 'bagging_temperature': 0.8701950569735756, 'border_count': 176, 'random_strength': 29.17881757032693, 'min_data_in_leaf': 17}. Best is trial 33 with value: 0.9663749572145994.


0:	learn: 1.2735668	test: 1.2776404	best: 1.2776404 (0)	total: 355ms	remaining: 29m 33s
bestTest = 0.5878391927
bestIteration = 2601
Shrink model to first 2602 iterations.


[I 2024-04-22 04:10:19,781] Trial 39 finished with value: 0.9638356220773353 and parameters: {'depth': 10, 'learning_rate': 0.2617675879399603, 'l2_leaf_reg': 3.9112810480603915, 'bagging_temperature': 0.8915867007503682, 'border_count': 194, 'random_strength': 58.862655482076725, 'min_data_in_leaf': 17}. Best is trial 33 with value: 0.9663749572145994.


In [6]:
print("Количество завершенных испытаний: {}".format(len(study.trials)))

trial = study.best_trial
print(f"Лучшее значение: {trial.value}\n")

print("Гиперпараметры:")
for key, value in trial.params.items():
    print("   {}: {}".format(key, value))

Количество завершенных испытаний: 40
Лучшее значение: 0.9663749572145994

Гиперпараметры:
   depth: 10
   learning_rate: 0.17202049904649439
   l2_leaf_reg: 7.606154289229104
   bagging_temperature: 0.8673588076824617
   border_count: 68
   random_strength: 61.14863444620232
   min_data_in_leaf: 14


## <b>4 <span style='color:#4682B4'>|</span> Обучение модели </b>

In [7]:
best_params = study.best_params  

model = CatBoostClassifier(iterations=10_000,
                           loss_function='MultiClass',
                           task_type='GPU',
                           **best_params)


model.fit(x_train,
          y_train,
          cat_features=cat_features,
          eval_set=(x_val, y_val),
          verbose=1000,
          early_stopping_rounds=100)

0:	learn: 1.6174049	test: 1.6199918	best: 1.6199918 (0)	total: 303ms	remaining: 50m 33s
1000:	learn: 0.4475923	test: 0.6746224	best: 0.6746224 (1000)	total: 3m 5s	remaining: 27m 48s
2000:	learn: 0.2888432	test: 0.6080876	best: 0.6080876 (2000)	total: 6m 32s	remaining: 26m 9s
3000:	learn: 0.2125650	test: 0.5809010	best: 0.5809010 (3000)	total: 10m 4s	remaining: 23m 30s
4000:	learn: 0.1714029	test: 0.5723939	best: 0.5723563 (3997)	total: 13m 36s	remaining: 20m 24s
bestTest = 0.5717523437
bestIteration = 4392
Shrink model to first 4393 iterations.


<catboost.core.CatBoostClassifier at 0x7bbaca295840>

In [8]:
y_pred_proba = model.predict_proba(x_val)
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

0.9662047594042886

## <b>5 <span style='color:#4682B4'>|</span> Submission </b>

In [9]:
sample_submission_df = pd.read_csv('/kaggle/input/alfa-x-finu-hack/sample_submission.csv')

last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(['id', 'date'], axis=1)

test_pred_proba = model.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
test_pred_proba_df.head()

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.000215,0.001067,0.0009286726,0.004192,0.001342531,1.180768e-06,2.264495e-05,4.163266e-06,0.000424,0.000283,0.0003308486,6.741641e-06,1.1e-05,2.068987e-07,9.8e-05,0.991073,2.028584e-07
1,2.8e-05,0.412001,7.896603e-07,1.1e-05,1.030863e-06,1.14284e-07,8.608535e-08,3.005169e-08,4e-06,0.000182,6.242754e-07,6.902805e-07,2e-06,4.016607e-09,2.1e-05,0.587745,1.743911e-06
2,0.001142,0.000687,0.000313124,0.00542,0.0004973045,4.888271e-06,7.468658e-06,1.098751e-05,0.000136,0.000218,0.0001414612,5.835006e-05,8e-06,3.026572e-07,0.000301,0.991054,4.667873e-07
3,0.011543,0.435397,1.997292e-05,0.000297,5.612789e-07,6.373468e-05,5.668058e-06,4.851931e-07,5e-06,0.226342,0.001223628,3.75797e-07,0.000148,7.759173e-08,3.1e-05,0.324922,2.805962e-07
4,0.003851,0.010537,0.005868169,0.008129,6.173457e-05,3.011136e-06,2.713406e-06,1.136141e-06,0.000122,0.043845,0.001337683,3.104938e-06,0.000118,5.638335e-07,2e-06,0.926116,6.051511e-07


In [10]:
sample_submission_df[model.classes_] = test_pred_proba_df
sample_submission_df.to_csv("sub.csv", index=False)