# Импорт библиотек

In [69]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, log_loss, auc, classification_report, confusion_matrix, \
    precision_recall_curve, roc_curve

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from catboost import Pool

import warnings
import optuna

warnings.filterwarnings("ignore")
RAND=10
N_FOLDS=5
percent_of_negative_class = 0.958

# Метод для подсчёта метрик

In [3]:
def get_metrics(y_test, y_pred, y_score, name):
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    df_metrics['Accuracy'] = [accuracy_score(y_test, y_pred)]
    df_metrics['ROC_AUC'] = [roc_auc_score(y_test, y_score[:,1])]
    df_metrics['Precision'] = [precision_score(y_test, y_pred)]
    df_metrics['Recall'] = [recall_score(y_test, y_pred)]
    df_metrics['f1'] = [f1_score(y_test, y_pred)]
    df_metrics['Logloss'] = [log_loss(y_test, y_score)]
    
    return df_metrics

# Подготовка данных к обучению

Выгрузим данные

In [5]:
df = pd.read_pickle('data.pickle')
df.head()

Unnamed: 0,SEMESTER,DISC_ID,TYPE_NAME,DEBT,GENDER,CITIZENSHIP,EXAM_TYPE,EXAM_SUBJECT_1,EXAM_SUBJECT_2,EXAM_SUBJECT_3,ADMITTED_EXAM_1,ADMITTED_EXAM_2,ADMITTED_EXAM_3,ADMITTED_SUBJECT_PRIZE_LEVEL,REGION_ID,mean_score
0,1,10502311854018326223,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
1,1,1601392918367593206,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
2,1,9559803959325174929,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
3,1,8955667882044263414,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
4,1,17741967398854095262,Экзамен,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667


Проведём разбиение на train и test

In [8]:
feature_cols = df.drop(columns = ['DEBT'])

X_train, X_test, y_train, y_test = train_test_split(feature_cols,
                                                    df['DEBT'],
                                                    test_size=0.33,
                                                    random_state=10)

Разобьём train на train_ и val для чтобы сформировать eval_set (для ранней остановки в бустингах)

In [17]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.16,
                                                    shuffle=True,
                                                    random_state=RAND)
eval_set = [(X_val, y_val)]

# LightGBM

Найдём параметры при помощи библиотеки optuna. Сначала подберём learning_rate и n_estimators

## learning_rate и n_esimators

- n_estimators - кол-во базовых алгоритмов
- learning rate - скорость обучения

In [35]:
def objective_lgb(trial, X, y, N_FOLDS, random_state=RAND):
    lgb_params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 15000),
        "learning_rate": trial.suggest_float("Learning_rate", 0.001, 0.3),
        "random_state": trial.suggest_categorical("random_state:", [RAND]),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [percent_of_negative_class])
    }
    
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
        model = LGBMClassifier (**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric="auc",
                  early_stopping_rounds=100,
                  callbacks=[pruning_callback],
                  verbose=0)
        
        preds = model.predict_proba(X_test)
        cv_predicts[idx] = roc_auc_score(y_test, preds[:,1])

    return np.mean(cv_predicts)


study = optuna.create_study(direction="maximize", study_name="LightGBM")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND)

study.optimize(func, n_trials=20, show_progress_bar=True)

lgb_grid = LGBMClassifier(**study.best_params)
lgb_grid.fit(X_train_,
             y_train_,
             eval_metric="auc",
             eval_set=eval_set,
             verbose=2,
             early_stopping_rounds=100)

y_pred = lgb_grid.predict(X_test)
y_pred_prob = lgb_grid.predict_proba(X_test)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='LightGBM_fitted')

y_pred = lgb_grid.predict(X_train_)
y_pred_prob = lgb_grid.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_pred_prob, name='LightGBM_fitted_train'))

[32m[I 2022-09-15 14:43:12,550][0m A new study created in memory with name: LightGBM[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-15 14:43:20,394][0m Trial 0 finished with value: 0.8554715479113805 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17203026361943966, 'random_state:': 10, 'scale_pos_weight': 0.9528}. Best is trial 0 with value: 0.8554715479113805.[0m
[32m[I 2022-09-15 14:43:25,254][0m Trial 1 finished with value: 0.848333986959973 and parameters: {'n_estimators': 11841, 'Learning_rate': 0.2557376697839917, 'random_state:': 10, 'scale_pos_weight': 0.9528}. Best is trial 0 with value: 0.8554715479113805.[0m
[32m[I 2022-09-15 14:43:31,153][0m Trial 2 finished with value: 0.8450412192006047 and parameters: {'n_estimators': 14780, 'Learning_rate': 0.27917781444103074, 'random_state:': 10, 'scale_pos_weight': 0.9528}. Best is trial 0 with value: 0.8554715479113805.[0m
[32m[I 2022-09-15 14:43:37,292][0m Trial 3 finished with value: 0.8519560531729595 and parameters: {'n_estimators': 1146, 'Learning_rate': 0.18765711830815424, 'random_state:': 10, 'scale_pos_weight': 0.952

[162]	valid_0's auc: 0.842607	valid_0's binary_logloss: 0.149174
[164]	valid_0's auc: 0.843428	valid_0's binary_logloss: 0.148922
[166]	valid_0's auc: 0.843786	valid_0's binary_logloss: 0.148767
[168]	valid_0's auc: 0.84446	valid_0's binary_logloss: 0.148575
[170]	valid_0's auc: 0.844585	valid_0's binary_logloss: 0.148505
[172]	valid_0's auc: 0.844768	valid_0's binary_logloss: 0.148419
[174]	valid_0's auc: 0.845445	valid_0's binary_logloss: 0.148168
[176]	valid_0's auc: 0.845811	valid_0's binary_logloss: 0.148028
[178]	valid_0's auc: 0.846308	valid_0's binary_logloss: 0.147876
[180]	valid_0's auc: 0.846634	valid_0's binary_logloss: 0.147762
[182]	valid_0's auc: 0.846543	valid_0's binary_logloss: 0.147806
[184]	valid_0's auc: 0.846527	valid_0's binary_logloss: 0.147777
[186]	valid_0's auc: 0.846825	valid_0's binary_logloss: 0.147626
[188]	valid_0's auc: 0.847285	valid_0's binary_logloss: 0.147395
[190]	valid_0's auc: 0.847811	valid_0's binary_logloss: 0.147236
[192]	valid_0's auc: 0.847

[424]	valid_0's auc: 0.865505	valid_0's binary_logloss: 0.142126
[426]	valid_0's auc: 0.865799	valid_0's binary_logloss: 0.142074
[428]	valid_0's auc: 0.865927	valid_0's binary_logloss: 0.142017
[430]	valid_0's auc: 0.865969	valid_0's binary_logloss: 0.142044
[432]	valid_0's auc: 0.866087	valid_0's binary_logloss: 0.142022
[434]	valid_0's auc: 0.866287	valid_0's binary_logloss: 0.141933
[436]	valid_0's auc: 0.866196	valid_0's binary_logloss: 0.141993
[438]	valid_0's auc: 0.866151	valid_0's binary_logloss: 0.142046
[440]	valid_0's auc: 0.866579	valid_0's binary_logloss: 0.14191
[442]	valid_0's auc: 0.866442	valid_0's binary_logloss: 0.141969
[444]	valid_0's auc: 0.866486	valid_0's binary_logloss: 0.141972
[446]	valid_0's auc: 0.866847	valid_0's binary_logloss: 0.141895
[448]	valid_0's auc: 0.866879	valid_0's binary_logloss: 0.141878
[450]	valid_0's auc: 0.867231	valid_0's binary_logloss: 0.141769
[452]	valid_0's auc: 0.86721	valid_0's binary_logloss: 0.141787
[454]	valid_0's auc: 0.8675

In [37]:
study.best_params

{'n_estimators': 7329,
 'Learning_rate': 0.17203026361943966,
 'random_state:': 10,
 'scale_pos_weight': 0.9528}

In [36]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightGBM_fitted,0.957349,0.869912,0.676152,0.179626,0.283845,0.138004
0,LightGBM_fitted_train,0.98249,0.992391,0.988532,0.635827,0.773887,0.054545


Видим на этом этапе переобучение, но это не критично, т.к. мы ещё не задействовали ни одного регуляризатора. Подберём другие гиперпараметры

## num_leaves, max_depth, min_data_in_leaf, lambda_l1, lambda_l2, bagging_fraction

- max_depth - максимальная глубина базовых деревьев
- num_leaves - количество листьев в одном дереве
- min_data_in_leaf - минимальное количество объектов, которые должны попасть в узел дерева для его добавления
- lambda_l1 – коэффициент для L1 регулиризации
- lambda_l2 – коэффициент для L2 регулиризации
- bagging_fraction - указывает процент train выборок, которые будут использоваться для обучения каждого дерева

In [55]:
def objective_lgb(trial, X, y, N_FOLDS, random_state=RAND):
    lgb_params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [7329]),
        "learning_rate": trial.suggest_categorical("Learning_rate", [0.17]),
        "random_state": trial.suggest_categorical("random_state:", [RAND]),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [percent_of_negative_class]),
        "num_leaves": trial.suggest_int("num_leaves", 20, 4096),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 100000),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        
    }
    
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
        model = LGBMClassifier (**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric="auc",
                  early_stopping_rounds=100,
                  callbacks=[pruning_callback],
                  verbose=0)
        
        preds = model.predict_proba(X_test)
        cv_predicts[idx] = roc_auc_score(y_test, preds[:,1])

    return np.mean(cv_predicts)


study = optuna.create_study(direction="maximize", study_name="LightGBM")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND)

study.optimize(func, n_trials=20, show_progress_bar=True)

lgb_grid = LGBMClassifier(**study.best_params)
lgb_grid.fit(X_train_,
             y_train_,
             eval_metric="auc",
             eval_set=eval_set,
             verbose=2,
             early_stopping_rounds=70)

y_pred = lgb_grid.predict(X_test)
y_pred_prob = lgb_grid.predict_proba(X_test)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='LightGBM_fitted')

y_pred = lgb_grid.predict(X_train_)
y_pred_prob = lgb_grid.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_pred_prob, name='LightGBM_fitted_train'))

[32m[I 2022-09-15 15:16:40,686][0m A new study created in memory with name: LightGBM[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-15 15:16:41,630][0m Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 75, 'max_depth': 8, 'min_data_in_leaf': 68035, 'lambda_l1': 85, 'lambda_l2': 95, 'bagging_fraction': 0.4645074558216473}. Best is trial 0 with value: 0.5.[0m
[32m[I 2022-09-15 15:16:44,201][0m Trial 1 finished with value: 0.6201707025320428 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 3947, 'max_depth': 9, 'min_data_in_leaf': 35395, 'lambda_l1': 45, 'lambda_l2': 64, 'bagging_fraction': 0.4941054926528827}. Best is trial 1 with value: 0.6201707025320428.[0m
[32m[I 2022-09-15 15:16:45,269][0m Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 310, 'max_depth': 4, 'min_data_in_leaf': 73090, 'lambda_l1': 6, '

[32m[I 2022-09-15 15:16:46,206][0m Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 982, 'max_depth': 8, 'min_data_in_leaf': 99398, 'lambda_l1': 9, 'lambda_l2': 74, 'bagging_fraction': 0.3375993049329612}. Best is trial 1 with value: 0.6201707025320428.[0m
[32m[I 2022-09-15 15:17:32,656][0m Trial 4 finished with value: 0.7047859357105766 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 2961, 'max_depth': 5, 'min_data_in_leaf': 23966, 'lambda_l1': 2, 'lambda_l2': 16, 'bagging_fraction': 0.5198070005752842}. Best is trial 4 with value: 0.7047859357105766.[0m
[32m[I 2022-09-15 15:17:33,650][0m Trial 5 finished with value: 0.5 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 411, 'max_depth': 9, 'min_data_in_leaf': 55456, 'la

[32m[I 2022-09-15 15:17:34,618][0m Trial 6 finished with value: 0.5 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 1669, 'max_depth': 9, 'min_data_in_leaf': 57674, 'lambda_l1': 84, 'lambda_l2': 72, 'bagging_fraction': 0.43162928697065517}. Best is trial 4 with value: 0.7047859357105766.[0m
[32m[I 2022-09-15 15:17:35,585][0m Trial 7 finished with value: 0.5 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 2019, 'max_depth': 9, 'min_data_in_leaf': 91296, 'lambda_l1': 3, 'lambda_l2': 15, 'bagging_fraction': 0.5366149427145894}. Best is trial 4 with value: 0.7047859357105766.[0m
[32m[I 2022-09-15 15:17:36,528][0m Trial 8 finished with value: 0.5 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 182, 'max_depth': 5, 'min_data_in_leaf': 55539, 'lambda_l1': 92

[32m[I 2022-09-15 15:20:26,045][0m Trial 9 finished with value: 0.8268206845673447 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 1073, 'max_depth': 9, 'min_data_in_leaf': 15347, 'lambda_l1': 3, 'lambda_l2': 73, 'bagging_fraction': 0.673041997429236}. Best is trial 9 with value: 0.8268206845673447.[0m
[32m[I 2022-09-15 15:20:30,310][0m Trial 10 finished with value: 0.8056622382853279 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 1260, 'max_depth': 12, 'min_data_in_leaf': 1073, 'lambda_l1': 27, 'lambda_l2': 39, 'bagging_fraction': 0.9650558888513634}. Best is trial 9 with value: 0.8268206845673447.[0m
[32m[I 2022-09-15 15:20:34,924][0m Trial 11 finished with value: 0.8059604435612092 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 1220, 'max_depth':

[32m[I 2022-09-15 15:20:39,757][0m Trial 12 finished with value: 0.8125108662830133 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 'random_state:': 10, 'scale_pos_weight': 0.9528, 'num_leaves': 1101, 'max_depth': 12, 'min_data_in_leaf': 3780, 'lambda_l1': 25, 'lambda_l2': 49, 'bagging_fraction': 0.8186263937009877}. Best is trial 9 with value: 0.8268206845673447.[0m
[32m[I 2022-09-15 15:20:40,150][0m Trial 13 pruned. Trial was pruned at iteration 101.[0m
[32m[I 2022-09-15 15:20:40,541][0m Trial 14 pruned. Trial was pruned at iteration 101.[0m
[32m[I 2022-09-15 15:20:40,870][0m Trial 15 pruned. Trial was pruned at iteration 101.[0m
[32m[I 2022-09-15 15:20:41,309][0m Trial 16 pruned. Trial was pruned at iteration 101.[0m
[32m[I 2022-09-15 15:20:41,607][0m Trial 17 pruned. Trial was pruned at iteration 101.[0m
[32m[I 2022-09-15 15:20:52,791][0m Trial 18 finished with value: 0.8328108289869839 and parameters: {'n_estimators': 7329, 'Learning_rate': 0.17, 

[84]	valid_0's auc: 0.782021	valid_0's binary_logloss: 0.170807
[86]	valid_0's auc: 0.782322	valid_0's binary_logloss: 0.170701
[88]	valid_0's auc: 0.783204	valid_0's binary_logloss: 0.170468
[90]	valid_0's auc: 0.783546	valid_0's binary_logloss: 0.170328
[92]	valid_0's auc: 0.7839	valid_0's binary_logloss: 0.170208
[94]	valid_0's auc: 0.784011	valid_0's binary_logloss: 0.170137
[96]	valid_0's auc: 0.784849	valid_0's binary_logloss: 0.169945
[98]	valid_0's auc: 0.785375	valid_0's binary_logloss: 0.169809
[100]	valid_0's auc: 0.786362	valid_0's binary_logloss: 0.16958
[102]	valid_0's auc: 0.787282	valid_0's binary_logloss: 0.169376
[104]	valid_0's auc: 0.787701	valid_0's binary_logloss: 0.169267
[106]	valid_0's auc: 0.787971	valid_0's binary_logloss: 0.169187
[108]	valid_0's auc: 0.788555	valid_0's binary_logloss: 0.169035
[110]	valid_0's auc: 0.789263	valid_0's binary_logloss: 0.16888
[112]	valid_0's auc: 0.789585	valid_0's binary_logloss: 0.168792
[114]	valid_0's auc: 0.790093	valid_0

[364]	valid_0's auc: 0.819897	valid_0's binary_logloss: 0.160597
[366]	valid_0's auc: 0.820041	valid_0's binary_logloss: 0.16055
[368]	valid_0's auc: 0.820068	valid_0's binary_logloss: 0.160516
[370]	valid_0's auc: 0.820148	valid_0's binary_logloss: 0.160479
[372]	valid_0's auc: 0.820449	valid_0's binary_logloss: 0.160413
[374]	valid_0's auc: 0.820445	valid_0's binary_logloss: 0.160384
[376]	valid_0's auc: 0.820815	valid_0's binary_logloss: 0.160293
[378]	valid_0's auc: 0.820903	valid_0's binary_logloss: 0.160266
[380]	valid_0's auc: 0.820984	valid_0's binary_logloss: 0.160222
[382]	valid_0's auc: 0.821056	valid_0's binary_logloss: 0.160184
[384]	valid_0's auc: 0.821195	valid_0's binary_logloss: 0.160141
[386]	valid_0's auc: 0.821354	valid_0's binary_logloss: 0.160111
[388]	valid_0's auc: 0.82162	valid_0's binary_logloss: 0.160052
[390]	valid_0's auc: 0.821996	valid_0's binary_logloss: 0.159972
[392]	valid_0's auc: 0.822245	valid_0's binary_logloss: 0.159909
[394]	valid_0's auc: 0.8223

[618]	valid_0's auc: 0.833922	valid_0's binary_logloss: 0.155885
[620]	valid_0's auc: 0.834016	valid_0's binary_logloss: 0.155857
[622]	valid_0's auc: 0.834152	valid_0's binary_logloss: 0.155836
[624]	valid_0's auc: 0.834234	valid_0's binary_logloss: 0.155813
[626]	valid_0's auc: 0.834249	valid_0's binary_logloss: 0.155797
[628]	valid_0's auc: 0.834244	valid_0's binary_logloss: 0.155779
[630]	valid_0's auc: 0.834315	valid_0's binary_logloss: 0.155766
[632]	valid_0's auc: 0.834386	valid_0's binary_logloss: 0.155739
[634]	valid_0's auc: 0.834402	valid_0's binary_logloss: 0.15572
[636]	valid_0's auc: 0.834477	valid_0's binary_logloss: 0.155696
[638]	valid_0's auc: 0.834517	valid_0's binary_logloss: 0.155684
[640]	valid_0's auc: 0.834621	valid_0's binary_logloss: 0.155651
[642]	valid_0's auc: 0.834707	valid_0's binary_logloss: 0.155616
[644]	valid_0's auc: 0.834753	valid_0's binary_logloss: 0.155595
[646]	valid_0's auc: 0.834848	valid_0's binary_logloss: 0.15557
[648]	valid_0's auc: 0.8348

[890]	valid_0's auc: 0.842316	valid_0's binary_logloss: 0.152841
[892]	valid_0's auc: 0.84237	valid_0's binary_logloss: 0.152817
[894]	valid_0's auc: 0.842471	valid_0's binary_logloss: 0.152795
[896]	valid_0's auc: 0.84253	valid_0's binary_logloss: 0.15278
[898]	valid_0's auc: 0.842674	valid_0's binary_logloss: 0.152739
[900]	valid_0's auc: 0.842697	valid_0's binary_logloss: 0.152732
[902]	valid_0's auc: 0.842743	valid_0's binary_logloss: 0.152711
[904]	valid_0's auc: 0.842759	valid_0's binary_logloss: 0.152703
[906]	valid_0's auc: 0.842757	valid_0's binary_logloss: 0.152689
[908]	valid_0's auc: 0.842882	valid_0's binary_logloss: 0.152643
[910]	valid_0's auc: 0.842897	valid_0's binary_logloss: 0.15263
[912]	valid_0's auc: 0.842929	valid_0's binary_logloss: 0.152616
[914]	valid_0's auc: 0.84297	valid_0's binary_logloss: 0.152596
[916]	valid_0's auc: 0.84296	valid_0's binary_logloss: 0.152599
[918]	valid_0's auc: 0.843153	valid_0's binary_logloss: 0.15257
[920]	valid_0's auc: 0.843231	va

In [60]:
study.best_params

{'n_estimators': 7329,
 'Learning_rate': 0.17,
 'random_state:': 10,
 'scale_pos_weight': 0.9528,
 'num_leaves': 1480,
 'max_depth': 10,
 'min_data_in_leaf': 6735,
 'lambda_l1': 16,
 'lambda_l2': 97,
 'bagging_fraction': 0.6322273209124994}

In [61]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightGBM_fitted,0.954097,0.838937,0.684783,0.045356,0.085078,0.148219
0,LightGBM_fitted_train,0.955345,0.879878,0.837398,0.065121,0.120845,0.137544


Итак, видим, что удалось улучшить roc_auc на тестовой выборке по-сравнению с бейзлайном, а также полностью убрать эффект переобучения. Как итог, мы получили очень хороший алгоритм

## Cross_val_score

Проведём в качестве финальной проверки 5-кратную кросс-валидацию и посмотрим метрику roc_auc на каждом фолде

In [64]:
scores = []

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

cv_predicts = np.empty(N_FOLDS)

for idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_test_ = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_test_ = y_train.iloc[train_idx], y_train.iloc[test_idx]
        
    model = LGBMClassifier(**study.best_params)
    model.fit(X_train_,
            y_train_,
            eval_set=[(X_test_, y_test_)],
            eval_metric="auc",
            early_stopping_rounds=100,
            verbose=0)
        
    preds = model.predict_proba(X_test_)
    cv_predicts[idx] = roc_auc_score(y_test_, preds[:,1])

print(cv_predicts)

[0.82567778 0.83110929 0.82437418 0.83766784 0.84186028]


Видим, что на всех фолдах roc_auc примерно одинаковый

# Catboost

Проделаем те же шаги c optuna для нашего зверя Catboost

In [66]:
cat_features = ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 
            'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 
            'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID']

for col in cat_features:
    df[col] = df[col].astype('string')

## learning_rate и n_estimators

- n_estimators - кол-во базовых алгоритмов
- learning rate - скорость обучения

In [73]:
def objective_lgb(trial, X, y, N_FOLDS, random_state, cat_feat):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("Learning_rate", 0.001, 0.3),
        "cat_features":
        trial.suggest_categorical("cat_features", [cat_features]),
        "loss_function":
        trial.suggest_categorical("loss_function", ["Logloss"]),
        "use_best_model":
        trial.suggest_categorical("use_best_model", [True]),
        "eval_metric":
        trial.suggest_categorical("eval_metric", ["Logloss"]),
        "random_state":
        RAND,
        "scale_pos_weight": 
        trial.suggest_categorical("scale_pos_weight", [percent_of_negative_class])
    }
       
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostClassifier(**params)
        model.fit(train_data,
                  eval_set=eval_data,
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict_proba(X_test)
        cv_predicts[idx] = roc_auc_score(y_test, preds[:,1])

    return np.mean(cv_predicts)

study_cat = optuna.create_study(direction="maximize", study_name="Catboost")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND, cat_feat=cat_features)

study_cat.optimize(func, n_trials=20, show_progress_bar=True)

[32m[I 2022-09-15 19:04:47,395][0m A new study created in memory with name: Catboost[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-15 19:12:19,676][0m Trial 0 finished with value: 0.8651664105966101 and parameters: {'n_estimators': 579, 'Learning_rate': 0.10950343750810351, 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'eval_metric': 'Logloss', 'scale_pos_weight': 0.9528}. Best is trial 0 with value: 0.8651664105966101.[0m
[32m[I 2022-09-15 19:28:25,938][0m Trial 1 finished with value: 0.89405954328465 and parameters: {'n_estimators': 633, 'Learning_rate': 0.2969979204816421, 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'eval_metric': 'Logloss', 'scale_pos_weight': 0.9528}. Best is trial 1 with value: 0.89405954328465.[0m
[32m[I

[32m[I 2022-09-15 21:52:19,555][0m Trial 17 finished with value: 0.8979053297404322 and parameters: {'n_estimators': 836, 'Learning_rate': 0.24708012644731442, 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'eval_metric': 'Logloss', 'scale_pos_weight': 0.9528}. Best is trial 12 with value: 0.9036729307658569.[0m
[32m[I 2022-09-15 22:04:57,898][0m Trial 18 finished with value: 0.8996076348068829 and parameters: {'n_estimators': 980, 'Learning_rate': 0.19685776323122645, 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'eval_metric': 'Logloss', 'scale_pos_weight': 0.9528}. Best is trial 12 with value: 0.9036729307658569.[0

Не знаю, почему в learning_rate первая буква всегда меняется на заглавную, но Catboost это не очень одобряет, поэтому придётся поправить:)

In [97]:
best_params = dict(study_cat.best_params)
best_params['learning_rate'] = study_cat.best_params['Learning_rate']
del best_params['Learning_rate']
best_params

{'n_estimators': 950,
 'cat_features': ['DISC_ID',
  'TYPE_NAME',
  'GENDER',
  'CITIZENSHIP',
  'EXAM_TYPE',
  'EXAM_SUBJECT_1',
  'EXAM_SUBJECT_2',
  'EXAM_SUBJECT_3',
  'ADMITTED_SUBJECT_PRIZE_LEVEL',
  'REGION_ID'],
 'loss_function': 'Logloss',
 'use_best_model': True,
 'eval_metric': 'Logloss',
 'scale_pos_weight': 0.9528,
 'learning_rate': 0.28330409101534126}

In [99]:
model = CatBoostClassifier(**best_params)
model.fit(X_train,
         y_train,
         eval_set=eval_set,
         early_stopping_rounds=100,
         verbose=0)

<catboost.core.CatBoostClassifier at 0x20fb72c7b80>

In [100]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
get_metrics(y_test, y_pred, y_pred_prob, name='Catboost_fitted')

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_fitted,0.963887,0.914199,0.769616,0.331893,0.463783,0.111745


In [106]:
y_pred = model.predict(X_train_)
y_pred_prob = model.predict_proba(X_train_)
get_metrics(y_train_, y_pred, y_pred_prob, name='Catboost_fitted_train')

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_fitted_train,0.97007,0.952349,0.893458,0.419851,0.571258,0.09151


Как видим, результаты просто шикарные, и этот алгоритм уже сильно бьёт затюненный LightGBM, но по roc_auc до своего бейзлайна совсем чуть-чуть недотягивает. Попробуем исправить ситуацию подбором других гиперпараметров

## max_depth, l2_leaf_reg, bootstrap_type, border_count, grow_policy, auto_class_weights, 

- max_depth - глубина дерева
- l2_leaf_reg - коэффициент при L2 регуляризации
- bootstrap_type - способ формирования бутстрэп-выборки
- grow_policy - способ построения дерева (симметричное, по глубине и т.д.)
- auto_class_weights - множитель весов объектов
- border_count - количество разбиений для числовых признаков (при выборе критерия разбиения)

In [110]:
def objective_lgb(trial, X, y, N_FOLDS, random_state, cat_feat):
    params = {
        "n_estimators":
        trial.suggest_categorical("n_estimators", [950]),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.28]),
        "max_depth":
        trial.suggest_int("max_depth", 3, 12),
        "l2_leaf_reg":
        trial.suggest_uniform("l2_leaf_reg", 1e-5, 1e2),
        #"random_strength":
        #trial.suggest_uniform('random_strength', 10, 50),
        "bootstrap_type":
        trial.suggest_categorical("bootstrap_type",
                                  ["Bayesian", "Bernoulli", "MVS", "No"]),
        "border_count":
        trial.suggest_categorical('border_count', [128, 254]),
        "grow_policy":
        trial.suggest_categorical('grow_policy',
                                  ["SymmetricTree", "Depthwise", "Lossguide"]),
        "auto_class_weights":
        trial.suggest_categorical("auto_class_weights",
                                  ["None", "Balanced", "SqrtBalanced"]),
        #"od_wait":
        #trial.suggest_int('od_wait', 500, 2000),
        #"leaf_estimation_iterations":
        #trial.suggest_int('leaf_estimation_iterations', 1, 15),
        "cat_features":
        trial.suggest_categorical("cat_features", [cat_features]),
        "loss_function":
        trial.suggest_categorical("loss_function", ["Logloss"]),
        "use_best_model":
        trial.suggest_categorical("use_best_model", [True]),
        "eval_metric":
        trial.suggest_categorical("eval_metric", ["Logloss"]),
        "random_state":
        random_state
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float(
            "bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample",
                                                  0.1,
                                                  1,
                                                  log=True)

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostClassifier(**params)
        model.fit(train_data,
                  eval_set=eval_data,
                  early_stopping_rounds=100,
                  verbose=0)
        
        preds = model.predict_proba(X_test)
        cv_predicts[idx] = roc_auc_score(y_test, preds[:,1])

    return np.mean(cv_predicts)

In [111]:
study_cat = optuna.create_study(direction="maximize", study_name="Catboost")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND, cat_feat=cat_features)

study_cat.optimize(func, n_trials=20, show_progress_bar=True)

[32m[I 2022-09-15 23:19:43,494][0m A new study created in memory with name: Catboost[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-15 23:28:07,798][0m Trial 0 finished with value: 0.9201138011927812 and parameters: {'n_estimators': 950, 'learning_rate': 0.28, 'max_depth': 8, 'l2_leaf_reg': 73.83322775347744, 'bootstrap_type': 'No', 'border_count': 254, 'grow_policy': 'Depthwise', 'auto_class_weights': 'Balanced', 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'eval_metric': 'Logloss'}. Best is trial 0 with value: 0.9201138011927812.[0m
[32m[I 2022-09-15 23:32:56,767][0m Trial 1 finished with value: 0.89244243183782 and parameters: {'n_estimators': 950, 'learning_rate': 0.28, 'max_depth': 10, 'l2_leaf_reg': 38.272629250341815, 'bootstrap_type': 'Bernoulli', 'border_count': 254, 'grow_policy': 'SymmetricTree', 'auto_class_weights': 'Balanced', 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_T

[32m[I 2022-09-16 03:08:54,535][0m Trial 14 finished with value: 0.9163696469999263 and parameters: {'n_estimators': 950, 'learning_rate': 0.28, 'max_depth': 9, 'l2_leaf_reg': 99.9124253335511, 'bootstrap_type': 'No', 'border_count': 254, 'grow_policy': 'Lossguide', 'auto_class_weights': 'SqrtBalanced', 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'eval_metric': 'Logloss'}. Best is trial 8 with value: 0.9249237064399862.[0m
[32m[I 2022-09-16 03:18:30,650][0m Trial 15 finished with value: 0.8775600650131435 and parameters: {'n_estimators': 950, 'learning_rate': 0.28, 'max_depth': 6, 'l2_leaf_reg': 45.75059338247975, 'bootstrap_type': 'No', 'border_count': 128, 'grow_policy': 'SymmetricTree', 'auto_class_weights': 'None', 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 

In [122]:
study_cat.best_params

{'n_estimators': 950,
 'learning_rate': 0.28,
 'max_depth': 9,
 'l2_leaf_reg': 38.55993812132333,
 'bootstrap_type': 'No',
 'border_count': 254,
 'grow_policy': 'SymmetricTree',
 'auto_class_weights': 'Balanced',
 'cat_features': ['DISC_ID',
  'TYPE_NAME',
  'GENDER',
  'CITIZENSHIP',
  'EXAM_TYPE',
  'EXAM_SUBJECT_1',
  'EXAM_SUBJECT_2',
  'EXAM_SUBJECT_3',
  'ADMITTED_SUBJECT_PRIZE_LEVEL',
  'REGION_ID'],
 'loss_function': 'Logloss',
 'use_best_model': True,
 'eval_metric': 'Logloss'}

In [None]:
best_params = dict(study_cat.best_params)

In [123]:
train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)
eval_data = Pool(data=X_test, label=y_test, cat_features=cat_features)

model = CatBoostClassifier(**study_cat.best_params)
model.fit(train_data,
         eval_set=eval_data,
         early_stopping_rounds=100,
         verbose=0)

<catboost.core.CatBoostClassifier at 0x20fb725e4c0>

In [133]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
metrics = metrics.append(
    get_metrics(y_test, y_pred, y_pred_prob, name='CatBoost_fitted'))

y_pred = model.predict(X_train)
y_pred_prob = model.predict_proba(X_train)
metrics = metrics.append(
    get_metrics(y_train, y_pred, y_pred_prob, name='Catboost_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightGBM_fitted,0.954097,0.838937,0.684783,0.045356,0.085078,0.148219
0,LightGBM_fitted_train,0.955345,0.879878,0.837398,0.065121,0.120845,0.137544
0,CatBoost_fitted,0.915663,0.937393,0.334437,0.800216,0.471724,0.238022
0,Catboost_train,0.926082,0.976876,0.384154,0.922361,0.542403,0.217075


Видим улучшение roc_auc на целых 2% относительно бейзлайна

## Cross_val_score

Аналогично LightGBM, сделаем проверку нашего алгоритма на кросс-валидации

In [135]:
scores = []

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

cv_predicts = np.empty(N_FOLDS)

for idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_test_ = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_test_ = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    train_data = Pool(data=X_train_, label=y_train_, cat_features=cat_features)
    eval_data = Pool(data=X_test_, label=y_test_, cat_features=cat_features)

    model = CatBoostClassifier(**study_cat.best_params)
    model.fit(train_data,
              eval_set=eval_data,
              early_stopping_rounds=100,
              verbose=0)
        
    preds = model.predict_proba(X_test_)
    cv_predicts[idx] = roc_auc_score(y_test_, preds[:,1])

print(cv_predicts)

[0.91724988 0.91887531 0.92316864 0.92941474 0.93187569]


Видим, что всё стабильно