In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Импорт библиотек

In [None]:
!pip install xgboost
!pip install lightgbm
!pip install catboost
!pip install optuna


import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, log_loss, auc, classification_report, confusion_matrix, \
    precision_recall_curve, roc_curve

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from catboost import Pool

import warnings
import optuna

warnings.filterwarnings("ignore")
RAND=10
N_FOLDS=5
percent_of_negative_class = 0.97

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.2-py3-none-any.whl (348 kB)
[K     |████████████████████████████████| 348 kB 4.0 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 7.0 MB/s 
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 62.7 MB/s 
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading Mako-1.2.2-py3-

# Метод для подсчёта метрик

In [None]:
def get_metrics(y_test, y_pred, y_score, name):
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    df_metrics['Accuracy'] = [accuracy_score(y_test, y_pred)]
    df_metrics['ROC_AUC'] = [roc_auc_score(y_test, y_score[:,1])]
    df_metrics['Precision'] = [precision_score(y_test, y_pred)]
    df_metrics['Recall'] = [recall_score(y_test, y_pred)]
    df_metrics['f1'] = [f1_score(y_test, y_pred)]
    df_metrics['Logloss'] = [log_loss(y_test, y_score)]
    
    return df_metrics

# Подготовка данных к обучению

Выгрузим данные

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/data_test3.parquet.gzip')
df.head()

Unnamed: 0,SEMESTER,DISC_ID,TYPE_NAME,DEBT,GENDER,CITIZENSHIP,EXAM_TYPE,EXAM_SUBJECT_1,EXAM_SUBJECT_2,EXAM_SUBJECT_3,...,DISC_DEP_12779834774062657273,DISC_DEP_12795149246808839444,DISC_DEP_12866670834530293829,DISC_DEP_12896073176567118977,DISC_DEP_13705271043836613455,DISC_DEP_16131140458546037814,DISC_DEP_16828277449727897492,DISC_DEP_17522523368314118110,DISC_DEP_18446744073709551615,mean_score
0,1,10502311854018326223,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,...,0,0,0,0,0,0,0,0,0,82.666667
1,1,1601392918367593206,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,...,0,0,0,0,0,0,0,0,0,82.666667
2,1,9559803959325174929,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,...,0,0,0,0,0,0,0,0,0,82.666667
3,1,8955667882044263414,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,...,0,0,0,0,0,0,0,0,0,82.666667
4,1,17741967398854095262,Экзамен,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,...,0,0,0,0,0,0,0,0,0,82.666667


Проведём разбиение на train и test

In [None]:
feature_cols = df.drop(columns = ['DEBT'])

X_train, X_test, y_train, y_test = train_test_split(feature_cols,
                                                    df['DEBT'],
                                                    test_size=0.16,
                                                    random_state=10)

In [None]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.16,
                                                    shuffle=True,
                                                    random_state=RAND)
eval_set = [(X_val, y_val)]

# LightGBM

Найдём параметры при помощи библиотеки optuna. Сначала подберём learning_rate и n_estimators

## learning_rate и n_esimators

- n_estimators - кол-во базовых алгоритмов
- learning rate - скорость обучения

In [None]:
def f1_metric(labels, scores):
    pred = np.round(scores)
    return 'f1', f1_score(labels, pred), True

In [None]:
def objective_lgb(trial, X, y, N_FOLDS, random_state=RAND):
    lgb_params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 15000),
        "learning_rate": trial.suggest_float("Learning_rate", 0.001, 0.3),
        "random_state": trial.suggest_categorical("random_state:", [RAND]),
        "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", [percent_of_negative_class])
    }
    
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model = LGBMClassifier (**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric=f1_metric,
                  early_stopping_rounds=100,
                  verbose=0)
        
        preds = model.predict(X_test)
        cv_predicts[idx] = f1_score(y_test, preds)

    return np.mean(cv_predicts)


study = optuna.create_study(direction="maximize", study_name="LightGBM")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND)

study.optimize(func, n_trials=20, show_progress_bar=True)

lgb_grid = LGBMClassifier(**study.best_params)
lgb_grid.fit(X_train_,
             y_train_,
             eval_metric=f1_metric,
             eval_set=eval_set,
             verbose=2,
             early_stopping_rounds=100)

y_pred = lgb_grid.predict(X_test)
y_pred_prob = lgb_grid.predict_proba(X_test)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='LightGBM_fitted')

y_pred = lgb_grid.predict(X_train_)
y_pred_prob = lgb_grid.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_pred_prob, name='LightGBM_fitted_train'))

[32m[I 2022-09-19 17:18:38,687][0m A new study created in memory with name: LightGBM[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-19 17:21:29,264][0m Trial 0 finished with value: 0.5870221115000639 and parameters: {'n_estimators': 12199, 'Learning_rate': 0.14423120579262796, 'random_state:': 10, 'scale_pos_weight': 0.958}. Best is trial 0 with value: 0.5870221115000639.[0m
[32m[I 2022-09-19 17:23:19,255][0m Trial 1 finished with value: 0.43172754283279857 and parameters: {'n_estimators': 629, 'Learning_rate': 0.06157457380318693, 'random_state:': 10, 'scale_pos_weight': 0.958}. Best is trial 0 with value: 0.5870221115000639.[0m
[32m[I 2022-09-19 17:24:47,535][0m Trial 2 finished with value: 0.5176173895975555 and parameters: {'n_estimators': 4425, 'Learning_rate': 0.2130317687453528, 'random_state:': 10, 'scale_pos_weight': 0.958}. Best is trial 0 with value: 0.5870221115000639.[0m
[32m[I 2022-09-19 17:30:47,367][0m Trial 3 finished with value: 0.5281394818570204 and parameters: {'n_estimators': 4332, 'Learning_rate': 0.03968547086562551, 'random_state:': 10, 'scale_pos_weight': 0.958}. 

In [None]:
study.best_params

{'n_estimators': 12199,
 'Learning_rate': 0.14423120579262796,
 'random_state:': 10,
 'scale_pos_weight': 0.958}

In [None]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightGBM_fitted,0.982482,0.972816,0.845805,0.475765,0.60898,0.051982
0,LightGBM_fitted_train,0.995422,0.999678,0.995511,0.85232,0.918367,0.018796


Видим на этом этапе переобучение, но это не критично, т.к. мы ещё не задействовали ни одного регуляризатора. Подберём другие гиперпараметры

## num_leaves, max_bin, bagging_fraction, feature fraction

- max_bin - целое число, представляющее максимальное количество сегментов. Чем он больше, тем медленнее идёт обучение, но тем большей точности можно достичь
- num_leaves - количество листьев в одном дереве
- bagging_fraction - указывает процент train выборок, которые будут использоваться для обучения каждого дерева
- feature fraction - процент признаков для выборки при обучении каждого дерева

In [None]:
def objective_lgb(trial, X, y, N_FOLDS, random_state=RAND):
    lgb_params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [12199]),
        "learning_rate": trial.suggest_categorical("Learning_rate", [0.144]),
        "random_state": trial.suggest_categorical("random_state:", [RAND]),
         "is_unbalance": [True],
        "num_leaves": trial.suggest_int("num_leaves", 20, 1000, step=20),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.99),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.99),
    }
    
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model = LGBMClassifier (**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric=f1_metric,
                  early_stopping_rounds=100,
                  verbose=0)
        
        preds = model.predict(X_test)
        cv_predicts[idx] = f1_score(y_test, preds)

    return np.mean(cv_predicts)


study = optuna.create_study(direction="maximize", study_name="LightGBM")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND)

study.optimize(func, n_trials=20, show_progress_bar=True)

lgb_grid = LGBMClassifier(**study.best_params)
lgb_grid.fit(X_train_,
             y_train_,
             eval_metric=f1_metric,
             eval_set=eval_set,
             verbose=2,
             early_stopping_rounds=100)

y_pred = lgb_grid.predict(X_test)
y_pred_prob = lgb_grid.predict_proba(X_test)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='LightGBM_fitted')

y_pred = lgb_grid.predict(X_train_)
y_pred_prob = lgb_grid.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_pred_prob, name='LightGBM_fitted_train'))

[32m[I 2022-09-19 22:01:59,401][0m A new study created in memory with name: LightGBM[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-19 22:03:57,225][0m Trial 0 finished with value: 0.6465295717014294 and parameters: {'n_estimators': 12199, 'Learning_rate': 0.144, 'random_state:': 10, 'num_leaves': 380, 'max_bin': 250, 'bagging_fraction': 0.6502877834167342, 'feature_fraction': 0.3013412241038426}. Best is trial 0 with value: 0.6465295717014294.[0m
[32m[I 2022-09-19 22:06:04,935][0m Trial 1 finished with value: 0.649423814850288 and parameters: {'n_estimators': 12199, 'Learning_rate': 0.144, 'random_state:': 10, 'num_leaves': 740, 'max_bin': 272, 'bagging_fraction': 0.8002853904493774, 'feature_fraction': 0.574694714651313}. Best is trial 1 with value: 0.649423814850288.[0m
[32m[I 2022-09-19 22:08:05,961][0m Trial 2 finished with value: 0.6342546634722706 and parameters: {'n_estimators': 12199, 'Learning_rate': 0.144, 'random_state:': 10, 'num_leaves': 440, 'max_bin': 229, 'bagging_fraction': 0.26080065650548656, 'feature_fraction': 0.27057412709476675}. Best is trial 1 with value: 0.649423814

In [None]:
study.best_params

{'n_estimators': 12199,
 'Learning_rate': 0.144,
 'random_state:': 10,
 'num_leaves': 140,
 'max_bin': 267,
 'bagging_fraction': 0.6859821983680845,
 'feature_fraction': 0.6530213206077305}

In [None]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightGBM_fitted,0.982116,0.975209,0.855422,0.452806,0.59216,0.050419
0,LightGBM_fitted_train,0.997893,0.999899,0.997067,0.933022,0.963982,0.01423


Итак, видим, что всё ещё присутствует переобучение. Попробуем уменьшить early_stopping_rounds в 50 раз (выявлено методом подбора)

In [None]:
lgb_grid = LGBMClassifier(**study.best_params)
lgb_grid.fit(X_train_,
             y_train_,
             eval_metric=f1_metric,
             eval_set=eval_set,
             verbose=2,
             early_stopping_rounds=2)

Training until validation scores don't improve for 2 rounds.
[2]	valid_0's binary_logloss: 0.111848	valid_0's f1: 0.0620384
[4]	valid_0's binary_logloss: 0.105528	valid_0's f1: 0.128205
[6]	valid_0's binary_logloss: 0.10069	valid_0's f1: 0.146273
[8]	valid_0's binary_logloss: 0.0969998	valid_0's f1: 0.158996
[10]	valid_0's binary_logloss: 0.0940353	valid_0's f1: 0.173315
[12]	valid_0's binary_logloss: 0.091444	valid_0's f1: 0.197564
[14]	valid_0's binary_logloss: 0.0891858	valid_0's f1: 0.223702
[16]	valid_0's binary_logloss: 0.0876418	valid_0's f1: 0.229855
[18]	valid_0's binary_logloss: 0.0859439	valid_0's f1: 0.24147
[20]	valid_0's binary_logloss: 0.0847735	valid_0's f1: 0.247396
[22]	valid_0's binary_logloss: 0.0835359	valid_0's f1: 0.268886
[24]	valid_0's binary_logloss: 0.0819851	valid_0's f1: 0.277707
[26]	valid_0's binary_logloss: 0.0809893	valid_0's f1: 0.284264
[28]	valid_0's binary_logloss: 0.0798609	valid_0's f1: 0.29256
[30]	valid_0's binary_logloss: 0.078694	valid_0's f1:

LGBMClassifier(Learning_rate=0.144, bagging_fraction=0.6859821983680845,
               feature_fraction=0.6530213206077305, max_bin=267,
               n_estimators=12199, num_leaves=140, random_state:=10)

In [None]:
y_pred = lgb_grid.predict(X_test)
y_pred_prob = lgb_grid.predict_proba(X_test)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='LightGBM_fitted')

y_pred = lgb_grid.predict(X_train_)
y_pred_prob = lgb_grid.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_pred_prob, name='LightGBM_fitted_train'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightGBM_fitted,0.975899,0.942701,0.898089,0.179847,0.299681,0.077287
0,LightGBM_fitted_train,0.97682,0.972435,0.957883,0.243481,0.388269,0.06728


Видим, что ситуация стала значительно лучше. По-сравнению с бейзлайном все метрики поднялись, в том числе целевая метрика f1. При этом удалось невелировать эффект переобучения при помощи подбора early_stopping. Таким образом, мы получили вполне боевой алгоритм, который можно использовать

## Cross_val_score

Проведём в качестве финальной проверки 5-кратную кросс-валидацию и посмотрим метрику f1 на каждом фолде

In [None]:
scores = []

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

cv_predicts = np.empty(N_FOLDS)

for idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_test_ = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_test_ = y_train.iloc[train_idx], y_train.iloc[test_idx]
        
    model = LGBMClassifier(**study.best_params)
    model.fit(X_train_,
            y_train_,
            eval_set=[(X_test_, y_test_)],
            eval_metric=f1_metric,
            early_stopping_rounds=2,
            verbose=0)
        
    preds = model.predict(X_test_)
    cv_predicts[idx] = f1_score(y_test_, preds)

print(cv_predicts)

[0.37098255 0.35576037 0.33146067 0.37408759 0.42040457]


Есть небольшой разброс метрик, но всё в рамках разумного. Видим, что нет провала на каком-нибудь одном конкретном фолде, и стабильно мы достигаем 30+% по метрике f1

# Catboost

Проделаем те же шаги c optuna для нашего зверя Catboost

In [None]:
cat_features = ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 
            'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 
            'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID']

## learning_rate и n_estimators

- n_estimators - кол-во базовых алгоритмов
- learning rate - скорость обучения

In [None]:
def objective_lgb(trial, X, y, N_FOLDS, random_state, cat_feat):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("Learning_rate", 0.001, 0.3),
        "cat_features":
        trial.suggest_categorical("cat_features", [cat_features]),
        "loss_function":
        trial.suggest_categorical("loss_function", ["Logloss"]),
        "use_best_model":
        trial.suggest_categorical("use_best_model", [True]),
        "random_state":
        RAND,
        "scale_pos_weight": 
        trial.suggest_categorical("scale_pos_weight", [percent_of_negative_class])
    }
       
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostClassifier(**params)
        model.fit(train_data,
                  eval_set=eval_data,
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = f1_score(y_test, preds)

    return np.mean(cv_predicts)

study_cat = optuna.create_study(direction="maximize", study_name="Catboost")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND, cat_feat=cat_features)

study_cat.optimize(func, n_trials=20, show_progress_bar=True)

[32m[I 2022-09-20 01:08:07,555][0m A new study created in memory with name: Catboost[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-20 01:21:50,110][0m Trial 0 finished with value: 0.4057519886566909 and parameters: {'n_estimators': 913, 'Learning_rate': 0.11536329790508669, 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'scale_pos_weight': 0.958}. Best is trial 0 with value: 0.4057519886566909.[0m
[32m[I 2022-09-20 01:34:08,357][0m Trial 1 finished with value: 0.390078730097345 and parameters: {'n_estimators': 831, 'Learning_rate': 0.10134131011779682, 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True, 'scale_pos_weight': 0.958}. Best is trial 0 with value: 0.4057519886566909.[0m
[32m[I 2022-09-20 01:40:26,348][0m Trial 2 finished wit

Не знаю, почему в learning_rate первая буква всегда меняется на заглавную, но Catboost это не очень одобряет, поэтому придётся поправить:)

In [None]:
best_params = dict(study_cat.best_params)
best_params['learning_rate'] = study_cat.best_params['Learning_rate']
del best_params['Learning_rate']
best_params

{'n_estimators': 826,
 'cat_features': ['DISC_ID',
  'TYPE_NAME',
  'GENDER',
  'CITIZENSHIP',
  'EXAM_TYPE',
  'EXAM_SUBJECT_1',
  'EXAM_SUBJECT_2',
  'EXAM_SUBJECT_3',
  'ADMITTED_SUBJECT_PRIZE_LEVEL',
  'REGION_ID'],
 'loss_function': 'Logloss',
 'use_best_model': True,
 'scale_pos_weight': 0.958,
 'learning_rate': 0.29917203660796443}

In [None]:
model = CatBoostClassifier(**best_params)
model.fit(X_train_,
         y_train_,
         eval_set=eval_set,
         early_stopping_rounds=100,
         verbose=0)

<catboost.core.CatBoostClassifier at 0x7fb13d5a6c10>

In [None]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
get_metrics(y_test, y_pred, y_pred_prob, name='Catboost_fitted')

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_fitted,0.979593,0.941725,0.844512,0.353316,0.498201,0.0691


In [None]:
y_pred = model.predict(X_train_)
y_pred_prob = model.predict_proba(X_train_)
get_metrics(y_train_, y_pred, y_pred_prob, name='Catboost_fitted_train')

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_fitted_train,0.983082,0.975378,0.945031,0.467197,0.625276,0.053517


Как видим, результаты очень хорошие. Я бы сказал, что этот алгоритм уже бьёт бейзлайн, так как он лучше по всем целевым метрикам

## max_depth, l2_leaf_reg, bootstrap_type, border_count, grow_policy, auto_class_weights, 

- max_depth - глубина дерева
- l2_leaf_reg - коэффициент при L2 регуляризации
- bootstrap_type - способ формирования бутстрэп-выборки
- grow_policy - способ построения дерева (симметричное, по глубине и т.д.)
- auto_class_weights - множитель весов объектов
- border_count - количество разбиений для числовых признаков (при выборе критерия разбиения)

In [None]:
def objective_lgb(trial, X, y, N_FOLDS, random_state, cat_feat):
    params = {
        "n_estimators":
        trial.suggest_categorical("n_estimators", [826]),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.299]),
        "max_depth":
        trial.suggest_int("max_depth", 3, 12),
        "l2_leaf_reg":
        trial.suggest_uniform("l2_leaf_reg", 1e-5, 1e2),
        "bootstrap_type":
        trial.suggest_categorical("bootstrap_type",
                                  ["Bayesian", "Bernoulli", "MVS", "No"]),
        "border_count":
        trial.suggest_categorical('border_count', [128, 254]),
        "grow_policy":
        trial.suggest_categorical('grow_policy',
                                  ["SymmetricTree", "Depthwise", "Lossguide"]),
        "auto_class_weights":
        trial.suggest_categorical("auto_class_weights",
                                  ["None", "Balanced", "SqrtBalanced"]),
        
        "cat_features":
        trial.suggest_categorical("cat_features", [cat_features]),
        "loss_function":
        trial.suggest_categorical("loss_function", ["Logloss"]),
        "use_best_model":
        trial.suggest_categorical("use_best_model", [True]),
        "random_state":
        RAND
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float(
            "bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample",
                                                  0.1,
                                                  1,
                                                  log=True)

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostClassifier(**params)
        model.fit(train_data,
                  eval_set=eval_data,
                  early_stopping_rounds=100,
                  verbose=0)
        
        preds = model.predict(X_test)
        cv_predicts[idx] = f1_score(y_test, preds)

    return np.mean(cv_predicts)

In [None]:
study_cat = optuna.create_study(direction="maximize", study_name="Catboost")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLDS, random_state=RAND, cat_feat=cat_features)

study_cat.optimize(func, n_trials=20, show_progress_bar=True)

[32m[I 2022-09-20 04:56:01,061][0m A new study created in memory with name: Catboost[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-09-20 05:04:06,694][0m Trial 0 finished with value: 0.4884553399396626 and parameters: {'n_estimators': 826, 'learning_rate': 0.299, 'max_depth': 5, 'l2_leaf_reg': 35.46225640264227, 'bootstrap_type': 'MVS', 'border_count': 128, 'grow_policy': 'Lossguide', 'auto_class_weights': 'SqrtBalanced', 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID'], 'loss_function': 'Logloss', 'use_best_model': True}. Best is trial 0 with value: 0.4884553399396626.[0m
[32m[I 2022-09-20 05:08:54,608][0m Trial 1 finished with value: 0.36831278076912677 and parameters: {'n_estimators': 826, 'learning_rate': 0.299, 'max_depth': 11, 'l2_leaf_reg': 19.25411378677244, 'bootstrap_type': 'No', 'border_count': 128, 'grow_policy': 'Depthwise', 'auto_class_weights': 'Balanced', 'cat_features': ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXA

In [None]:
study_cat.best_params

{'n_estimators': 826,
 'learning_rate': 0.299,
 'max_depth': 9,
 'l2_leaf_reg': 41.82118127520995,
 'bootstrap_type': 'Bernoulli',
 'border_count': 254,
 'grow_policy': 'SymmetricTree',
 'auto_class_weights': 'SqrtBalanced',
 'cat_features': ['DISC_ID',
  'TYPE_NAME',
  'GENDER',
  'CITIZENSHIP',
  'EXAM_TYPE',
  'EXAM_SUBJECT_1',
  'EXAM_SUBJECT_2',
  'EXAM_SUBJECT_3',
  'ADMITTED_SUBJECT_PRIZE_LEVEL',
  'REGION_ID'],
 'loss_function': 'Logloss',
 'use_best_model': True,
 'subsample': 0.6150031490281568}

In [None]:
best_params = dict(study_cat.best_params)

In [None]:
train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)
eval_data = Pool(data=X_test, label=y_test, cat_features=cat_features)

model = CatBoostClassifier(**study_cat.best_params)
model.fit(train_data,
         eval_set=eval_data,
         early_stopping_rounds=100,
         verbose=0)

<catboost.core.CatBoostClassifier at 0x7fb13aead9d0>

In [None]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='CatBoost_fitted')

y_pred = model.predict(X_train_)
y_pred_prob = model.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_pred_prob, name='Catboost_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,CatBoost_fitted,0.98047,0.97331,0.660668,0.655612,0.658131,0.067243
0,Catboost_train,0.989401,0.995939,0.794667,0.875377,0.833072,0.050101


Видим улучшение метрик, но f1 на train почти на 20% больше, чем на test. Как и для LightGBM, попробуем при обучении уменьшить early_stopping

In [None]:
train_data = Pool(data=X_train, label=y_train, cat_features=cat_features)
eval_data = Pool(data=X_test, label=y_test, cat_features=cat_features)

model = CatBoostClassifier(**study_cat.best_params)
model.fit(train_data,
         eval_set=eval_data,
         early_stopping_rounds=5,
         verbose=0)

<catboost.core.CatBoostClassifier at 0x7fb13b164450>

In [None]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='CatBoost_fitted')

y_pred = model.predict(X_train_)
y_pred_prob = model.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_pred_prob, name='Catboost_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,CatBoost_fitted,0.976484,0.956557,0.590734,0.585459,0.588085,0.096605
0,Catboost_train,0.980751,0.981317,0.673309,0.704914,0.688749,0.086873


Видим, что стало получше. Мы немного просели по метрикам, но смогли невелировать эффект переобучения. Как итог, этот алгоритм однозначно бьёт свой бейзлайн, а также затюненный LightGBM. 

## Cross_val_score

Аналогично LightGBM, сделаем проверку нашего алгоритма на кросс-валидации

In [None]:
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)
cv_predicts = np.empty(N_FOLDS)

for idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_test_ = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_test_ = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    train_data = Pool(data=X_train_, label=y_train_, cat_features=cat_features)
    eval_data = Pool(data=X_test_, label=y_test_, cat_features=cat_features)

    model = CatBoostClassifier(**study_cat.best_params)
    model.fit(train_data,
              eval_set=eval_data,
              early_stopping_rounds=5,
              verbose=0)
        
    preds = model.predict(X_test_)
    cv_predicts[idx] = f1_score(y_test_, preds)
    print(f1_score(y_test_, preds))

print(cv_predicts)

0.5306633291614518
0.5466666666666667
0.5727699530516432
0.5226438188494492
0.5555555555555556
[0.53066333 0.54666667 0.57276995 0.52264382 0.55555556]


Видим, что всё стабильно. То, что f1 недотягивает до заявленного 0.58 объясняется тем, что мы просто каждый раз обучаемся на меньшей выборке, так как один фолд нужно оставить под предикт