# Импорт библиотек

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, log_loss, auc, classification_report, confusion_matrix, \
    precision_recall_curve, roc_curve

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings

warnings.filterwarnings("ignore")
RAND = 10
percent_of_negative_class = 0.958

# Метод для подсчёта метрик

In [60]:
def get_metrics(y_test, y_pred, y_score, name):
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    df_metrics['Accuracy'] = [accuracy_score(y_test, y_pred)]
    df_metrics['ROC_AUC'] = [roc_auc_score(y_test, y_score[:,1])]
    df_metrics['Precision'] = [precision_score(y_test, y_pred)]
    df_metrics['Recall'] = [recall_score(y_test, y_pred)]
    df_metrics['f1'] = [f1_score(y_test, y_pred)]
    df_metrics['Logloss'] = [log_loss(y_test, y_score)]
    
    return df_metrics

# Подготовка данных к обучению

Выгрузим данные, сохранённые на этапе EDA

In [61]:
df = pd.read_pickle('data.pickle')
df.head()

Unnamed: 0,SEMESTER,DISC_ID,TYPE_NAME,DEBT,GENDER,CITIZENSHIP,EXAM_TYPE,EXAM_SUBJECT_1,EXAM_SUBJECT_2,EXAM_SUBJECT_3,ADMITTED_EXAM_1,ADMITTED_EXAM_2,ADMITTED_EXAM_3,ADMITTED_SUBJECT_PRIZE_LEVEL,REGION_ID,mean_score
0,1,10502311854018326223,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
1,1,1601392918367593206,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
2,1,9559803959325174929,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
3,1,8955667882044263414,Зачет,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667
4,1,17741967398854095262,Экзамен,0,М,15601729049989747827,ЕГЭ,70786669040476600,5533732657842394915,8388269026169219461,78.0,79.0,91.0,ЕГЭ,7805492244297918082,82.666667


## Бинаризация

Проведём бинаризацию датафрейма

In [63]:
df_bin = pd.get_dummies(df, drop_first=True)
df_bin.head()

Unnamed: 0,SEMESTER,DEBT,ADMITTED_EXAM_1,ADMITTED_EXAM_2,ADMITTED_EXAM_3,mean_score,DISC_ID_57659945070201404,DISC_ID_81203412138540191,DISC_ID_119954623639665579,DISC_ID_150957394596913708,...,REGION_ID_15874707964024214396,REGION_ID_15990212999969728230,REGION_ID_16228771858762793341,REGION_ID_16416166836247151845,REGION_ID_16856079058758621152,REGION_ID_16925236604731044042,REGION_ID_17119224547502736839,REGION_ID_17341978997214072207,REGION_ID_17696900483302278054,REGION_ID_17759417206326758158
0,1,0,78.0,79.0,91.0,82.666667,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,78.0,79.0,91.0,82.666667,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,78.0,79.0,91.0,82.666667,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,78.0,79.0,91.0,82.666667,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,78.0,79.0,91.0,82.666667,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Разбиение на train/test

Разобьём датафрейм на обучающую и тестовую выборки для обоих датафреймов (бинаризованного и небинаризованного, чтобы в зависимости от модели подавать на вход тот или иной вариант)

In [64]:
feature_cols = df.drop(columns = ['DEBT'])

X_train, X_test, y_train, y_test = train_test_split(feature_cols,
                                                    df['DEBT'],
                                                    test_size=0.33,
                                                    random_state=10)

In [65]:
feature_cols_bin = df_bin.drop(columns = ['DEBT'])

X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(feature_cols_bin,
                                                    df_bin['DEBT'],
                                                    test_size=0.33,
                                                    random_state=10)

## Нормализация

Приведём все признаки к одной шкале с помощью MinMaxScaler для бинаризованных данных - это будет третья и последняя вариация формата наших выборок

In [66]:
st = MinMaxScaler()
X_train_bin_scaled = st.fit_transform(X_train_bin)
X_test_bin_scaled = st.transform(X_test_bin)

# Logistic regression

Приступим к обучению наших бейзлайн моделей

In [67]:
lr = LogisticRegression(class_weight='balanced', random_state=RAND)
lr.fit(X_train_bin_scaled, y_train)

LogisticRegression(class_weight='balanced', random_state=10)

In [68]:
y_pred = lr.predict(X_test_bin_scaled)
y_pred_prob = lr.predict_proba(X_test_bin_scaled)
metrics = get_metrics(y_test, y_pred, y_pred_prob, name='LogisticRegression')

y_pred_train = lr.predict(X_train_bin_scaled)
y_pred_prob_train = lr.predict_proba(X_train_bin_scaled)
metrics = metrics.append(
    get_metrics(y_train, y_pred_train, y_pred_prob_train, name='LogisticRegression_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596


# Decision tree

In [69]:
dt = DecisionTreeClassifier(class_weight='balanced', random_state=RAND)
dt.fit(X_train_bin, y_train_bin)

DecisionTreeClassifier(class_weight='balanced', random_state=10)

In [70]:
y_pred = dt.predict(X_test_bin)
y_pred_prob = dt.predict_proba(X_test_bin)
metrics = metrics.append(
    get_metrics(y_test_bin, y_pred, y_pred_prob, name='Decision_tree'))

y_pred_train = dt.predict(X_train_bin)
y_pred_prob_train = dt.predict_proba(X_train_bin)
metrics = metrics.append(
    get_metrics(y_train, y_pred_train, y_pred_prob_train, name='Decision_tree_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985


# Random forest

In [71]:
rf = RandomForestClassifier(class_weight='balanced', bootstrap=False, random_state=RAND)
rf.fit(X_train_bin, y_train_bin)

RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       random_state=10)

In [72]:
y_pred = rf.predict(X_test_bin)
y_pred_prob = rf.predict_proba(X_test_bin)
metrics = metrics.append(
    get_metrics(y_test_bin, y_pred, y_pred_prob, name='Random_forest'))

y_pred_train = rf.predict(X_train_bin)
y_pred_prob_train = rf.predict_proba(X_train_bin)
metrics = metrics.append(
    get_metrics(y_train_bin, y_pred_train, y_pred_prob_train, name='Random_forest_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985
0,Random_forest,0.951912,0.884939,0.481091,0.279338,0.35345,0.351344
0,Random_forest_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017991


# Bagging classifier (logistic regression)

In [73]:
bg = BaggingClassifier(base_estimator=LogisticRegression(),
                      random_state=RAND)
bg.fit(pd.DataFrame(X_train_bin_scaled), y_train_bin)

BaggingClassifier(base_estimator=LogisticRegression(), random_state=10)

In [74]:
y_pred = bg.predict(X_test_bin_scaled)
y_pred_prob = bg.predict_proba(X_test_bin_scaled)
metrics = metrics.append(get_metrics(y_test, y_pred, y_pred_prob, name='Bagging_classifier'))

y_pred_train = bg.predict(X_train_bin_scaled)
y_pred_prob_train = bg.predict_proba(X_train_bin_scaled)
metrics = metrics.append(
    get_metrics(y_train, y_pred_train, y_pred_prob_train, name='Bagging_classifier_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985
0,Random_forest,0.951912,0.884939,0.481091,0.279338,0.35345,0.351344
0,Random_forest_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017991
0,Bagging_classifier,0.953013,0.741952,0.512048,0.030598,0.057745,0.168947
0,Bagging_classifier_train,0.952821,0.772599,0.555233,0.03355,0.063276,0.164812


# XGBoost

In [75]:
# Выборки для проверки после каждой итерации обучения
X_train_, X_val, y_train_, y_val = train_test_split(X_train_bin,
                                                    y_train_bin,
                                                    test_size=0.16,
                                                    shuffle=True,
                                                    random_state=RAND)
eval_set = [(X_val, y_val)]

clf = XGBClassifier(random_state=RAND, scale_pos_weight = percent_of_negative_class)

clf.fit(X_train_,
        y_train_,
        eval_metric="auc",
        eval_set=eval_set,
        early_stopping_rounds=100,
        verbose=2)

[0]	validation_0-auc:0.69734
[2]	validation_0-auc:0.72105
[4]	validation_0-auc:0.74048
[6]	validation_0-auc:0.74580
[8]	validation_0-auc:0.75601
[10]	validation_0-auc:0.75828
[12]	validation_0-auc:0.76750
[14]	validation_0-auc:0.76876
[16]	validation_0-auc:0.77103
[18]	validation_0-auc:0.77443
[20]	validation_0-auc:0.77507
[22]	validation_0-auc:0.77632
[24]	validation_0-auc:0.77696
[26]	validation_0-auc:0.78338
[28]	validation_0-auc:0.78232
[30]	validation_0-auc:0.78480
[32]	validation_0-auc:0.78465
[34]	validation_0-auc:0.78492
[36]	validation_0-auc:0.78507
[38]	validation_0-auc:0.78566
[40]	validation_0-auc:0.78953
[42]	validation_0-auc:0.79314
[44]	validation_0-auc:0.79386
[46]	validation_0-auc:0.79243
[48]	validation_0-auc:0.79742
[50]	validation_0-auc:0.79807
[52]	validation_0-auc:0.79788
[54]	validation_0-auc:0.79828
[56]	validation_0-auc:0.79781
[58]	validation_0-auc:0.79959
[60]	validation_0-auc:0.79861
[62]	validation_0-auc:0.79902
[64]	validation_0-auc:0.80379
[66]	validation

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=20.27659574468085,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [76]:
y_pred = clf.predict(X_test_bin)
y_pred_prob = clf.predict_proba(X_test_bin)
metrics = metrics.append(
    get_metrics(y_test_bin, y_pred, y_pred_prob, name='XGBoost'))

y_pred_train = clf.predict(X_train_)
y_pred_prob_train = clf.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred_train, y_pred_prob_train, name='XGBoost_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985
0,Random_forest,0.951912,0.884939,0.481091,0.279338,0.35345,0.351344
0,Random_forest_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017991
0,Bagging_classifier,0.953013,0.741952,0.512048,0.030598,0.057745,0.168947
0,Bagging_classifier_train,0.952821,0.772599,0.555233,0.03355,0.063276,0.164812
0,XGBoost,0.768298,0.806016,0.126959,0.667747,0.213353,0.48187
0,XGBoost_train,0.784206,0.886426,0.155538,0.808008,0.260861,0.466063


# LightGBM

In [77]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.16,
                                                    shuffle=True,
                                                    random_state=RAND)
eval_set = [(X_val, y_val)]

clf = LGBMClassifier(random_state=RAND, scale_pos_weight=percent_of_negative_class)

eval_set = [(X_val, y_val)]

clf.fit(X_train_,
        y_train_,
        eval_metric="auc",
        eval_set=eval_set,
        early_stopping_rounds=100,
        verbose=2)

Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.724668	valid_0's binary_logloss: 0.226594
[4]	valid_0's auc: 0.742846	valid_0's binary_logloss: 0.264321
[6]	valid_0's auc: 0.750452	valid_0's binary_logloss: 0.29881
[8]	valid_0's auc: 0.754218	valid_0's binary_logloss: 0.328605
[10]	valid_0's auc: 0.757157	valid_0's binary_logloss: 0.355085
[12]	valid_0's auc: 0.761666	valid_0's binary_logloss: 0.37593
[14]	valid_0's auc: 0.762603	valid_0's binary_logloss: 0.394402
[16]	valid_0's auc: 0.76587	valid_0's binary_logloss: 0.40863
[18]	valid_0's auc: 0.768676	valid_0's binary_logloss: 0.420258
[20]	valid_0's auc: 0.771517	valid_0's binary_logloss: 0.428942
[22]	valid_0's auc: 0.77362	valid_0's binary_logloss: 0.43493
[24]	valid_0's auc: 0.775752	valid_0's binary_logloss: 0.439685
[26]	valid_0's auc: 0.776909	valid_0's binary_logloss: 0.443347
[28]	valid_0's auc: 0.777914	valid_0's binary_logloss: 0.445802
[30]	valid_0's auc: 0.779189	valid_0's binary_loglos

LGBMClassifier(random_state=10, scale_pos_weight=20.27659574468085)

In [78]:
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
metrics = metrics.append(
    get_metrics(y_test, y_pred, y_pred_prob, name='LightGBM'))

y_pred_train = clf.predict(X_train_)
y_pred_prob_train = clf.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred_train, y_pred_prob_train, name='LightGBM_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985
0,Random_forest,0.951912,0.884939,0.481091,0.279338,0.35345,0.351344
0,Random_forest_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017991
0,Bagging_classifier,0.953013,0.741952,0.512048,0.030598,0.057745,0.168947
0,Bagging_classifier_train,0.952821,0.772599,0.555233,0.03355,0.063276,0.164812
0,XGBoost,0.768298,0.806016,0.126959,0.667747,0.213353,0.48187
0,XGBoost_train,0.784206,0.886426,0.155538,0.808008,0.260861,0.466063
0,LightGBM,0.82931,0.803692,0.155154,0.591073,0.24579,0.368354


# Catboost 

In [82]:
cat_features = ['DISC_ID', 'TYPE_NAME', 'GENDER', 'CITIZENSHIP', 
            'EXAM_TYPE', 'EXAM_SUBJECT_1', 'EXAM_SUBJECT_2', 
            'EXAM_SUBJECT_3', 'ADMITTED_SUBJECT_PRIZE_LEVEL', 'REGION_ID']

clf = CatBoostClassifier(random_state=RAND,\
                         eval_metric="AUC", 
                         scale_pos_weight = percent_of_negative_class,
                         cat_features = cat_features)

eval_set = [(X_val, y_val)]

clf.fit(X_train_,
        y_train_,
        eval_set=eval_set,
        early_stopping_rounds=100,
        verbose=2)

Learning rate set to 0.099001
0:	test: 0.7194341	best: 0.7194341 (0)	total: 303ms	remaining: 5m 2s
2:	test: 0.7327368	best: 0.7327368 (2)	total: 498ms	remaining: 2m 45s
4:	test: 0.7375244	best: 0.7375244 (4)	total: 715ms	remaining: 2m 22s
6:	test: 0.7402752	best: 0.7402752 (6)	total: 896ms	remaining: 2m 7s
8:	test: 0.7433100	best: 0.7433100 (8)	total: 1.14s	remaining: 2m 5s
10:	test: 0.7483543	best: 0.7483543 (10)	total: 1.35s	remaining: 2m 1s
12:	test: 0.7515538	best: 0.7515538 (12)	total: 1.57s	remaining: 1m 58s
14:	test: 0.7536830	best: 0.7536830 (14)	total: 1.76s	remaining: 1m 55s
16:	test: 0.7600306	best: 0.7603297 (15)	total: 1.99s	remaining: 1m 54s
18:	test: 0.7611093	best: 0.7611129 (17)	total: 2.23s	remaining: 1m 55s
20:	test: 0.7692986	best: 0.7692986 (20)	total: 2.46s	remaining: 1m 54s
22:	test: 0.7733354	best: 0.7733354 (22)	total: 2.67s	remaining: 1m 53s
24:	test: 0.7837858	best: 0.7837858 (24)	total: 2.89s	remaining: 1m 52s
26:	test: 0.7846967	best: 0.7846967 (26)	total: 

226:	test: 0.8725733	best: 0.8725733 (226)	total: 32.4s	remaining: 1m 50s
228:	test: 0.8735607	best: 0.8735607 (228)	total: 32.7s	remaining: 1m 49s
230:	test: 0.8742116	best: 0.8742116 (230)	total: 33.2s	remaining: 1m 50s
232:	test: 0.8743765	best: 0.8743765 (232)	total: 33.4s	remaining: 1m 49s
234:	test: 0.8746411	best: 0.8746411 (234)	total: 33.6s	remaining: 1m 49s
236:	test: 0.8748546	best: 0.8748546 (236)	total: 33.9s	remaining: 1m 48s
238:	test: 0.8756059	best: 0.8756059 (238)	total: 34.1s	remaining: 1m 48s
240:	test: 0.8759964	best: 0.8759964 (240)	total: 34.3s	remaining: 1m 48s
242:	test: 0.8764322	best: 0.8764322 (242)	total: 34.6s	remaining: 1m 47s
244:	test: 0.8772824	best: 0.8772824 (244)	total: 34.7s	remaining: 1m 47s
246:	test: 0.8778447	best: 0.8778447 (246)	total: 35s	remaining: 1m 46s
248:	test: 0.8780222	best: 0.8780441 (247)	total: 35.2s	remaining: 1m 46s
250:	test: 0.8784609	best: 0.8784609 (250)	total: 35.4s	remaining: 1m 45s
252:	test: 0.8786872	best: 0.8786872 (25

450:	test: 0.9005820	best: 0.9005820 (450)	total: 1m 2s	remaining: 1m 15s
452:	test: 0.9005295	best: 0.9005861 (451)	total: 1m 2s	remaining: 1m 15s
454:	test: 0.9008411	best: 0.9008411 (454)	total: 1m 3s	remaining: 1m 15s
456:	test: 0.9010247	best: 0.9010564 (455)	total: 1m 3s	remaining: 1m 15s
458:	test: 0.9010857	best: 0.9011155 (457)	total: 1m 4s	remaining: 1m 15s
460:	test: 0.9012557	best: 0.9012557 (460)	total: 1m 4s	remaining: 1m 15s
462:	test: 0.9017694	best: 0.9018277 (461)	total: 1m 4s	remaining: 1m 15s
464:	test: 0.9018493	best: 0.9018493 (464)	total: 1m 5s	remaining: 1m 14s
466:	test: 0.9018427	best: 0.9018493 (464)	total: 1m 5s	remaining: 1m 14s
468:	test: 0.9020458	best: 0.9020458 (468)	total: 1m 5s	remaining: 1m 14s
470:	test: 0.9020608	best: 0.9020608 (470)	total: 1m 5s	remaining: 1m 13s
472:	test: 0.9022814	best: 0.9022814 (472)	total: 1m 6s	remaining: 1m 13s
474:	test: 0.9024866	best: 0.9024866 (474)	total: 1m 6s	remaining: 1m 13s
476:	test: 0.9023313	best: 0.9024866 (

674:	test: 0.9108927	best: 0.9108927 (674)	total: 1m 35s	remaining: 46.1s
676:	test: 0.9110576	best: 0.9110576 (676)	total: 1m 36s	remaining: 45.8s
678:	test: 0.9111361	best: 0.9111363 (677)	total: 1m 36s	remaining: 45.5s
680:	test: 0.9111003	best: 0.9111492 (679)	total: 1m 36s	remaining: 45.3s
682:	test: 0.9112466	best: 0.9112466 (682)	total: 1m 37s	remaining: 45.1s
684:	test: 0.9112769	best: 0.9112940 (683)	total: 1m 37s	remaining: 44.9s
686:	test: 0.9112731	best: 0.9112940 (683)	total: 1m 37s	remaining: 44.6s
688:	test: 0.9111816	best: 0.9112940 (683)	total: 1m 38s	remaining: 44.4s
690:	test: 0.9111109	best: 0.9112940 (683)	total: 1m 38s	remaining: 44.2s
692:	test: 0.9111445	best: 0.9112940 (683)	total: 1m 39s	remaining: 43.9s
694:	test: 0.9111579	best: 0.9112940 (683)	total: 1m 39s	remaining: 43.8s
696:	test: 0.9112235	best: 0.9112940 (683)	total: 1m 40s	remaining: 43.5s
698:	test: 0.9112607	best: 0.9112940 (683)	total: 1m 40s	remaining: 43.3s
700:	test: 0.9113479	best: 0.9113514 (

898:	test: 0.9157759	best: 0.9163960 (876)	total: 2m 10s	remaining: 14.7s
900:	test: 0.9158757	best: 0.9163960 (876)	total: 2m 10s	remaining: 14.4s
902:	test: 0.9158886	best: 0.9163960 (876)	total: 2m 10s	remaining: 14.1s
904:	test: 0.9159275	best: 0.9163960 (876)	total: 2m 11s	remaining: 13.8s
906:	test: 0.9160515	best: 0.9163960 (876)	total: 2m 11s	remaining: 13.5s
908:	test: 0.9160275	best: 0.9163960 (876)	total: 2m 11s	remaining: 13.2s
910:	test: 0.9162328	best: 0.9163960 (876)	total: 2m 11s	remaining: 12.9s
912:	test: 0.9162643	best: 0.9163960 (876)	total: 2m 11s	remaining: 12.6s
914:	test: 0.9162898	best: 0.9163960 (876)	total: 2m 12s	remaining: 12.3s
916:	test: 0.9164878	best: 0.9164878 (916)	total: 2m 12s	remaining: 12s
918:	test: 0.9165342	best: 0.9165342 (918)	total: 2m 12s	remaining: 11.7s
920:	test: 0.9166540	best: 0.9166540 (920)	total: 2m 12s	remaining: 11.4s
922:	test: 0.9167465	best: 0.9167551 (921)	total: 2m 13s	remaining: 11.1s
924:	test: 0.9168673	best: 0.9168673 (92

<catboost.core.CatBoostClassifier at 0x23f099b2fa0>

In [91]:
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
metrics = metrics.append(
    get_metrics(y_test, y_pred, y_pred_prob, name='Catboost'))

y_pred_train = clf.predict(X_train_)
y_pred_prob_train = clf.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred_train, y_pred_prob_train, name='Catboost_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985
0,Random_forest,0.951912,0.884939,0.481091,0.279338,0.35345,0.351344
0,Random_forest_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017991
0,Bagging_classifier,0.953013,0.741952,0.512048,0.030598,0.057745,0.168947
0,Bagging_classifier_train,0.952821,0.772599,0.555233,0.03355,0.063276,0.164812
0,XGBoost,0.768298,0.806016,0.126959,0.667747,0.213353,0.48187
0,XGBoost_train,0.784206,0.886426,0.155538,0.808008,0.260861,0.466063
0,LightGBM,0.82931,0.803692,0.155154,0.591073,0.24579,0.368354


# Gradient boosting

In [95]:
clf = GradientBoostingClassifier(random_state=RAND)
clf.fit(X_train_bin, y_train_bin)

GradientBoostingClassifier(random_state=10)

In [99]:
y_pred = clf.predict(X_test_bin)
y_pred_prob = clf.predict_proba(X_test_bin)
metrics = metrics.append(
    get_metrics(y_test, y_pred, y_pred_prob, name='Gradient_Boosting'))

y_pred_train = clf.predict(X_train_bin)
y_pred_prob_train = clf.predict_proba(X_train_bin)
metrics = metrics.append(
    get_metrics(y_train_bin, y_pred_train, y_pred_prob_train, name='Gradient_Boosting_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985
0,Random_forest,0.951912,0.884939,0.481091,0.279338,0.35345,0.351344
0,Random_forest_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017991
0,Bagging_classifier,0.953013,0.741952,0.512048,0.030598,0.057745,0.168947
0,Bagging_classifier_train,0.952821,0.772599,0.555233,0.03355,0.063276,0.164812
0,XGBoost,0.768298,0.806016,0.126959,0.667747,0.213353,0.48187
0,XGBoost_train,0.784206,0.886426,0.155538,0.808008,0.260861,0.466063
0,LightGBM,0.82931,0.803692,0.155154,0.591073,0.24579,0.368354


# kNN

In [103]:
clf = KNeighborsClassifier()
clf.fit(X_train_bin_scaled, y_train_bin)

KNeighborsClassifier()

In [104]:
y_pred = clf.predict(X_test_bin_scaled)
y_pred_prob = clf.predict_proba(X_test_bin_scaled)
metrics = metrics.append(
    get_metrics(y_test_bin, y_pred, y_pred_prob, name='kNN'))

y_pred_train = clf.predict(X_train_bin_scaled)
y_pred_prob_train = clf.predict_proba(X_train_bin_scaled)
metrics = metrics.append(
    get_metrics(y_train_bin, y_pred_train, y_pred_prob_train, name='kNN_train'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LogisticRegression,0.697817,0.742184,0.095195,0.637509,0.165653,0.566596
0,Decision_tree,0.946593,0.788852,0.450146,0.609431,0.517816,1.536115
0,Decision_tree_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017985
0,Random_forest,0.951912,0.884939,0.481091,0.279338,0.35345,0.351344
0,Random_forest_train,0.991081,0.999807,0.84262,0.99877,0.914074,0.017991
0,Bagging_classifier,0.953013,0.741952,0.512048,0.030598,0.057745,0.168947
0,Bagging_classifier_train,0.952821,0.772599,0.555233,0.03355,0.063276,0.164812
0,XGBoost,0.768298,0.806016,0.126959,0.667747,0.213353,0.48187
0,XGBoost_train,0.784206,0.886426,0.155538,0.808008,0.260861,0.466063
0,LightGBM,0.82931,0.803692,0.155154,0.591073,0.24579,0.368354


# Выводы

Итак, качество базового алгоритма будем определять по двум критериям:

1) значение метрики roc_auc на тестовой выборке

2) разрыв между метриками roc_auc и log_loss на тренировочной и тестовой выборках 

На другие метрики особо смотреть не будем, но их полезно выводить, чтобы лучше контролироовать адекватность происходящего. Так, recall, precision и f1 занулятся, если все объекты выборки будут отнесены к одному классу. Это довольно редкая ситуация, поэтому обратить на неё внимание было бы неплохо. Единственной полностью бесполезной метрикой, конечно, является accuracy в силу дисбаланса классов

Исходя из этих соображений, по таблице можно понять, что лучшей бейзлайн моделью является CatBoost. Она дала лучший roc_auc=0.92 на тестовой выборке, при этом различия в метриках на train и на test минимальны

XGBoost и LightGBM также дали хорошие результаты. Для LightGBM подберём гиперпараметры, и вполне возможно, что он побьёт Catboost в силу его известной хорошей начальной настройки. XGBoost, пожалуй, не будем брать в финал в силу того, что он не может работать с категориальными признаками, которых у нас подавляющее большинство. Нам приходится их бинаризовывать, а это часто не очень хорошо влияет на работу бустингов

Итак, мы остановились на двух моделях - Catboost и LightGBM