In [1]:
import time
import numpy as np
import pandas as pd
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import optuna

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import gmean
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

Useful Functions

In [2]:
class Bayesian_optimization():
    def __init__(self, classifier, params, params_search,
                 x_train, y_train, x_valid, y_valid,
                 categorical=None,
                 metric=roc_auc_score, evals=True, 
                 method='maximize', random_state=42,
                 init_points=3, n_iter=5, acq='ei',
                 early_stopping_rounds=50, verbose=200,
                 int_params=[]):
        self.classifier = classifier
        self.params = params
        self.params_search = params_search
        self.x_train = x_train
        self.y_train = y_train
        self.x_valid = x_valid
        self.y_valid = y_valid
        self.categorical = categorical
        self.metric = metric
        self.evals = [(x_train, y_train), (x_valid, y_valid)] \
                        if evals else None
        self.method = method
        self.random_state = random_state
        self.init_points = init_points
        self.n_iter = n_iter
        self.acq = acq
        self.early_stopping_rounds = early_stopping_rounds
        self.verbose = verbose
        self.int_params = int_params
        self.model = None
    
    def _normalize_params(self, params: dict):
        new_params = params.copy()
        for int_param in self.int_params:
            if int_param in params.keys():
                new_params[int_param] = int(params[int_param])
        return new_params
    
    def _optimize_params(self, **kwargs):
        params = {**self.params,**kwargs}
        params = self._normalize_params(params)
        
        model = self.classifier(**params)
        model.fit(
            self.x_train, self.y_train, 
#             categorical=self.categorical,
            eval_set=self.evals,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose=self.verbose
        )
        return self.metric(self.y_valid, model.predict(self.x_valid))
    
    def _optimization(self):
        optimizer = BayesianOptimization(
            self._optimize_params,
            pbounds=self.params_search,
            random_state=self.random_state
        )
    #     if method == 'maximize':
        optimizer.maximize(init_points=self.init_points, 
                           n_iter=self.n_iter, 
                           acq=self.acq)
        return optimizer.max
    
    def optimize_it(self):
        optimal_params = self._optimization()['params']
        optimal_params = self._normalize_params(optimal_params)
        
        self.model = self.classifier(**optimal_params)
        self.model.fit(self.x_train, self.y_train, 
                       eval_set=self.evals,
                       early_stopping_rounds=self.early_stopping_rounds,
                       verbose=self.verbose)

        return optimal_params

In [3]:
def hold_out(X, y, split_params=[0.7, 0.2, 0.1]):
    x_train, x_valid = train_test_split(X, train_size=split_params[0], 
                                        random_state=42)
    y_train, y_valid = train_test_split(y, train_size=split_params[0],
                                        random_state=42)
    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])
        
        x_valid, x_test = train_test_split(x_valid, test_size=test_size,
                                          random_state=42)
        y_valid, y_test = train_test_split(y_valid, test_size=test_size,
                                          random_state=42)
        return x_train, y_train, x_valid, y_valid, x_test, y_test
    
    elif len(split_params) == 2:
        return x_train, y_train, x_valid, y_valid
    
def adversarial_validation(train, test, target, treshold=0.3, tests=True):
    X_test  = test.select_dtypes(include=['number']).copy()
    X_train = train.select_dtypes(include=['number']).copy()
    features = X_train.columns
    X_train["AV_label"] = 0
    X_test["AV_label"]  = 1
    all_data = pd.concat([X_train, X_test], axis=0, ignore_index=True)
    all_data_shuffled = all_data.sample(frac=1)
    X = all_data_shuffled.drop(['AV_label'], axis=1)
    y = all_data_shuffled['AV_label']
    
    model = xgb.XGBClassifier(n_estimators=50)
    model.fit(X, y)
    
    y_pred_adv = model.predict_proba(X)
    score = roc_auc_score(y, y_pred_adv[:, 1])
    print(round(score, 4))
    
    y_pred = model.predict_proba(X_train[features])
    print(pd.cut(y_pred[:, 1], bins=np.arange(0, 1.01, 0.1))\
          .value_counts().sort_index())
    X_train['AV_proba'] = np.zeros(X_train.shape[0])
    X_train['y'] = np.zeros(X_train.shape[0])
    X_train['AV_proba'] = y_pred
    X_train['y'] = target
    
    valid_count = int(X_train.shape[0] * treshold)

    treshold_proba = None
    for i in range(10000):
        if X_train.loc[X_train['AV_proba'] < i/10000].shape[0] >= valid_count:
            treshold_proba = i / 10000
            break
    print(f'\nValid sample: {1-treshold_proba:.4f} - 1.0000')

    y_valid = X_train.loc[X_train['AV_proba'] < treshold_proba, 'y']
    x_valid = X_train.loc[X_train['AV_proba'] < treshold_proba][features]
    y_train = X_train.loc[X_train['AV_proba'] >= treshold_proba, 'y']
    x_train = X_train.loc[X_train['AV_proba'] >= treshold_proba][features]

    if tests:
        test_size = treshold
        
        x_valid, x_test = train_test_split(x_valid, test_size=test_size,
                                          random_state=42)
        y_valid, y_test = train_test_split(y_valid, test_size=test_size,
                                          random_state=42)
        return x_train, y_train, x_valid, y_valid, x_test, y_test
    
    return x_train, y_train, x_valid, y_valid

def catboost_CV(params, X, y, cv, categorical=None, sampling=False):
    estimators, fold_scores = [], []
    oof_preds = np.zeros(X.shape[0])
    
    print(f'{time.ctime()}, cross-validation, {X.shape[0]} rows, {X.shape[1]} cols')
    if categorical:
        categorical = list(set(categorical) & set(X.columns))
        X[categorical] = X[categorical].astype(str)
    models = []
    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        
        if sampling is True:
            smote = SMOTE(random_state=42, n_jobs=-1)
            x_train, y_train = smote.fit_resample(x_train, y_train)
        
        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f'Fold {fold+1}, valid score = {round(score, 5)}')
        fold_scores.append(round(score, 5))
        estimators.append(model)
        models.append(model)
    
    print(f'Score by each fold: {fold_scores}')
    print('='*65)
    return estimators, oof_preds, models

In [4]:
data = pd.read_pickle('./prepared_data.pkl')
# data = pd.read_pickle('./prepared_data_without_outliers.pkl')
target_name = 'target'

In [5]:
data.head(3)

Unnamed: 0,application_number,target,name_contract_type,gender,childrens,total_salary_x,amount_credit_x,amount_annuity,education_level_x,family_status,...,isna_prev_app_days_termination_min_approved_apps,isna_prev_app_delay_first_payment_min_approved_apps,isna_prev_app_delay_last_payment_min_approved_apps,isna_prev_app_payment_in_advance_min_approved_apps,isna_prev_app_amount_goods_payment_mean_approved_apps,isna_prev_app_amount_goods_payment_max_approved_apps,isna_expected_total_loss_3,isna_external_scoring_rating_3,isna_expected_monthly_loss_3,isna_sum
0,123687442,0.0,1,2,1.0,157500.0,855000.0,25128.0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,123597908,1.0,1,3,,,,,3,2,...,0,0,0,0,0,0,1,1,1,15
2,123526683,0.0,1,1,0.0,135000.0,1006920.0,42660.0,2,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data = data.replace(np.inf, np.nan)

In [7]:
mask = data['target'].isnull()
features_to_drop = ['application_number', 'target']

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train['target'], test['application_number']
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)
train = train.fillna(-9999)
test = test.fillna(-9999)

In [8]:
categorical = [
    'name_contract_type',
    'gender',
    'education_level_x',
    'family_status',
    'education_level_family_status',
    'education_level_y'
]
numerical = list(set(train.columns) - set(categorical))

In [9]:
%%time
# x_train, y_train, x_valid, y_valid, x_test, y_test = adversarial_validation(
#     train, test, target, treshold=0.3, tests=True
# )
x_train, y_train, x_valid, y_valid = adversarial_validation(
    train, test, target, treshold=0.25, tests=False
)
# x_train, y_train, x_valid, y_valid, x_test, y_test = hold_out(train, target)
# x_train, y_train, x_valid, y_valid = hold_out(train, target, [0.8, 0.2])

0.6081
(0.0, 0.1]        0
(0.1, 0.2]        9
(0.2, 0.3]       84
(0.3, 0.4]      512
(0.4, 0.5]     3276
(0.5, 0.6]    59765
(0.6, 0.7]    46351
(0.7, 0.8]       96
(0.8, 0.9]        0
(0.9, 1.0]        0
dtype: int64

Valid sample: 0.6052 - 1.0000
Wall time: 48.7 s


In [10]:
# smote = SMOTE(random_state=42, n_jobs=-1)
# x_train, y_train = smote.fit_resample(x_train, y_train)

In [11]:
features_to_drop = [
    'region_population',
    'family_size',
    'amt_req_credit_bureau_week',
    'amt_req_credit_bureau_year',
    'flag_phone',
    'flag_email',
    'ratio_salary_to_region_population',
    'childrens',
    'amt_req_credit_bureau_day',
    'amt_req_credit_bureau_hour',
    'bki_request_count',
    'amt_req_credit_bureau_mon'
]

In [12]:
model_search = {
#     'xgb': {
#         'classifier': xgb.XGBClassifier,
#         'params': {
#             "booster": "gblinear",
#             "objective": "binary:logistic",
#             "eval_metric": "auc",
#             "n_estimators": 10000,
#             "seed": 42,
#         },
#         'params_search': {
#             "max_depth": (3, 8),
#             "subsample": (0.3, 0.9),
#             "colsample_bytree": (0.3, 0.9),
#             "min_child_weights": (300, 700),
#             "alpha": (1, 20),
#             "lambda": (10, 100), 
#             "gamma": (1, 20), 
#             "eta": (1e-2, 0.1)
#         }
#     },
#     'lgb': {
#         'classifier': lgb.LGBMClassifier,
#         'params': {
#             'metric':"auc", 
#             'boosting_type': 'gbdt',
#             'n_estimators': 10000,
#             'random_state': 42,
#         },
#         'params_search': {
#             "max_depth": (3, 8),
#             "bagging_fraction": (0.3, 0.9),
#             "feature_fraction": (0.3, 0.9),
#             "min_data_in_leaf": (300, 700),
#             "lambda_l1": (10, 100),
#             "labmda_l2": (10, 100), 
#             "learning_rate": (1e-3, 0.1)
#         }
#     },
    'cb': {
        'classifier': cb.CatBoostClassifier,
        'params': {
            'n_estimators': 10000,
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'task_type': 'CPU',
            'random_seed': 42,
        },
        'params_search': {
            "max_depth": (3, 7),
            "subsample": (0.4, 0.8),
            "colsample_bylevel": (0.4, 0.8),
            "min_data_in_leaf": (400, 600),
            "reg_lambda": (10, 100),
            "learning_rate": (1e-2, 0.1)
        }
    },
}

In [13]:
%%time
models = dict()
LB_preds = pd.DataFrame()

for key, value in model_search.items():
    model_name = key
    classifier = value['classifier']
    params = value['params']
    params_search = value['params_search']
    categorical = categorical
    
    print(f'{model_name}\n{"*"*70}\n{"*"*70}')

    optimizator = Bayesian_optimization(
                    classifier, params, params_search,
                    x_train.drop(features_to_drop, axis=1), 
                    y_train, 
                    x_valid.drop(features_to_drop, axis=1), 
                    y_valid,
#                     categorical=categorical,
                    metric=roc_auc_score, evals=True, 
                    method='maximize', random_state=42,
                    init_points=3, n_iter=3, acq='ei',
                    early_stopping_rounds=50, verbose=200,
                    int_params=['max_depth', 
                                'min_child_weights', 
                                'min_data_in_leaf']
                  )
    optimal_params = optimizator.optimize_it()
    
    model = optimizator.model
    model.fit(x_train, y_train)
#     print(f'\n{model_name}\ntest metric: \
#         {roc_auc_score(y_test, model.predict(x_test))}\n')
    
    models[model_name] = {
        'model': model,
        'y_valid_pred': model.predict(x_valid),
#         'y_test_pred': model.predict(x_test),
        'optimal_params': optimal_params,
    }
    
    LB_preds[model_name] = model.predict_proba(test)[:, 1]

cb
**********************************************************************
**********************************************************************
|   iter    |  target   | colsam... | learni... | max_depth | min_da... | reg_la... | subsample |
-------------------------------------------------------------------------------------------------
0:	test: 0.5681575	test1: 0.5621222	best: 0.5621222 (0)	total: 217ms	remaining: 36m 6s
200:	test: 0.7505528	test1: 0.7211577	best: 0.7212490 (164)	total: 6.65s	remaining: 5m 24s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7225593153
bestIteration = 276

Shrink model to first 277 iterations.
| [0m 1       [0m | [0m 0.5031  [0m | [0m 0.5498  [0m | [0m 0.09556 [0m | [0m 5.928   [0m | [0m 519.7   [0m | [0m 24.04   [0m | [0m 0.4624  [0m |
0:	test: 0.6317614	test1: 0.6175959	best: 0.6175959 (0)	total: 29.9ms	remaining: 4m 59s
200:	test: 0.7522551	test1: 0.7221441	best: 0.7221829 (193)	total: 6.96s	remaining: 5m 39s
St

85:	learn: 0.2558971	total: 2.37s	remaining: 25.2s
86:	learn: 0.2558051	total: 2.4s	remaining: 25.1s
87:	learn: 0.2557468	total: 2.42s	remaining: 25.1s
88:	learn: 0.2556962	total: 2.45s	remaining: 25.1s
89:	learn: 0.2556454	total: 2.47s	remaining: 25s
90:	learn: 0.2555146	total: 2.5s	remaining: 25s
91:	learn: 0.2554675	total: 2.52s	remaining: 24.9s
92:	learn: 0.2554119	total: 2.55s	remaining: 24.9s
93:	learn: 0.2553486	total: 2.58s	remaining: 24.9s
94:	learn: 0.2553205	total: 2.61s	remaining: 24.8s
95:	learn: 0.2552526	total: 2.63s	remaining: 24.8s
96:	learn: 0.2552232	total: 2.67s	remaining: 24.9s
97:	learn: 0.2551408	total: 2.7s	remaining: 24.8s
98:	learn: 0.2550813	total: 2.73s	remaining: 24.8s
99:	learn: 0.2550232	total: 2.75s	remaining: 24.8s
100:	learn: 0.2549766	total: 2.78s	remaining: 24.8s
101:	learn: 0.2549374	total: 2.81s	remaining: 24.7s
102:	learn: 0.2548577	total: 2.83s	remaining: 24.7s
103:	learn: 0.2548173	total: 2.86s	remaining: 24.6s
104:	learn: 0.2547134	total: 2.89s

249:	learn: 0.2477848	total: 6.9s	remaining: 20.7s
250:	learn: 0.2477537	total: 6.92s	remaining: 20.7s
251:	learn: 0.2477289	total: 6.95s	remaining: 20.6s
252:	learn: 0.2476578	total: 6.98s	remaining: 20.6s
253:	learn: 0.2475877	total: 7s	remaining: 20.6s
254:	learn: 0.2475579	total: 7.03s	remaining: 20.5s
255:	learn: 0.2475168	total: 7.05s	remaining: 20.5s
256:	learn: 0.2474763	total: 7.08s	remaining: 20.5s
257:	learn: 0.2474449	total: 7.11s	remaining: 20.4s
258:	learn: 0.2473936	total: 7.14s	remaining: 20.4s
259:	learn: 0.2473505	total: 7.16s	remaining: 20.4s
260:	learn: 0.2472975	total: 7.19s	remaining: 20.3s
261:	learn: 0.2472464	total: 7.23s	remaining: 20.4s
262:	learn: 0.2472042	total: 7.25s	remaining: 20.3s
263:	learn: 0.2471700	total: 7.28s	remaining: 20.3s
264:	learn: 0.2471522	total: 7.31s	remaining: 20.3s
265:	learn: 0.2470993	total: 7.33s	remaining: 20.2s
266:	learn: 0.2470387	total: 7.36s	remaining: 20.2s
267:	learn: 0.2470183	total: 7.38s	remaining: 20.2s
268:	learn: 0.24

408:	learn: 0.2411537	total: 11.2s	remaining: 16.2s
409:	learn: 0.2411043	total: 11.2s	remaining: 16.2s
410:	learn: 0.2410714	total: 11.3s	remaining: 16.1s
411:	learn: 0.2410341	total: 11.3s	remaining: 16.1s
412:	learn: 0.2409940	total: 11.3s	remaining: 16.1s
413:	learn: 0.2409935	total: 11.3s	remaining: 16s
414:	learn: 0.2409931	total: 11.3s	remaining: 16s
415:	learn: 0.2409589	total: 11.4s	remaining: 16s
416:	learn: 0.2409235	total: 11.4s	remaining: 15.9s
417:	learn: 0.2408750	total: 11.4s	remaining: 15.9s
418:	learn: 0.2408507	total: 11.5s	remaining: 15.9s
419:	learn: 0.2407825	total: 11.5s	remaining: 15.8s
420:	learn: 0.2407380	total: 11.5s	remaining: 15.8s
421:	learn: 0.2406709	total: 11.5s	remaining: 15.8s
422:	learn: 0.2406306	total: 11.6s	remaining: 15.8s
423:	learn: 0.2405838	total: 11.6s	remaining: 15.7s
424:	learn: 0.2405330	total: 11.6s	remaining: 15.7s
425:	learn: 0.2404985	total: 11.6s	remaining: 15.7s
426:	learn: 0.2404672	total: 11.7s	remaining: 15.7s
427:	learn: 0.2404

567:	learn: 0.2350031	total: 15.6s	remaining: 11.8s
568:	learn: 0.2349895	total: 15.6s	remaining: 11.8s
569:	learn: 0.2349291	total: 15.6s	remaining: 11.8s
570:	learn: 0.2348834	total: 15.6s	remaining: 11.7s
571:	learn: 0.2348343	total: 15.7s	remaining: 11.7s
572:	learn: 0.2347978	total: 15.7s	remaining: 11.7s
573:	learn: 0.2347685	total: 15.7s	remaining: 11.7s
574:	learn: 0.2347232	total: 15.7s	remaining: 11.6s
575:	learn: 0.2346879	total: 15.8s	remaining: 11.6s
576:	learn: 0.2346295	total: 15.8s	remaining: 11.6s
577:	learn: 0.2346048	total: 15.8s	remaining: 11.5s
578:	learn: 0.2345715	total: 15.8s	remaining: 11.5s
579:	learn: 0.2345315	total: 15.9s	remaining: 11.5s
580:	learn: 0.2344819	total: 15.9s	remaining: 11.5s
581:	learn: 0.2344520	total: 15.9s	remaining: 11.4s
582:	learn: 0.2344420	total: 15.9s	remaining: 11.4s
583:	learn: 0.2343958	total: 16s	remaining: 11.4s
584:	learn: 0.2343480	total: 16s	remaining: 11.3s
585:	learn: 0.2342782	total: 16s	remaining: 11.3s
586:	learn: 0.2342

727:	learn: 0.2289622	total: 19.9s	remaining: 7.42s
728:	learn: 0.2289308	total: 19.9s	remaining: 7.4s
729:	learn: 0.2288765	total: 19.9s	remaining: 7.37s
730:	learn: 0.2288278	total: 19.9s	remaining: 7.34s
731:	learn: 0.2287982	total: 20s	remaining: 7.31s
732:	learn: 0.2287502	total: 20s	remaining: 7.29s
733:	learn: 0.2286798	total: 20s	remaining: 7.26s
734:	learn: 0.2286323	total: 20.1s	remaining: 7.23s
735:	learn: 0.2285909	total: 20.1s	remaining: 7.21s
736:	learn: 0.2285598	total: 20.1s	remaining: 7.18s
737:	learn: 0.2284917	total: 20.1s	remaining: 7.15s
738:	learn: 0.2284358	total: 20.2s	remaining: 7.12s
739:	learn: 0.2283872	total: 20.2s	remaining: 7.1s
740:	learn: 0.2283342	total: 20.2s	remaining: 7.07s
741:	learn: 0.2283100	total: 20.3s	remaining: 7.04s
742:	learn: 0.2282636	total: 20.3s	remaining: 7.02s
743:	learn: 0.2282095	total: 20.3s	remaining: 6.99s
744:	learn: 0.2281469	total: 20.3s	remaining: 6.96s
745:	learn: 0.2281380	total: 20.4s	remaining: 6.93s
746:	learn: 0.228102

892:	learn: 0.2230104	total: 24.5s	remaining: 2.93s
893:	learn: 0.2229553	total: 24.5s	remaining: 2.9s
894:	learn: 0.2229506	total: 24.5s	remaining: 2.88s
895:	learn: 0.2229008	total: 24.6s	remaining: 2.85s
896:	learn: 0.2228498	total: 24.6s	remaining: 2.82s
897:	learn: 0.2228166	total: 24.6s	remaining: 2.79s
898:	learn: 0.2227963	total: 24.6s	remaining: 2.77s
899:	learn: 0.2227828	total: 24.7s	remaining: 2.74s
900:	learn: 0.2227592	total: 24.7s	remaining: 2.71s
901:	learn: 0.2227420	total: 24.7s	remaining: 2.69s
902:	learn: 0.2226894	total: 24.7s	remaining: 2.66s
903:	learn: 0.2226414	total: 24.8s	remaining: 2.63s
904:	learn: 0.2226102	total: 24.8s	remaining: 2.6s
905:	learn: 0.2225808	total: 24.8s	remaining: 2.58s
906:	learn: 0.2225551	total: 24.8s	remaining: 2.55s
907:	learn: 0.2225061	total: 24.9s	remaining: 2.52s
908:	learn: 0.2224858	total: 24.9s	remaining: 2.49s
909:	learn: 0.2224439	total: 24.9s	remaining: 2.46s
910:	learn: 0.2224160	total: 24.9s	remaining: 2.44s
911:	learn: 0.

Важность признаков

In [14]:
from eli5.sklearn import PermutationImportance
from eli5 import show_weights


permutation_estimator = PermutationImportance(
    models['cb']['model'], scoring="roc_auc", random_state=42
)
importance = permutation_estimator.fit(train.fillna(-9999), target)

In [15]:
show_weights(importance, 
             feature_names=train.columns.tolist(), 
             top=train.shape[1])

Weight,Feature
0.0193  ± 0.0009,ratio_credit_to_annuity
0.0162  ± 0.0028,external_scoring_rating_nanmedian
0.0137  ± 0.0012,external_scoring_rating_max
0.0136  ± 0.0006,external_scoring_rating_min
0.0108  ± 0.0009,prev_app_amt_application_sum_approved_apps
0.0107  ± 0.0018,external_scoring_rating_mean
0.0106  ± 0.0011,age
0.0080  ± 0.0006,region_population
0.0060  ± 0.0009,name_contract_type
0.0056  ± 0.0005,expected_total_loss_1


In [16]:
feature_importance = pd.DataFrame(importance.feature_importances_,
                                  index=train.columns, 
                                  columns=['score'])

In [17]:
important_features = feature_importance.loc[feature_importance['score'] > 0.001].index.tolist()
len(important_features)

78

In [18]:
important_features

['name_contract_type',
 'gender',
 'total_salary_x',
 'amount_credit_x',
 'amount_annuity',
 'family_status',
 'region_population',
 'age',
 'days_on_last_job',
 'own_car_age',
 'family_size',
 'external_scoring_rating_1',
 'external_scoring_rating_2',
 'external_scoring_rating_3',
 'amt_req_credit_bureau_qrt',
 'amt_req_credit_bureau_year',
 'education_level_family_status',
 'bki_request_count',
 'external_scoring_prod',
 'external_scoring_weighted',
 'external_scoring_rating_min',
 'external_scoring_rating_max',
 'external_scoring_rating_mean',
 'external_scoring_rating_nanmedian',
 'external_scoring_rating_var',
 'ratio_credit_to_annuity',
 'ratio_annuity_to_salary',
 'ratio_credit_to_salary',
 'total_salary_net',
 'ratio_annuity_to_age',
 'ratio_credit_to_age',
 'ratio_salary_to_age',
 'ratio_salary_to_experience',
 'ratio_credit_to_experience',
 'ratio_annuity_to_experience',
 'ratio_age_to_experience',
 'ratio_salary_to_region_population',
 'ratio_car_to_experience',
 'ratio_car_

In [19]:
%%time
models = dict()
LB_preds = pd.DataFrame()

for key, value in model_search.items():
    model_name = key
    classifier = value['classifier']
    params = value['params']
    params_search = value['params_search']
    categorical = categorical
    
    print(f'{model_name}\n{"*"*70}\n{"*"*70}')

    optimizator = Bayesian_optimization(
                    classifier, params, params_search,
                    x_train[important_features], 
                    y_train, 
                    x_valid[important_features], 
                    y_valid,
                    categorical=categorical,
                    metric=roc_auc_score, evals=True, 
                    method='maximize', random_state=42,
                    init_points=3, n_iter=3, acq='ei',
                    early_stopping_rounds=50, verbose=200,
                    int_params=['max_depth', 
                                'min_child_weights', 
                                'min_data_in_leaf']
                  )
    optimal_params = optimizator.optimize_it()
    
    model = optimizator.model
    model.fit(x_train[important_features], y_train)
#     print(f'\n{model_name}\ntest metric: \
#         {roc_auc_score(y_test, model.predict(x_test))}\n')
    
    models[model_name] = {
        'model': model,
        'y_valid_pred': model.predict(x_valid[important_features]),
#         'y_test_pred': model.predict(x_test),
        'optimal_params': optimal_params,
    }
    
    LB_preds[model_name] = model.predict_proba(test[important_features])[:, 1]

cb
**********************************************************************
**********************************************************************
|   iter    |  target   | colsam... | learni... | max_depth | min_da... | reg_la... | subsample |
-------------------------------------------------------------------------------------------------
0:	test: 0.6172955	test1: 0.6062255	best: 0.6062255 (0)	total: 26.5ms	remaining: 4m 25s
200:	test: 0.7516472	test1: 0.7206858	best: 0.7208866 (192)	total: 6.29s	remaining: 5m 6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7234145188
bestIteration = 333

Shrink model to first 334 iterations.
| [0m 1       [0m | [0m 0.5035  [0m | [0m 0.5498  [0m | [0m 0.09556 [0m | [0m 5.928   [0m | [0m 519.7   [0m | [0m 24.04   [0m | [0m 0.4624  [0m |
0:	test: 0.6512512	test1: 0.6352008	best: 0.6352008 (0)	total: 29ms	remaining: 4m 49s
200:	test: 0.7516308	test1: 0.7210504	best: 0.7211823 (197)	total: 6.69s	remaining: 5m 26s
Stop

78:	learn: 0.2561569	total: 1.91s	remaining: 22.2s
79:	learn: 0.2560913	total: 1.93s	remaining: 22.2s
80:	learn: 0.2559691	total: 1.95s	remaining: 22.2s
81:	learn: 0.2559201	total: 1.97s	remaining: 22.1s
82:	learn: 0.2558453	total: 2s	remaining: 22s
83:	learn: 0.2558030	total: 2.02s	remaining: 22s
84:	learn: 0.2557851	total: 2.04s	remaining: 22s
85:	learn: 0.2557067	total: 2.06s	remaining: 21.9s
86:	learn: 0.2556354	total: 2.09s	remaining: 21.9s
87:	learn: 0.2555599	total: 2.11s	remaining: 21.9s
88:	learn: 0.2554667	total: 2.13s	remaining: 21.8s
89:	learn: 0.2554002	total: 2.15s	remaining: 21.8s
90:	learn: 0.2553423	total: 2.17s	remaining: 21.7s
91:	learn: 0.2552806	total: 2.2s	remaining: 21.7s
92:	learn: 0.2552158	total: 2.24s	remaining: 21.9s
93:	learn: 0.2551635	total: 2.27s	remaining: 21.9s
94:	learn: 0.2550674	total: 2.31s	remaining: 22s
95:	learn: 0.2550137	total: 2.33s	remaining: 21.9s
96:	learn: 0.2549474	total: 2.35s	remaining: 21.9s
97:	learn: 0.2548814	total: 2.37s	remaining

244:	learn: 0.2484609	total: 5.97s	remaining: 18.4s
245:	learn: 0.2484302	total: 6s	remaining: 18.4s
246:	learn: 0.2483640	total: 6.03s	remaining: 18.4s
247:	learn: 0.2483382	total: 6.04s	remaining: 18.3s
248:	learn: 0.2483030	total: 6.07s	remaining: 18.3s
249:	learn: 0.2482587	total: 6.09s	remaining: 18.3s
250:	learn: 0.2482183	total: 6.11s	remaining: 18.2s
251:	learn: 0.2481522	total: 6.13s	remaining: 18.2s
252:	learn: 0.2481464	total: 6.15s	remaining: 18.2s
253:	learn: 0.2480786	total: 6.17s	remaining: 18.1s
254:	learn: 0.2480373	total: 6.19s	remaining: 18.1s
255:	learn: 0.2480264	total: 6.21s	remaining: 18.1s
256:	learn: 0.2480040	total: 6.25s	remaining: 18.1s
257:	learn: 0.2479424	total: 6.27s	remaining: 18s
258:	learn: 0.2479303	total: 6.29s	remaining: 18s
259:	learn: 0.2478994	total: 6.31s	remaining: 18s
260:	learn: 0.2478743	total: 6.33s	remaining: 17.9s
261:	learn: 0.2478254	total: 6.36s	remaining: 17.9s
262:	learn: 0.2477981	total: 6.39s	remaining: 17.9s
263:	learn: 0.2477961

407:	learn: 0.2422678	total: 9.86s	remaining: 14.3s
408:	learn: 0.2422119	total: 9.89s	remaining: 14.3s
409:	learn: 0.2421800	total: 9.91s	remaining: 14.3s
410:	learn: 0.2421264	total: 9.93s	remaining: 14.2s
411:	learn: 0.2420839	total: 9.95s	remaining: 14.2s
412:	learn: 0.2420478	total: 9.97s	remaining: 14.2s
413:	learn: 0.2420296	total: 9.99s	remaining: 14.1s
414:	learn: 0.2419561	total: 10s	remaining: 14.1s
415:	learn: 0.2419212	total: 10s	remaining: 14.1s
416:	learn: 0.2418599	total: 10.1s	remaining: 14.1s
417:	learn: 0.2418506	total: 10.1s	remaining: 14s
418:	learn: 0.2418419	total: 10.1s	remaining: 14s
419:	learn: 0.2417958	total: 10.1s	remaining: 14s
420:	learn: 0.2417719	total: 10.2s	remaining: 14s
421:	learn: 0.2417716	total: 10.2s	remaining: 13.9s
422:	learn: 0.2417296	total: 10.2s	remaining: 13.9s
423:	learn: 0.2416894	total: 10.2s	remaining: 13.9s
424:	learn: 0.2416696	total: 10.3s	remaining: 13.9s
425:	learn: 0.2416405	total: 10.3s	remaining: 13.9s
426:	learn: 0.2416106	to

566:	learn: 0.2361870	total: 13.9s	remaining: 10.6s
567:	learn: 0.2361682	total: 13.9s	remaining: 10.6s
568:	learn: 0.2361597	total: 13.9s	remaining: 10.6s
569:	learn: 0.2361399	total: 14s	remaining: 10.5s
570:	learn: 0.2361242	total: 14s	remaining: 10.5s
571:	learn: 0.2361192	total: 14s	remaining: 10.5s
572:	learn: 0.2361183	total: 14s	remaining: 10.5s
573:	learn: 0.2360827	total: 14.1s	remaining: 10.4s
574:	learn: 0.2360217	total: 14.1s	remaining: 10.4s
575:	learn: 0.2359592	total: 14.1s	remaining: 10.4s
576:	learn: 0.2359258	total: 14.1s	remaining: 10.4s
577:	learn: 0.2358814	total: 14.1s	remaining: 10.3s
578:	learn: 0.2358643	total: 14.2s	remaining: 10.3s
579:	learn: 0.2358608	total: 14.2s	remaining: 10.3s
580:	learn: 0.2358200	total: 14.2s	remaining: 10.3s
581:	learn: 0.2357660	total: 14.2s	remaining: 10.2s
582:	learn: 0.2357529	total: 14.3s	remaining: 10.2s
583:	learn: 0.2357250	total: 14.3s	remaining: 10.2s
584:	learn: 0.2357119	total: 14.3s	remaining: 10.2s
585:	learn: 0.235674

725:	learn: 0.2309851	total: 17.7s	remaining: 6.68s
726:	learn: 0.2309423	total: 17.7s	remaining: 6.66s
727:	learn: 0.2309029	total: 17.8s	remaining: 6.64s
728:	learn: 0.2309019	total: 17.8s	remaining: 6.61s
729:	learn: 0.2308892	total: 17.8s	remaining: 6.58s
730:	learn: 0.2308586	total: 17.8s	remaining: 6.56s
731:	learn: 0.2308340	total: 17.8s	remaining: 6.53s
732:	learn: 0.2308176	total: 17.9s	remaining: 6.51s
733:	learn: 0.2307898	total: 17.9s	remaining: 6.49s
734:	learn: 0.2307529	total: 17.9s	remaining: 6.47s
735:	learn: 0.2307385	total: 18s	remaining: 6.44s
736:	learn: 0.2307086	total: 18s	remaining: 6.42s
737:	learn: 0.2306676	total: 18s	remaining: 6.39s
738:	learn: 0.2306166	total: 18s	remaining: 6.37s
739:	learn: 0.2305940	total: 18s	remaining: 6.34s
740:	learn: 0.2305293	total: 18.1s	remaining: 6.32s
741:	learn: 0.2304891	total: 18.1s	remaining: 6.29s
742:	learn: 0.2304168	total: 18.1s	remaining: 6.27s
743:	learn: 0.2303717	total: 18.1s	remaining: 6.24s
744:	learn: 0.2303611	

890:	learn: 0.2253912	total: 21.8s	remaining: 2.67s
891:	learn: 0.2253405	total: 21.9s	remaining: 2.65s
892:	learn: 0.2253258	total: 21.9s	remaining: 2.62s
893:	learn: 0.2252914	total: 21.9s	remaining: 2.6s
894:	learn: 0.2252739	total: 21.9s	remaining: 2.57s
895:	learn: 0.2252235	total: 21.9s	remaining: 2.55s
896:	learn: 0.2252019	total: 22s	remaining: 2.52s
897:	learn: 0.2251421	total: 22s	remaining: 2.5s
898:	learn: 0.2250954	total: 22s	remaining: 2.47s
899:	learn: 0.2250784	total: 22s	remaining: 2.45s
900:	learn: 0.2250596	total: 22.1s	remaining: 2.42s
901:	learn: 0.2250277	total: 22.1s	remaining: 2.4s
902:	learn: 0.2250008	total: 22.1s	remaining: 2.38s
903:	learn: 0.2249905	total: 22.1s	remaining: 2.35s
904:	learn: 0.2249535	total: 22.2s	remaining: 2.33s
905:	learn: 0.2249347	total: 22.2s	remaining: 2.3s
906:	learn: 0.2248916	total: 22.2s	remaining: 2.28s
907:	learn: 0.2248400	total: 22.3s	remaining: 2.25s
908:	learn: 0.2248067	total: 22.3s	remaining: 2.23s
909:	learn: 0.2247614	to

In [20]:
for key, val in models.items():
    print(f'\n{key}\n{val["optimal_params"]}')


cb
{'colsample_bylevel': 0.5498160475389451, 'learning_rate': 0.09556428757689246, 'max_depth': 5, 'min_data_in_leaf': 519, 'reg_lambda': 24.041677639819287, 'subsample': 0.46239780813448106}


Подготвка прогноза

In [21]:
y_pred = gmean(LB_preds, axis=1)
# y_pred = LB_preds.mean(axis=1)
y_pred = y_pred * 4
y_pred = y_pred.clip(0, 1)
LB_preds.head(3)

Unnamed: 0,cb
0,0.079429
1,0.276897
2,0.185115


In [22]:
submit = pd.read_csv('./data/sample_submit.csv')
submit['TARGET'] = y_pred
submit.to_csv('./baseline_preds.csv', index=False, encoding='utf-8', sep=',')
submit.head()

Unnamed: 0,APPLICATION_NUMBER,TARGET
0,123724268,0.317717
1,123456549,1.0
2,123428178,0.740461
3,123619984,0.292606
4,123671104,0.061746


In [23]:
submit[submit['TARGET'] > 0.5].shape

(25204, 2)

In [24]:
x_train['target'] = y_train
x_valid['target'] = y_valid
x_train[['target'] + important_features].to_pickle('prepared_train_adv075.pkl')
x_valid[['target'] + important_features].to_pickle('prepared_valid_adv075.pkl')
test[important_features].to_pickle('prepared_test_adv075.pkl')