### Импорты

In [2]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


import matplotlib.pyplot as plt
import seaborn as sns


### Загрузка данных

In [3]:
data = pd.read_csv('../data/train_with_all_features_made_without_cat_encoding.csv')
data.head()

Unnamed: 0,home_ownership,income,years_in_current_job,tax_liens,number_of_open_accounts,years_of_credit_history,maximum_open_credit,number_of_credit_problem,bankruptcies,purpose,...,number_of_bankruptcies_per_history,current_credit_balance_per_income,monthly_income_per_monthly_debt,current_loan_amount_per_income,months_to_pay_per_month_of_credit_history,PCA_5_0,PCA_5_1,PCA_5_2,PCA_5_3,PCA_5_4
0,Own Home,482087.0,unknown,0.0,11.0,26.3,685960.0,1.0,1.0,debt consolidation,...,0.038023,0.098293,5.07631,24.628703,4.753724,-0.168816,1.080475,-2.203982,1.837586,-1.157517
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,0.0,debt consolidation,...,0.0,0.385156,4.651241,0.258383,0.078549,0.3635,-0.013454,0.624591,-0.758497,0.175988
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,0.0,debt consolidation,...,0.0,0.410413,4.587039,15.801155,2.070872,0.992833,-1.463747,0.08664,1.734586,-1.469592
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,1.0,debt consolidation,...,0.044444,0.119064,5.917181,0.15079,0.039656,0.678198,1.350833,-3.014448,0.179879,-1.588368
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,0.0,debt consolidation,...,0.0,0.120203,9.009564,0.16211,0.107393,-0.075578,1.993978,-1.333405,-0.840971,-0.322678


In [4]:
cat_features = ['home_ownership',
 'years_in_current_job',
 'tax_liens',
 'number_of_credit_problem',
 'bankruptcies',
 'purpose',
 'term',
 'num_features_clustered_by_3',
 'num_features_clustered_by_4']

num_features = ['income',
 'number_of_open_accounts',
 'years_of_credit_history',
 'maximum_open_credit',
 'current_loan_amount',
 'current_credit_balance',
 'monthly_debt',
 'credit_score',
 'income_per_credit_history',
 'mean_credit_score_per_history',
 'number_of_credit_problem_per_history',
 'number_of_bankruptcies_per_history',
 'current_credit_balance_per_income',
 'monthly_income_per_monthly_debt',
 'current_loan_amount_per_income',
 'months_to_pay_per_month_of_credit_history',
 'income_per_credit_history',
 'mean_credit_score_per_history',
 'number_of_credit_problem_per_history',
 'number_of_bankruptcies_per_history',
 'current_credit_balance_per_income',
 'monthly_income_per_monthly_debt',
 'current_loan_amount_per_income',
 'months_to_pay_per_month_of_credit_history',
 'PCA_5_0',
 'PCA_5_1',
 'PCA_5_2',
 'PCA_5_3',
 'PCA_5_4']

target = 'credit_default'

In [5]:
x = data.drop(columns=[target])
y = data[target]

x.shape, y.shape

((7500, 30), (7500,))

In [6]:
data['credit_default'].value_counts()

0    5387
1    2113
Name: credit_default, dtype: int64

Делим на тренировочную и валидационную выборку:

In [7]:
# x[cat_features] = x[cat_features].astype(str)
# x[cat_features] = x[cat_features].apply(lambda x: str(x))

In [8]:
x[['tax_liens', 'number_of_credit_problem', 'bankruptcies']] = x[['tax_liens', 'number_of_credit_problem', 'bankruptcies']].astype(int)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)
x_train.shape, x_test.shape

((6000, 30), (1500, 30))

In [10]:
categorical_features_indices = np.where(x.dtypes != float)[0]

In [11]:
categorical_features_indices

array([ 0,  2,  3,  7,  8,  9, 10, 15, 16])

## Catboost и подбор параметров

In [12]:
model = CatBoostClassifier(random_state=1, class_weights=[2.1, 5.4])

In [13]:
model.fit(x_train, y_train, cat_features=categorical_features_indices)

Learning rate set to 0.022141
0:	learn: 0.6875487	total: 87.4ms	remaining: 1m 27s
1:	learn: 0.6827554	total: 113ms	remaining: 56.2s
2:	learn: 0.6781767	total: 138ms	remaining: 45.9s
3:	learn: 0.6731805	total: 161ms	remaining: 40.2s
4:	learn: 0.6693854	total: 201ms	remaining: 40s
5:	learn: 0.6653117	total: 265ms	remaining: 44s
6:	learn: 0.6614944	total: 316ms	remaining: 44.9s
7:	learn: 0.6583547	total: 353ms	remaining: 43.8s
8:	learn: 0.6560173	total: 371ms	remaining: 40.9s
9:	learn: 0.6529299	total: 405ms	remaining: 40.1s
10:	learn: 0.6499999	total: 464ms	remaining: 41.8s
11:	learn: 0.6472666	total: 493ms	remaining: 40.6s
12:	learn: 0.6449310	total: 536ms	remaining: 40.7s
13:	learn: 0.6427237	total: 567ms	remaining: 39.9s
14:	learn: 0.6406285	total: 595ms	remaining: 39.1s
15:	learn: 0.6386811	total: 643ms	remaining: 39.5s
16:	learn: 0.6374727	total: 710ms	remaining: 41.1s
17:	learn: 0.6358686	total: 764ms	remaining: 41.7s
18:	learn: 0.6344756	total: 821ms	remaining: 42.4s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7f200bb15450>

In [14]:
y_pred = model.predict(x_test)

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.68      0.73      1077
           1       0.41      0.57      0.47       423

    accuracy                           0.65      1500
   macro avg       0.60      0.62      0.60      1500
weighted avg       0.69      0.65      0.66      1500



### GridSearch

In [50]:
model = CatBoostClassifier(class_weights=[2.1, 5.3], 
                           cat_features=cat_features, 
                           random_seed=1)

In [51]:
params = {
    'learning_rate': [0.01, 0.015, 0.02],
    'l2_leaf_reg': [7, 9],
    'n_estimators': [20, 50, 100, 150],
    'depth': [5, 7, 10],
    
}


In [52]:
grid_search = GridSearchCV(estimator=model,
                           param_grid=params,
                           scoring='f1',
                           cv=3)

In [53]:
grid_search.fit(x_train, y_train)

0:	learn: 0.6913745	total: 16.3ms	remaining: 310ms
1:	learn: 0.6894578	total: 25.8ms	remaining: 232ms
2:	learn: 0.6875412	total: 36.7ms	remaining: 208ms
3:	learn: 0.6857536	total: 48.9ms	remaining: 195ms
4:	learn: 0.6837616	total: 59.7ms	remaining: 179ms
5:	learn: 0.6819748	total: 67.6ms	remaining: 158ms
6:	learn: 0.6803408	total: 76.1ms	remaining: 141ms
7:	learn: 0.6790397	total: 83.2ms	remaining: 125ms
8:	learn: 0.6774814	total: 95ms	remaining: 116ms
9:	learn: 0.6760587	total: 103ms	remaining: 103ms
10:	learn: 0.6746131	total: 111ms	remaining: 90.7ms
11:	learn: 0.6732748	total: 119ms	remaining: 79.5ms
12:	learn: 0.6717117	total: 127ms	remaining: 68.3ms
13:	learn: 0.6704216	total: 135ms	remaining: 58ms
14:	learn: 0.6689295	total: 145ms	remaining: 48.2ms
15:	learn: 0.6679338	total: 150ms	remaining: 37.6ms
16:	learn: 0.6665443	total: 161ms	remaining: 28.4ms
17:	learn: 0.6652966	total: 172ms	remaining: 19.1ms
18:	learn: 0.6641754	total: 181ms	remaining: 9.54ms
19:	learn: 0.6631712	total:

In [57]:
grid_search.best_params_

{'depth': 10, 'l2_leaf_reg': 9, 'learning_rate': 0.02, 'n_estimators': 150}

In [56]:
y_pred = grid_search.best_estimator_.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.64      0.72      1077
           1       0.41      0.65      0.51       423

    accuracy                           0.64      1500
   macro avg       0.62      0.64      0.61      1500
weighted avg       0.71      0.64      0.66      1500



In [58]:
model = CatBoostClassifier(class_weights=[2.1, 5.3], 
                           cat_features=cat_features, 
                           random_seed=1)

params = {
    'learning_rate': [0.02, 0.05, 0.07],
    'l2_leaf_reg': [9],
    'n_estimators': [150, 200, 250],
    'depth': [7, 10, 15],
    
}

In [59]:
grid_search = GridSearchCV(estimator=model,
                           param_grid=params,
                           scoring='f1',
                           cv=3)

In [60]:
grid_search.fit(x_train, y_train)

0:	learn: 0.6898593	total: 36.7ms	remaining: 5.47s
1:	learn: 0.6863028	total: 70.1ms	remaining: 5.19s
2:	learn: 0.6833234	total: 89.4ms	remaining: 4.38s
3:	learn: 0.6797457	total: 103ms	remaining: 3.75s
4:	learn: 0.6766586	total: 116ms	remaining: 3.35s
5:	learn: 0.6733999	total: 129ms	remaining: 3.09s
6:	learn: 0.6704246	total: 141ms	remaining: 2.89s
7:	learn: 0.6674148	total: 147ms	remaining: 2.61s
8:	learn: 0.6649139	total: 160ms	remaining: 2.51s
9:	learn: 0.6621938	total: 173ms	remaining: 2.43s
10:	learn: 0.6602877	total: 188ms	remaining: 2.37s
11:	learn: 0.6581357	total: 192ms	remaining: 2.21s
12:	learn: 0.6556587	total: 207ms	remaining: 2.18s
13:	learn: 0.6537693	total: 217ms	remaining: 2.11s
14:	learn: 0.6520929	total: 231ms	remaining: 2.08s
15:	learn: 0.6499741	total: 239ms	remaining: 2s
16:	learn: 0.6481419	total: 260ms	remaining: 2.03s
17:	learn: 0.6466353	total: 283ms	remaining: 2.08s
18:	learn: 0.6451185	total: 290ms	remaining: 2s
19:	learn: 0.6434222	total: 310ms	remaining:

In [61]:
grid_search.best_params_

{'depth': 7, 'l2_leaf_reg': 9, 'learning_rate': 0.02, 'n_estimators': 250}

In [87]:
y_pred = grid_search.best_estimator_.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.61      0.70      1077
           1       0.40      0.65      0.49       423

    accuracy                           0.62      1500
   macro avg       0.61      0.63      0.60      1500
weighted avg       0.70      0.62      0.64      1500



In [75]:
params = {'depth': 10, 'l2_leaf_reg': 9, 'learning_rate': 0.02, 'n_estimators': 150}

In [76]:
model = CatBoostClassifier(class_weights=[2.1, 5.3], 
                           random_seed=1,
                           **params)

In [77]:
model.fit(x_train, y_train, cat_features=cat_features)

0:	learn: 0.6894530	total: 139ms	remaining: 20.7s
1:	learn: 0.6856615	total: 237ms	remaining: 17.5s
2:	learn: 0.6813451	total: 254ms	remaining: 12.5s
3:	learn: 0.6777346	total: 372ms	remaining: 13.6s
4:	learn: 0.6741866	total: 530ms	remaining: 15.4s
5:	learn: 0.6710609	total: 742ms	remaining: 17.8s
6:	learn: 0.6679905	total: 995ms	remaining: 20.3s
7:	learn: 0.6650563	total: 1.1s	remaining: 19.6s
8:	learn: 0.6626543	total: 1.14s	remaining: 17.9s
9:	learn: 0.6605333	total: 1.37s	remaining: 19.2s
10:	learn: 0.6575806	total: 1.39s	remaining: 17.5s
11:	learn: 0.6554023	total: 1.54s	remaining: 17.7s
12:	learn: 0.6537551	total: 1.56s	remaining: 16.4s
13:	learn: 0.6514778	total: 1.74s	remaining: 16.9s
14:	learn: 0.6490870	total: 1.8s	remaining: 16.2s
15:	learn: 0.6468552	total: 1.93s	remaining: 16.2s
16:	learn: 0.6449257	total: 2.09s	remaining: 16.4s
17:	learn: 0.6436265	total: 2.11s	remaining: 15.5s
18:	learn: 0.6415002	total: 2.21s	remaining: 15.2s
19:	learn: 0.6394900	total: 2.24s	remaining

<catboost.core.CatBoostClassifier at 0x7f2008f01c30>

In [78]:
y_pred = model.predict(x_test)

In [79]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.64      0.72      1077
           1       0.41      0.65      0.51       423

    accuracy                           0.64      1500
   macro avg       0.62      0.64      0.61      1500
weighted avg       0.71      0.64      0.66      1500



### Сохранение результатов

In [86]:
with open('catboost_grid_search_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [81]:
with open('../data/x_train.pkl', 'wb') as f:
    pickle.dump(x_train, f)

In [82]:
with open('../data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

In [83]:
with open('../data/x_test.pkl', 'wb') as f:
    pickle.dump(x_test, f)

In [85]:
with open('../data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)