### Импорты

In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


import matplotlib.pyplot as plt
import seaborn as sns


### Загрузка данных

In [15]:
data = pd.read_csv('../data/train_with_all_features_made_without_cat_encoding.csv')
data.head()

Unnamed: 0,home_ownership,income,years_in_current_job,tax_liens,number_of_open_accounts,years_of_credit_history,maximum_open_credit,number_of_credit_problem,bankruptcies,purpose,...,number_of_bankruptcies_per_history,current_credit_balance_per_income,monthly_income_per_monthly_debt,current_loan_amount_per_income,months_to_pay_per_month_of_credit_history,PCA_5_0,PCA_5_1,PCA_5_2,PCA_5_3,PCA_5_4
0,Own Home,482087.0,unknown,0.0,11.0,26.3,685960.0,1.0,1.0,debt consolidation,...,0.038023,0.098293,5.07631,24.628703,4.753724,-0.168816,1.080475,-2.203982,1.837586,-1.157517
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,0.0,debt consolidation,...,0.0,0.385156,4.651241,0.258383,0.078549,0.3635,-0.013454,0.624591,-0.758497,0.175988
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,0.0,debt consolidation,...,0.0,0.410413,4.587039,15.801155,2.070872,0.992833,-1.463747,0.08664,1.734586,-1.469592
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,1.0,debt consolidation,...,0.044444,0.119064,5.917181,0.15079,0.039656,0.678198,1.350833,-3.014448,0.179879,-1.588368
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,0.0,debt consolidation,...,0.0,0.120203,9.009564,0.16211,0.107393,-0.075578,1.993978,-1.333405,-0.840971,-0.322678


In [16]:
cat_features = ['home_ownership',
 'years_in_current_job',
 'tax_liens',
 'number_of_credit_problem',
 'bankruptcies',
 'purpose',
 'term',
 'num_features_clustered_by_3',
 'num_features_clustered_by_4']

num_features = ['income',
 'number_of_open_accounts',
 'years_of_credit_history',
 'maximum_open_credit',
 'current_loan_amount',
 'current_credit_balance',
 'monthly_debt',
 'credit_score',
 'income_per_credit_history',
 'mean_credit_score_per_history',
 'number_of_credit_problem_per_history',
 'number_of_bankruptcies_per_history',
 'current_credit_balance_per_income',
 'monthly_income_per_monthly_debt',
 'current_loan_amount_per_income',
 'months_to_pay_per_month_of_credit_history',
 'income_per_credit_history',
 'mean_credit_score_per_history',
 'number_of_credit_problem_per_history',
 'number_of_bankruptcies_per_history',
 'current_credit_balance_per_income',
 'monthly_income_per_monthly_debt',
 'current_loan_amount_per_income',
 'months_to_pay_per_month_of_credit_history',
 'PCA_5_0',
 'PCA_5_1',
 'PCA_5_2',
 'PCA_5_3',
 'PCA_5_4']

target = 'credit_default'

In [17]:
x = data.drop(columns=[target])
y = data[target]

x.shape, y.shape

((7500, 30), (7500,))

In [47]:
data['credit_default'].value_counts()

0    5387
1    2113
Name: credit_default, dtype: int64

Делим на тренировочную и валидационную выборку:

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)
x_train.shape, x_test.shape

((6000, 30), (1500, 30))

In [20]:
x[['tax_liens', 'number_of_credit_problem', 'bankruptcies']] = x[['tax_liens', 'number_of_credit_problem', 'bankruptcies']].astype(int)

In [26]:
categorical_features_indices = np.where(x.dtypes != float)[0]

### Catboost и подбор параметров

In [48]:
model = CatBoostClassifier(random_state=1, class_weights=[2.1, 5.3])

In [49]:
model.fit(x_train, y_train, cat_features=categorical_features_indices)

Learning rate set to 0.022141
0:	learn: 0.6874867	total: 25.1ms	remaining: 25.1s
1:	learn: 0.6835190	total: 51.2ms	remaining: 25.6s
2:	learn: 0.6787889	total: 75ms	remaining: 24.9s
3:	learn: 0.6756354	total: 86.8ms	remaining: 21.6s
4:	learn: 0.6721857	total: 110ms	remaining: 21.8s
5:	learn: 0.6679141	total: 132ms	remaining: 21.9s
6:	learn: 0.6640535	total: 159ms	remaining: 22.5s
7:	learn: 0.6611011	total: 181ms	remaining: 22.5s
8:	learn: 0.6581856	total: 206ms	remaining: 22.6s
9:	learn: 0.6549775	total: 234ms	remaining: 23.2s
10:	learn: 0.6523816	total: 270ms	remaining: 24.3s
11:	learn: 0.6500743	total: 298ms	remaining: 24.6s
12:	learn: 0.6474422	total: 322ms	remaining: 24.5s
13:	learn: 0.6450811	total: 349ms	remaining: 24.6s
14:	learn: 0.6431846	total: 389ms	remaining: 25.5s
15:	learn: 0.6410022	total: 461ms	remaining: 28.4s
16:	learn: 0.6388822	total: 515ms	remaining: 29.8s
17:	learn: 0.6375887	total: 544ms	remaining: 29.7s
18:	learn: 0.6358206	total: 604ms	remaining: 31.2s
19:	learn

<catboost.core.CatBoostClassifier at 0x7f2378caf220>

In [50]:
y_pred = model.predict(x_test)

In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.68      0.74      1077
           1       0.41      0.56      0.47       423

    accuracy                           0.65      1500
   macro avg       0.60      0.62      0.60      1500
weighted avg       0.69      0.65      0.66      1500

