# 2. Machine Learning

1. Построить модель (при помощи Python) прогнозирование дефолта клиента на аппликационных и поведенческих данных из сайта с помощью логистической регрессии, спрогнозировать вероятность не выполнения обязательств перед компанией. Финальную модель логистической регрессии необходимо отобразить в Excel (параметры, группы и баллы).

In [614]:
import gc

import numpy as np
import pandas as pd
from scipy.stats import uniform, randint

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, validation_curve, StratifiedKFold, RandomizedSearchCV

from xgboost.sklearn import XGBClassifier

from sklearn.metrics import auc, roc_curve, accuracy_score, f1_score


# Reading the dataset

In [615]:
%%time 
data = pd.read_csv('data.csv')

CPU times: user 1.11 s, sys: 65.9 ms, total: 1.18 s
Wall time: 1.18 s


In [616]:
print('data shape:', data.shape)

data shape: (35311, 285)


In [617]:
data.head()

Unnamed: 0,target_flag,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_not_uah,crdeal_deallife__loans_cnt__dldonor_bnk,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_7_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_30_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_45_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_60_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_90_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_120_days,...,credres__credit_req_cnt__org_bnk__last_3_5_days,credres__credit_req_cnt__org_bnk__last_7_days,credres__credit_req_cnt__org_bnk__last_10_days,credres__credit_req_cnt__org_bnk__last_13_days,credres__credit_req_cnt__org_bnk__last_20_days,credres__credit_req_cnt__org_bnk__last_30_days,credres__credit_req_cnt__org_bnk__last_60_days,credres__credit_req_cnt__org_bnk__more_than_90_days,credres__credit_req_cnt__org_bnk__last_360_days,credres__credit_req_cnt__org_bnk__more_than_360_days
0,good,40.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,5.0,4.0
1,good,72.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,14.0,6.0,11.0
2,good,77.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,5.0,9.0
3,good,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0
4,good,8.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


# Handling missing data

In [618]:
missing = data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)

In [619]:
missing

crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah                     94
crdeal_deallife__loans_cnt__state_discounted__last_active_lteq_90_days       94
crdeal_deallife__loans_cnt__state_discounted__last_active_lteq_120_days      94
crdeal_deallife__loans_cnt__state_discounted__last_active_lteq_180_days      94
crdeal_deallife__loans_cnt__state_discounted__last_active_lteq_360_days      94
                                                                           ... 
crdeal_deallife__initial_amount_mean__state_clsd                           2215
crdeal_deallife__historical_max_overdue_debt__clsd_state                   2215
crdeal_deallife__initial_amount_sum__state_clsd                            2215
crdeal_deallife__current_max_dpd__donor_mfo                                3648
crdeal_deallife__current_max_dpd__donor_bnk                                4816
Length: 284, dtype: int64

In [620]:
data = data.fillna(data.mean())

In [621]:
missing = data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)

In [622]:
missing

crdeal_deallife__first_loan_open_date                 94
crdeal_deallife__last_loan_open_date                  94
crdeal_deallife__first_loan_open_date__donor_mfo    1687
crdeal_deallife__last_loan_open_date__donor_mfo     1687
crdeal_deallife__first_loan_open_date__donor_bnk    1943
crdeal_deallife__last_loan_open_date__donor_bnk     1943
dtype: int64

In [623]:
data = data.drop(missing.index, axis=1)

# Splitting the data into train and test

In [624]:
test_data = data[data['target_flag'] == 'not determined']
test_data = test_data.drop('target_flag', axis=1)

test_data.head()

Unnamed: 0,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_not_uah,crdeal_deallife__loans_cnt__dldonor_bnk,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_7_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_30_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_45_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_60_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_90_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_120_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_180_days,...,credres__credit_req_cnt__org_bnk__last_3_5_days,credres__credit_req_cnt__org_bnk__last_7_days,credres__credit_req_cnt__org_bnk__last_10_days,credres__credit_req_cnt__org_bnk__last_13_days,credres__credit_req_cnt__org_bnk__last_20_days,credres__credit_req_cnt__org_bnk__last_30_days,credres__credit_req_cnt__org_bnk__last_60_days,credres__credit_req_cnt__org_bnk__more_than_90_days,credres__credit_req_cnt__org_bnk__last_360_days,credres__credit_req_cnt__org_bnk__more_than_360_days
7,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9,25.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,4.0,11.0,9.0,7.0
14,21.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,2.0,2.0,2.0,6.0,7.0,1.0
17,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
21,9.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0


In [625]:
train_data = data[data['target_flag'] != 'not determined']
train_data.head()

Unnamed: 0,target_flag,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_not_uah,crdeal_deallife__loans_cnt__dldonor_bnk,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_7_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_30_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_45_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_60_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_90_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_120_days,...,credres__credit_req_cnt__org_bnk__last_3_5_days,credres__credit_req_cnt__org_bnk__last_7_days,credres__credit_req_cnt__org_bnk__last_10_days,credres__credit_req_cnt__org_bnk__last_13_days,credres__credit_req_cnt__org_bnk__last_20_days,credres__credit_req_cnt__org_bnk__last_30_days,credres__credit_req_cnt__org_bnk__last_60_days,credres__credit_req_cnt__org_bnk__more_than_90_days,credres__credit_req_cnt__org_bnk__last_360_days,credres__credit_req_cnt__org_bnk__more_than_360_days
0,good,40.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,5.0,4.0
1,good,72.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,14.0,6.0,11.0
2,good,77.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,5.0,9.0
3,good,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0
4,good,8.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


# Encoding target into 0's and 1's (0 - bad, 1 - good)

In [626]:
le = LabelEncoder()
le.fit(train_data['target_flag'])

LabelEncoder()

In [627]:
le.classes_

array(['bad', 'good'], dtype=object)

In [628]:
train_data['target_flag'] = le.transform(train_data['target_flag'])

# Selecting highly corellated features

In [629]:
cor = train_data.corr()

In [630]:
cor

Unnamed: 0,target_flag,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_not_uah,crdeal_deallife__loans_cnt__dldonor_bnk,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_7_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_30_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_45_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_60_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_90_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_120_days,...,credres__credit_req_cnt__org_bnk__last_3_5_days,credres__credit_req_cnt__org_bnk__last_7_days,credres__credit_req_cnt__org_bnk__last_10_days,credres__credit_req_cnt__org_bnk__last_13_days,credres__credit_req_cnt__org_bnk__last_20_days,credres__credit_req_cnt__org_bnk__last_30_days,credres__credit_req_cnt__org_bnk__last_60_days,credres__credit_req_cnt__org_bnk__more_than_90_days,credres__credit_req_cnt__org_bnk__last_360_days,credres__credit_req_cnt__org_bnk__more_than_360_days
target_flag,1.000000,0.091743,0.013211,0.038861,-0.007049,-0.011811,-0.017213,-0.022600,-0.017219,-0.013722,...,-0.020655,-0.027994,-0.029913,-0.033918,-0.040779,-0.044781,-0.047482,0.061724,0.012885,0.063783
crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah,0.091743,1.000000,0.022652,0.273456,-0.005998,0.014167,0.020049,0.023663,0.043557,0.051764,...,0.002438,0.008426,0.015355,0.018750,0.026693,0.042059,0.082420,0.379365,0.295388,0.322104
crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_not_uah,0.013211,0.022652,1.000000,0.068526,0.001052,-0.000032,0.007995,0.007843,0.003092,0.002735,...,0.007526,0.007545,0.015456,0.016200,0.021692,0.019370,0.021928,0.040042,0.030510,0.039556
crdeal_deallife__loans_cnt__dldonor_bnk,0.038861,0.273456,0.068526,1.000000,0.029250,0.205050,0.277884,0.323573,0.403166,0.460923,...,0.044780,0.057475,0.071271,0.081803,0.108255,0.135666,0.194585,0.475663,0.378076,0.444309
crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_7_days,-0.007049,-0.005998,0.001052,0.029250,1.000000,0.323123,0.236427,0.189661,0.138114,0.117182,...,0.058297,0.081329,0.073869,0.072200,0.062868,0.054911,0.043022,0.010901,0.019944,0.011899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
credres__credit_req_cnt__org_bnk__last_30_days,-0.044781,0.042059,0.019370,0.135666,0.054911,0.212988,0.206939,0.196184,0.184562,0.180395,...,0.434945,0.571008,0.661063,0.732974,0.873283,1.000000,0.813793,0.162939,0.468698,0.107198
credres__credit_req_cnt__org_bnk__last_60_days,-0.047482,0.082420,0.021928,0.194585,0.043022,0.173926,0.244423,0.284395,0.264335,0.258730,...,0.343214,0.449135,0.523404,0.581142,0.696603,0.813793,1.000000,0.238205,0.616566,0.156267
credres__credit_req_cnt__org_bnk__more_than_90_days,0.061724,0.379365,0.040042,0.475663,0.010901,0.052234,0.074846,0.086743,0.117520,0.164420,...,0.055509,0.076927,0.093639,0.106141,0.126173,0.162939,0.238205,1.000000,0.768297,0.856923
credres__credit_req_cnt__org_bnk__last_360_days,0.012885,0.295388,0.030510,0.378076,0.019944,0.105179,0.149380,0.180100,0.232132,0.276797,...,0.179013,0.237775,0.281889,0.315962,0.385610,0.468698,0.616566,0.768297,1.000000,0.423432


In [631]:
cor_target = abs(cor["target_flag"])

In [632]:
relevant_features = cor_target[cor_target>0.5]
relevant_features

target_flag    1.0
Name: target_flag, dtype: float64

No features highly corellated with the target found

# Splitting train data into train and validation datasets

In [633]:
y = train_data['target_flag']
X = train_data.drop('target_flag', axis=1)

In [634]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [635]:
del X, y, le, train_data, data, relevant_features, cor, cor_target
gc.collect()

572

In [636]:
X_train.head()

Unnamed: 0,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_not_uah,crdeal_deallife__loans_cnt__dldonor_bnk,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_7_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_30_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_45_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_60_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_90_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_120_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_180_days,...,credres__credit_req_cnt__org_bnk__last_3_5_days,credres__credit_req_cnt__org_bnk__last_7_days,credres__credit_req_cnt__org_bnk__last_10_days,credres__credit_req_cnt__org_bnk__last_13_days,credres__credit_req_cnt__org_bnk__last_20_days,credres__credit_req_cnt__org_bnk__last_30_days,credres__credit_req_cnt__org_bnk__last_60_days,credres__credit_req_cnt__org_bnk__more_than_90_days,credres__credit_req_cnt__org_bnk__last_360_days,credres__credit_req_cnt__org_bnk__more_than_360_days
30227,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,1.0,30.0
12425,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,5.0,2.0
4788,11.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,6.0
14439,11.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2.0,4.0
25594,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,4.0,0.0


# Fitting a XGBoost Classifier

In [637]:
xgbc = XGBClassifier(objective='binary:logistic')

In [638]:
params = {
    
    "colsample_bytree": uniform(0.8, 0.3),
    "lambda" : [0, 0.5, 1],
    "alpha" : [0, 0.5, 1],
    "gamma": [0, 0.5, 1],
    "learning_rate": uniform(0.03, 0.3), 
    "max_depth": [1, 2, 3], 
    "n_estimators": [100, 150, 200, 250, 300], 
    "subsample": uniform(0.7, 0.4)
}

clf = RandomizedSearchCV(estimator=xgbc, param_distributions=params, random_state=42, n_iter=50, cv=3, verbose=1, n_jobs=-1, return_train_score=True)

In [639]:
%%time
clf.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.1min finished
CPU times: user 47.2 s, sys: 364 ms, total: 47.5 s
Wall time: 4min 14s


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                        'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fec540de340>,
                                        'gamma': 

In [640]:
clf.best_params_

{'alpha': 1,
 'colsample_bytree': 0.928132336587877,
 'gamma': 0,
 'lambda': 0,
 'learning_rate': 0.1214343774474087,
 'max_depth': 2,
 'n_estimators': 200,
 'subsample': 0.8669644012595116}

In [641]:
clf.best_score_

0.687842244708949

In [642]:
pred = clf.predict(X_test)

In [643]:
del X_train, y_train
gc.collect()

2524

# Measuring performance

In [644]:
fpr, tpr, thresholds = roc_curve(y_test, pred)
print(auc(fpr, tpr))

0.5531098922096819


In [645]:
accuracy_score(y_test, pred)

0.6869790169588962

In [646]:
f1_score(y_test, pred)

0.8013136288998357

In [647]:
del X_test, y_test, pred
gc.collect()

80

# Making predictions on test data

In [648]:
target_flag = clf.predict(test_data)

In [649]:
target_flag

array([1, 1, 1, ..., 1, 1, 1])

In [650]:
data = pd.concat([pd.Series(target_flag, name='target_flag', index=test_data.index), test_data], axis=1)

In [651]:
data.head()

Unnamed: 0,target_flag,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_uah,crdeal_deallife__loans_cnt__dlrolesub_zaemshik__curr_not_uah,crdeal_deallife__loans_cnt__dldonor_bnk,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_7_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_30_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_45_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_60_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_90_days,crdeal_deallife__loans_cnt__dldonor_bnk__opened_lteq_120_days,...,credres__credit_req_cnt__org_bnk__last_3_5_days,credres__credit_req_cnt__org_bnk__last_7_days,credres__credit_req_cnt__org_bnk__last_10_days,credres__credit_req_cnt__org_bnk__last_13_days,credres__credit_req_cnt__org_bnk__last_20_days,credres__credit_req_cnt__org_bnk__last_30_days,credres__credit_req_cnt__org_bnk__last_60_days,credres__credit_req_cnt__org_bnk__more_than_90_days,credres__credit_req_cnt__org_bnk__last_360_days,credres__credit_req_cnt__org_bnk__more_than_360_days
7,1,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9,1,25.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,4.0,11.0,9.0,7.0
14,1,21.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,2.0,2.0,2.0,6.0,7.0,1.0
17,1,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
21,1,9.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0


In [652]:
data.to_csv('xgboost_prediction.xlsx', index=False)