In [1]:
import numpy as np
import pandas as pd
import pickle   # сохранение модели

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error as mse, r2_score as r2
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix,  accuracy_score, classification_report

In [2]:
df = pd.read_csv('train3.csv', sep=',')
df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [3]:
df.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366392.0,0.030133,11.130933,18.317467,945153.7,0.17,34.6926,0.117152,11873180.0,289833.2,18314.454133,1151.087498,0.281733
std,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688806,0.347192,31926120.0,317871.4,11926.764673,1604.451418,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.8,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


In [4]:
# feature_names = [ 'Annual Income', 'Tax Liens', 'Number of Open Accounts', 'Months since last delinquent', 'Maximum Open Credit', 'Number of Credit Problems', 'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt', 'Credit Score']
feature_names = [ 'Annual Income', 'Tax Liens', 'Months since last delinquent', 'Number of Credit Problems', 'Monthly Debt', 'Credit Score']
target_name = 'Credit Default'

In [5]:
missing_num = df.isnull().sum()
print(missing_num)

Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64


In [6]:
for feature in feature_names:
    med = df[feature].median()
    df[feature] = df[feature].fillna(med)
missing_num = df.isnull().sum()
print(missing_num)

Home Ownership                    0
Annual Income                     0
Years in current job            371
Tax Liens                         0
Number of Open Accounts           0
Years of Credit History           0
Maximum Open Credit               0
Number of Credit Problems         0
Months since last delinquent      0
Bankruptcies                     14
Purpose                           0
Term                              0
Current Loan Amount               0
Current Credit Balance            0
Monthly Debt                      0
Credit Score                      0
Credit Default                    0
dtype: int64


In [7]:
df.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7486.0,7500.0,7500.0,7500.0,7500.0,7500.0
mean,1325286.0,0.030133,11.130933,18.317467,945153.7,0.17,33.227467,0.117152,11873180.0,289833.2,18314.454133,1063.877333,0.281733
std,756755.1,0.271604,4.908924,7.041946,16026220.0,0.498598,14.70395,0.347192,31926120.0,317871.4,11926.764673,1438.335832,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,931133.0,0.0,8.0,13.5,279229.5,0.0,32.0,0.0,180169.0,114256.5,10067.5,718.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1499974.0,0.0,14.0,21.8,793501.5,0.0,32.0,0.0,519882.0,360406.2,23818.0,740.0,1.0
max,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


In [8]:
feature_names_for_stand = df[feature_names].select_dtypes (include=['float64', 'float16', 'int32']).columns.tolist()
scaler = StandardScaler()
stand_features = scaler.fit_transform(df[feature_names_for_stand])

In [9]:
df[feature_names_for_stand] = pd.DataFrame(stand_features, columns=feature_names_for_stand)
df.to_csv('prepareddf3.csv', index=False, encoding='utf-8', sep=';')

In [10]:
X = df[feature_names]
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42,  stratify=y)

In [11]:
display(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True))

Credit Default
0    0.718209
1    0.281791
Name: proportion, dtype: float64

Credit Default
0    0.718384
1    0.281616
Name: proportion, dtype: float64

In [12]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [13]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [14]:
pred_train = logreg.predict(X_train)
pred_test = logreg.predict(X_test)

In [15]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      3609
           1       0.98      0.19      0.31      1416

    accuracy                           0.77      5025
   macro avg       0.87      0.59      0.59      5025
weighted avg       0.82      0.77      0.71      5025

TEST

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      1778
           1       0.97      0.21      0.34       697

    accuracy                           0.77      2475
   macro avg       0.87      0.60      0.60      2475
weighted avg       0.82      0.77      0.72      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1774    4
1                553  144


In [16]:
vectmach = svm.SVC()
vectmach.fit(X_train, y_train)

In [17]:
pred_train = vectmach.predict(X_train)
pred_test = vectmach.predict(X_test)

In [18]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      3609
           1       0.99      0.20      0.33      1416

    accuracy                           0.77      5025
   macro avg       0.87      0.60      0.60      5025
weighted avg       0.82      0.77      0.71      5025

TEST

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      1778
           1       0.99      0.21      0.34       697

    accuracy                           0.78      2475
   macro avg       0.87      0.60      0.60      2475
weighted avg       0.83      0.78      0.72      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1776    2
1                554  143


In [19]:
GB = GradientBoostingClassifier(n_estimators=100, learning_rate=1,
                                 max_depth=1, random_state=0)

In [20]:
GB.fit(X_train, y_train)

In [21]:
pred_train = GB.predict(X_train)
pred_test = GB.predict(X_test)

In [22]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       0.77      0.99      0.86      3609
           1       0.90      0.23      0.37      1416

    accuracy                           0.78      5025
   macro avg       0.83      0.61      0.62      5025
weighted avg       0.80      0.78      0.72      5025

TEST

              precision    recall  f1-score   support

           0       0.77      0.99      0.86      1778
           1       0.87      0.23      0.36       697

    accuracy                           0.77      2475
   macro avg       0.82      0.61      0.61      2475
weighted avg       0.79      0.77      0.72      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1754   24
1                538  159


In [23]:
nbrs = KNeighborsClassifier(n_neighbors=2)
nbrs.fit(X_train, y_train)

In [24]:
pred_train = nbrs.predict(X_train)
pred_test = nbrs.predict(X_test)

In [26]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       0.81      1.00      0.89      3609
           1       1.00      0.39      0.56      1416

    accuracy                           0.83      5025
   macro avg       0.90      0.70      0.73      5025
weighted avg       0.86      0.83      0.80      5025

TEST

              precision    recall  f1-score   support

           0       0.76      0.94      0.84      1778
           1       0.61      0.26      0.37       697

    accuracy                           0.75      2475
   macro avg       0.69      0.60      0.60      2475
weighted avg       0.72      0.75      0.71      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1663  115
1                515  182
