In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold,  GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score



In [3]:
df = pd.read_csv('credit_data_processed.csv')
df

Unnamed: 0,person_age,person_emp_length,loan_int_rate,loan_percent_income,cb_person_default_on_file,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_status,has_credit_history
0,21,5.0,11.14,0.10,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1
1,25,1.0,12.87,0.57,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1
2,23,4.0,15.23,0.53,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,1
3,24,8.0,14.27,0.55,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,1
4,21,2.0,7.14,0.25,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30916,52,0.0,11.26,0.08,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,1
30917,57,1.0,13.16,0.11,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1
30918,54,4.0,7.49,0.15,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1
30919,65,3.0,10.99,0.46,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1,1


In [4]:
df.shape

(30921, 15)

In [5]:
y = df['loan_status']
X = df.drop(columns=['loan_status'])
print(f"Features: {X.shape[1]}, observations: {X.shape[0]}")
print(f"Class balance: {y.value_counts(normalize=True).round(2)}")

Features: 14, observations: 30921
Class balance: loan_status
0    0.78
1    0.22
Name: proportion, dtype: float64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train class balance: {y_train.value_counts(normalize=True).round(2)}")

Train: (24736, 14), Test: (6185, 14)
Train class balance: loan_status
0    0.78
1    0.22
Name: proportion, dtype: float64


Пронормируем данные и разделим тестовую выборку на 5 фолдов. 

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

cv = StratifiedKFold(n_splits=5)

## KNN

In [8]:
knn = GridSearchCV(
    estimator = KNeighborsClassifier(),
    cv= StratifiedKFold(5, shuffle=True, random_state=42),
    param_grid = {"n_neighbors": np.arange(1, 21),
        "metric": ["euclidean", "manhattan"],
        "weights": ["uniform", "distance"],
    },
    scoring="balanced_accuracy",
    n_jobs=-1,
)

knn.fit(X_train, y_train)

In [9]:
print("Metric:", knn.best_params_["metric"])
print("Num neighbors:", knn.best_params_["n_neighbors"])
print("Weigths:", knn.best_params_["weights"])

Metric: manhattan
Num neighbors: 8
Weigths: distance


In [10]:
y_pred =knn.predict(X_test)
print(f'Accuracy score {np.round(accuracy_score(y_pred=y_pred, y_true=y_test)*100, 2)}%')
print(f'Balanced accuracy score {np.round(balanced_accuracy_score(y_pred=y_pred, y_true=y_test)*100, 2)}%')

Accuracy score 87.76%
Balanced accuracy score 76.79%


In [11]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

Precision: 0.83
Recall: 0.57


## Testing different models

In [None]:
models = [
    ("Logistic Regression", LogisticRegression()), 
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ("SVM", SVC()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("XGBoost", XGBClassifier()),
    ("CatBoost", CatBoostClassifier(verbose=0))
]

In [None]:
results = []

for name, model in models:
    model.fit(X_train,y_train)
    model_pred = model.predict(X_test)
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, model_pred),
        "Bal_accuracy": balanced_accuracy_score(y_test, model_pred),
        "Precision": precision_score(y_test, model_pred),
        "Recall": recall_score(y_test, model_pred),
        "F1-Score": f1_score(y_test, model_pred)
    })

In [None]:
comparison_df = pd.DataFrame(results).sort_values(by="F1-Score", ascending=False)
print(comparison_df)

                 Model  Accuracy  Bal_accuracy  Precision    Recall  F1-Score
6             CatBoost  0.903800      0.798347   0.945067  0.606911  0.739149
5              XGBoost  0.901213      0.797446   0.925602  0.609071  0.734694
3    Gradient Boosting  0.897332      0.789063   0.922646  0.592513  0.721613
4        Random Forest  0.895392      0.790625   0.900648  0.600432  0.720518
2                  SVM  0.885691      0.768515   0.895592  0.555796  0.685917
1  K-Nearest Neighbors  0.868715      0.758591   0.795897  0.558675  0.656514
0  Logistic Regression  0.842522      0.708715   0.736064  0.465803  0.570547
