In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import catboost as cat

In [2]:
df = pd.read_csv('../Data/german_credit_data_with_risk.csv', index_col=0)

In [3]:
interval = (18, 25, 35, 60, 120)
categories = ['Young', 'Adult', 'Middle', 'Senior']
df["Age category"] = pd.cut(df['Age'], interval, labels=categories)

In [4]:
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age category
0,67,male,2,own,,little,1169,6,radio/TV,good,Senior
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,Young
2,49,male,1,own,little,,2096,12,education,good,Middle
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,Middle
4,53,male,2,free,little,little,4870,24,car,bad,Middle
...,...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,good,Adult
996,40,male,3,own,little,little,3857,30,car,good,Middle
997,38,male,2,own,little,,804,12,radio/TV,good,Middle
998,23,male,2,free,little,little,1845,45,radio/TV,bad,Young


In [5]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose', 'Risk', 'Age category'],
      dtype='object')

In [6]:
# заполняем пропуски 
df['Saving accounts'] = df['Saving accounts'].fillna('no_inf')
df['Checking account'] = df['Checking account'].fillna('no_inf')

In [7]:
from sklearn.model_selection import train_test_split
X = df.drop(columns='Risk')
y = df['Risk']
y = y.map({'good': 1,
           'bad': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [8]:
from catboost import CatBoostClassifier

cat_features = [1,2,3,4,5,8,9]

model = CatBoostClassifier(iterations=20)

model.fit(X_train, y_train, cat_features)
predictions = model.predict(X_test)

Learning rate set to 0.329261
0:	learn: 0.6388743	total: 77ms	remaining: 1.46s
1:	learn: 0.5971971	total: 78.4ms	remaining: 706ms
2:	learn: 0.5727266	total: 81.6ms	remaining: 463ms
3:	learn: 0.5515225	total: 83.6ms	remaining: 334ms
4:	learn: 0.5374459	total: 84.9ms	remaining: 255ms
5:	learn: 0.5267174	total: 85.8ms	remaining: 200ms
6:	learn: 0.5225096	total: 87.3ms	remaining: 162ms
7:	learn: 0.5165622	total: 87.7ms	remaining: 132ms
8:	learn: 0.5104962	total: 88.7ms	remaining: 108ms
9:	learn: 0.5076181	total: 89.2ms	remaining: 89.2ms
10:	learn: 0.4997931	total: 90.5ms	remaining: 74.1ms
11:	learn: 0.4954585	total: 91.2ms	remaining: 60.8ms
12:	learn: 0.4953886	total: 91.5ms	remaining: 49.3ms
13:	learn: 0.4890037	total: 92.3ms	remaining: 39.5ms
14:	learn: 0.4824146	total: 93.2ms	remaining: 31.1ms
15:	learn: 0.4802915	total: 93.9ms	remaining: 23.5ms
16:	learn: 0.4731960	total: 94.7ms	remaining: 16.7ms
17:	learn: 0.4715933	total: 95.2ms	remaining: 10.6ms
18:	learn: 0.4683002	total: 95.9ms	re

In [9]:
y_pred = predictions
y_pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0])

In [10]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score

# Расчет различных метрик производительности
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
rau = roc_auc_score(y_test, y_pred)

# Вывод результатов
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Roc-auc:", rau)

Accuracy: 0.74
Recall: 0.8820224719101124
Precision: 0.7810945273631841
F1 Score: 0.8284960422163589
Roc-auc: 0.6354556803995006
