In [1]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool

train = pd.read_csv('train.csv')
train = train.drop('index', axis=1)
train = train.drop('FLAG_MOBIL', axis=1)
train = train.drop('occyp_type', axis=1)
train = train.drop('child_num', axis=1)
train['credit'] = train['credit'].astype('int64')

In [3]:
def days_to_year(x):
    if x<0:
        return (x*(-1))/365
    return 0

def minus_to_plus(x):
    if x<0:
        return x*(-1)
    return 0

In [4]:
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].apply(days_to_year)
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(days_to_year)
train['begin_month'] = train['begin_month'].apply(minus_to_plus)

In [5]:
train.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,credit
0,F,N,N,202500.0,Commercial associate,Higher education,Married,Municipal apartment,38.079452,12.90137,0,0,0,2.0,6.0,1
1,F,N,Y,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,31.178082,4.219178,0,0,1,3.0,5.0,1
2,M,Y,Y,450000.0,Working,Higher education,Married,House / apartment,52.293151,12.147945,0,1,0,2.0,22.0,2
3,F,N,Y,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,41.336986,5.731507,0,1,0,2.0,37.0,0
4,F,Y,Y,157500.0,State servant,Higher education,Married,House / apartment,41.19726,5.767123,0,0,0,2.0,26.0,2


In [6]:
X = train.drop('credit', axis=1)
y = train['credit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

catboost_pool = Pool(X_train, y_train, cat_features=[0, 1, 2, 4, 5, 6, 7, 10, 11, 12])
model = CatBoostClassifier(iterations=100,
                           depth=2,
                           learning_rate=1,
                           loss_function='MultiClass',
                           verbose=False)

model.fit(catboost_pool)

preds_class = model.predict(X_test)
preds_proba = model.predict_proba(X_test)
print("class = ", preds_class)
print("proba = ", preds_proba)

class =  [[2]
 [2]
 [2]
 ...
 [2]
 [2]
 [2]]
proba =  [[0.07144239 0.20579499 0.72276262]
 [0.05148669 0.40023899 0.54827432]
 [0.09743162 0.15058972 0.75197865]
 ...
 [0.08786064 0.18099675 0.73114262]
 [0.22198293 0.29910971 0.47890737]
 [0.07954042 0.13727454 0.78318505]]


In [7]:
from sklearn.metrics import accuracy_score, classification_report

model.fit(catboost_pool)

preds_class = model.predict(X_test)

accuracy = accuracy_score(y_test, preds_class)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, preds_class, zero_division=1))

print("Predicted Probabilities:")
print(preds_proba)

Accuracy: 0.6891534391534392
Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.00      0.01       650
           1       0.71      0.23      0.35      1262
           2       0.69      0.99      0.81      3380

    accuracy                           0.69      5292
   macro avg       0.56      0.41      0.39      5292
weighted avg       0.64      0.69      0.60      5292

Predicted Probabilities:
[[0.07144239 0.20579499 0.72276262]
 [0.05148669 0.40023899 0.54827432]
 [0.09743162 0.15058972 0.75197865]
 ...
 [0.08786064 0.18099675 0.73114262]
 [0.22198293 0.29910971 0.47890737]
 [0.07954042 0.13727454 0.78318505]]


In [8]:
feature_importance = model.get_feature_importance(catboost_pool, type='FeatureImportance')

feature_names = X_train.columns

feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print(feature_importance_df)

          feature  importance
14    begin_month   59.191822
8      DAYS_BIRTH   16.520341
9   DAYS_EMPLOYED    9.274848
3    income_total    6.718284
4     income_type    1.599935
13    family_size    1.442218
7      house_type    1.157816
10     work_phone    0.790449
12          email    0.772950
0          gender    0.656191
5        edu_type    0.606959
2         reality    0.529415
11          phone    0.369738
6     family_type    0.217144
1             car    0.151888


In [15]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report, log_loss

# 데이터 준비
X = train.drop('credit', axis=1)
y = train['credit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# CatBoost Pool 생성
cat_features = [0, 1, 2, 4, 5, 6, 7, 10, 11, 12]
catboost_pool = Pool(X_train, y_train, cat_features=cat_features)

# 모델 생성 및 학습
model = CatBoostClassifier(iterations=100,
                           depth=2,
                           learning_rate=1,
                           loss_function='MultiClass',
                           verbose=False)
model.fit(catboost_pool)

# 예측
preds_class = model.predict(X_test)
preds_proba = model.predict_proba(X_test)

# 평가
accuracy = accuracy_score(y_test, preds_class)
logloss = log_loss(y_test, preds_proba)  # Log loss 추가

print("Accuracy:", accuracy)
print("Log Loss:", logloss)  # Log loss 출력

print("Classification Report:")
print(classification_report(y_test, preds_class, zero_division=1))

print("Predicted Classes:")
print(preds_class)

print("Predicted Probabilities:")
print(preds_proba)

Accuracy: 0.6891534391534392
Log Loss: 0.807988372407299
Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.00      0.01       650
           1       0.71      0.23      0.35      1262
           2       0.69      0.99      0.81      3380

    accuracy                           0.69      5292
   macro avg       0.56      0.41      0.39      5292
weighted avg       0.64      0.69      0.60      5292

Predicted Classes:
[[2]
 [2]
 [2]
 ...
 [2]
 [2]
 [2]]
Predicted Probabilities:
[[0.07144239 0.20579499 0.72276262]
 [0.05148669 0.40023899 0.54827432]
 [0.09743162 0.15058972 0.75197865]
 ...
 [0.08786064 0.18099675 0.73114262]
 [0.22198293 0.29910971 0.47890737]
 [0.07954042 0.13727454 0.78318505]]
