In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

data = pd.read_csv('../dataset/master_3.csv')

In [2]:
# Catboost - Categorical Boosting
# 한계점 : sparse 한 matrix는 처리하지 못 한다, 데이터 대부분이 수치형 변수인 경우, Light GBM 보다 학습 속도가 느리다

In [3]:
data.columns[data.dtypes == "object"]

Index(['intime', 'outtime', 'ed_los'], dtype='object')

In [4]:
# categorical 변수 전처리 해줘야하지만 시계열 데이터여서 삭제 & y label과 같은 의미인 outcome_icu_transfer_12h도 같이 삭제
X = data[data.columns.difference(['outcome_critical','intime','outtime','ed_los','outcome_icu_transfer_12h'])]
y = data['outcome_critical']

In [5]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% training and 20% test

In [6]:
# fit 할 때 cat_features 넣어도 결과 동일해 빼고 진행하였음

cat = CatBoostClassifier(learning_rate=0.05, iterations=5000)
cat.fit(X_train, y_train, early_stopping_rounds=100, verbose=500)
result = cat.predict_proba(X_test)

0:	learn: 0.5513734	total: 96.6ms	remaining: 8m 3s
500:	learn: 0.1029323	total: 13.2s	remaining: 1m 58s
1000:	learn: 0.0974042	total: 25.1s	remaining: 1m 40s
1500:	learn: 0.0933469	total: 37s	remaining: 1m 26s
2000:	learn: 0.0898608	total: 48.8s	remaining: 1m 13s
2500:	learn: 0.0868271	total: 1m	remaining: 1m
3000:	learn: 0.0838801	total: 1m 12s	remaining: 48.5s
3500:	learn: 0.0811970	total: 1m 24s	remaining: 36.4s
4000:	learn: 0.0785155	total: 1m 37s	remaining: 24.3s
4500:	learn: 0.0759375	total: 1m 49s	remaining: 12.1s
4999:	learn: 0.0734889	total: 2m 3s	remaining: 0us


In [7]:
print(roc_auc_score(y_test, cat.predict_proba(X_test)[:, 1]))

0.9616555668290753
