# CatBoost Classifier Baseline (Model Comparison) v3

- 모델 종류 체급 비교 목적

평가 (Test set):
- PR-AUC
- Recall (Churn)
- Accuracy
- Confusion Matrix
- Classification Report
- Feature Importance

In [9]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)


In [10]:
df = pd.read_parquet('../data/processed/kkbox_train_feature_v3.parquet')


In [11]:
RANDOM_STATE = 719

ID_COL = "msno"
TARGET_COL = "is_churn"

CATEGORICAL_COLS = [
    "city", "gender", "registered_via", "last_payment_method",
    "has_ever_paid", "has_ever_cancelled", "is_auto_renew_last", "is_free_user",
]

NUMERICAL_COLS = [
    "reg_days",
    "num_days_active_w7", "total_secs_w7", "avg_secs_per_day_w7", "std_secs_w7",
    "num_songs_w7", "avg_songs_per_day_w7", "num_unq_w7", "num_25_w7", "num_100_w7",
    "short_play_w7", "skip_ratio_w7", "completion_ratio_w7", "short_play_ratio_w7", "variety_ratio_w7",
    "num_days_active_w14", "total_secs_w14", "avg_secs_per_day_w14", "std_secs_w14",
    "num_songs_w14", "avg_songs_per_day_w14", "num_unq_w14", "num_25_w14", "num_100_w14",
    "short_play_w14", "skip_ratio_w14", "completion_ratio_w14", "short_play_ratio_w14", "variety_ratio_w14",
    "num_days_active_w21", "total_secs_w21", "avg_secs_per_day_w21", "std_secs_w21",
    "num_songs_w21", "avg_songs_per_day_w21", "num_unq_w21", "num_25_w21", "num_100_w21",
    "short_play_w21", "skip_ratio_w21", "completion_ratio_w21", "short_play_ratio_w21", "variety_ratio_w21",
    "num_days_active_w30", "total_secs_w30", "avg_secs_per_day_w30", "std_secs_w30",
    "num_songs_w30", "avg_songs_per_day_w30", "num_unq_w30", "num_25_w30", "num_100_w30",
    "short_play_w30", "skip_ratio_w30", "completion_ratio_w30", "short_play_ratio_w30", "variety_ratio_w30",
    "secs_trend_w7_w30", "secs_trend_w14_w30", "days_trend_w7_w14", "days_trend_w7_w30",
    "songs_trend_w7_w30", "songs_trend_w14_w30", "skip_trend_w7_w30", "completion_trend_w7_w30",
    "days_since_last_payment", "days_since_last_cancel", "last_plan_days",
    "total_payment_count", "total_amount_paid", "avg_amount_per_payment",
    "unique_plan_count", "subscription_months_est", "payment_count_last_30d", "payment_count_last_90d",
]

FEATURE_COLS = CATEGORICAL_COLS + NUMERICAL_COLS

X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].astype(int).copy()


## 1. Train / Test Split


In [12]:
from sklearn.model_selection import train_test_split

X = df[FEATURE_COLS]
y = df[TARGET_COL].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y,
)

print(X_train.shape, X_test.shape)


(688772, 83) (172194, 83)


## 2. Column Groups


In [13]:
cat_cols = [c for c in CATEGORICAL_COLS if c in X_train.columns]
num_cols = [c for c in NUMERICAL_COLS if c in X_train.columns]

print(f"num_cols: {len(num_cols)}")
print(f"cat_cols: {len(cat_cols)}")


num_cols: 75
cat_cols: 8


## 3. Preprocessing (CatBoost Native Categorical)


In [14]:
cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_cols]

X_train_cb = X_train.copy()
X_test_cb = X_test.copy()

for col in cat_cols:
    X_train_cb[col] = X_train_cb[col].astype(str).astype("category")
    X_test_cb[col] = X_test_cb[col].astype(str).astype("category")


## 4. CatBoost Classifier Model


In [15]:
from catboost import CatBoostClassifier

cb_model = CatBoostClassifier(
    iterations=4000,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="PRAUC",
    auto_class_weights="Balanced",
    random_seed=RANDOM_STATE,
    verbose=False,
    early_stopping_rounds=200,
)


## 5. Train Model


In [16]:
cb_model.fit(
    X_train_cb,
    y_train,
    eval_set=(X_test_cb, y_test),
    cat_features=cat_feature_indices,
    verbose=False,
)


<catboost.core.CatBoostClassifier at 0x373e05760>

## 6. Test Evaluation


In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    average_precision_score,
    recall_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
)

y_proba = cb_model.predict_proba(X_test_cb)[:, 1]
y_pred  = cb_model.predict(X_test_cb)

print(f"PR-AUC: {average_precision_score(y_test, y_proba):.4f}")
print(f"Recall (Churn): {recall_score(y_test, y_pred):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


PR-AUC: 0.9339
Recall (Churn): 0.9347
Accuracy: 0.9580


## 7. Confusion Matrix


In [18]:
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])


Unnamed: 0,Pred 0,Pred 1
Actual 0,149743,6161
Actual 1,1063,15227


## 8. Classification Report


In [19]:
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9930    0.9605    0.9764    155904
           1     0.7119    0.9347    0.8083     16290

    accuracy                         0.9580    172194
   macro avg     0.8524    0.9476    0.8924    172194
weighted avg     0.9664    0.9580    0.9605    172194



## 9. Feature Importance (Importance Type)


In [20]:
from catboost import Pool

feature_names = X_train_cb.columns
pool = Pool(X_train_cb, label=y_train, cat_features=cat_feature_indices)
importances = cb_model.get_feature_importance(pool)

imp_cb_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances,
}).sort_values("importance", ascending=False)

imp_cb_df


Unnamed: 0,feature,importance
6,is_auto_renew_last,17.156077
73,days_since_last_payment,16.043813
80,subscription_months_est,9.947155
82,payment_count_last_90d,9.735191
74,days_since_last_cancel,9.377481
77,total_amount_paid,9.015122
5,has_ever_cancelled,8.890531
81,payment_count_last_30d,3.843197
8,reg_days,2.680458
2,registered_via,2.461273


In [21]:
imp_cb_df.to_csv(
    "../data/model_df/catboost_feature_importance.csv",
    index=False
)
