# LightGBM Classifier Baseline (Model Comparison) v3

- 모델 종류 체급 비교 목적

평가 (Test set):
- PR-AUC
- Recall (Churn)
- Accuracy
- Confusion Matrix
- Classification Report
- Feature Importance


In [12]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)


In [13]:
df = pd.read_parquet('../data/processed/kkbox_train_feature_v3.parquet')


In [14]:
RANDOM_STATE = 719

ID_COL = "msno"
TARGET_COL = "is_churn"

CATEGORICAL_COLS = [
    "city", "gender", "registered_via", "last_payment_method",
    "has_ever_paid", "has_ever_cancelled", "is_auto_renew_last", "is_free_user",
]

NUMERICAL_COLS = [
    "reg_days",
    "num_days_active_w7", "total_secs_w7", "avg_secs_per_day_w7", "std_secs_w7",
    "num_songs_w7", "avg_songs_per_day_w7", "num_unq_w7", "num_25_w7", "num_100_w7",
    "short_play_w7", "skip_ratio_w7", "completion_ratio_w7", "short_play_ratio_w7", "variety_ratio_w7",
    "num_days_active_w14", "total_secs_w14", "avg_secs_per_day_w14", "std_secs_w14",
    "num_songs_w14", "avg_songs_per_day_w14", "num_unq_w14", "num_25_w14", "num_100_w14",
    "short_play_w14", "skip_ratio_w14", "completion_ratio_w14", "short_play_ratio_w14", "variety_ratio_w14",
    "num_days_active_w21", "total_secs_w21", "avg_secs_per_day_w21", "std_secs_w21",
    "num_songs_w21", "avg_songs_per_day_w21", "num_unq_w21", "num_25_w21", "num_100_w21",
    "short_play_w21", "skip_ratio_w21", "completion_ratio_w21", "short_play_ratio_w21", "variety_ratio_w21",
    "num_days_active_w30", "total_secs_w30", "avg_secs_per_day_w30", "std_secs_w30",
    "num_songs_w30", "avg_songs_per_day_w30", "num_unq_w30", "num_25_w30", "num_100_w30",
    "short_play_w30", "skip_ratio_w30", "completion_ratio_w30", "short_play_ratio_w30", "variety_ratio_w30",
    "secs_trend_w7_w30", "secs_trend_w14_w30", "days_trend_w7_w14", "days_trend_w7_w30",
    "songs_trend_w7_w30", "songs_trend_w14_w30", "skip_trend_w7_w30", "completion_trend_w7_w30",
    "days_since_last_payment", "days_since_last_cancel", "last_plan_days",
    "total_payment_count", "total_amount_paid", "avg_amount_per_payment",
    "unique_plan_count", "subscription_months_est", "payment_count_last_30d", "payment_count_last_90d",
]

FEATURE_COLS = CATEGORICAL_COLS + NUMERICAL_COLS

X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].astype(int).copy()


## 1. Train / Test Split


In [15]:
from sklearn.model_selection import train_test_split

X = df[FEATURE_COLS]
y = df[TARGET_COL].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y,
)

print(X_train.shape, X_test.shape)


(688772, 83) (172194, 83)


## 2. Column Groups


In [16]:
cat_cols = [c for c in CATEGORICAL_COLS if c in X_train.columns]
num_cols = [c for c in NUMERICAL_COLS if c in X_train.columns]

print(f"num_cols: {len(num_cols)}")
print(f"cat_cols: {len(cat_cols)}")


num_cols: 75
cat_cols: 8


## 3. Preprocessing (LightGBM Native Categorical)


In [17]:
cat_feature_indices = [X_train.columns.get_loc(col) for col in cat_cols]

X_train_lgb = X_train.copy()
X_test_lgb = X_test.copy()

for col in cat_cols:
    X_train_lgb[col] = X_train_lgb[col].astype("category")
    X_test_lgb[col] = X_test_lgb[col].astype("category")


## 4. LightGBM Classifier Model


In [18]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary",
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=-1,
)


## 5. Train Model


In [19]:
lgb_model.fit(
    X_train_lgb,
    y_train,
    eval_set=[(X_test_lgb, y_test)],
    categorical_feature=cat_cols,
    callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)],
)


[LightGBM] [Info] Number of positive: 65158, number of negative: 623614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16260
[LightGBM] [Info] Number of data points in the train set: 688772, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.03
,n_estimators,3000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


## 6. Test Evaluation


In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    average_precision_score,
    recall_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
)

y_proba = lgb_model.predict_proba(X_test_lgb)[:, 1]
y_pred  = lgb_model.predict(X_test_lgb)

print(f"PR-AUC: {average_precision_score(y_test, y_proba):.4f}")
print(f"Recall (Churn): {recall_score(y_test, y_pred):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


PR-AUC: 0.9339
Recall (Churn): 0.9019
Accuracy: 0.9710


## 7. Confusion Matrix


In [21]:
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])


Unnamed: 0,Pred 0,Pred 1
Actual 0,152500,3404
Actual 1,1598,14692


## 8. Classification Report


In [22]:
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9896    0.9782    0.9839    155904
           1     0.8119    0.9019    0.8545     16290

    accuracy                         0.9710    172194
   macro avg     0.9008    0.9400    0.9192    172194
weighted avg     0.9728    0.9710    0.9716    172194



## 9. Feature Importance (Gain)


In [23]:
feature_names = X_train_lgb.columns
importances = lgb_model.feature_importances_

imp_lgb_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances,
}).sort_values("importance", ascending=False)

imp_lgb_df


Unnamed: 0,feature,importance
8,reg_days,8650
73,days_since_last_payment,7392
72,completion_trend_w7_w30,4533
0,city,4299
64,variety_ratio_w30,4277
71,skip_trend_w7_w30,4230
74,days_since_last_cancel,3861
22,variety_ratio_w7,3656
66,secs_trend_w14_w30,3613
50,variety_ratio_w21,3481


In [24]:
imp_lgb_df.to_csv(
    "../data/model_df/lgb_feature_importance.csv",
    index=False
)
