# imports + seed

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report,
)

RANDOM_STATE = 719
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_parquet('kkbox_train_feature_v1.parquet')

In [4]:
df.head(5)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,bd_clean,registration_month,is_churn,num_days_active_w7,total_secs_w7,avg_secs_per_day_w7,std_secs_w7,num_songs_w7,avg_songs_per_day_w7,num_unq_w7,num_25_w7,num_100_w7,short_play_w7,skip_ratio_w7,completion_ratio_w7,short_play_ratio_w7,variety_ratio_w7,num_days_active_w14,total_secs_w14,avg_secs_per_day_w14,std_secs_w14,num_songs_w14,avg_songs_per_day_w14,num_unq_w14,num_25_w14,num_100_w14,short_play_w14,skip_ratio_w14,completion_ratio_w14,short_play_ratio_w14,variety_ratio_w14,num_days_active_w21,total_secs_w21,avg_secs_per_day_w21,std_secs_w21,num_songs_w21,avg_songs_per_day_w21,num_unq_w21,num_25_w21,num_100_w21,short_play_w21,skip_ratio_w21,completion_ratio_w21,short_play_ratio_w21,variety_ratio_w21,num_days_active_w30,total_secs_w30,avg_secs_per_day_w30,std_secs_w30,num_songs_w30,avg_songs_per_day_w30,num_unq_w30,num_25_w30,num_100_w30,short_play_w30,skip_ratio_w30,completion_ratio_w30,short_play_ratio_w30,variety_ratio_w30,secs_trend_w7_w30,secs_trend_w14_w30,days_trend_w7_w14,days_trend_w7_w30,songs_trend_w7_w30,songs_trend_w14_w30,skip_trend_w7_w30,completion_trend_w7_w30,recency_secs_ratio,recency_songs_ratio,days_since_last_payment,has_ever_paid,days_since_last_cancel,has_ever_cancelled,is_auto_renew_last,last_plan_days,last_payment_method,is_free_user,total_payment_count,total_amount_paid,avg_amount_per_payment,unique_plan_count,subscription_months_est,payment_count_last_30d,payment_count_last_90d
0,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,unknown,7,2011-09-14,,2011-09,0,7,75448.625,10778.375,9128.514648,338,48.285713,159,39,271,54,0.115385,0.801775,0.159763,0.470414,14,177639.296875,12688.521484,10458.754883,842,60.142857,480,127,641,157,0.150831,0.761283,0.186461,0.570071,20,238367.421875,11918.371094,9021.441406,1156,57.799999,663,170,863,220,0.147059,0.74654,0.190311,0.573529,30,358554.0,11951.799805,7876.637695,1776,59.200001,1040,277,1296,355,0.155968,0.72973,0.199887,0.585586,0.210425,0.495432,0.5,0.233333,0.190315,0.474099,-0.040584,0.072045,0.210425,0.190315,5,1,999,0,1,30,41,0,1,129,129.0,1,1.0,1,1
1,yLkV2gbZ4GLFwqTOXLVHz0VGrMYcgBGgKZ3kj9RiYu8=,4,30,male,9,2011-09-16,30.0,2011-09,0,6,123668.695312,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,6,123668.695312,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,6,123668.695312,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,6,123668.695312,20611.449219,9505.349609,557,92.833336,67,14,518,22,0.025135,0.929982,0.039497,0.120287,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1,1,999,0,1,30,39,0,2,298,149.0,1,2.0,1,2
2,I0yFvqMoNkM8ZNHb617e1RBzIS/YRKemHO7Wj13EtA0=,13,63,male,9,2011-09-18,63.0,2011-09,0,3,16989.527344,5663.175781,1434.409424,70,23.333334,65,2,65,3,0.028571,0.928571,0.042857,0.928571,10,50269.140625,5026.914062,3101.173584,249,24.9,182,43,195,47,0.172691,0.783133,0.188755,0.730924,15,63667.992188,4244.532715,2992.634277,352,23.466667,273,77,206,107,0.21875,0.585227,0.303977,0.775568,18,80453.320312,4469.628906,2823.026123,416,23.111111,337,77,269,108,0.185096,0.646635,0.259615,0.810096,0.211172,0.624824,0.3,0.166667,0.168269,0.598558,-0.156525,0.281937,0.211172,0.168269,5,1,999,0,1,30,40,0,1,149,149.0,1,1.0,1,1
3,OoDwiKZM+ZGr9P3fRivavgOtglTEaNfWJO4KaJcTTts=,1,0,unknown,7,2011-09-18,,2011-09,1,1,6168.049805,6168.049805,0.0,23,23.0,23,0,22,0,0.0,0.956522,0.0,1.0,2,8142.378906,4071.189453,2965.408447,35,17.5,34,2,30,4,0.057143,0.857143,0.114286,0.971429,2,8142.378906,4071.189453,2965.408447,35,17.5,34,2,30,4,0.057143,0.857143,0.114286,0.971429,3,8613.391602,2871.130615,2952.498535,38,12.666667,37,3,31,5,0.078947,0.815789,0.131579,0.973684,0.7161,0.945316,0.5,0.333333,0.605263,0.921053,-0.078947,0.140732,0.7161,0.605263,6,1,999,0,1,30,41,0,1,149,149.0,1,1.0,1,1
4,4De1jAxNRABoyRBDZ82U0yEmzYkqeOugRGVNIf92Xb8=,4,28,female,9,2011-09-20,28.0,2011-09,0,2,5703.128906,2851.564453,2644.321289,29,14.5,24,5,24,5,0.172414,0.827586,0.172414,0.827586,5,15160.677734,3032.135498,1988.283691,90,18.0,42,6,82,6,0.066667,0.911111,0.066667,0.466667,8,19365.294922,2420.661865,1723.983154,118,14.75,55,9,105,10,0.076271,0.889831,0.084746,0.466102,10,22494.763672,2249.476318,1725.134766,134,13.4,68,12,117,13,0.089552,0.873134,0.097015,0.507463,0.253531,0.673965,0.4,0.2,0.216418,0.671642,0.082862,-0.045548,0.253531,0.216418,29,1,999,0,1,30,36,0,1,180,180.0,1,1.0,1,1


In [5]:
# 1) bd vs bd_clean 중복 제거
if "bd" in df.columns and "bd_clean" in df.columns:
    df = df.drop(columns=["bd"])

# 2) time / month 숫자화
df["reg_year"]  = df["registration_init_time"].dt.year.astype("Int64")

# 원본 제거 (Period/Datetime 에러 방지)
df = df.drop(columns=["registration_init_time", "registration_month", "recency_secs_ratio", "recency_songs_ratio"])

# train/valid/test split (stratify 유지)

In [6]:
assert "msno" in df.columns and "is_churn" in df.columns

trainval_df, test_df = train_test_split(
    df,
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=df["is_churn"],
)

valid_size = 0.15 / 0.85
train_df, valid_df = train_test_split(
    trainval_df,
    test_size=valid_size,
    random_state=RANDOM_STATE,
    stratify=trainval_df["is_churn"],
)

feature_cols = [c for c in df.columns if c not in ["msno", "is_churn"]]

X_train, y_train = train_df[feature_cols], train_df["is_churn"].astype(int)
X_valid, y_valid = valid_df[feature_cols], valid_df["is_churn"].astype(int)
X_test,  y_test  = test_df[feature_cols],  test_df["is_churn"].astype(int)

print("churn rate:", y_train.mean(), y_valid.mean(), y_test.mean())
print(X_train.shape, X_valid.shape, X_test.shape)


churn rate: 0.09460141104009451 0.09459909404158116 0.09459909404158116
(602676, 84) (129145, 84) (129145, 84)


#### 이후 LightGBM 학습 시작

In [7]:
# =========================================
# LightGBM 학습 + 평가 + 중요도 (Top20) 파트
# =========================================

# 0) 필요한 추가 라이브러리 import
# - lightgbm이 설치되어 있지 않으면:  !pip install lightgbm  후 재실행
try:
    import lightgbm as lgb
except ImportError as e:
    raise ImportError(
        "lightgbm이 설치되어 있지 않습니다.\n"
        "노트북에서 아래를 한 번 실행 후 다시 시도하세요:\n"
        "!pip install lightgbm"
    ) from e

from sklearn.inspection import permutation_importance
from scipy import sparse

# 1) 컬럼 타입(숫자/범주) 자동 구분
# - object/category/bool 계열은 범주형으로 보고 One-Hot Encoding
# - 나머지는 숫자형으로 보고 결측치만 채움
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = [c for c in X_train.columns if c not in cat_cols]

print(f"[컬럼 개수] total={X_train.shape[1]}, num={len(num_cols)}, cat={len(cat_cols)}")

# 2) (중요) pandas의 nullable 정수(Int64 등)는 scikit-learn에서 간혹 불안정할 수 있어
#    숫자 컬럼을 안전하게 numeric으로 변환해줍니다(문자 섞였으면 NaN으로 -> imputer가 처리).
def _safe_numeric_cast(df, cols):
    df = df.copy()
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

X_train2 = _safe_numeric_cast(X_train, num_cols)
X_valid2 = _safe_numeric_cast(X_valid, num_cols)
X_test2  = _safe_numeric_cast(X_test,  num_cols)

# 3) 전처리기(preprocessor) 구성
# - 숫자: 결측치 -> 중앙값(median)으로 채움
# - 범주: 결측치 -> 가장 흔한 값(most_frequent)으로 채움 + 원-핫 인코딩
num_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

# sklearn 버전에 따라 OneHotEncoder 옵션명이 다를 수 있어 try/except로 안전 처리
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

cat_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_tf, num_cols),
        ("cat", cat_tf, cat_cols),
    ],
    remainder="drop",
)

# 4) 전처리 적용(학습 데이터로 fit, 나머지는 transform만!)
X_tr = preprocess.fit_transform(X_train2)
X_va = preprocess.transform(X_valid2)
X_te = preprocess.transform(X_test2)

print("[변환 후 shape]", X_tr.shape, X_va.shape, X_te.shape)
print("[희소행렬 여부]", sparse.issparse(X_tr))

# 5) 클래스 불균형 보정용 scale_pos_weight 계산(선택이지만 churn에 보통 도움 됨)
pos = int(y_train.sum())
neg = int((1 - y_train).sum())
scale_pos_weight = (neg / pos) if pos > 0 else 1.0
print(f"[클래스 비율] pos={pos}, neg={neg}, scale_pos_weight={scale_pos_weight:.3f}")

# 6) LightGBM 하이퍼파라미터(기본 실전형 세팅)
lgb_params = dict(
    objective="binary",
    boosting_type="gbdt",
    learning_rate=0.05,
    n_estimators=5000,          # 크게 주고 early stopping으로 자동으로 최적 지점에서 멈춤
    num_leaves=63,
    max_depth=-1,               # 제한 없음(필요하면 6~12 등으로 제한 가능)
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=0.0,
    min_child_samples=30,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,  # 불균형 보정
)

lgbm = lgb.LGBMClassifier(**lgb_params)

# 7) 학습 (Valid를 보면서 early stopping)
# - AUC, AUC-PR(=aucpr) 같이 확인
# - LightGBM 버전에 따라 early stopping 인자 사용법이 달라서 콜백 방식으로 안전하게 처리
callbacks = [
    lgb.early_stopping(stopping_rounds=200, verbose=True),
    lgb.log_evaluation(period=200),
]

lgbm.fit(
    X_tr, y_train,
    eval_set=[(X_va, y_valid)],
    eval_metric=["auc", "aucpr"],
    callbacks=callbacks,
)

best_iter = getattr(lgbm, "best_iteration_", None)
print(f"[학습 완료] best_iteration_ = {best_iter}")


[컬럼 개수] total=84, num=83, cat=1
[변환 후 shape] (602676, 86) (129145, 86) (129145, 86)
[희소행렬 여부] False
[클래스 비율] pos=57014, neg=545662, scale_pos_weight=9.571
[LightGBM] [Info] Number of positive: 57014, number of negative: 545662
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16104
[LightGBM] [Info] Number of data points in the train set: 602676, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.094601 -> initscore=-2.258703
[LightGBM] [Info] Start training from score -2.258703
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.989476	valid_0's binary_logloss: 0.122417
Early stopping, best iteration is:
[190]	valid_0's auc: 0.989493	valid_0's binary_logloss: 0.12281
[학습 완료] best_iteration_ = 190


In [8]:
# =========================================
# 평가: Score / Confusion Matrix (valid/test)
# =========================================

def evaluate_binary(name, y_true, p_pred, thr=0.5):
    """
    name  : 구간 이름(VALID/TEST)
    y_true: 정답(0/1)
    p_pred: 예측 확률(1일 확률)
    thr   : 확률을 0/1로 바꾸는 기준(threshold)
    """
    auc = roc_auc_score(y_true, p_pred)
    pr  = average_precision_score(y_true, p_pred)

    y_hat = (p_pred >= thr).astype(int)
    cm = confusion_matrix(y_true, y_hat)

    # confusion matrix를 보기 좋게 DataFrame으로
    cm_df = pd.DataFrame(
        cm,
        index=["True:0(Stay)", "True:1(Churn)"],
        columns=["Pred:0", "Pred:1"]
    )

    print(f"\n===== {name} =====")
    print(f"ROC-AUC : {auc:.6f}")
    print(f"PR-AUC  : {pr:.6f}")
    print(f"Threshold: {thr}")
    print("\n[Confusion Matrix]")
    display(cm_df)

    print("\n[Classification Report]")
    print(classification_report(y_true, y_hat, digits=4))

    return {"split": name, "roc_auc": auc, "pr_auc": pr, "threshold": thr, "cm": cm_df}

# 확률 예측
p_valid = lgbm.predict_proba(X_va)[:, 1]
p_test  = lgbm.predict_proba(X_te)[:, 1]

# 평가 실행
res_valid = evaluate_binary("VALID", y_valid, p_valid, thr=0.5)
res_test  = evaluate_binary("TEST",  y_test,  p_test,  thr=0.5)

# 점수만 한 번에 표로 정리
score_df = pd.DataFrame([
    {"split": "VALID", "ROC-AUC": res_valid["roc_auc"], "PR-AUC": res_valid["pr_auc"]},
    {"split": "TEST",  "ROC-AUC": res_test["roc_auc"],  "PR-AUC": res_test["pr_auc"]},
])
score_df





===== VALID =====
ROC-AUC : 0.989493
PR-AUC  : 0.934446
Threshold: 0.5

[Confusion Matrix]


Unnamed: 0,Pred:0,Pred:1
True:0(Stay),112677,4251
True:1(Churn),848,11369



[Classification Report]
              precision    recall  f1-score   support

           0     0.9925    0.9636    0.9779    116928
           1     0.7278    0.9306    0.8168     12217

    accuracy                         0.9605    129145
   macro avg     0.8602    0.9471    0.8974    129145
weighted avg     0.9675    0.9605    0.9626    129145


===== TEST =====
ROC-AUC : 0.989502
PR-AUC  : 0.934667
Threshold: 0.5

[Confusion Matrix]


Unnamed: 0,Pred:0,Pred:1
True:0(Stay),112552,4376
True:1(Churn),818,11399



[Classification Report]
              precision    recall  f1-score   support

           0     0.9928    0.9626    0.9774    116928
           1     0.7226    0.9330    0.8144     12217

    accuracy                         0.9598    129145
   macro avg     0.8577    0.9478    0.8959    129145
weighted avg     0.9672    0.9598    0.9620    129145



Unnamed: 0,split,ROC-AUC,PR-AUC
0,VALID,0.989493,0.934446
1,TEST,0.989502,0.934667


In [9]:
# =========================================
# Feature Importances (Top 20)
# - LightGBM 내부 중요도: gain 기준(추천) + split 기준(참고)
# =========================================

# 전처리 후 feature name 뽑기
# (ColumnTransformer가 get_feature_names_out을 지원하는 sklearn 버전이면 가장 깔끔)
try:
    feature_names = preprocess.get_feature_names_out()
except Exception:
    # 일부 구버전 대응: 가능하면 업그레이드 권장
    feature_names = np.array([f"f{i}" for i in range(X_tr.shape[1])])

# LightGBM booster에서 gain/split 중요도 추출
booster = lgbm.booster_
imp_gain = booster.feature_importance(importance_type="gain")
imp_split = booster.feature_importance(importance_type="split")

fi_df = pd.DataFrame({
    "feature": feature_names,
    "importance_gain": imp_gain,
    "importance_split": imp_split,
}).sort_values("importance_gain", ascending=False)

print("[Top 20 - gain]")
display(fi_df.head(20))

print("[Top 20 - split]")
display(fi_df.sort_values("importance_split", ascending=False).head(20))


[Top 20 - gain]


Unnamed: 0,feature,importance_gain,importance_split
69,num__days_since_last_cancel,1696875.0,632
70,num__has_ever_cancelled,1696818.0,68
81,num__payment_count_last_90d,1621097.0,409
71,num__is_auto_renew_last,1574038.0,215
67,num__days_since_last_payment,1215992.0,1255
79,num__subscription_months_est,799567.7,195
76,num__total_amount_paid,646428.0,608
80,num__payment_count_last_30d,359963.2,329
73,num__last_payment_method,321549.2,605
77,num__avg_amount_per_payment,229958.1,459


[Top 20 - split]


Unnamed: 0,feature,importance_gain,importance_split
67,num__days_since_last_payment,1215992.0,1255
69,num__days_since_last_cancel,1696875.0,632
76,num__total_amount_paid,646428.0,608
73,num__last_payment_method,321549.2,605
77,num__avg_amount_per_payment,229958.1,459
82,num__reg_year,124049.4,446
81,num__payment_count_last_90d,1621097.0,409
80,num__payment_count_last_30d,359963.2,329
71,num__is_auto_renew_last,1574038.0,215
2,num__bd_clean,14189.26,209


In [10]:
# =========================================
# Permutation Importance (Top 20)
# - 여기서는 "원본 컬럼 기준"으로 보기 위해
#   preprocess + model 을 Pipeline처럼 묶어서 permutation importance를 수행합니다.
#   (즉, One-hot으로 늘어난 수천개 feature가 아니라 원래의 feature_cols 단위 Top20)
# =========================================

# (주의) permutation importance는 시간이 꽤 걸릴 수 있음
# - n_repeats 줄이기(예: 3)
# - valid 샘플 수가 너무 크면 일부 샘플만 뽑기(예: 50,000개) 같은 최적화 가능

# 1) (선택) valid에서 너무 크면 샘플링
MAX_PI_SAMPLES = 50000

if len(X_valid2) > MAX_PI_SAMPLES:
    pi_idx = np.random.RandomState(RANDOM_STATE).choice(len(X_valid2), MAX_PI_SAMPLES, replace=False)
    X_pi = X_valid2.iloc[pi_idx].copy()
    y_pi = y_valid.iloc[pi_idx].copy()
    print(f"[Permutation] VALID 샘플링: {len(X_valid2)} -> {len(X_pi)}")
else:
    X_pi = X_valid2.copy()
    y_pi = y_valid.copy()

# 2) "전처리+모델"을 하나로 묶은 형태로 예측이 가능하도록 구성
#    - 이미 preprocess, lgbm 둘 다 학습(fit)되어 있으므로 그대로 사용
from sklearn.pipeline import Pipeline as SkPipeline

pipe = SkPipeline(steps=[
    ("preprocess", preprocess),
    ("model", lgbm),
])

# 3) permutation importance 계산 (AUC 기준)
pi = permutation_importance(
    estimator=pipe,
    X=X_pi,
    y=y_pi,
    scoring="roc_auc",
    n_repeats=3,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

pi_df = pd.DataFrame({
    "feature": X_pi.columns,
    "perm_importance_mean": pi.importances_mean,
    "perm_importance_std": pi.importances_std,
}).sort_values("perm_importance_mean", ascending=False)

print("[Permutation Importance Top 20 (VALID, ROC-AUC 기준)]")
display(pi_df.head(20))


[Permutation] VALID 샘플링: 129145 -> 50000




[Permutation Importance Top 20 (VALID, ROC-AUC 기준)]


Unnamed: 0,feature,perm_importance_mean,perm_importance_std
82,payment_count_last_90d,0.056562,0.000722
70,days_since_last_cancel,0.035283,0.000552
68,days_since_last_payment,0.031097,0.00127
72,is_auto_renew_last,0.025757,0.000326
81,payment_count_last_30d,0.005892,7.5e-05
83,reg_year,0.00364,0.000356
80,subscription_months_est,0.003115,2.8e-05
74,last_payment_method,0.002786,3.1e-05
77,total_amount_paid,0.002351,0.000179
78,avg_amount_per_payment,0.001263,7.8e-05


In [11]:
# =========================================
# 최종 요약(요청하신 항목 한 번에 정리)
# - Model Hyperparameters
# - Score (valid/test)
# - Confusion matrix (valid/test)
# - Feature Importances Top 20
# - Permutation Importance Top 20
# =========================================

print("\n==================== [FINAL SUMMARY] ====================")

print("\n1) Model Hyperparameters")
# 실제 모델 파라미터(학습에 반영된 값) 확인
print(pd.Series(lgbm.get_params()).sort_index().to_string())
print(f"\nBest iteration: {getattr(lgbm, 'best_iteration_', None)}")

print("\n2) Score (VALID / TEST)")
display(score_df)

print("\n3) Confusion Matrix (VALID / TEST)")
print("\n[VALID]")
display(res_valid["cm"])
print("\n[TEST]")
display(res_test["cm"])

print("\n4) Feature Importances (Top 20, gain)")
display(fi_df.head(20))

print("\n5) Permutation Importance (Top 20, VALID / ROC-AUC)")
display(pi_df.head(20))

print("\n=========================================================")



1) Model Hyperparameters
boosting_type            gbdt
class_weight             None
colsample_bytree          0.8
importance_type         split
learning_rate            0.05
max_depth                  -1
min_child_samples          30
min_child_weight        0.001
min_split_gain            0.0
n_estimators             5000
n_jobs                     -1
num_leaves                 63
objective              binary
random_state              719
reg_alpha                 0.0
reg_lambda                0.0
scale_pos_weight     9.570667
subsample                 0.8
subsample_for_bin      200000
subsample_freq              0

Best iteration: 190

2) Score (VALID / TEST)


Unnamed: 0,split,ROC-AUC,PR-AUC
0,VALID,0.989493,0.934446
1,TEST,0.989502,0.934667



3) Confusion Matrix (VALID / TEST)

[VALID]


Unnamed: 0,Pred:0,Pred:1
True:0(Stay),112677,4251
True:1(Churn),848,11369



[TEST]


Unnamed: 0,Pred:0,Pred:1
True:0(Stay),112552,4376
True:1(Churn),818,11399



4) Feature Importances (Top 20, gain)


Unnamed: 0,feature,importance_gain,importance_split
69,num__days_since_last_cancel,1696875.0,632
70,num__has_ever_cancelled,1696818.0,68
81,num__payment_count_last_90d,1621097.0,409
71,num__is_auto_renew_last,1574038.0,215
67,num__days_since_last_payment,1215992.0,1255
79,num__subscription_months_est,799567.7,195
76,num__total_amount_paid,646428.0,608
80,num__payment_count_last_30d,359963.2,329
73,num__last_payment_method,321549.2,605
77,num__avg_amount_per_payment,229958.1,459



5) Permutation Importance (Top 20, VALID / ROC-AUC)


Unnamed: 0,feature,perm_importance_mean,perm_importance_std
82,payment_count_last_90d,0.056562,0.000722
70,days_since_last_cancel,0.035283,0.000552
68,days_since_last_payment,0.031097,0.00127
72,is_auto_renew_last,0.025757,0.000326
81,payment_count_last_30d,0.005892,7.5e-05
83,reg_year,0.00364,0.000356
80,subscription_months_est,0.003115,2.8e-05
74,last_payment_method,0.002786,3.1e-05
77,total_amount_paid,0.002351,0.000179
78,avg_amount_per_payment,0.001263,7.8e-05





### 실행 후 “총 정리”를 어떻게 읽으면 좋은가
- Score (valid/test): 일반적으로 VALID와 TEST 점수가 비슷하면 과적합이 덜합니다.
- Confusion Matrix: Churn(1)을 얼마나 잘 잡는지(특히 FN: True=1인데 Pred=0) 확인하세요.
- Feature Importances (gain): LightGBM이 내부적으로 “성능 개선에 기여한 정도”가 큰 피처입니다(원-핫이 섞여 보일 수 있음).
- Permutation Importance: “원본 컬럼 단위”로 AUC에 기여한 변수를 보여줘서 해석이 더 직관적입니다.