# Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report,
)

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# 데이터 로드/확인

In [2]:
df = pd.read_parquet("../data/processed/kkbox_train_feature_v1.parquet")

In [None]:
assert "msno" in df.columns and "is_churn" in df.columns
print(df.shape)
print(df["is_churn"].value_counts(normalize=True).head())

(860966, 90)
is_churn
0    0.905399
1    0.094601
Name: proportion, dtype: Float64


# Train / Valid / Test split

In [4]:
# 1) test 15%
trainval_df, test_df = train_test_split(
    df,
    test_size=0.15,
    random_state=42,
    stratify=df["is_churn"],
)

# 2) valid 15% (전체의 15%가 되도록 0.15/0.85)
valid_ratio_in_trainval = 0.15 / 0.85
train_df, valid_df = train_test_split(
    trainval_df,
    test_size=valid_ratio_in_trainval,
    random_state=42,
    stratify=trainval_df["is_churn"],
)

feature_cols = [c for c in df.columns if c not in ["msno", "is_churn"]]

X_train, y_train = train_df[feature_cols], train_df["is_churn"].astype(int)
X_valid, y_valid = valid_df[feature_cols], valid_df["is_churn"].astype(int)
X_test,  y_test  = test_df[feature_cols],  test_df["is_churn"].astype(int)

print("churn rate:", y_train.mean(), y_valid.mean(), y_test.mean())
print(X_train.shape, X_valid.shape, X_test.shape)

churn rate: 0.09460141104009451 0.09459909404158116 0.09459909404158116
(602676, 88) (129145, 88) (129145, 88)


# datetime/period 컬럼을 “피처로 쓰는” 변환 함수
- 원본 datetime/period는 모델이 싫어하니까 파생 숫자 피처로 바꿔서 사용.

In [5]:
from pandas.api.types import is_datetime64_any_dtype

def add_time_features(
    X: pd.DataFrame,
    dt_col: str = "registration_init_time",
    period_col: str = "registration_month",
) -> pd.DataFrame:
    X = X.copy()

    # datetime -> year/month/day
    if dt_col in X.columns:
        dt = pd.to_datetime(X[dt_col], errors="coerce")
        X[f"{dt_col}_year"] = dt.dt.year.astype("Int16")
        X[f"{dt_col}_month"] = dt.dt.month.astype("Int8")
        X[f"{dt_col}_day"] = dt.dt.day.astype("Int8")
        X = X.drop(columns=[dt_col])

    # period[M] -> year/month
    if period_col in X.columns:
        p = X[period_col]
        if not isinstance(p.dtype, pd.PeriodDtype):
            p = p.astype("period[M]")
        X[f"{period_col}_year"] = p.dt.year.astype("Int16")
        X[f"{period_col}_month"] = p.dt.month.astype("Int8")
        X = X.drop(columns=[period_col])

    return X

# 시간 피처 변환 적용 + drop 확인

In [6]:
X_train2 = add_time_features(X_train)
X_valid2 = add_time_features(X_valid)
X_test2  = add_time_features(X_test)

print("before:", X_train.shape, "after:", X_train2.shape)

# 원본 컬럼이 사라졌는지 확인
print("registration_init_time" in X_train2.columns, "registration_month" in X_train2.columns)

before: (602676, 88) after: (602676, 91)
False False


# 컬럼 분리 + preprocess 정의
- LR은 scaler 포함
- LGBM은 scaler 불필요

In [7]:
from pandas.api.types import is_object_dtype, is_string_dtype

def split_cols(X: pd.DataFrame):
    cat_cols = [
        c for c in X.columns
        if (is_object_dtype(X[c])
            or is_string_dtype(X[c])
            or isinstance(X[c].dtype, pd.CategoricalDtype))
    ]
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

num_cols, cat_cols = split_cols(X_train2)
print("num:", len(num_cols), "cat:", len(cat_cols), "cat example:", cat_cols[:10])

def make_preprocess(num_cols, cat_cols, *, for_lr: bool):
    num_steps = [("imputer", SimpleImputer(strategy="median"))]
    if for_lr:
        num_steps.append(("scaler", StandardScaler(with_mean=False)))  # sparse 안전

    return ColumnTransformer(
        transformers=[
            ("num", Pipeline(num_steps), num_cols),
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore")),
            ]), cat_cols),
        ],
        remainder="drop",
    )

num: 90 cat: 1 cat example: ['gender']


# 공통 평가 함수

In [8]:
def eval_binary(y_true, y_proba, threshold=0.5, prefix=""):
    roc = roc_auc_score(y_true, y_proba)
    pr  = average_precision_score(y_true, y_proba)

    y_pred = (y_proba >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)

    print(f"{prefix}ROC-AUC: {roc:.6f} | PR-AUC(AP): {pr:.6f} | thr={threshold}")
    print(f"{prefix}Confusion matrix:\n{cm}")
    print(f"{prefix}Classification report:\n{classification_report(y_true, y_pred, digits=4)}")

    return roc, pr, cm

# Logistic Regression 학습/평가 (빠른 루트)

In [None]:
preprocess_lr = make_preprocess(num_cols, cat_cols, for_lr=True)

X_train_t = preprocess_lr.fit_transform(X_train2)
X_valid_t = preprocess_lr.transform(X_valid2)
X_test_t  = preprocess_lr.transform(X_test2)

lr = LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    solver="saga",
    # n_jobs=-1,
)

lr.fit(X_train_t, y_train)

p_valid = lr.predict_proba(X_valid_t)[:, 1]
p_test  = lr.predict_proba(X_test_t)[:, 1]

eval_binary(y_valid, p_valid, prefix="[LR valid] ")
eval_binary(y_test,  p_test,  prefix="[LR test ] ")



[LR valid] ROC-AUC: 0.966573 | PR-AUC(AP): 0.839794 | thr=0.5
[LR valid] Confusion matrix:
[[111510   5418]
 [  1498  10719]]
[LR valid] Classification report:
              precision    recall  f1-score   support

           0     0.9867    0.9537    0.9699    116928
           1     0.6642    0.8774    0.7561     12217

    accuracy                         0.9464    129145
   macro avg     0.8255    0.9155    0.8630    129145
weighted avg     0.9562    0.9464    0.9497    129145

[LR test ] ROC-AUC: 0.966887 | PR-AUC(AP): 0.842084 | thr=0.5
[LR test ] Confusion matrix:
[[111540   5388]
 [  1454  10763]]
[LR test ] Classification report:
              precision    recall  f1-score   support

           0     0.9871    0.9539    0.9702    116928
           1     0.6664    0.8810    0.7588     12217

    accuracy                         0.9470    129145
   macro avg     0.8268    0.9175    0.8645    129145
weighted avg     0.9568    0.9470    0.9502    129145





(0.9668869425747473,
 0.8420841002521133,
 array([[111540,   5388],
        [  1454,  10763]]))

# LightGBM 학습/평가 (early stopping 포함)

In [26]:
preprocess_lgbm = make_preprocess(num_cols, cat_cols, for_lr=False)

X_train_t = preprocess_lgbm.fit_transform(X_train2)
X_valid_t = preprocess_lgbm.transform(X_valid2)
X_test_t  = preprocess_lgbm.transform(X_test2)

pos = float(y_train.sum())
neg = float((y_train == 0).sum())
scale_pos_weight = neg / pos if pos > 0 else 1.0
print("scale_pos_weight:", scale_pos_weight)

lgbm = lgb.LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.02,
    num_leaves=63,
    min_child_samples=50,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)

lgbm.fit(
    X_train_t, y_train,
    eval_set=[(X_valid_t, y_valid)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(200, verbose=True)],
)

p_valid_lgbm = lgbm.predict_proba(X_valid_t)[:, 1]
p_test_lgbm  = lgbm.predict_proba(X_test_t)[:, 1]

eval_binary(y_valid, p_valid_lgbm, prefix="[LGBM valid] ")
eval_binary(y_test,  p_test_lgbm,  prefix="[LGBM test ] ")

scale_pos_weight: 9.570666853755219
[LightGBM] [Info] Number of positive: 57014, number of negative: 545662
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16815
[LightGBM] [Info] Number of data points in the train set: 602676, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.094601 -> initscore=-2.258703
[LightGBM] [Info] Start training from score -2.258703
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[132]	valid_0's auc: 0.988414	valid_0's binary_logloss: 0.12475




[LGBM valid] ROC-AUC: 0.988414 | PR-AUC(AP): 0.923838 | thr=0.5
[LGBM valid] Confusion matrix:
[[112672   4256]
 [   888  11329]]
[LGBM valid] Classification report:
              precision    recall  f1-score   support

           0     0.9922    0.9636    0.9777    116928
           1     0.7269    0.9273    0.8150     12217

    accuracy                         0.9602    129145
   macro avg     0.8595    0.9455    0.8963    129145
weighted avg     0.9671    0.9602    0.9623    129145

[LGBM test ] ROC-AUC: 0.988967 | PR-AUC(AP): 0.928185 | thr=0.5
[LGBM test ] Confusion matrix:
[[112674   4254]
 [   888  11329]]
[LGBM test ] Classification report:
              precision    recall  f1-score   support

           0     0.9922    0.9636    0.9777    116928
           1     0.7270    0.9273    0.8150     12217

    accuracy                         0.9602    129145
   macro avg     0.8596    0.9455    0.8964    129145
weighted avg     0.9671    0.9602    0.9623    129145



(0.988966981410978,
 0.9281846271009779,
 array([[112674,   4254],
        [   888,  11329]]))

In [11]:
import joblib
from pathlib import Path

Path("artifacts").mkdir(exist_ok=True)

joblib.dump({"model": lr, "preprocess": preprocess_lr}, "artifacts/lr.joblib")
joblib.dump({"model": lgbm, "preprocess": preprocess_lgbm}, "artifacts/lgbm.joblib")

print("saved to artifacts/")

saved to artifacts/


# XGBoost, CatBoost, RandomForest 모델을 위한 공통 전처리

In [13]:
preprocess_common = make_preprocess(num_cols, cat_cols, for_lr=False)

X_train_t = preprocess_common.fit_transform(X_train2)
X_valid_t = preprocess_common.transform(X_valid2)
X_test_t  = preprocess_common.transform(X_test2)

pos = float(y_train.sum())
neg = float((y_train == 0).sum())
scale_pos_weight = neg / pos if pos > 0 else 1.0
scale_pos_weight

9.570666853755219

# XGBoost 학습/평가

In [14]:
import xgboost as xgb

In [16]:
xgb_clf = xgb.XGBClassifier(
    n_estimators=5000,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    objective="binary:logistic",
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,
    early_stopping_rounds=200,
)

xgb_clf.fit(
    X_train_t, y_train,
    eval_set=[(X_valid_t, y_valid)],
    verbose=False,
)

p_valid_xgb = xgb_clf.predict_proba(X_valid_t)[:, 1]
p_test_xgb  = xgb_clf.predict_proba(X_test_t)[:, 1]

eval_binary(y_valid, p_valid_xgb, prefix="[XGB valid] ")
eval_binary(y_test,  p_test_xgb,  prefix="[XGB test ] ")

[XGB valid] ROC-AUC: 0.988489 | PR-AUC(AP): 0.931125 | thr=0.5
[XGB valid] Confusion matrix:
[[112798   4130]
 [   889  11328]]
[XGB valid] Classification report:
              precision    recall  f1-score   support

           0     0.9922    0.9647    0.9782    116928
           1     0.7328    0.9272    0.8186     12217

    accuracy                         0.9611    129145
   macro avg     0.8625    0.9460    0.8984    129145
weighted avg     0.9676    0.9611    0.9631    129145

[XGB test ] ROC-AUC: 0.989018 | PR-AUC(AP): 0.934766 | thr=0.5
[XGB test ] Confusion matrix:
[[112755   4173]
 [   888  11329]]
[XGB test ] Classification report:
              precision    recall  f1-score   support

           0     0.9922    0.9643    0.9781    116928
           1     0.7308    0.9273    0.8174     12217

    accuracy                         0.9608    129145
   macro avg     0.8615    0.9458    0.8977    129145
weighted avg     0.9675    0.9608    0.9629    129145



(0.9890180685800413,
 0.9347664053598701,
 array([[112755,   4173],
        [   888,  11329]]))

# CatBoost 학습/평가

In [18]:
from catboost import CatBoostClassifier

## 1) Period/Datetime 처리 (CatBoost용: 문자열/카테고리로)

In [21]:
def prep_catboost_df(
    X: pd.DataFrame,
    *,
    dt_col: str = "registration_init_time",
    period_col: str = "registration_month",
) -> pd.DataFrame:
    X = X.copy()

    # datetime -> 문자열 카테고리(또는 파생 피처). 여기선 파생으로 추천.
    if dt_col in X.columns:
        dt = pd.to_datetime(X[dt_col], errors="coerce")
        X[f"{dt_col}_year"] = dt.dt.year.astype("Int16")
        X[f"{dt_col}_month"] = dt.dt.month.astype("Int8")
        X[f"{dt_col}_day"] = dt.dt.day.astype("Int8")
        X = X.drop(columns=[dt_col])

    # period[M] -> 문자열 카테고리로 변환 ("2015-01" 같은 형태)
    if period_col in X.columns:
        p = X[period_col]
        if isinstance(p.dtype, pd.PeriodDtype):
            X[period_col] = p.astype(str)        # "2015-01"
        else:
            # 혹시 object로 들어왔어도 방어적으로 처리
            X[period_col] = p.astype("string")

        # catboost가 확실히 categorical로 보게끔 category로 바꿔도 됨(선택)
        X[period_col] = X[period_col].astype("category")

    return X

## 2) CatBoost 입력 만들기

In [23]:
X_train_cb = prep_catboost_df(X_train)
X_valid_cb = prep_catboost_df(X_valid)
X_test_cb  = prep_catboost_df(X_test)

# categorical 컬럼 찾기: object / category / string
cat_cols_cb = [
    c for c in X_train_cb.columns
    if (X_train_cb[c].dtype == "object"
        or pd.api.types.is_string_dtype(X_train_cb[c])
        or isinstance(X_train_cb[c].dtype, pd.CategoricalDtype))
]

cat_col_idx = [X_train_cb.columns.get_loc(c) for c in cat_cols_cb]

print("cat cols:", len(cat_cols_cb))
print("example:", cat_cols_cb[:10])

cat cols: 2
example: ['gender', 'registration_month']


In [24]:
cat_clf = CatBoostClassifier(
    iterations=5000,
    learning_rate=0.03,
    depth=8,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=200,
)

cat_clf.fit(
    X_train_cb,
    y_train,
    eval_set=(X_valid_cb, y_valid),
    cat_features=cat_col_idx,
    use_best_model=True,
    early_stopping_rounds=200,
)

p_valid_cat = cat_clf.predict_proba(X_valid_cb)[:, 1]
p_test_cat  = cat_clf.predict_proba(X_test_cb)[:, 1]

eval_binary(y_valid, p_valid_cat, prefix="[CAT valid] ")
eval_binary(y_test,  p_test_cat,  prefix="[CAT test ] ")

0:	test: 0.9614759	best: 0.9614759 (0)	total: 191ms	remaining: 15m 53s
200:	test: 0.9876067	best: 0.9876067 (200)	total: 23.6s	remaining: 9m 23s
400:	test: 0.9881368	best: 0.9881368 (400)	total: 46.1s	remaining: 8m 48s
600:	test: 0.9884856	best: 0.9884856 (600)	total: 1m 8s	remaining: 8m 24s
800:	test: 0.9886492	best: 0.9886499 (794)	total: 1m 31s	remaining: 8m
1000:	test: 0.9887249	best: 0.9887262 (995)	total: 1m 54s	remaining: 7m 37s
1200:	test: 0.9887681	best: 0.9887682 (1181)	total: 2m 17s	remaining: 7m 15s
1400:	test: 0.9887968	best: 0.9887968 (1400)	total: 2m 40s	remaining: 6m 52s
1600:	test: 0.9888190	best: 0.9888208 (1587)	total: 3m 3s	remaining: 6m 29s
1800:	test: 0.9888303	best: 0.9888354 (1696)	total: 3m 26s	remaining: 6m 7s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9888354481
bestIteration = 1696

Shrink model to first 1697 iterations.
[CAT valid] ROC-AUC: 0.988835 | PR-AUC(AP): 0.934152 | thr=0.5
[CAT valid] Confusion matrix:
[[115603   1325]
 [ 

(0.9891901644053332,
 0.9375645807302999,
 array([[115679,   1249],
        [  1821,  10396]]))

# RandomForest 학습/평가 (sparse 그대로)

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=600,              # 너무 크게 잡으면 오래 걸림
    max_depth=None,
    min_samples_leaf=50,           # 과적합/시간 둘 다 완화
    min_samples_split=100,
    max_features="sqrt",
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced_subsample",
)

rf.fit(X_train_t, y_train)

p_valid_rf = rf.predict_proba(X_valid_t)[:, 1]
p_test_rf  = rf.predict_proba(X_test_t)[:, 1]

eval_binary(y_valid, p_valid_rf, prefix="[RF valid] ")
eval_binary(y_test,  p_test_rf,  prefix="[RF test ] ")

[RF valid] ROC-AUC: 0.984179 | PR-AUC(AP): 0.905187 | thr=0.5
[RF valid] Confusion matrix:
[[112999   3929]
 [  1146  11071]]
[RF valid] Classification report:
              precision    recall  f1-score   support

           0     0.9900    0.9664    0.9780    116928
           1     0.7381    0.9062    0.8135     12217

    accuracy                         0.9607    129145
   macro avg     0.8640    0.9363    0.8958    129145
weighted avg     0.9661    0.9607    0.9625    129145

[RF test ] ROC-AUC: 0.984798 | PR-AUC(AP): 0.909327 | thr=0.5
[RF test ] Confusion matrix:
[[113056   3872]
 [  1166  11051]]
[RF test ] Classification report:
              precision    recall  f1-score   support

           0     0.9898    0.9669    0.9782    116928
           1     0.7405    0.9046    0.8144     12217

    accuracy                         0.9610    129145
   macro avg     0.8652    0.9357    0.8963    129145
weighted avg     0.9662    0.9610    0.9627    129145



(0.9847984648439577,
 0.9093271047018638,
 array([[113056,   3872],
        [  1166,  11051]]))

# Ensemble (CatBoost + LightGBM) 

In [27]:
# === Ensemble (CatBoost + LightGBM) ===
# 전제: p_valid_cat, p_test_cat, p_valid_lgbm, p_test_lgbm 이미 있음
#       y_valid, y_test, eval_binary() 이미 있음

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_fscore_support

def sweep_ensemble_weights(
    y_true,
    p_cat,
    p_lgbm,
    weights=np.linspace(0.0, 1.0, 21),  # cat weight
):
    rows = []
    for w in weights:
        p = w * p_cat + (1 - w) * p_lgbm
        roc = roc_auc_score(y_true, p)
        ap = average_precision_score(y_true, p)

        # 기본 thr=0.5에서 precision/recall/f1도 같이 기록
        y_pred = (p >= 0.5).astype(int)
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average="binary", zero_division=0
        )

        rows.append({
            "w_cat": float(w),
            "w_lgbm": float(1 - w),
            "roc_auc": float(roc),
            "ap": float(ap),
            "precision@0.5": float(prec),
            "recall@0.5": float(rec),
            "f1@0.5": float(f1),
        })

    return pd.DataFrame(rows).sort_values(["ap", "roc_auc"], ascending=False)

# 1) valid에서 가중치 스윕
ens_valid_df = sweep_ensemble_weights(
    y_valid, p_valid_cat, p_valid_lgbm,
    weights=np.linspace(0.0, 1.0, 41)  # 0.025 간격
)
display(ens_valid_df.head(10))

best_w = float(ens_valid_df.iloc[0]["w_cat"])
print("best_w_cat(valid):", best_w)

# 2) best weight로 valid/test 평가 (thr=0.5)
p_valid_ens = best_w * p_valid_cat + (1 - best_w) * p_valid_lgbm
p_test_ens  = best_w * p_test_cat  + (1 - best_w) * p_test_lgbm

eval_binary(y_valid, p_valid_ens, prefix=f"[ENS valid w_cat={best_w:.3f}] ")
eval_binary(y_test,  p_test_ens,  prefix=f"[ENS test  w_cat={best_w:.3f}] ")

Unnamed: 0,w_cat,w_lgbm,roc_auc,ap,precision@0.5,recall@0.5,f1@0.5
33,0.825,0.175,0.989052,0.934471,0.869919,0.865433,0.867671
34,0.85,0.15,0.989049,0.934469,0.871776,0.863142,0.867437
32,0.8,0.2,0.989053,0.934468,0.867271,0.869117,0.868193
31,0.775,0.225,0.989053,0.93446,0.865258,0.871491,0.868363
35,0.875,0.125,0.989041,0.934459,0.874303,0.860277,0.867233
30,0.75,0.25,0.989051,0.934448,0.862476,0.873701,0.868052
36,0.9,0.1,0.989028,0.93444,0.877385,0.858067,0.867618
29,0.725,0.275,0.989049,0.934434,0.859469,0.877057,0.868174
28,0.7,0.3,0.989046,0.934415,0.856881,0.879676,0.868129
37,0.925,0.075,0.989007,0.934408,0.879865,0.854874,0.86719


best_w_cat(valid): 0.8250000000000001
[ENS valid w_cat=0.825] ROC-AUC: 0.989052 | PR-AUC(AP): 0.934471 | thr=0.5
[ENS valid w_cat=0.825] Confusion matrix:
[[115347   1581]
 [  1644  10573]]
[ENS valid w_cat=0.825] Classification report:
              precision    recall  f1-score   support

           0     0.9859    0.9865    0.9862    116928
           1     0.8699    0.8654    0.8677     12217

    accuracy                         0.9750    129145
   macro avg     0.9279    0.9260    0.9269    129145
weighted avg     0.9750    0.9750    0.9750    129145

[ENS test  w_cat=0.825] ROC-AUC: 0.989473 | PR-AUC(AP): 0.937834 | thr=0.5
[ENS test  w_cat=0.825] Confusion matrix:
[[115438   1490]
 [  1616  10601]]
[ENS test  w_cat=0.825] Classification report:
              precision    recall  f1-score   support

           0     0.9862    0.9873    0.9867    116928
           1     0.8768    0.8677    0.8722     12217

    accuracy                         0.9759    129145
   macro avg     0.

(0.9894727239788168,
 0.9378343019430334,
 array([[115438,   1490],
        [  1616,  10601]]))