In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
import joblib
import os
import numpy as np
import pandas as pd


In [2]:
df = pd.read_parquet('data/kkbox_train_feature_v3.parquet')

In [3]:
RANDOM_STATE = 719

ID_COL = "msno"
TARGET_COL = "is_churn"

CATEGORICAL_COLS = [
    "city", "gender", "registered_via", "last_payment_method",
    "has_ever_paid", "has_ever_cancelled", "is_auto_renew_last", "is_free_user",
]

NUMERICAL_COLS = [
    "reg_days",
    "num_days_active_w7", "total_secs_w7", "avg_secs_per_day_w7", "std_secs_w7",
    "num_songs_w7", "avg_songs_per_day_w7", "num_unq_w7", "num_25_w7", "num_100_w7",
    "short_play_w7", "skip_ratio_w7", "completion_ratio_w7", "short_play_ratio_w7", "variety_ratio_w7",
    "num_days_active_w14", "total_secs_w14", "avg_secs_per_day_w14", "std_secs_w14",
    "num_songs_w14", "avg_songs_per_day_w14", "num_unq_w14", "num_25_w14", "num_100_w14",
    "short_play_w14", "skip_ratio_w14", "completion_ratio_w14", "short_play_ratio_w14", "variety_ratio_w14",
    "num_days_active_w21", "total_secs_w21", "avg_secs_per_day_w21", "std_secs_w21",
    "num_songs_w21", "avg_songs_per_day_w21", "num_unq_w21", "num_25_w21", "num_100_w21",
    "short_play_w21", "skip_ratio_w21", "completion_ratio_w21", "short_play_ratio_w21", "variety_ratio_w21",
    "num_days_active_w30", "total_secs_w30", "avg_secs_per_day_w30", "std_secs_w30",
    "num_songs_w30", "avg_songs_per_day_w30", "num_unq_w30", "num_25_w30", "num_100_w30",
    "short_play_w30", "skip_ratio_w30", "completion_ratio_w30", "short_play_ratio_w30", "variety_ratio_w30",
    "secs_trend_w7_w30", "secs_trend_w14_w30", "days_trend_w7_w14", "days_trend_w7_w30",
    "songs_trend_w7_w30", "songs_trend_w14_w30", "skip_trend_w7_w30", "completion_trend_w7_w30",
    "days_since_last_payment", "days_since_last_cancel", "last_plan_days",
    "total_payment_count", "total_amount_paid", "avg_amount_per_payment",
    "unique_plan_count", "subscription_months_est", "payment_count_last_30d", "payment_count_last_90d",
]

FEATURE_COLS = CATEGORICAL_COLS + NUMERICAL_COLS

X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].astype(int).copy()


In [4]:
from sklearn.model_selection import train_test_split

# 1. train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

# 2. 컬럼 검증
missing_num = set(NUMERICAL_COLS) - set(X_train.columns)
missing_cat = set(CATEGORICAL_COLS) - set(X_train.columns)

assert not missing_num and not missing_cat, \
    f"Missing columns\nNUM: {missing_num}\nCAT: {missing_cat}"

num_cols = NUMERICAL_COLS
cat_cols = CATEGORICAL_COLS

X_train_cb = X_train.copy()
X_test_cb  = X_test.copy()

for col in cat_cols:
    X_train_cb[col] = X_train_cb[col].astype(str).fillna("UNKNOWN")
    X_test_cb[col]  = X_test_cb[col].astype(str).fillna("UNKNOWN")

X_train_cb[num_cols] = X_train_cb[num_cols].fillna(0)
X_test_cb[num_cols]  = X_test_cb[num_cols].fillna(0)

cat_feature_indices = [
    X_train_cb.columns.get_loc(col) for col in cat_cols
]


In [5]:
model = CatBoostClassifier(
    iterations=4000,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="PRAUC",
    auto_class_weights="Balanced",
    random_seed=RANDOM_STATE,
    verbose=False,
    early_stopping_rounds=200,
)

eval_set = X_test_cb, y_test
model.fit(
    X_train_cb, y_train,
    eval_set=eval_set,
    cat_features=cat_feature_indices,
    use_best_model=True
)


<catboost.core.CatBoostClassifier at 0x17cdb4cdd60>

In [6]:
model.get_best_iteration()

1586

In [7]:
model.get_best_score()


{'learn': {'Logloss': 0.11873195210497985, 'PRAUC': 0.9912244287920877},
 'validation': {'Logloss': 0.12940180418180153, 'PRAUC': 0.9895603822090873}}

In [9]:
y_pred_proba = model.predict_proba(X_test_cb)[:, 1]

from sklearn.metrics import average_precision_score, roc_auc_score

pr_auc = average_precision_score(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("PR-AUC:", pr_auc)
print("ROC-AUC:", roc_auc)


PR-AUC: 0.9336633781679303
ROC-AUC: 0.9895935049057594


In [10]:
import numpy as np
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1)

best_threshold = thresholds[best_idx]
best_threshold


np.float64(0.8769032299854463)

In [None]:
import joblib

joblib.dump(
    {
        "model": model,
        "feature_cols": FEATURE_COLS,
        "cat_cols": cat_cols,
        "num_cols": num_cols,
        "cat_feature_indices": cat_feature_indices,
        "threshold": best_threshold,
    },
    "churn_catboost_model.joblib"
)

def preprocess_for_inference(df, cat_cols, num_cols):
    df = df.copy()

    for col in cat_cols:
        df[col] = df[col].astype(str).fillna("UNKNOWN")

    df[num_cols] = df[num_cols].fillna(0)

    return df



In [None]:
bundle = joblib.load("churn_catboost_model.joblib")

model = bundle["model"]
threshold = bundle["threshold"]

X_input = preprocess_for_inference(
    input_df,
    bundle["cat_cols"],
    bundle["num_cols"],
)

proba = model.predict_proba(X_input)[:, 1]
is_churn = (proba >= threshold).astype(int)

In [14]:
df_head = df.head(5)
df_head.to_csv("kkbox_head_5.csv", index=False)
