# Logistic Regression (Hyperparameter Tuning) · v3 dataset

- Baseline 대비 PR-AUC / Recall / F1-score 향상을 목표로 Grid Search + cross-validation 적용
- 평가 기준: **PR-AUC** (average precision)을 주평가로 삼고, Recall, F1도 동시에 모니터링


In [2]:

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)


## 1. 데이터 로드


In [3]:

df = pd.read_parquet('../data/processed/kkbox_train_feature_v3.parquet')
print(df.shape)


(860966, 85)


## 2. 상수 및 피처 목록


In [4]:

RANDOM_STATE = 719

ID_COL = "msno"
TARGET_COL = "is_churn"

CATEGORICAL_COLS = [
    "city", "gender", "registered_via", "last_payment_method",
    "has_ever_paid", "has_ever_cancelled", "is_auto_renew_last", "is_free_user",
]

NUMERICAL_COLS = [
    "reg_days",
    "num_days_active_w7", "total_secs_w7", "avg_secs_per_day_w7", "std_secs_w7",
    "num_songs_w7", "avg_songs_per_day_w7", "num_unq_w7", "num_25_w7", "num_100_w7",
    "short_play_w7", "skip_ratio_w7", "completion_ratio_w7", "short_play_ratio_w7", "variety_ratio_w7",
    "num_days_active_w14", "total_secs_w14", "avg_secs_per_day_w14", "std_secs_w14",
    "num_songs_w14", "avg_songs_per_day_w14", "num_unq_w14", "num_25_w14", "num_100_w14",
    "short_play_w14", "skip_ratio_w14", "completion_ratio_w14", "short_play_ratio_w14", "variety_ratio_w14",
    "num_days_active_w21", "total_secs_w21", "avg_secs_per_day_w21", "std_secs_w21",
    "num_songs_w21", "avg_songs_per_day_w21", "num_unq_w21", "num_25_w21", "num_100_w21",
    "short_play_w21", "skip_ratio_w21", "completion_ratio_w21", "short_play_ratio_w21", "variety_ratio_w21",
    "num_days_active_w30", "total_secs_w30", "avg_secs_per_day_w30", "std_secs_w30",
    "num_songs_w30", "avg_songs_per_day_w30", "num_unq_w30", "num_25_w30", "num_100_w30",
    "short_play_w30", "skip_ratio_w30", "completion_ratio_w30", "short_play_ratio_w30", "variety_ratio_w30",
    "secs_trend_w7_w30", "secs_trend_w14_w30", "days_trend_w7_w14", "days_trend_w7_w30",
    "songs_trend_w7_w30", "songs_trend_w14_w30", "skip_trend_w7_w30", "completion_trend_w7_w30",
    "days_since_last_payment", "days_since_last_cancel", "last_plan_days",
    "total_payment_count", "total_amount_paid", "avg_amount_per_payment",
    "unique_plan_count", "subscription_months_est", "payment_count_last_30d", "payment_count_last_90d",
]

FEATURE_COLS = CATEGORICAL_COLS + NUMERICAL_COLS


## 3. Train / Test Split
- Baseline과 동일한 stratified 80/20 split


In [5]:

from sklearn.model_selection import train_test_split

X = df[FEATURE_COLS]
y = df[TARGET_COL].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE,
)

X_train.shape, X_test.shape


((688772, 83), (172194, 83))

## 4. Column Groups & Preprocessing (OHE + Scaling)


In [6]:

cat_cols = [c for c in CATEGORICAL_COLS if c in X_train.columns]
num_cols = [c for c in NUMERICAL_COLS if c in X_train.columns]
print("num_cols:", len(num_cols), "| cat_cols:", len(cat_cols))


num_cols: 75 | cat_cols: 8


In [7]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

numeric_tf = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False)),
])

categorical_tf = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ],
    remainder="drop",
)


## 5. Hyperparameter Grid & GridSearchCV
- Solver: `saga` (희소 행렬 + L1/L2/elasticnet 지원)
- 다중 스코어: PR-AUC(=average_precision), Recall, F1, Accuracy (refit은 PR-AUC)
- 3-fold Stratified CV


In [8]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

log_reg = LogisticRegression(
    solver="saga",
    max_iter=5000,
    n_jobs=-1,
)

clf = Pipeline(steps=[
    ("prep", preprocess),
    ("model", log_reg),
])

param_grid = [
    {
        "model__penalty": ["l1", "l2"],
        "model__C": [0.05, 0.1, 0.3, 1.0],
        "model__class_weight": [None, "balanced"],
    },
    {
        "model__penalty": ["elasticnet"],
        "model__l1_ratio": [0.2, 0.5, 0.8],
        "model__C": [0.05, 0.1, 0.3, 1.0],
        "model__class_weight": [None, "balanced"],
    }
]

scoring = {
    "prauc": "average_precision",
    "recall": "recall",
    "f1": "f1",
    "accuracy": "accuracy",
}

grid = GridSearchCV(
    clf,
    param_grid=param_grid,
    scoring=scoring,
    refit="prauc",
    cv=3,
    n_jobs=-1,
    verbose=2,
)

grid.fit(X_train, y_train)


Fitting 3 folds for each of 40 candidates, totalling 120 fits




[CV] END model__C=0.05, model__class_weight=balanced, model__penalty=l2; total time=233.6min




[CV] END model__C=0.05, model__class_weight=balanced, model__penalty=l1; total time=273.6min




[CV] END model__C=0.05, model__class_weight=None, model__penalty=l2; total time=276.1min




KeyboardInterrupt: 

In [None]:

print("Best params:", grid.best_params_)
print("Best CV PR-AUC:", grid.best_score_)


In [None]:

cv_results = (
    pd.DataFrame(grid.cv_results_)
      .sort_values("rank_test_prauc")
      [[
          "params", "mean_test_prauc", "mean_test_recall",
          "mean_test_f1", "mean_test_accuracy", "rank_test_prauc"
       ]]
)
cv_results.head(10)


## 6. Test Evaluation (Best Estimator)


In [None]:

from sklearn.metrics import (
    average_precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
)

best_clf = grid.best_estimator_

y_proba = best_clf.predict_proba(X_test)[:, 1]
y_pred = best_clf.predict(X_test)

print(f"PR-AUC: {average_precision_score(y_test, y_proba):.4f}")
print(f"Recall (Churn): {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


In [None]:

cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])


In [None]:

print(classification_report(y_test, y_pred, digits=4))


## 7. Feature Importance (coefficients)


In [None]:

feature_names = best_clf.named_steps["prep"].get_feature_names_out()
coef = best_clf.named_steps["model"].coef_[0]

imp_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coef,
})
imp_df["abs_coef"] = imp_df["coef"].abs()
imp_df = imp_df.sort_values("abs_coef", ascending=False)
imp_df.head(20)


In [None]:

imp_df.to_csv("../data/model_df/lr_tuned_feature_importance.csv", index=False)
print("Saved to ../data/model_df/lr_tuned_feature_importance.csv")


## 8. 요약
- PR-AUC/Recall/F1를 동시에 고려한 튜닝 결과와 원본 baseline 대비 향상폭을 기록하세요.
- 추가 튜닝 아이디어: class_weight 커스텀, threshold 조정, PolynomialFeatures 등
