# Random Forest Classifier Baseline (Model Comparison) v3

- 모델 종류 체급 비교 목적

평가 (Test set):
- PR-AUC
- Recall (Churn)
- Accuracy
- Confusion Matrix
- Classification Report
- Feature Importance


In [3]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)


In [4]:
df = pd.read_parquet('data/kkbox_train_feature_v3.parquet')


In [5]:
RANDOM_STATE = 719

ID_COL = "msno"
TARGET_COL = "is_churn"

CATEGORICAL_COLS = [
    "city",
    "gender",
    "registered_via",
    "last_payment_method",
    "has_ever_paid",
    "has_ever_cancelled",
    # "is_auto_renew_last",
    "is_free_user",
]

NUMERICAL_COLS = [
    # "reg_days",
    "num_days_active_w7",
    "total_secs_w7",
    "avg_secs_per_day_w7",
    "std_secs_w7",
    "num_songs_w7",
    "avg_songs_per_day_w7",
    "num_unq_w7",
    "num_25_w7",
    "num_100_w7",
    "short_play_w7",
    "skip_ratio_w7",
    "completion_ratio_w7",
    "short_play_ratio_w7",
    "variety_ratio_w7",
    "num_days_active_w14",
    "total_secs_w14",
    "avg_secs_per_day_w14",
    "std_secs_w14",
    "num_songs_w14",
    "avg_songs_per_day_w14",
    "num_unq_w14",
    "num_25_w14",
    "num_100_w14",
    "short_play_w14",
    "skip_ratio_w14",
    "completion_ratio_w14",
    "short_play_ratio_w14",
    "variety_ratio_w14",
    "num_days_active_w21",
    "total_secs_w21",
    "avg_secs_per_day_w21",
    "std_secs_w21",
    "num_songs_w21",
    "avg_songs_per_day_w21",
    "num_unq_w21",
    "num_25_w21",
    "num_100_w21",
    "short_play_w21",
    "skip_ratio_w21",
    "completion_ratio_w21",
    "short_play_ratio_w21",
    "variety_ratio_w21",
    "num_days_active_w30",
    "total_secs_w30",
    "avg_secs_per_day_w30",
    "std_secs_w30",
    "num_songs_w30",
    "avg_songs_per_day_w30",
    "num_unq_w30",
    "num_25_w30",
    "num_100_w30",
    "short_play_w30",
    "skip_ratio_w30",
    "completion_ratio_w30",
    "short_play_ratio_w30",
    "variety_ratio_w30",
    "secs_trend_w7_w30",
    "secs_trend_w14_w30",
    "days_trend_w7_w14",
    "days_trend_w7_w30",
    "songs_trend_w7_w30",
    "songs_trend_w14_w30",
    "skip_trend_w7_w30",
    "completion_trend_w7_w30",
    # "days_since_last_payment",
    # "days_since_last_cancel",
    "last_plan_days",
    "total_payment_count",
    "total_amount_paid",
    "avg_amount_per_payment",
    "unique_plan_count",
    "subscription_months_est",
    "payment_count_last_30d",
    "payment_count_last_90d",
]

FEATURE_COLS = CATEGORICAL_COLS + NUMERICAL_COLS

X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].astype(int).copy()


## 1. Train / Test Split


In [6]:
from sklearn.model_selection import train_test_split

X = df[FEATURE_COLS]
y = df[TARGET_COL].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y,
)

print(X_train.shape, X_test.shape)


(688772, 79) (172194, 79)


## 2. Column Groups


In [7]:
cat_cols = [c for c in CATEGORICAL_COLS if c in X_train.columns]
num_cols = [c for c in NUMERICAL_COLS if c in X_train.columns]

print(f"num_cols: {len(num_cols)}")
print(f"cat_cols: {len(cat_cols)}")


num_cols: 72
cat_cols: 7


## 3. Preprocessing (OHE only)


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
)

X_train_t = preprocess.fit_transform(X_train)
X_test_t = preprocess.transform(X_test)


## 4. Random Forest Classifier Model


In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=800,
    max_depth=None,
    max_features="sqrt",
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight="balanced",
    n_jobs=-1,
    random_state=RANDOM_STATE,
)


## 5. Train Model


In [None]:
rf_model.fit(X_train_t, y_train)


## 6. Test Evaluation


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    average_precision_score,
    recall_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
)

y_proba = rf_model.predict_proba(X_test_t)[:, 1]
y_pred  = rf_model.predict(X_test_t)

print(f"PR-AUC: {average_precision_score(y_test, y_proba):.4f}")
print(f"Recall (Churn): {recall_score(y_test, y_pred):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


NameError: name 'rf_model' is not defined

## 7. Confusion Matrix


In [None]:
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])


Unnamed: 0,Pred 0,Pred 1
Actual 0,154317,1587
Actual 1,3072,13218


## 8. Classification Report


In [None]:
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9805    0.9898    0.9851    155904
           1     0.8928    0.8114    0.8502     16290

    accuracy                         0.9729    172194
   macro avg     0.9366    0.9006    0.9176    172194
weighted avg     0.9722    0.9729    0.9724    172194



## 9. Feature Importance (Impurity Decrease)


In [None]:
feature_names = preprocess.get_feature_names_out()
importances = rf_model.feature_importances_

imp_rf_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances,
}).sort_values("importance", ascending=False)

imp_rf_df


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


NameError: name 'y_test' is not defined

In [None]:
# imp_rf_df.to_csv(
#     "../data/model_df/rf_feature_importance.csv",
#     index=False
# )
