In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_parquet('data/features_v2.parquet')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 860966 entries, 0 to 860965
Data columns (total 86 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   msno                     860966 non-null  string  
 1   city                     860966 non-null  int8    
 2   gender                   860966 non-null  category
 3   registered_via           860966 non-null  int8    
 4   bd_clean                 386394 non-null  float32 
 5   is_churn                 860966 non-null  Int8    
 6   num_days_active_w7       860966 non-null  UInt8   
 7   total_secs_w7            860966 non-null  float32 
 8   avg_secs_per_day_w7      860966 non-null  float32 
 9   std_secs_w7              860966 non-null  float32 
 10  num_songs_w7             860966 non-null  UInt16  
 11  avg_songs_per_day_w7     860966 non-null  float32 
 12  num_unq_w7               860966 non-null  UInt16  
 13  num_25_w7                860966 non-null  UI

In [3]:
def check_missing(df, name):
    print(f'[{name}]')
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    print(missing)
    print('-' * 40)

check_missing(df, 'df')

[df]
bd_clean                   474572
days_since_last_payment     35599
days_since_last_cancel      35599
last_plan_days              35599
last_payment_method         35599
dtype: int64
----------------------------------------


In [None]:
RANDOM_STATE = 719
np.random.seed(RANDOM_STATE)

D_COL = "msno"
TARGET_COL = "is_churn"

CATEGORICAL_COLS = [
    "city", "gender", "registered_via", "last_payment_method",
    "has_ever_paid", "has_ever_cancelled", "is_auto_renew_last", "is_free_user",
]

NUMERICAL_COLS = [
    "bd_clean", "reg_days",
    "num_days_active_w7", "total_secs_w7", "avg_secs_per_day_w7", "std_secs_w7",
    "num_songs_w7", "avg_songs_per_day_w7", "num_unq_w7", "num_25_w7", "num_100_w7",
    "short_play_w7", "skip_ratio_w7", "completion_ratio_w7", "short_play_ratio_w7", "variety_ratio_w7",
    "num_days_active_w14", "total_secs_w14", "avg_secs_per_day_w14", "std_secs_w14",
    "num_songs_w14", "avg_songs_per_day_w14", "num_unq_w14", "num_25_w14", "num_100_w14",
    "short_play_w14", "skip_ratio_w14", "completion_ratio_w14", "short_play_ratio_w14", "variety_ratio_w14",
    "num_days_active_w21", "total_secs_w21", "avg_secs_per_day_w21", "std_secs_w21",
    "num_songs_w21", "avg_songs_per_day_w21", "num_unq_w21", "num_25_w21", "num_100_w21",
    "short_play_w21", "skip_ratio_w21", "completion_ratio_w21", "short_play_ratio_w21", "variety_ratio_w21",
    "num_days_active_w30", "total_secs_w30", "avg_secs_per_day_w30", "std_secs_w30",
    "num_songs_w30", "avg_songs_per_day_w30", "num_unq_w30", "num_25_w30", "num_100_w30",
    "short_play_w30", "skip_ratio_w30", "completion_ratio_w30", "short_play_ratio_w30", "variety_ratio_w30",
    "secs_trend_w7_w30", "secs_trend_w14_w30", "days_trend_w7_w14", "days_trend_w7_w30",
    "songs_trend_w7_w30", "songs_trend_w14_w30", "skip_trend_w7_w30", "completion_trend_w7_w30",
    "days_since_last_payment", "days_since_last_cancel", "last_plan_days",
    "total_payment_count", "total_amount_paid", "avg_amount_per_payment",
    "unique_plan_count", "subscription_months_est", "payment_count_last_30d", "payment_count_last_90d",
]

FEATURE_COLS = CATEGORICAL_COLS + NUMERICAL_COLS

X = df[FEATURE_COLS].copy()
y = df[TARGET_COL].astype(int).copy()
print(X)

        city   gender  registered_via  last_payment_method  has_ever_paid  \
0          1  unknown               7                 41.0              1   
1          4     male               9                 39.0              1   
2         13     male               9                 40.0              1   
3          1  unknown               7                 41.0              1   
4          4   female               9                 36.0              1   
...      ...      ...             ...                  ...            ...   
860961     1  unknown               7                 41.0              1   
860962     1  unknown               7                 41.0              1   
860963     1  unknown               7                 41.0              1   
860964     6   female               7                 41.0              1   
860965     1  unknown               7                 41.0              1   

        has_ever_cancelled  is_auto_renew_last  is_free_user  bd_clean  \
0

In [7]:
print(y.value_counts())

is_churn
0    779518
1     81448
Name: count, dtype: int64


In [10]:
%pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# categorical dtype 변환
for col in CATEGORICAL_COLS:
    X[col] = X[col].astype("category")

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

lgb_train = lgb.Dataset(
    X_train, y_train,
    categorical_feature=CATEGORICAL_COLS,
    free_raw_data=False
)

lgb_valid = lgb.Dataset(
    X_valid, y_valid,
    categorical_feature=CATEGORICAL_COLS,
    free_raw_data=False
)

params = {
    "objective": "binary",
    "metric": "AUCPR",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "min_data_in_leaf": 100,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 1.0,
    "lambda_l2": 1.0,
    "seed": RANDOM_STATE,
    "verbosity": -1
}

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=["train", "valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

# 평가
valid_pred = model.predict(X_valid)
auc = roc_auc_score(y_valid, valid_pred)
print("Valid AUC:", auc)



ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

In [7]:
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

proba = model.predict(X_valid)

precision, recall, thresholds = precision_recall_curve(y_valid, proba)

f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
best_idx = np.argmax(f1_scores)

best_threshold = thresholds[best_idx]
print("Best threshold:", best_threshold)
print("Precision:", precision[best_idx])
print("Recall:", recall[best_idx])
print("F1:", f1_scores[best_idx])


Best threshold: 0.4575809392725924
Precision: 0.8759155803848542
Recall: 0.8662369551872314
F1: 0.8710493822160648


In [9]:
from sklearn.metrics import confusion_matrix

y_pred = (proba >= best_threshold).astype(int)

cm = confusion_matrix(y_valid, y_pred)
cm = pd.DataFrame(cm)
cm


Unnamed: 0,0,1
0,153905,1999
1,2179,14111


In [10]:
import pandas as pd

fi = pd.DataFrame({
    "feature": model.feature_name(),
    "importance": model.feature_importance(importance_type="gain")
}).sort_values("importance", ascending=False)

fi.head(20)


Unnamed: 0,feature,importance
75,days_since_last_cancel,1030906.0
76,last_plan_days,473484.7
74,days_since_last_payment,260884.2
3,last_payment_method,145156.9
83,payment_count_last_90d,117852.8
5,has_ever_cancelled,106729.0
78,total_amount_paid,106215.1
6,is_auto_renew_last,100182.3
77,total_payment_count,86767.94
81,subscription_months_est,61378.75


In [11]:
# 예시: 기준 날짜 기준 분할
train_df = df[df["base_date"] < "2017-03-01"]
valid_df = df[df["base_date"] >= "2017-03-01"]


KeyError: 'base_date'

In [12]:
df["churn_proba"] = model.predict(df[FEATURE_COLS])
df["churn_pred"] = (df["churn_proba"] >= best_threshold).astype(int)

# 이탈 위험 TOP 1%
high_risk = df.sort_values("churn_proba", ascending=False).head(int(len(df)*0.01))


ValueError: train and valid dataset categorical_feature do not match.