<a href="https://colab.research.google.com/github/minji0620/sk_broadband/blob/main/ML_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("/content/iptv_data_processed(28000).csv", encoding='cp949')
#df = pd.read_csv("/content/iptv_data_processed(50000).csv", encoding='cp949')

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pickle

In [None]:
# 1. test set 분리
X_total = df.drop(columns=["iptv_yn"])
y_total = df["iptv_yn"]
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X_total, y_total, test_size=0.2, stratify=y_total, random_state=42
)

# 2. K-Fold 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

모델 성능 평가 함수 코드

In [None]:
def evaluate_xgb_model(params, X_trainval, y_trainval, X_test, y_test, cv=5, verbose=True):

    precision_0, recall_0, f1_0, support_0 = [], [], [], []
    precision_1, recall_1, f1_1, support_1 = [], [], [], []
    roc_auc_list = []

    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    for train_idx, val_idx in skf.split(X_trainval, y_trainval):
        X_train, X_val = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
        y_train, y_val = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

        model = XGBClassifier(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]
        report = classification_report(y_val, y_pred, output_dict=True)
        auc = roc_auc_score(y_val, y_proba)

        precision_0.append(report['0']['precision'])
        recall_0.append(report['0']['recall'])
        f1_0.append(report['0']['f1-score'])
        support_0.append(report['0']['support'])

        precision_1.append(report['1']['precision'])
        recall_1.append(report['1']['recall'])
        f1_1.append(report['1']['f1-score'])
        support_1.append(report['1']['support'])

        roc_auc_list.append(auc)

    # 평균 계산
    total_support_0 = np.mean(support_0)
    total_support_1 = np.mean(support_1)
    total_support = total_support_0 + total_support_1

    avg_p0, avg_r0, avg_f0 = np.mean(precision_0), np.mean(recall_0), np.mean(f1_0)
    avg_p1, avg_r1, avg_f1 = np.mean(precision_1), np.mean(recall_1), np.mean(f1_1)

    macro_precision = (avg_p0 + avg_p1) / 2
    macro_recall = (avg_r0 + avg_r1) / 2
    macro_f1 = (avg_f0 + avg_f1) / 2

    weighted_precision = (avg_p0 * total_support_0 + avg_p1 * total_support_1) / total_support
    weighted_recall = (avg_r0 * total_support_0 + avg_r1 * total_support_1) / total_support
    weighted_f1 = (avg_f0 * total_support_0 + avg_f1 * total_support_1) / total_support

    # 교차검증 결과 출력
    if verbose:
        print("📄 HPT 모델 성능 (5-Fold 평균 기반)")
        print(f"Class 0 - Precision: {avg_p0:.4f}, Recall: {avg_r0:.4f}, F1: {avg_f0:.4f}")
        print(f"Class 1 - Precision: {avg_p1:.4f}, Recall: {avg_r1:.4f}, F1: {avg_f1:.4f}")
        print(f"Macro Avg     - Precision: {macro_precision:.4f}, Recall: {macro_recall:.4f}, F1: {macro_f1:.4f}")
        print(f"Weighted Avg  - Precision: {weighted_precision:.4f}, Recall: {weighted_recall:.4f}, F1: {weighted_f1:.4f}")
        print(f"ROC AUC 평균: {np.mean(roc_auc_list):.4f}")

    # 최종 모델로 테스트셋 평가
    final_model = XGBClassifier(**params)
    final_model.fit(X_trainval, y_trainval)
    y_test_pred = final_model.predict(X_test)
    y_test_proba = final_model.predict_proba(X_test)[:, 1]

    if verbose:
        print("\n📊 HPT 모델 성능 (Test Set)")
        print(classification_report(y_test, y_test_pred))
        print(f"ROC AUC Score: {roc_auc_score(y_test, y_test_proba):.4f}")

    return {
        "cv": {
            "class_0": {"precision": avg_p0, "recall": avg_r0, "f1": avg_f0},
            "class_1": {"precision": avg_p1, "recall": avg_r1, "f1": avg_f1},
            "macro_avg": {"precision": macro_precision, "recall": macro_recall, "f1": macro_f1},
            "weighted_avg": {"precision": weighted_precision, "recall": weighted_recall, "f1": weighted_f1},
            "roc_auc": np.mean(roc_auc_list)
        },
        "test": {
            "classification_report": classification_report(y_test, y_test_pred, output_dict=True),
            "roc_auc": roc_auc_score(y_test, y_test_proba)
        }
    }

모델 피클 파일 저장 함수

In [None]:
def save_model_and_results(model, scores, filename, selected_features = "All Selected"):
    """
    모델, 하이퍼파라미터, 성능 점수, 선택된 피처 리스트를 하나의 pickle 파일로 저장합니다.

    Parameters:
    - model: 학습된 모델 객체 (예: XGBClassifier)
    - scores: dict 형태의 성능 점수 (cv or test 등 자유롭게 구성 가능)
    - selected_features: list 형태의 선택된 feature 이름
    - filename: 저장할 경로 및 파일명 (.pkl)
    """
    save_object = {
        "model": model,
        "scores": scores,
        "selected_features": selected_features
    }

    with open(filename, "wb") as f:
        pickle.dump(save_object, f)

    return filename

기본 모델 성능 확인

In [None]:
# 기본 모델 (파라미터 조정 X)
base_model=XGBClassifier(use_label_encoder=False,eval_metric='logloss',random_state=42)
base_params={'use_label_encoder':False, 'eval_metric':'logloss', 'random_state':42}

base_results = evaluate_xgb_model(base_params, X_trainval, y_trainval, X_test, y_test)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



📄 HPT 모델 성능 (5-Fold 평균 기반)
Class 0 - Precision: 0.5941, Recall: 0.3564, F1: 0.4454
Class 1 - Precision: 0.9148, Recall: 0.9660, F1: 0.9397
Macro Avg     - Precision: 0.7545, Recall: 0.6612, F1: 0.6925
Weighted Avg  - Precision: 0.8754, Recall: 0.8912, F1: 0.8790
ROC AUC 평균: 0.8920


Parameters: { "use_label_encoder" } are not used.




📊 HPT 모델 성능 (Test Set)
              precision    recall  f1-score   support

           0       0.59      0.36      0.45      1028
           1       0.92      0.96      0.94      7353

    accuracy                           0.89      8381
   macro avg       0.75      0.66      0.69      8381
weighted avg       0.88      0.89      0.88      8381

ROC AUC Score: 0.8941


In [None]:
save_model_and_results(base_model, base_results, "/content/xgb_base_model_50.pkl")

'/content/xgb_base_model_50.pkl'

In [None]:
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1


In [None]:
# 1차 튜닝
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.3, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42
    }

    model = XGBClassifier(**params)
    score = cross_val_score(model, X_trainval, y_trainval, cv=skf, scoring='roc_auc', n_jobs=-1)
    return np.mean(score)

# Study 정의 및 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# 최적 결과 출력
print("✅ Best trial:")
print("AUC:", study.best_value)
print("Params:", study.best_params)

[I 2025-04-10 08:20:08,799] A new study created in memory with name: no-name-690cb959-1cc3-4865-8368-de73618f6003
[I 2025-04-10 08:20:11,998] Trial 0 finished with value: 0.9003340660641801 and parameters: {'n_estimators': 361, 'max_depth': 3, 'learning_rate': 0.2245396976117129, 'min_child_weight': 12, 'gamma': 2.949878668171184, 'subsample': 0.6842126027873047, 'colsample_bytree': 0.7931639640450332, 'scale_pos_weight': 0.47684024459226276}. Best is trial 0 with value: 0.9003340660641801.
[I 2025-04-10 08:20:14,232] Trial 1 finished with value: 0.9002938304055681 and parameters: {'n_estimators': 369, 'max_depth': 4, 'learning_rate': 0.1747334276231886, 'min_child_weight': 1, 'gamma': 3.403093062004362, 'subsample': 0.9008727877455318, 'colsample_bytree': 0.955758753259873, 'scale_pos_weight': 0.5338882825896373}. Best is trial 0 with value: 0.9003340660641801.
[I 2025-04-10 08:20:15,769] Trial 2 finished with value: 0.880272533058787 and parameters: {'n_estimators': 348, 'max_depth':

✅ Best trial:
AUC: 0.9018574284282022
Params: {'n_estimators': 285, 'max_depth': 4, 'learning_rate': 0.08159743695042095, 'min_child_weight': 4, 'gamma': 1.2449394970977086, 'subsample': 0.8915439147949505, 'colsample_bytree': 0.9141412503037608, 'scale_pos_weight': 0.5843248205910467}


In [None]:
# trial들을 value 기준으로 정렬
sorted_trials = sorted(study.trials, key=lambda x: x.value, reverse=True)

# 상위 5개 출력
for i, trial in enumerate(sorted_trials[:5], 1):
    print(f"🔹 {i}등 Trial")
    print(f"  AUC: {trial.value}")
    print(f"  Params: {trial.params}\n")

🔹 1등 Trial
  AUC: 0.9018574284282022
  Params: {'n_estimators': 285, 'max_depth': 4, 'learning_rate': 0.08159743695042095, 'min_child_weight': 4, 'gamma': 1.2449394970977086, 'subsample': 0.8915439147949505, 'colsample_bytree': 0.9141412503037608, 'scale_pos_weight': 0.5843248205910467}

🔹 2등 Trial
  AUC: 0.9017685335258229
  Params: {'n_estimators': 273, 'max_depth': 4, 'learning_rate': 0.07675471539146558, 'min_child_weight': 4, 'gamma': 1.2281977200793799, 'subsample': 0.8713934219411413, 'colsample_bytree': 0.9399599378392961, 'scale_pos_weight': 0.9751019184344335}

🔹 3등 Trial
  AUC: 0.9017352366332195
  Params: {'n_estimators': 195, 'max_depth': 4, 'learning_rate': 0.09563940358400486, 'min_child_weight': 14, 'gamma': 1.242268669141998, 'subsample': 0.8881746398775934, 'colsample_bytree': 0.9204006284324615, 'scale_pos_weight': 0.6308914507048188}

🔹 4등 Trial
  AUC: 0.901733509970344
  Params: {'n_estimators': 421, 'max_depth': 3, 'learning_rate': 0.1056657294942845, 'min_child_w

In [None]:
params_1={
    'n_estimators': 285,
    'max_depth': 4,
    'learning_rate': 0.08159743695042095,
    'min_child_weight': 4,
    'gamma': 1.2449394970977086,
    'subsample': 0.8915439147949505,
    'colsample_bytree': 0.9141412503037608,
    'scale_pos_weight': 0.5843248205910467
}
params_2 = {
    'n_estimators': 273,
    'max_depth': 4,
    'learning_rate': 0.07675471539146558,
    'min_child_weight': 4,
    'gamma': 1.2281977200793799,
    'subsample': 0.8713934219411413,
    'colsample_bytree': 0.9399599378392961,
    'scale_pos_weight': 0.9751019184344335
}
params_3={
    'n_estimators': 195,
    'max_depth': 4,
    'learning_rate': 0.09563940358400486,
    'min_child_weight': 14,
    'gamma': 1.242268669141998,
    'subsample': 0.8881746398775934,
    'colsample_bytree': 0.9204006284324615,
    'scale_pos_weight': 0.6308914507048188
}
params_4={
    'n_estimators': 421,
    'max_depth': 3,
    'learning_rate': 0.1056657294942845,
    'min_child_weight': 11,
    'gamma': 1.537445767391795,
    'subsample': 0.8931164133098327,
    'colsample_bytree': 0.9290710442530649,
    'scale_pos_weight': 0.6659544711836847
}
params_5={
    'n_estimators': 337,
    'max_depth': 4,
    'learning_rate': 0.09053755944554247,
    'min_child_weight': 6,
    'gamma': 1.692728726249683,
    'subsample': 0.8818022525877556,
    'colsample_bytree': 0.8830792897746043,
    'scale_pos_weight': 0.9506109879562626
}

In [None]:
# optuna n_trials=100
xgb_5=XGBClassifier(params_5)

results_5=evaluate_xgb_model(params_5, X_trainval, y_trainval, X_test, y_test)

save_model_and_results(xgb_5, results_5, "/content/xgb_5(50).pkl")



📄 HPT 모델 성능 (5-Fold 평균 기반)
Class 0 - Precision: 0.6304, Recall: 0.3287, F1: 0.4320
Class 1 - Precision: 0.9120, Recall: 0.9730, F1: 0.9415
Macro Avg     - Precision: 0.7712, Recall: 0.6509, F1: 0.6867
Weighted Avg  - Precision: 0.8775, Recall: 0.8940, F1: 0.8790
ROC AUC 평균: 0.9019

📊 HPT 모델 성능 (Test Set)
              precision    recall  f1-score   support

           0       0.61      0.33      0.43      1028
           1       0.91      0.97      0.94      7353

    accuracy                           0.89      8381
   macro avg       0.76      0.65      0.69      8381
weighted avg       0.88      0.89      0.88      8381

ROC AUC Score: 0.9026


'/content/xgb_5(50).pkl'

In [None]:
# Objective 함수 (2차 튜닝)
def objective_2nd(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 287, 387),  # ±50
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'learning_rate': trial.suggest_float('learning_rate', 0.060, 0.120, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 4, 10),
        'gamma': trial.suggest_float('gamma', 1.2, 2.2),
        'subsample': trial.suggest_float('subsample', 0.84, 0.92),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.83, 0.93),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.85, 1.05),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)
    score = cross_val_score(model, X_trainval, y_trainval, cv=skf, scoring='roc_auc', n_jobs=-1)
    return np.mean(score)

# Study 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective_2nd, n_trials=100)

# 결과 출력
print("🎯 2차 Best trial:")
print("AUC:", study.best_value)
print("Params:", study.best_params)

[I 2025-04-10 09:09:23,433] A new study created in memory with name: no-name-485c2a36-4c02-42fc-bfcf-2902da3c7fca
[I 2025-04-10 09:09:24,589] Trial 0 finished with value: 0.9014868679220029 and parameters: {'n_estimators': 355, 'max_depth': 5, 'learning_rate': 0.06171474173141282, 'min_child_weight': 10, 'gamma': 1.8220276021698438, 'subsample': 0.8537217441569678, 'colsample_bytree': 0.8671866991628727, 'scale_pos_weight': 0.9804981788989071}. Best is trial 0 with value: 0.9014868679220029.
[I 2025-04-10 09:09:25,641] Trial 1 finished with value: 0.9010934295300677 and parameters: {'n_estimators': 323, 'max_depth': 4, 'learning_rate': 0.08670714144524305, 'min_child_weight': 6, 'gamma': 1.2869947080299151, 'subsample': 0.8583832408163823, 'colsample_bytree': 0.8905265247750229, 'scale_pos_weight': 0.9323128293557413}. Best is trial 0 with value: 0.9014868679220029.
[I 2025-04-10 09:09:26,731] Trial 2 finished with value: 0.9016055876770057 and parameters: {'n_estimators': 372, 'max_de

🎯 2차 Best trial:
AUC: 0.9019473166354693
Params: {'n_estimators': 349, 'max_depth': 4, 'learning_rate': 0.08623486667008952, 'min_child_weight': 5, 'gamma': 1.5646162778661707, 'subsample': 0.8718620060407601, 'colsample_bytree': 0.871407684824421, 'scale_pos_weight': 0.9370579241954289}


In [None]:
study.best_params

{'n_estimators': 349,
 'max_depth': 4,
 'learning_rate': 0.08623486667008952,
 'min_child_weight': 5,
 'gamma': 1.5646162778661707,
 'subsample': 0.8718620060407601,
 'colsample_bytree': 0.871407684824421,
 'scale_pos_weight': 0.9370579241954289}

In [None]:
xgb_52 = XGBClassifier(**study.best_params)

# 성능 평가
results_52 = evaluate_xgb_model(
    study.best_params,
    X_trainval,
    y_trainval,
    X_test,
    y_test
)

save_model_and_results(xgb_52, results_52, "/content/xgb_52(50).pkl")

📄 HPT 모델 성능 (5-Fold 평균 기반)
Class 0 - Precision: 0.6288, Recall: 0.3431, F1: 0.4437
Class 1 - Precision: 0.9136, Recall: 0.9717, F1: 0.9418
Macro Avg     - Precision: 0.7712, Recall: 0.6574, F1: 0.6927
Weighted Avg  - Precision: 0.8787, Recall: 0.8945, F1: 0.8806
ROC AUC 평균: 0.9014

📊 HPT 모델 성능 (Test Set)
              precision    recall  f1-score   support

           0       0.61      0.34      0.44      1028
           1       0.91      0.97      0.94      7353

    accuracy                           0.89      8381
   macro avg       0.76      0.65      0.69      8381
weighted avg       0.88      0.89      0.88      8381

ROC AUC Score: 0.9032


'/content/xgb_52(50).pkl'

In [None]:
with open('/content/xgb_base.pkl', 'rb') as f:
    data = pickle.load(f)
    scores = data['scores']

with open('/content/xgb_1-1.pkl', 'rb') as f:
    data1 = pickle.load(f)
    scores1 = data1['scores']

with open('/content/xgb_2-1.pkl', 'rb') as f:
    data2 = pickle.load(f)
    scores2 = data2['scores']

with open('/content/xgb_3-1.pkl', 'rb') as f:
    data3 = pickle.load(f)
    scores3 = data3['scores']

with open('/content/xgb_4-1.pkl', 'rb') as f:
    data4 = pickle.load(f)
    scores4 = data4['scores']

with open('/content/xgb_5-1.pkl', 'rb') as f:
    data5 = pickle.load(f)
    scores5 = data5['scores']

with open('/content/xgb_sv.pkl', 'rb') as f:
    data6 = pickle.load(f)
    scores_sv = data6['scores']

# with open('/content/xgb_sv135(50).pkl', 'rb') as f:
#     data7 = pickle.load(f)
#     scores135 = data7['scores']

# with open('/content/xgb_sv345(50).pkl', 'rb') as f:
#     data8 = pickle.load(f)
#     scores345 = data8['scores']

print(scores)
print(scores1)
print(scores2)
print(scores3)
print(scores4)
print(scores5)
print(scores_sv)

{'cv': {'class_0': {'precision': np.float64(0.581960848704254), 'recall': np.float64(0.3532743242484176), 'f1': np.float64(0.43948547725213016)}, 'class_1': {'precision': np.float64(0.8447719779802265), 'recall': np.float64(0.9327832892013384), 'f1': np.float64(0.8865907798880842)}, 'macro_avg': {'precision': np.float64(0.7133664133422403), 'recall': np.float64(0.6430288067248779), 'f1': np.float64(0.6630381285701072)}, 'weighted_avg': {'precision': np.float64(0.7897061514538855), 'recall': np.float64(0.8113609574208308), 'f1': np.float64(0.7929104890199398)}, 'roc_auc': np.float64(0.7933351859083697)}, 'test': {'classification_report': {'0': {'precision': 0.6114754098360655, 'recall': 0.36284046692607, 'f1-score': 0.4554334554334554, 'support': 1028.0}, '1': {'precision': 0.847603536528618, 'recall': 0.9389175257731959, 'f1-score': 0.8909268769870384, 'support': 3880.0}, 'accuracy': 0.8182559087204564, 'macro avg': {'precision': 0.7295394731823417, 'recall': 0.650878996349633, 'f1-sco

In [None]:
with open('/content/xgb_sv123(50).pkl', 'rb') as f:
    data6 = pickle.load(f)
    model123 = data6['model']

with open('/content/xgb_sv135(50).pkl', 'rb') as f:
    data7 = pickle.load(f)
    model135 = data7['model']

with open('/content/xgb_sv345(50).pkl', 'rb') as f:
    data8 = pickle.load(f)
    model345 = data8['model']

In [None]:
model_sv

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
# 이미 학습된 모델들 (예: model4, model5, model3)
soft_voting_model = VotingClassifier(
    estimators=[('model4', model4), ('model5', model5), ('model3', model3)],
    voting='soft'
)

In [None]:
def evaluate_model_general(model, X_trainval, y_trainval, X_test, y_test, cv=5):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # 결과 저장 리스트
    precision_0, recall_0, f1_0 = [], [], []
    precision_1, recall_1, f1_1 = [], [], []
    roc_auc_list = []

    for train_idx, val_idx in skf.split(X_trainval, y_trainval):
        X_train, X_val = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
        y_train, y_val = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

        # 모델 학습
        model.fit(X_train, y_train)

        # 예측 및 확률
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]

        report = classification_report(y_val, y_pred, output_dict=True)
        auc = roc_auc_score(y_val, y_proba)

        # 클래스별 지표
        precision_0.append(report['0']['precision'])
        recall_0.append(report['0']['recall'])
        f1_0.append(report['0']['f1-score'])

        precision_1.append(report['1']['precision'])
        recall_1.append(report['1']['recall'])
        f1_1.append(report['1']['f1-score'])

        roc_auc_list.append(auc)

    # 평균 계산
    avg_p0, avg_r0, avg_f0 = np.mean(precision_0), np.mean(recall_0), np.mean(f1_0)
    avg_p1, avg_r1, avg_f1 = np.mean(precision_1), np.mean(recall_1), np.mean(f1_1)

    total_support_0 = sum([report['0']['support'] for report in [classification_report(y_trainval.iloc[val_idx], model.predict(X_trainval.iloc[val_idx]), output_dict=True) for _, val_idx in skf.split(X_trainval, y_trainval)]]) / cv
    total_support_1 = sum([report['1']['support'] for report in [classification_report(y_trainval.iloc[val_idx], model.predict(X_trainval.iloc[val_idx]), output_dict=True) for _, val_idx in skf.split(X_trainval, y_trainval)]]) / cv
    total_support = total_support_0 + total_support_1

    macro_precision = (avg_p0 + avg_p1) / 2
    macro_recall = (avg_r0 + avg_r1) / 2
    macro_f1 = (avg_f0 + avg_f1) / 2

    weighted_precision = (avg_p0 * total_support_0 + avg_p1 * total_support_1) / total_support
    weighted_recall = (avg_r0 * total_support_0 + avg_r1 * total_support_1) / total_support
    weighted_f1 = (avg_f0 * total_support_0 + avg_f1 * total_support_1) / total_support

    print("📄 모델 성능 (5-Fold 평균 기반)")
    print(f"Class 0 - Precision: {avg_p0:.4f}, Recall: {avg_r0:.4f}, F1: {avg_f0:.4f}")
    print(f"Class 1 - Precision: {avg_p1:.4f}, Recall: {avg_r1:.4f}, F1: {avg_f1:.4f}")
    print(f"Macro Avg     - Precision: {macro_precision:.4f}, Recall: {macro_recall:.4f}, F1: {macro_f1:.4f}")
    print(f"Weighted Avg  - Precision: {weighted_precision:.4f}, Recall: {weighted_recall:.4f}, F1: {weighted_f1:.4f}")
    print(f"ROC AUC 평균: {np.mean(roc_auc_list):.4f}")

    # 테스트셋 평가
    model.fit(X_trainval, y_trainval)
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]

    print("\n📊 모델 성능 (Test Set)")
    print(classification_report(y_test, y_test_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_test_proba):.4f}")

    return {
        "cv": {
            "class_0": {"precision": avg_p0, "recall": avg_r0, "f1": avg_f0},
            "class_1": {"precision": avg_p1, "recall": avg_r1, "f1": avg_f1},
            "macro_avg": {"precision": macro_precision, "recall": macro_recall, "f1": macro_f1},
            "weighted_avg": {"precision": weighted_precision, "recall": weighted_recall, "f1": weighted_f1},
            "roc_auc": np.mean(roc_auc_list)
        },
        "test": {
            "classification_report": classification_report(y_test, y_test_pred, output_dict=True),
            "roc_auc": roc_auc_score(y_test, y_test_proba)
        }
    }

In [None]:
result_voting=evaluate_model_general(model_sv, X_trainval, y_trainval, X_test, y_test)
save_model_and_results(model_sv, result_voting, "/content/xgb_sv.pkl")

📄 모델 성능 (5-Fold 평균 기반)
Class 0 - Precision: 0.5536, Recall: 0.4897, F1: 0.5195
Class 1 - Precision: 0.8687, Recall: 0.8952, F1: 0.8818
Macro Avg     - Precision: 0.7112, Recall: 0.6924, F1: 0.7007
Weighted Avg  - Precision: 0.8027, Recall: 0.8102, F1: 0.8059
ROC AUC 평균: 0.8115

📊 모델 성능 (Test Set)
              precision    recall  f1-score   support

           0       0.57      0.50      0.53      1028
           1       0.87      0.90      0.89      3880

    accuracy                           0.82      4908
   macro avg       0.72      0.70      0.71      4908
weighted avg       0.81      0.82      0.81      4908

ROC AUC Score: 0.8214


'/content/xgb_sv.pkl'

'/content/xgb_sv123(50).pkl'