In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression

# (옵션) XGBoost
from xgboost import XGBClassifier

# =========================
# 1) Load
# =========================
df = pd.read_csv("diabetes_dataset.csv")

FEATURES = [
    "smoking_status",
    "alcohol_consumption_per_week",
    "physical_activity_minutes_per_week",
    "diet_score",
    "sleep_hours_per_day",
    "screen_time_hours_per_day",
    "bmi"
]

# diabetes_stage → 0~4로 세분화
mapping = {
    'No Diabetes': 0,
    'Pre-Diabetes': 1,
    'Type 1': 2,
    'Type 2': 3,
    'Gestational': 4
}
df['diabetes_stage_class'] = df['diabetes_stage'].map(mapping)

TARGET = "diabetes_stage_class"

# =========================
# 2) Helper: preprocess + eval
# =========================
cat_cols = ["smoking_status"]
num_cols = [
    "alcohol_consumption_per_week",
    "physical_activity_minutes_per_week",
    "diet_score",
    "sleep_hours_per_day",
    "screen_time_hours_per_day",
    "bmi"
]
dfr = df[FEATURES] # 타깃 컬럼(diabetes_stage) 버리기

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop",
)

def evaluate(y_true, y_pred, title=""):
    print("\n" + "=" * 80)
    print(title)
    print("Balanced Acc:", balanced_accuracy_score(y_true, y_pred))
    print("Macro F1     :", f1_score(y_true, y_pred, average="macro"))
    print("\n[Classification Report]\n", classification_report(y_true, y_pred))
    print("[Confusion Matrix]\n", confusion_matrix(y_true, y_pred))

# =========================
# 3) Train/Test split (stratify 필수)
# =========================
def run_experiment(use_risk_score: bool):
    use_features = FEATURES.copy()
    if not use_risk_score:
        use_features.remove("diabetes_risk_score")

    X = dfr[use_features].copy()
    y = df[TARGET].copy()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # 전처리 컬럼도 맞춰서 재정의
    _cat_cols = ["smoking_status"]
    _num_cols = [c for c in use_features if c != "smoking_status"]

    _preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), _cat_cols),
            ("num", StandardScaler(), _num_cols),
        ],
        remainder="drop",
    )

    # -------------------------
    # A) 기준모델: 다항 로지스틱 회귀 (해석/베이스라인)
    # -------------------------
    logit = LogisticRegression(
        multi_class="multinomial",
        solver="saga",
        max_iter=5000,
        n_jobs=-1,
        class_weight="balanced",  # 불균형 대응
    )

    logit_clf = Pipeline([("prep", _preprocess), ("model", logit)])
    logit_clf.fit(X_train, y_train)
    pred_logit = logit_clf.predict(X_test)

    evaluate(
        y_test, pred_logit,
        title=f"[LogisticRegression] risk_score={'IN' if use_risk_score else 'OUT'}"
    )

    # -------------------------
    # B) 주력모델: XGBoost (성능용)
    #    - 불균형은 sample_weight로 대응
    # -------------------------
    # XGBoost는 스케일링이 필수는 아니지만(트리 기반),
    # 파이프라인 단순화를 위해 동일 전처리 사용(원핫은 필요)
    xgb = XGBClassifier(
        objective="multi:softprob",
        eval_metric="mlogloss",
        n_estimators=600,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        tree_method="hist",
        n_jobs=-1,
        random_state=42,
    )

    xgb_clf = Pipeline([("prep", _preprocess), ("model", xgb)])

    # 클래스 불균형 보정 가중치
    sw = compute_sample_weight(class_weight="balanced", y=y_train)

    xgb_clf.fit(X_train, y_train, model__sample_weight=sw)
    pred_xgb = xgb_clf.predict(X_test)

    evaluate(
        y_test, pred_xgb,
        title=f"[XGBoost] risk_score={'IN' if use_risk_score else 'OUT'}"
    )

    # -------------------------
    # (옵션) “당뇨 위험도” 확률로 뽑기 예시
    #   - 예: No Diabetes가 아닌 확률을 위험도로 정의
    # -------------------------
    proba = xgb_clf.predict_proba(X_test)
    classes = xgb_clf.named_steps["model"].classes_
    if 0 in classes:
        idx_no = np.where(classes == 0)[0][0]
        risk = 1.0 - proba[:, idx_no]
        print("\n[Sample Risk Prob] (1 - P(0))")
        print(pd.Series(risk).head())

# =========================
# 4) Run both: risk_score 포함 vs 제외
# =========================
run_experiment(use_risk_score=True)
run_experiment(use_risk_score=False)





[LogisticRegression] risk_score=IN
Balanced Acc: 0.18290496543291232
Macro F1     : 0.09937182424436915

[Classification Report]
               precision    recall  f1-score   support

           0       0.07      0.19      0.10      1596
           1       0.31      0.11      0.16      6369
           2       0.00      0.12      0.00        24
           3       0.62      0.14      0.23     11955
           4       0.00      0.36      0.00        56

    accuracy                           0.13     20000
   macro avg       0.20      0.18      0.10     20000
weighted avg       0.48      0.13      0.19     20000

[Confusion Matrix]
 [[ 297  154  216  180  749]
 [1241  688  950  806 2684]
 [   3    3    3    4   11]
 [2610 1359 1784 1653 4549]
 [  10    8   11    7   20]]

[XGBoost] risk_score=IN
Balanced Acc: 0.22948534008014804
Macro F1     : 0.1933477338258374

[Classification Report]
               precision    recall  f1-score   support

           0       0.12      0.42      0.18  

ValueError: list.remove(x): x not in list