In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection      import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.preprocessing       import StandardScaler
from sklearn.impute              import SimpleImputer
from xgboost                     import XGBClassifier
from sklearn.metrics             import confusion_matrix, classification_report
from sklearn.pipeline            import Pipeline

In [5]:
# 1) 데이터 로드 및 ID 제거
df = pd.read_csv("train_final_with_pca.csv").drop(columns=['ID'])

# 2) 피처(X)와 이진 타깃(y_e) 분리
X = df.drop(columns=['Segment'])
y = df['Segment']
y_e = (y == 'E').astype(int)  # E면 1, 아니면 0

# 3) 학습/검증 분할 (20% 검증, stratify)
X_train, X_val, y_train_e, y_val_e = train_test_split(
    X, y_e, test_size=0.2, random_state=42, stratify=y_e
)

# 4) 전처리: 결측값 대체 → 스케일링
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
X_train_p = scaler.fit_transform(imputer.fit_transform(X_train))
X_val_p = scaler.transform(imputer.transform(X_val))

# 5) 모델 학습: XGBoost 이진 분류 (E vs not-E)
model_e = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)
model_e.fit(X_train_p, y_train_e)

# 6) 검증 데이터 예측
y_pred_e = model_e.predict(X_val_p)

# 7) 결과 출력
cm = confusion_matrix(y_val_e, y_pred_e)
print("혼동행렬 (E vs not-E):")
print(pd.DataFrame(
    cm,
    index=['실제_not-E', '실제_E'],
    columns=['예측_not-E', '예측_E']
))

print("\n분류 리포트 (E vs not-E):")
print(classification_report(y_val_e, y_pred_e, target_names=['not-E', 'E']))

=== 10-Fold CV Aggregated Confusion Matrix ===
          예측_not-E    예측_E
실제_not-E     50493   13233
실제_E          9037  247237

=== 10-Fold CV Classification Report ===
              precision    recall  f1-score   support

       not-E       0.85      0.79      0.82     63726
           E       0.95      0.96      0.96    256274

    accuracy                           0.93    320000
   macro avg       0.90      0.88      0.89    320000
weighted avg       0.93      0.93      0.93    320000


=== Hold-out Confusion Matrix ===
          예측_not-E   예측_E
실제_not-E     12689   3243
실제_E          2182  61886

=== Hold-out Classification Report ===
              precision    recall  f1-score   support

       not-E       0.85      0.80      0.82     15932
           E       0.95      0.97      0.96     64068

    accuracy                           0.93     80000
   macro avg       0.90      0.88      0.89     80000
weighted avg       0.93      0.93      0.93     80000

