In [1]:
from matplotlib import rc

import matplotlib.pyplot as plt
import pandas as pd

merged_data = pd.read_csv("./mapped_data/mapped_merged_data.csv")


# 한글 폰트 설정
rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

  merged_data = pd.read_csv("./mapped_data/mapped_merged_data.csv")


In [2]:
missing_ratio = merged_data.isnull().sum() / len(merged_data) * 100
print(missing_ratio)


TRAVEL_ID            98.849432
VISIT_AREA_ID        98.849432
ACTIVITY_TYPE_CD     98.849432
ACTIVITY_TYPE_SEQ    98.849432
CONSUME_HIS_SEQ      98.849432
CONSUME_HIS_SNO      98.849432
PAYMENT_NUM          98.849432
BRNO                 99.115745
STORE_NM             98.859937
ROAD_NM_ADDR         98.977249
LOTNO_ADDR           99.293021
ROAD_NM_CD           99.071983
LOTNO_CD             99.071983
PAYMENT_DT           99.036616
PAYMENT_MTHD_SE      98.854983
PAYMENT_AMT_WON      98.849804
PAYMENT_ETC          99.294269
SGG_CD                0.000000
POI_ID                1.150568
POI_NM                1.150634
BRNO_POI              1.150568
ROAD_NM_ADDR_POI     15.394293
LOTNO_ADDR_POI        1.150568
ASORT_LCLASDC        87.585680
ASORT_MLSFCDC        87.585680
ASORT_SDASDC         87.585680
X_COORD               1.150568
Y_COORD               1.150568
ROAD_NM_CD_POI        1.150568
LOTNO_CD_POI          1.150568
region                8.583069
dtype: float64


In [3]:
# 1️⃣ 한글 지역명을 영어로 변환하는 매핑 딕셔너리
region_mapping = {
    "서울특별시": "Seoul",
    "부산광역시": "Busan",
    "대구광역시": "Daegu",
    "인천광역시": "Incheon",
    "광주광역시": "Gwangju",
    "대전광역시": "Daejeon",
    "울산광역시": "Ulsan",
    "세종특별자치시": "Sejong",
    "경기도": "Gyeonggi",
    "강원도": "Gangwon",
    "충청북도": "Chungbuk",
    "충청남도": "Chungnam",
    "전라북도": "Jeonbuk",
    "전라남도": "Jeonnam",
    "경상북도": "Gyeongbuk",
    "경상남도": "Gyeongnam",
    "제주특별자치도": "Jeju"
}

# 2️⃣ 지역명을 영어로 변환
merged_data["region"] = merged_data["region"].map(region_mapping)


In [5]:
# 🚨 범주형 변수 목록
categorical_features = [
    "TRAVEL_ID", "STORE_NM", "ROAD_NM_ADDR", "LOTNO_ADDR", "PAYMENT_DT",
    "PAYMENT_ETC", "POI_ID", "POI_NM", "BRNO_POI", "ROAD_NM_ADDR_POI",
    "LOTNO_ADDR_POI", "ROAD_NM_CD_POI", "LOTNO_CD_POI"
]


# 🚨 범주형 변수: 최빈값(Mode) 또는 "Missing" 카테고리 추가
for col in categorical_features:
    mode_value = merged_data[col].mode()[0] if not merged_data[col].mode().empty else "Missing"
    merged_data[col] = merged_data[col].fillna(mode_value)


# 🚨 수치형(float64) 변수 중 결측치가 있는 변수 찾기
numeric_cols_with_nan = merged_data.select_dtypes(include=['float64']).columns

# 🚨 해당 변수들의 결측치를 **중앙값(Median)**으로 대체
for col in numeric_cols_with_nan:
    median_value = merged_data[col].median()  # 중앙값 계산
    merged_data[col].fillna(median_value)

# 🚨 'region'의 NaN 값 제거
merged_data = merged_data.dropna(subset=["region"])


merged_data.isnull().sum()

TRAVEL_ID            0
VISIT_AREA_ID        0
ACTIVITY_TYPE_CD     0
ACTIVITY_TYPE_SEQ    0
CONSUME_HIS_SEQ      0
CONSUME_HIS_SNO      0
PAYMENT_NUM          0
BRNO                 0
STORE_NM             0
ROAD_NM_ADDR         0
LOTNO_ADDR           0
ROAD_NM_CD           0
LOTNO_CD             0
PAYMENT_DT           0
PAYMENT_MTHD_SE      0
PAYMENT_AMT_WON      0
PAYMENT_ETC          0
SGG_CD               0
POI_ID               0
POI_NM               0
BRNO_POI             0
ROAD_NM_ADDR_POI     0
LOTNO_ADDR_POI       0
ASORT_LCLASDC        0
ASORT_MLSFCDC        0
ASORT_SDASDC         0
X_COORD              0
Y_COORD              0
ROAD_NM_CD_POI       0
LOTNO_CD_POI         0
region               0
dtype: int64

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# ✅ Bootstrapping을 이용한 AUC 신뢰 구간 계산 함수
def bootstrap_auc_ci(y_true, y_pred_proba, n_bootstraps=1000, alpha=0.95):
    """
    Bootstrapping을 이용하여 ROC AUC의 95% 신뢰 구간을 계산하는 함수
    """
    bootstrapped_scores = []

    for _ in range(n_bootstraps):
        # 랜덤 샘플링
        indices = np.random.choice(len(y_true), len(y_true), replace=True)
        if len(np.unique(y_true[indices])) < 2:
            continue  # AUC를 계산할 수 없는 경우 무시
        score = roc_auc_score(y_true[indices], y_pred_proba[indices])
        bootstrapped_scores.append(score)

    # 신뢰 구간 계산
    lower_bound = np.percentile(bootstrapped_scores, ((1.0 - alpha) / 2) * 100)
    upper_bound = np.percentile(bootstrapped_scores, (alpha + ((1.0 - alpha) / 2)) * 100)
    
    return np.mean(bootstrapped_scores), lower_bound, upper_bound


# ✅ 모든 지역을 One-vs-Rest 방식으로 이진 분류
unique_regions = merged_data["region"].unique()
final_results = []

for region in unique_regions:
    print(f"\n🔹 Processing Binary Classification for {region} vs Others")

    # ✅ 특성(X)과 타겟(y) 설정 (각 지역 vs 나머지)
    X = merged_data.drop(columns=["region"])  # 입력 변수
    y = merged_data["region"].apply(lambda x: 1 if x == region else 0)  # 현재 지역을 1, 나머지를 0으로 변환

   # ✅ 범주형 변수 처리 (Label Encoding)
    categorical_cols = X.select_dtypes(include=['object']).columns

    for col in categorical_cols:
        X[col] = X[col].astype(str)  # 모든 값을 문자열(str)로 변환
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

    # ✅ 데이터 분할 (Train / Test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # ✅ 모델 정의
    models = {
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "LightGBM": LGBMClassifier(random_state=42),
        "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
    }

    # ✅ 모델 학습 및 평가
    results = []
    conf_matrices = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # 확률값
        
        # ROC AUC 및 95% CI 계산
        mean_auc, lower_ci, upper_ci = bootstrap_auc_ci(y_test.to_numpy(), y_pred_proba)
 
        # Precision-Recall Curve 계산
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = auc(recall, precision)

        # Confusion Matrix 저장
        conf_matrix = confusion_matrix(y_test, y_pred)
        conf_matrices[model_name] = conf_matrix

        # 결과 저장
        results.append([region, model_name, mean_auc, lower_ci, upper_ci, pr_auc])

    # ✅ 지역별 결과 저장
    results_df = pd.DataFrame(results, columns=["Region", "Model", "ROC AUC", "ROC AUC Lower CI", "ROC AUC Upper CI", "PR AUC"])
    final_results.append(results_df)

# ✅ 모든 지역 결과 합치기
final_results_df = pd.concat(final_results, ignore_index=True)

# ✅ 결과를 Excel 파일로 저장
excel_filename = "./binary_classification_results.xlsx"
final_results_df.to_excel(excel_filename, index=False)


# ✅ 결과 출력
import ace_tools_open as tools
tools.display_dataframe_to_user(name="Binary Classification Results (Each Region vs Others)", dataframe=final_results_df)




🔹 Processing Binary Classification for Seoul vs Others


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['float', 'str']