In [22]:
# 예전 버전 코드 -> cat_boost로 단일 학습 시켜야 겠다고 마음 먹음 

import numpy as np
import pandas as pd
import gc
from functools import reduce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings

warnings.filterwarnings('ignore')  # 불필요한 경고 무시

# 데이터 전처리 (상관관계 제거 + Smart SMOTE)
def get_preprocessed_data():
    print("=" * 60)
    print("Part 1. 데이터 전처리 시작")
    print("=" * 60)

    # 데이터 로드
    data_splits = ["train", "test"]
    data_categories = {
        "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
        "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
        "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
        "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
        "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
        "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
        "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
        "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
    }

    print("데이터를 불러오는 중...")
    raw_data = {}
    for split in data_splits:
        for category, info in data_categories.items():
            file_path = f"./data/{info['folder']}/{split}_{info['suffix']}.parquet"
            key = f"{info['var_prefix']}_{split}"
            raw_data[key] = pd.read_parquet(file_path, engine="pyarrow")
    gc.collect()

    # 데이터 병합
    def merge_split_data(split_name, data_dict):
        categories = ["customer", "credit", "sales", "billing", "balance", "channel", "marketing", "performance"]
        dfs = []
        for prefix in categories:
            df = data_dict[f"{prefix}_{split_name}"]
            if '기준년월' in df.columns and '기준년월' != 'ID':
                df = df.drop(columns=['기준년월'])
            dfs.append(df)
        return reduce(lambda left, right: pd.merge(left, right, on='ID', how='left'), dfs)

    merged_train_df = merge_split_data("train", raw_data)
    merged_test_df = merge_split_data("test", raw_data)
    del raw_data
    gc.collect()

    # 기본 전처리
    target_col = 'Segment'
    id_col = 'ID'
    y = merged_train_df[target_col]
    train_len = len(merged_train_df)
    test_ids = merged_test_df[id_col]

    df_all = pd.concat([merged_train_df.drop(columns=[target_col], errors='ignore'), merged_test_df], axis=0).reset_index(drop=True)
    features_df = df_all.drop(columns=[id_col, 'customer_id'], errors='ignore')
    features_df.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in features_df.columns]

    # 변수 타입 구분
    cat_features = []
    num_features = []
    for col in features_df.columns:
        if features_df[col].dtype == 'object' or features_df[col].nunique() < 30:
            cat_features.append(col)
        else:
            num_features.append(col)

    X_all = features_df.copy()
    X_all[num_features] = X_all[num_features].fillna(0)
    X_all[cat_features] = X_all[cat_features].fillna('Missing')

    le = LabelEncoder()
    for col in cat_features:
        X_all[col] = le.fit_transform(X_all[col].astype(str))

    # 상관관계 높은 변수 제거
    X_temp_train = X_all.iloc[:train_len, :]
    THRESHOLD = 0.90
    corr_matrix = X_temp_train[num_features].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > THRESHOLD)]
    if to_drop:
        print(f"상관관계 {THRESHOLD} 이상 변수 {len(to_drop)}개 제거")
        X_all = X_all.drop(columns=to_drop, errors='ignore')

    # train/test 나누기
    X_train_final = X_all.iloc[:train_len, :]
    X_test_final = X_all.iloc[train_len:, :]

    le_y = LabelEncoder()
    y_encoded = le_y.fit_transform(y)

    # SMOTE 적용
    print("학습/검증 분할 및 SMOTE 적용 중...")
    X_tr, X_val, y_tr, y_val = train_test_split(X_train_final, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

    try:
        idx_A = list(le_y.classes_).index('A')
        idx_B = list(le_y.classes_).index('B')
    except ValueError:
        idx_A, idx_B = 0, 1

    smote = SMOTE(random_state=42, k_neighbors=2, sampling_strategy={idx_A: 15000, idx_B: 15000})
    X_tr_resampled, y_tr_resampled = smote.fit_resample(X_tr, y_tr)

    if not isinstance(X_tr_resampled, pd.DataFrame):
        X_tr_resampled = pd.DataFrame(X_tr_resampled, columns=X_tr.columns)

    final_cat_cols = [col for col in cat_features if col in X_tr_resampled.columns]
    for col in final_cat_cols:
        X_tr_resampled[col] = X_tr_resampled[col].round().astype(int)
        X_val[col] = X_val[col].astype(int)
        X_test_final[col] = X_test_final[col].astype(int)

    return {
        "X_train": X_tr_resampled,
        "y_train": y_tr_resampled,
        "X_val": X_val,
        "y_val": y_val,
        "X_test": X_test_final,
        "le_y": le_y,
        "test_ids": test_ids,
        "cat_cols": final_cat_cols
    }

# 모델 비교 (XGBoost, LightGBM, CatBoost)
def run_model_comparison(data):
    print("\n" + "=" * 60)
    print("Part 2. 모델 비교 및 평가 시작")
    print("=" * 60)

    X_tr, y_tr = data["X_train"], data["y_train"]
    X_val, y_val = data["X_val"], data["y_val"]
    X_test = data["X_test"]
    cat_cols = data["cat_cols"]
    le_y = data["le_y"]

    models = {
        "XGBoost": XGBClassifier(
            n_estimators=1000, learning_rate=0.05, max_depth=6,
            early_stopping_rounds=50, n_jobs=-1, random_state=42,
            enable_categorical=True, device="cuda", tree_method="hist"
        ),
        "LightGBM": LGBMClassifier(
            n_estimators=1000, learning_rate=0.05, max_depth=6,
            num_leaves=31, n_jobs=-1, random_state=42, verbose=-1
        ),
        "CatBoost": CatBoostClassifier(
            iterations=1000, learning_rate=0.05, depth=6,
            early_stopping_rounds=50, verbose=100, random_state=42,
            allow_writing_files=False, cat_features=cat_cols,
            task_type="GPU", devices='0'
        )
    }

    best_f1 = -1
    best_model_name = ""
    best_model = None
    results = []

    for name, model in models.items():
        print(f"\n{name} 모델 학습 중...")

        if name == "LightGBM":
            from lightgbm import early_stopping, log_evaluation
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='multi_logloss',
                      callbacks=[early_stopping(50), log_evaluation(0)])
        elif name == "CatBoost":
            model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
        else:
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=0)

        val_pred = model.predict(X_val)
        if name == "CatBoost":
            val_pred = val_pred.flatten()

        f1 = f1_score(y_val, val_pred, average='macro')
        acc = accuracy_score(y_val, val_pred)
        results.append([name, acc, f1])

        if f1 > best_f1:
            best_f1 = f1
            best_model_name = name
            best_model = model

        print(f"{name} 결과 - F1: {f1:.4f}, Accuracy: {acc:.4f}")
        print("Confusion Matrix:\n", confusion_matrix(y_val, val_pred))

    print("\n" + "=" * 60)
    print("모델별 성능 요약:\n")
    print(pd.DataFrame(results, columns=['Model', 'Accuracy', 'Macro F1']))
    print(f"\n최고 성능 모델: {best_model_name} (F1: {best_f1:.4f})")

    # 테스트 예측 및 제출 파일 저장
    test_pred = best_model.predict(X_test)
    if best_model_name == "CatBoost":
        test_pred = test_pred.flatten()

    submission = pd.DataFrame({
        'ID': data["test_ids"],
        'Segment': le_y.inverse_transform(test_pred.astype(int))
    })

    filename = f"submission_Final_Best_{best_model_name}_f1_{best_f1:.4f}.csv"
    submission.to_csv(filename, index=False)
    print(f"제출 파일 저장 완료: {filename}")


In [23]:
import numpy as np
import pandas as pd
import gc
from functools import reduce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings

# 경고 무시
warnings.filterwarnings('ignore')

def get_preprocessed_data():
    print("데이터 전처리 시작")

    # 1. 데이터 불러오기
    data_splits = ["train", "test"]
    data_categories = {
        "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
        "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
        "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
        "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
        "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
        "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
        "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
        "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
    }

    print("데이터 불러오는 중...")
    raw_data = {}
    for split in data_splits:
        for category, info in data_categories.items():
            file_path = f"./data/{info['folder']}/{split}_{info['suffix']}.parquet"
            key = f"{info['var_prefix']}_{split}"
            raw_data[key] = pd.read_parquet(file_path, engine="pyarrow")
    gc.collect()

    # 2. 데이터 병합
    def merge_split_data(split_name, data_dict):
        info_categories = ["customer", "credit", "sales", "billing", "balance", "channel", "marketing", "performance"]
        dfs_list = []
        for prefix in info_categories:
            df = data_dict[f"{prefix}_{split_name}"]
            if '기준년월' in df.columns and '기준년월' != 'ID':
                df = df.drop(columns=['기준년월'])
            dfs_list.append(df)
        return reduce(lambda left, right: pd.merge(left, right, on='ID', how='left'), dfs_list)

    merged_train_df = merge_split_data("train", raw_data)
    merged_test_df = merge_split_data("test", raw_data)
    del raw_data
    gc.collect()

    # 3. 전처리
    target_col = 'Segment'
    id_col = 'ID'
    y = merged_train_df[target_col]
    train_len = len(merged_train_df)
    test_ids = merged_test_df[id_col]

    # 학습+테스트 합쳐서 전처리
    df_all = pd.concat([merged_train_df.drop(columns=[target_col], errors='ignore'), merged_test_df], axis=0).reset_index(drop=True)
    features_df = df_all.drop(columns=[id_col, 'customer_id'], errors='ignore')
    features_df.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in features_df.columns]

    # 수치/범주형 구분
    Discrimination_criteria = 30
    target_cat_features = []
    target_num_features = []
    for col in features_df.columns:
        if features_df[col].dtype == 'object' or features_df[col].nunique() < Discrimination_criteria:
            target_cat_features.append(col)
        else:
            target_num_features.append(col)

    X_all = features_df.copy()

    # 수치형 결측값 0으로 채움
    X_all[target_num_features] = X_all[target_num_features].fillna(0)

    # 범주형 결측값 처리 + 라벨 인코딩
    X_all[target_cat_features] = X_all[target_cat_features].fillna('Missing')
    le = LabelEncoder()
    for col in target_cat_features:
        X_all[col] = X_all[col].astype(str)
        X_all[col] = le.fit_transform(X_all[col])

    # 다중공선성 제거
    X_temp_train = X_all.iloc[:train_len, :]
    THRESHOLD = 0.94
    current_num = [f for f in target_num_features if f in X_temp_train.columns]

    to_drop = []
    if current_num:
        corr_matrix = X_temp_train[current_num].corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > THRESHOLD)]

        if to_drop:
            print(f"상관 높은 컬럼 {len(to_drop)}개 제거")
            X_all = X_all.drop(columns=to_drop, errors='ignore')
            target_num_features = [f for f in target_num_features if f not in to_drop]

    # train / test 분리
    X_train_final = X_all.iloc[:train_len, :]
    X_test_final = X_all.iloc[train_len:, :]

    # 타겟 인코딩
    le_y = LabelEncoder()
    y_encoded = le_y.fit_transform(y)

    print(f"전처리 완료됨. 학습 데이터 shape: {X_train_final.shape}")

    # 4. 학습/검증 분할 + SMOTE
    print("학습/검증 세트로 나누는 중...")
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train_final, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
    )

    print(f"SMOTE 전 클래스 분포: {np.bincount(y_tr)}")

    try:
        idx_A = list(le_y.classes_).index('A')
        idx_B = list(le_y.classes_).index('B')
    except ValueError:
        print("클래스 'A', 'B' 없음. 기본 index로 진행")
        idx_A, idx_B = 0, 1

    smote_strategy = {
        idx_A: 15000, 
        idx_B: 15000
    }

    print("SMOTE 적용 중...")
    smote = SMOTE(random_state=42, k_neighbors=2, sampling_strategy=smote_strategy)
    X_tr_resampled, y_tr_resampled = smote.fit_resample(X_tr, y_tr)

    if not isinstance(X_tr_resampled, pd.DataFrame):
        X_tr_resampled = pd.DataFrame(X_tr_resampled, columns=X_tr.columns)

    # 범주형 다시 정수로 보정
    final_cat_cols = [c for c in target_cat_features if c in X_tr_resampled.columns]
    for col in final_cat_cols:
        X_tr_resampled[col] = X_tr_resampled[col].round().astype(int)
        X_val[col] = X_val[col].astype(int)
        X_test_final[col] = X_test_final[col].astype(int)

    print(f"SMOTE 후 클래스 분포: {np.bincount(y_tr_resampled)}")
    print(f"학습 샘플 수: {len(y_tr)} → {len(y_tr_resampled)}")

    return {
        "X_train": X_tr_resampled,
        "y_train": y_tr_resampled,
        "X_val": X_val,
        "y_val": y_val,
        "X_test": X_test_final,
        "le_y": le_y,
        "test_ids": test_ids,
        "cat_cols": final_cat_cols
    }


In [24]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
import pandas as pd

def run_model_Cat_boost_comparison(data):
    print("\n" + "="*60)
    print("CatBoost 최적 파라미터로 모델 학습 시작")
    print("="*60)

    X_tr, y_tr = data["X_train"], data["y_train"]
    X_val, y_val = data["X_val"], data["y_val"]
    X_test = data["X_test"]
    cat_cols = data["cat_cols"]
    le_y = data["le_y"]

    # Optuna로 튜닝된 최종 파라미터 사용
    models = {
        "CatBoost": CatBoostClassifier(
            iterations=1500,
            depth=5,
            learning_rate=0.04268265981566772,
            l2_leaf_reg=3,
            random_strength=3.394598804716689e-07,
            bagging_temperature=0.7017937881211938,
            early_stopping_rounds=100,
            verbose=100,
            random_state=42,
            allow_writing_files=False,
            cat_features=cat_cols,
            task_type="GPU",
            devices='0'
        )
    }

    results = []
    best_f1 = -1
    best_model_name = ""
    best_model = None

    for name, model in models.items():
        print("\n" + "-"*40)
        print(f"{name} 모델 학습 중...")
        print("-"*40)

        # 모델 학습
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val))

        # 검증 평가
        val_pred = model.predict(X_val).flatten()
        f1 = f1_score(y_val, val_pred, average='macro')
        acc = accuracy_score(y_val, val_pred)
        results.append([name, acc, f1])

        best_f1 = f1
        best_model_name = name
        best_model = model

        # 리포트 출력
        print(f"\n{name} 결과 요약:")
        print(f"Macro F1: {f1:.4f}, Accuracy: {acc:.4f}")
        print(classification_report(y_val, val_pred, target_names=le_y.classes_))
        print("Confusion Matrix:")
        print(confusion_matrix(y_val, val_pred))

    print("\n" + "="*60)
    print(f"최종 모델: {best_model_name} / F1 점수: {best_f1:.4f}")

    # 예측 및 제출 파일 생성
    print("테스트 데이터 예측 및 제출 파일 생성")
    test_pred = best_model.predict(X_test).flatten()

    submission = pd.DataFrame({
        'ID': data["test_ids"],
        'Segment': le_y.inverse_transform(test_pred.astype(int))
    })

    filename = f"submission_Final_CatBoost_Optuna_f1_{best_f1:.4f}.csv"
    submission.to_csv(filename, index=False)
    print(f"제출 파일 저장 완료: {filename}")


In [25]:
# 1. 전처리 실행 (데이터 준비)
data_pack = get_preprocessed_data()

데이터 전처리 시작
데이터 불러오는 중...
상관 높은 컬럼 140개 제거
전처리 완료됨. 학습 데이터 shape: (400000, 715)
학습/검증 세트로 나누는 중...
SMOTE 전 클래스 분포: [   130     19  17012  46565 256274]
SMOTE 적용 중...
SMOTE 후 클래스 분포: [ 15000  15000  17012  46565 256274]
학습 샘플 수: 320000 → 349851


In [26]:
run_model_Cat_boost_comparison(data_pack)


CatBoost 최적 파라미터로 모델 학습 시작

----------------------------------------
CatBoost 모델 학습 중...
----------------------------------------
0:	learn: 1.4911611	test: 1.4910142	best: 1.4910142 (0)	total: 33.5ms	remaining: 50.2s
100:	learn: 0.3344999	test: 0.3485464	best: 0.3485464 (100)	total: 2.66s	remaining: 36.9s
200:	learn: 0.2906516	test: 0.3115905	best: 0.3115905 (200)	total: 5.65s	remaining: 36.5s
300:	learn: 0.2733539	test: 0.2964569	best: 0.2964569 (300)	total: 8.4s	remaining: 33.5s
400:	learn: 0.2624542	test: 0.2868684	best: 0.2868684 (400)	total: 10.8s	remaining: 29.5s
500:	learn: 0.2551233	test: 0.2804574	best: 0.2804574 (500)	total: 13s	remaining: 25.9s
600:	learn: 0.2491296	test: 0.2752422	best: 0.2752422 (600)	total: 15.2s	remaining: 22.7s
700:	learn: 0.2445749	test: 0.2714514	best: 0.2714514 (700)	total: 17.4s	remaining: 19.8s
800:	learn: 0.2407784	test: 0.2684652	best: 0.2684652 (800)	total: 19.7s	remaining: 17.2s
900:	learn: 0.2375025	test: 0.2659822	best: 0.2659822 (900)	total

In [27]:
def run_model_Cat_boost_comparison(data):
    print("\n" + "="*60)
    print("CatBoost 최적 파라미터로 학습 시작")
    print("="*60)

    X_tr, y_tr = data["X_train"], data["y_train"]
    X_val, y_val = data["X_val"], data["y_val"]
    X_test = data["X_test"]
    cat_cols = data["cat_cols"]
    le_y = data["le_y"]

    # Optuna로 튜닝된 파라미터 사용
    models = {
        "CatBoost": CatBoostClassifier(
            iterations=1500,
            depth=5,
            learning_rate=0.04268265981566772,
            l2_leaf_reg=3,
            random_strength=3.394598804716689e-07,
            bagging_temperature=0.7017937881211938,
            early_stopping_rounds=100,
            verbose=100,
            random_state=42,
            allow_writing_files=False,
            cat_features=cat_cols,
            task_type="GPU",
            devices='0'
        )
    }

    best_model = None

    for name, model in models.items():
        print("\n" + "-"*40)
        print(f"{name} 모델 학습 중...")
        print("-"*40)

        # 모델 학습
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val))
        best_model = model

    print("모델 학습 완료. 객체 반환")
    return best_model


In [28]:
model = run_model_Cat_boost_comparison(data_pack)


CatBoost 최적 파라미터로 학습 시작

----------------------------------------
CatBoost 모델 학습 중...
----------------------------------------
0:	learn: 1.4911611	test: 1.4910138	best: 1.4910138 (0)	total: 247ms	remaining: 6m 9s
100:	learn: 0.3344999	test: 0.3485465	best: 0.3485465 (100)	total: 2.17s	remaining: 30.1s
200:	learn: 0.2906515	test: 0.3115907	best: 0.3115907 (200)	total: 4.21s	remaining: 27.2s
300:	learn: 0.2733539	test: 0.2964570	best: 0.2964570 (300)	total: 6.25s	remaining: 24.9s
400:	learn: 0.2624542	test: 0.2868683	best: 0.2868683 (400)	total: 8.2s	remaining: 22.5s
500:	learn: 0.2551233	test: 0.2804574	best: 0.2804574 (500)	total: 10.1s	remaining: 20.2s
600:	learn: 0.2491297	test: 0.2752422	best: 0.2752422 (600)	total: 12s	remaining: 18s
700:	learn: 0.2445748	test: 0.2714515	best: 0.2714515 (700)	total: 13.9s	remaining: 15.8s
800:	learn: 0.2407783	test: 0.2684652	best: 0.2684652 (800)	total: 15.7s	remaining: 13.7s
900:	learn: 0.2375026	test: 0.2659823	best: 0.2659823 (900)	total: 17.5

In [None]:
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

# 학습된 모델 성능 평가
print("\n" + "="*50)
print("검증 데이터로 모델 성능 평가함")
print("="*50)

# 예측 수행
val_pred = model.predict(data_pack["X_val"]).flatten()

# 점수 계산
f1 = f1_score(data_pack["y_val"], val_pred, average='macro')
acc = accuracy_score(data_pack["y_val"], val_pred)

# 결과 출력
print(f"Macro F1: {f1:.4f}, Accuracy: {acc:.4f}")
print("-" * 50)
print("분류 리포트:\n")
print(classification_report(data_pack["y_val"], val_pred, target_names=data_pack["le_y"].classes_))
print("-" * 50)
print("Confusion Matrix:\n")
print(confusion_matrix(data_pack["y_val"], val_pred))
print("="*50)



검증 데이터로 모델 성능 평가함
Macro F1: 0.5933, Accuracy: 0.8975
--------------------------------------------------
분류 리포트:

              precision    recall  f1-score   support

           A       0.26      0.66      0.37        32
           B       1.00      0.20      0.33         5
           C       0.74      0.59      0.65      4253
           D       0.70      0.62      0.66     11642
           E       0.94      0.97      0.95     64068

    accuracy                           0.90     80000
   macro avg       0.73      0.61      0.59     80000
weighted avg       0.89      0.90      0.89     80000

--------------------------------------------------
Confusion Matrix:

[[   21     0    11     0     0]
 [    1     1     3     0     0]
 [   51     0  2508  1262   432]
 [    6     0   717  7226  3693]
 [    3     0   171  1849 62045]]
