In [None]:
import numpy as np
import pandas as pd
import gc
from pathlib import Path

import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import platform

# === Korean font setup ===
system_name = platform.system()
if system_name == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False

print("Font setup complete!")


# ===============================================================
# 1. parquet 파일 자동 탐색 + globals() 로 데이터 로드
# ===============================================================
base_dir = Path("data")

folder_map = {
    "customer":   "1.회원정보",
    "credit":     "2.신용정보",
    "sales":      "3.승인매출정보",
    "billing":    "4.청구입금정보",
    "balance":    "5.잔액정보",
    "channel":    "6.채널정보",
    "marketing":  "7.마케팅정보",
    "performance":"8.성과정보",
}

info_categories = list(folder_map.keys())


def find_parquet_file(folder: Path):
    """해당 폴더 내에서 train*.parquet 파일을 자동 탐색"""
    files = list(folder.glob("train*.parquet"))
    if len(files) == 0:
        raise FileNotFoundError(f"train*.parquet not found in {folder}")
    return files[0]


# --- parquet 파일 읽고 globals에 저장 ---
for prefix in info_categories:
    folder = base_dir / folder_map[prefix]
    file_path = find_parquet_file(folder)
    df = pd.read_parquet(file_path)
    globals()[f"{prefix}_train"] = df
    print(f"Loaded {prefix}_train: {df.shape}")

# ===============================================================
#  2. Concat monthly datasets (your structure supports multi-month)
# ===============================================================
train_dfs = {}
for prefix in info_categories:
    df_list = [globals()[f"{prefix}_train"]]
    train_dfs[f"{prefix}_train_df"] = pd.concat(df_list, axis=0)
    print(f"{prefix}_train_df created: {train_dfs[f'{prefix}_train_df'].shape}")

customer_train_df    = train_dfs["customer_train_df"]
credit_train_df      = train_dfs["credit_train_df"]
sales_train_df       = train_dfs["sales_train_df"]
billing_train_df     = train_dfs["billing_train_df"]
balance_train_df     = train_dfs["balance_train_df"]
channel_train_df     = train_dfs["channel_train_df"]
marketing_train_df   = train_dfs["marketing_train_df"]
performance_train_df = train_dfs["performance_train_df"]

gc.collect()

# ===============================================================
#  3. Merge all dataframes on ID
# ===============================================================
from functools import reduce

COMMON_ID = "ID"
CONFLICT_COL = "기준년월"

data_to_merge = [
    customer_train_df, credit_train_df, sales_train_df,
    billing_train_df, balance_train_df, channel_train_df,
    marketing_train_df, performance_train_df
]

processed_list = []
for df in data_to_merge:
    df2 = df.copy()
    if CONFLICT_COL in df2.columns and CONFLICT_COL != COMMON_ID:
        df2 = df2.drop(columns=[CONFLICT_COL])
    processed_list.append(df2)

merged_train_df = reduce(
    lambda left, right: pd.merge(left, right, on=COMMON_ID, how='left'),
    processed_list
)

print("Merge done:", merged_train_df.shape)

# ===============================================================
#  4. Feature separation (numerical vs categorical)
# ===============================================================
target_col = "Segment"
id_col = ["customer_id"]

features_df = merged_train_df.drop(columns=[target_col] + id_col, errors='ignore')

Discrimination_criteria = 30

initial_categorical = features_df.select_dtypes(include=['object', 'category']).columns.tolist()
initial_numerical = features_df.select_dtypes(include=np.number).columns.tolist()

refined_categorical = initial_categorical.copy()
refined_numeric = []

for col in initial_numerical:
    if features_df[col].nunique() < Discrimination_criteria:
        refined_categorical.append(col)
    else:
        refined_numeric.append(col)

print("Numeric:", len(refined_numeric))
print("Categorical:", len(refined_categorical))

# ===============================================================
#  5. ANOVA (numerical) + Chi2 (categorical)
# ===============================================================
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.feature_selection import f_classif, chi2

y = merged_train_df[target_col]
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# --- ANOVA numeric ---
X_num = features_df[refined_numeric].fillna(0)
f_scores, p_values = f_classif(X_num, y_encoded)

num_results_df = pd.DataFrame({
    "Feature": refined_numeric,
    "F_Score": f_scores,
    "P_Value": p_values
}).sort_values(by="F_Score", ascending=False)

# --- Chi2 categorical ---
X_cat = features_df[refined_categorical].astype(str).fillna("Missing")
encoder = OrdinalEncoder()
X_cat_encoded = encoder.fit_transform(X_cat)
chi_scores, p_vals_cat = chi2(X_cat_encoded, y_encoded)

cat_results_df = pd.DataFrame({
    "Feature": refined_categorical,
    "Chi2_Score": chi_scores,
    "P_Value": p_vals_cat
}).sort_values(by="Chi2_Score", ascending=False)

# --- Select top 50 each ---
TOP_N = 50
top_num_features = num_results_df["Feature"].head(TOP_N).tolist()
top_cat_features = cat_results_df["Feature"].head(TOP_N).tolist()
key_features = top_num_features + top_cat_features

# ===============================================================
#  6. Preprocessing: missing, outliers, log-transform, label-encoding
# ===============================================================
print("Final preprocessing...")

X = merged_train_df[key_features].copy()

# numeric
X[top_num_features] = X[top_num_features].fillna(0)
X[top_num_features] = X[top_num_features].clip(lower=0)
for col in top_num_features:
    p99 = X[col].quantile(0.99)
    X[col] = X[col].clip(upper=p99)
for col in top_num_features:
    X[col] = np.log1p(X[col])

# categorical
le2 = LabelEncoder()
for col in top_cat_features:
    X[col] = X[col].astype(str).fillna("Missing")
    X[col] = le2.fit_transform(X[col])

print("Preprocessing complete!")
print("X shape:", X.shape, "y:", y_encoded.shape)


Font setup complete!
Loaded customer_train: (400000, 78)
Loaded credit_train: (400000, 42)
Loaded sales_train: (400000, 406)
Loaded billing_train: (400000, 46)
Loaded balance_train: (400000, 82)
Loaded channel_train: (400000, 105)
Loaded marketing_train: (400000, 64)
Loaded performance_train: (400000, 49)
customer_train_df created: (400000, 78)
credit_train_df created: (400000, 42)
sales_train_df created: (400000, 406)
billing_train_df created: (400000, 46)
balance_train_df created: (400000, 82)
channel_train_df created: (400000, 105)
marketing_train_df created: (400000, 64)
performance_train_df created: (400000, 49)
Merge done: (400000, 857)
Numeric: 351
Categorical: 505
Final preprocessing...
Preprocessing complete!
X shape: (400000, 100) y: (400000,)


In [23]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import numpy as np


# ===============================================================
# 1. Train / Validation / Test Split (60 / 20 / 20)
# ===============================================================
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full,
    test_size=0.25,          # 0.25 * 0.80 = 0.20
    random_state=42,
    stratify=y_train_full
)

print("Train:", X_train.shape)
print("Valid:", X_valid.shape)
print("Test :", X_test.shape)


# ===============================================================
# 2. Cross-validation 객체 생성
# ===============================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# ===============================================================
# 3. A_only test_set 생성
# ===============================================================
A_LABEL = int(np.where(le.classes_ == 'A')[0][0])
print("A_LABEL:", A_LABEL)
mask_A = (y_test == A_LABEL)
X_test_A = X_test.loc[mask_A] if hasattr(X_test, "loc") else X_test[mask_A]
y_test_A = y_test[mask_A]


# ===============================================================
# 3. 공통 평가 함수
# ===============================================================
def eval_model(model, X, y, name):
    pred = model.predict(X)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred, average='macro')
    print(f"[{name}] Accuracy={acc:.4f}, Macro F1={f1:.4f}")
    return acc, f1

def eval_A_only(model, X_A, y_A, name):
    # Predict on A-only set
    y_pred = model.predict(X_A)

    # In some libraries, predict may return shape (n,1)
    y_pred = np.asarray(y_pred).reshape(-1)

    # Recall for class A on A-only set == accuracy on A-only set
    recall_A = (y_pred == A_LABEL).mean()

    # Distribution of predicted labels for true A
    unique, counts = np.unique(y_pred, return_counts=True)
    dist = dict(zip(unique.tolist(), counts.tolist()))

    # Convert numeric labels to original segment names (optional)
    dist_named = {le.inverse_transform([k])[0]: v for k, v in dist.items()}

    print(f"\n[{name}] A-only Evaluation")
    print(f"  A Recall (on A-only test) = {recall_A:.4f}")
    print(f"  Predicted label distribution (true=A): {dist_named}")

    return recall_A, dist_named


Train: (240000, 100)
Valid: (80000, 100)
Test : (80000, 100)
A_LABEL: 0


In [None]:
# ===============================================================
# 4-1. XGBoost 모델 및 하이퍼파라미터 튜닝
# ===============================================================
xgb_params = {
    "max_depth": [4, 6, 8],
    "learning_rate": [0.01, 0.03, 0.1],
    "n_estimators": [300, 600, 1000],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

xgb_model_baseline = XGBClassifier(
    tree_method="hist",
    predictor="auto",
    objective="multi:softmax",
    num_class=len(np.unique(y_encoded)),
    random_state=42
)


xgb_search_baseline = RandomizedSearchCV(
    estimator=xgb_model_baseline,
    param_distributions=xgb_params,
    n_iter=20,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

xgb_search_baseline.fit(X_train, y_train)
best_xgb_baseline = xgb_search_baseline.best_estimator_
print("\nBest XGBoost params:", xgb_search_baseline.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best XGBoost params: {'subsample': 1.0, 'n_estimators': 600, 'max_depth': 4, 'learning_rate': 0.1, 'colsample_bytree': 1.0}


In [None]:
# ===============================================================
# 4-2. LightGBM 모델 및 하이퍼파라미터 튜닝
# ===============================================================
lgbm_params = {
    "num_leaves": [31, 63, 127],
    "max_depth": [-1, 5, 10],
    "learning_rate": [0.01, 0.03, 0.05],
    "n_estimators": [500, 1000, 1500]
}

lgbm_model_baseline = LGBMClassifier(
    objective="multiclass",
    num_class=5,
    boosting_type="gbdt",
    random_state=42
)


lgbm_search_baseline = RandomizedSearchCV(
    estimator=lgbm_model_baseline,
    param_distributions=lgbm_params,
    n_iter=20,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

lgbm_search_baseline.fit(X_train, y_train)
best_lgbm_baseline = lgbm_search_baseline.best_estimator_
print("\nBest LGBM params:", lgbm_search_baseline.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12555
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 100
[LightGBM] [Info] Start training from score -7.813683
[LightGBM] [Info] Start training from score -9.749337
[LightGBM] [Info] Start training from score -2.934402
[LightGBM] [Info] Start training from score -1.927465
[LightGBM] [Info] Start training from score -0.222071

Best LGBM params: {'num_leaves': 127, 'n_estimators': 500, 'max_depth': -1, 'learning_rate': 0.01}


In [None]:
# ===============================================================
# 4-3. CatBoost 모델 및 하이퍼파라미터 튜닝
# ===============================================================
cat_params = {
    "depth": [4, 6, 8],
    "learning_rate": [0.01, 0.03, 0.1],
    "iterations": [500, 1000, 1500],
    "l2_leaf_reg": [1, 3, 5, 7]
}

cat_model_baseline = CatBoostClassifier(
    loss_function="MultiClass",
    task_type="GPU",
    devices='0',
    random_seed=42,
    verbose=0,
    thread_count=1,
    gpu_ram_part=0.7
)

cat_search_baseline = RandomizedSearchCV(
    estimator=cat_model_baseline,
    param_distributions=cat_params,
    n_iter=15,
    scoring="f1_macro",
    cv=cv,
    n_jobs=1,
    verbose=1
)

cat_search_baseline.fit(X_train, y_train)
best_cat_baseline = cat_search_baseline.best_estimator_
print("\nBest CatBoost params:", cat_search_baseline.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits

Best CatBoost params: {'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 1000, 'depth': 8}


In [None]:
# ===============================================================
# 5. Validation 및 Test 성능 비교
# ===============================================================
print("\n====== VALIDATION PERFORMANCE ======")
eval_model(best_xgb, X_valid, y_valid, "XGBoost Valid")
eval_model(best_lgbm, X_valid, y_valid, "LightGBM Valid")
eval_model(best_cat, X_valid, y_valid, "CatBoost Valid")

print("\n====== FINAL TEST PERFORMANCE ======") 
eval_model(best_xgb, X_test, y_test, "XGBoost Test")
eval_model(best_lgbm, X_test, y_test, "LightGBM Test")
eval_model(best_cat, X_test, y_test, "CatBoost Test")


[XGBoost Valid] Accuracy=0.8967, Macro F1=0.4615
[LightGBM Valid] Accuracy=0.8930, Macro F1=0.4458
[CatBoost Valid] Accuracy=0.8967, Macro F1=0.4518

[XGBoost Test] Accuracy=0.8955, Macro F1=0.4565
[LightGBM Test] Accuracy=0.8922, Macro F1=0.4618
[CatBoost Test] Accuracy=0.8953, Macro F1=0.4570


(0.8952625, 0.45701615957579317)

In [None]:
print("\n====== A_only PERFORMANCE ======")
eval_A_only(best_xgb, X_test_A, y_test_A, "XGBoost")
eval_A_only(best_lgbm, X_test_A, y_test_A, "LightGBM")
eval_A_only(best_cat, X_test_A, y_test_A, "CatBoost")

In [None]:
# ===============================================================
# 4-4. best_param 기반 학습(재탐색 방지용)
# ===============================================================
BEST_PARAMS_XGB = {
    "max_depth": 4,
    "learning_rate": 0.1,
    "n_estimators": 600,
    "subsample": 1.0,
    "colsample_bytree": 1.0
}

BEST_PARAMS_LGBM = {
    "num_leaves": 127,
    "max_depth": -1,
    "learning_rate": 0.01,
    "n_estimators": 500
}

BEST_PARAMS_CAT = {
    "depth": 8,
    "learning_rate": 0.1,
    "iterations": 1000,
    "l2_leaf_reg": 5
}


# -----------------------------
# 2) Train baseline models (no class weights)
# -----------------------------
num_classes = len(le.classes_)  # safer than len(np.unique(y))

# XGBoost baseline (CPU)
xgb_baseline = XGBClassifier(
    **BEST_PARAMS_XGB,
    tree_method="hist",
    predictor="auto",
    objective="multi:softmax",
    num_class=num_classes,
    random_state=42
)
xgb_baseline.fit(X_train, y_train)

# LightGBM baseline (CPU)
lgbm_baseline = LGBMClassifier(
    **BEST_PARAMS_LGBM,
    objective="multiclass",
    num_class=num_classes,
    boosting_type="gbdt",
    random_state=42
)
lgbm_baseline.fit(X_train, y_train)

# CatBoost baseline (GPU)
cat_baseline = CatBoostClassifier(
    **BEST_PARAMS_CAT,
    loss_function="MultiClass",
    task_type="GPU",
    devices="0",
    random_seed=42,
    verbose=0,
    thread_count=1,
    gpu_ram_part=0.7
)
cat_baseline.fit(X_train, y_train)




Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12555
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 100
[LightGBM] [Info] Start training from score -7.813683
[LightGBM] [Info] Start training from score -9.749337
[LightGBM] [Info] Start training from score -2.934402
[LightGBM] [Info] Start training from score -1.927465
[LightGBM] [Info] Start training from score -0.222071

[XGBoost Baseline Valid] Accuracy=0.8967, Macro F1=0.4615
[LightGBM Baseline Valid] Accuracy=0.8930, Macro F1=0.4458
[CatBoost Baseline Valid] Accuracy=0.8967, Macro F1=0.4518

[XGBoost Baseline Test] Accuracy=0.8955, Macro F1=0.4565
[LightGBM Baseline Test] Accuracy=0.8922, Macro F1=0.4618
[CatBoost Baseline Test] Accuracy=0.8953, Macro F1=0.4570


(0.8952625, 0.45701615957579317)

In [29]:
# -----------------------------
# 3) Evaluate on Validation / Test
# -----------------------------
print("\n====== BASELINE VALIDATION PERFORMANCE ======")
eval_model(xgb_baseline, X_valid, y_valid, "XGBoost Baseline Valid")
eval_model(lgbm_baseline, X_valid, y_valid, "LightGBM Baseline Valid")
eval_model(cat_baseline, X_valid, y_valid, "CatBoost Baseline Valid")

print("\n====== BASELINE FINAL TEST PERFORMANCE ======")
eval_model(xgb_baseline, X_test, y_test, "XGBoost Baseline Test")
eval_model(lgbm_baseline, X_test, y_test, "LightGBM Baseline Test")
eval_model(cat_baseline, X_test, y_test, "CatBoost Baseline Test")

print("\n====== BASELINE A ONLY PERFORMANCE ======")
eval_A_only(xgb_baseline, X_test_A, y_test_A, "XGBoost Baseline Aonly")
eval_A_only(lgbm_baseline, X_test_A, y_test_A, "LightGBM Baseline Aonly")
eval_A_only(cat_baseline, X_test_A, y_test_A, "CatBoost Baseline Aonly")


[XGBoost Baseline Valid] Accuracy=0.8967, Macro F1=0.4615
[LightGBM Baseline Valid] Accuracy=0.8930, Macro F1=0.4458
[CatBoost Baseline Valid] Accuracy=0.8967, Macro F1=0.4518

[XGBoost Baseline Test] Accuracy=0.8955, Macro F1=0.4565
[LightGBM Baseline Test] Accuracy=0.8922, Macro F1=0.4618
[CatBoost Baseline Test] Accuracy=0.8953, Macro F1=0.4570


[XGBoost Baseline Aonly] A-only Evaluation
  A Recall (on A-only test) = 0.0312
  Predicted label distribution (true=A): {'A': 1, 'C': 31}

[LightGBM Baseline Aonly] A-only Evaluation
  A Recall (on A-only test) = 0.0000
  Predicted label distribution (true=A): {'C': 32}

[CatBoost Baseline Aonly] A-only Evaluation
  A Recall (on A-only test) = 0.0312
  Predicted label distribution (true=A): {'A': 1, 'C': 31}


(np.float64(0.03125), {'A': 1, 'C': 31})

y 가중치 부여(데이터 불균형)

In [16]:
from collections import Counter

# y_train은 이미 LabelEncoder로 0~4 인코딩된 상태라고 가정
counter = Counter(y_train)
print("Class counts:", counter)

total_samples = sum(counter.values())
num_classes = len(counter)

# 표준 class weight 공식
raw_class_weights = {
    cls: total_samples / (num_classes * cnt)
    for cls, cnt in counter.items()
}

print("Raw class weights:", raw_class_weights)

Class counts: Counter({np.int64(4): 192206, np.int64(3): 34924, np.int64(2): 12759, np.int64(0): 97, np.int64(1): 14})
Raw class weights: {np.int64(4): 0.2497320583124356, np.int64(3): 1.3744130111098385, np.int64(2): 3.7620503174229953, np.int64(0): 494.8453608247423, np.int64(1): 3428.5714285714284}


In [17]:
## 가중치 과도화 방지 코드
MAX_WEIGHT = 50

class_weights = {
    cls: min(weight, MAX_WEIGHT)
    for cls, weight in raw_class_weights.items()
}

print("Capped class weights:", class_weights)

Capped class weights: {np.int64(4): 0.2497320583124356, np.int64(3): 1.3744130111098385, np.int64(2): 3.7620503174229953, np.int64(0): 50, np.int64(1): 50}


In [19]:
# ===============================================================
# 4-1. XGBoost 모델 및 하이퍼파라미터 튜닝 with Weight
# ===============================================================
xgb_params = {
    "max_depth": [4, 6, 8],
    "learning_rate": [0.01, 0.03, 0.1],
    "n_estimators": [300, 600, 1000],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

xgb_model = XGBClassifier(
    tree_method="hist",
    predictor="auto",
    objective="multi:softmax",
    num_class=len(np.unique(y_encoded)),
    random_state=42
)


xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_params,
    n_iter=20,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)
sample_weights = np.array([class_weights[y] for y in y_train])

xgb_search.fit(X_train, y_train, sample_weight=sample_weights)
best_xgb = xgb_search.best_estimator_
print("\nBest XGBoost params:", xgb_search.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best XGBoost params: {'subsample': 1.0, 'n_estimators': 600, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 1.0}


In [20]:
# ===============================================================
# 4-2. LightGBM 모델 및 하이퍼파라미터 튜닝 with Weight
# ===============================================================
lgbm_params = {
    "num_leaves": [31, 63, 127],
    "max_depth": [-1, 5, 10],
    "learning_rate": [0.01, 0.03, 0.05],
    "n_estimators": [500, 1000, 1500]
}

lgbm_model = LGBMClassifier(
    objective="multiclass",
    num_class=5,
    boosting_type="gbdt",
    class_weight=class_weights,
    random_state=42
)


lgbm_search = RandomizedSearchCV(
    estimator=lgbm_model,
    param_distributions=lgbm_params,
    n_iter=20,
    scoring="f1_macro",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

lgbm_search.fit(X_train, y_train)
best_lgbm = lgbm_search.best_estimator_
print("\nBest LGBM params:", lgbm_search.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016497 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12555
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 100
[LightGBM] [Info] Start training from score -3.428652
[LightGBM] [Info] Start training from score -5.364306
[LightGBM] [Info] Start training from score -1.136430
[LightGBM] [Info] Start training from score -1.136430
[LightGBM] [Info] Start training from score -1.136430

Best LGBM params: {'num_leaves': 127, 'n_estimators': 1500, 'max_depth': -1, 'learning_rate': 0.03}


In [21]:
# ===============================================================
# 4-3. CatBoost 모델 및 하이퍼파라미터 튜닝 with Weight
# ===============================================================

labels = np.unique(y_train)

cat_class_weights = [
    class_weights.get(i, 1.0)   # 혹시 없는 클래스 대비
    for i in range(num_classes)
]

cat_params = {
    "depth": [4, 6, 8],
    "learning_rate": [0.01, 0.03, 0.1],
    "iterations": [500, 1000, 1500],
    "l2_leaf_reg": [1, 3, 5, 7]
}

cat_model = CatBoostClassifier(
    loss_function="MultiClass",
    task_type="GPU",
    devices='0',
    class_weights=cat_class_weights,
    random_seed=42,
    verbose=0,
    thread_count=1,
    gpu_ram_part=0.7
)

cat_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=cat_params,
    n_iter=15,
    scoring="f1_macro",
    cv=cv,
    n_jobs=1,
    verbose=1
)



cat_search.fit(X_train, y_train)
best_cat = cat_search.best_estimator_
print("\nBest CatBoost params:", cat_search.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits

Best CatBoost params: {'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 1500, 'depth': 4}


In [22]:
# ===============================================================
# 5. Validation 및 Test 성능 비교 with Weight
# ===============================================================
print("\n====== VALIDATION PERFORMANCE ======")
eval_model(best_xgb, X_valid, y_valid, "XGBoost Valid")
eval_model(best_lgbm, X_valid, y_valid, "LightGBM Valid")
eval_model(best_cat, X_valid, y_valid, "CatBoost Valid")

print("\n====== FINAL TEST PERFORMANCE ======") 
eval_model(best_xgb, X_test, y_test, "XGBoost Test")
eval_model(best_lgbm, X_test, y_test, "LightGBM Test")
eval_model(best_cat, X_test, y_test, "CatBoost Test")


[XGBoost Valid] Accuracy=0.8325, Macro F1=0.4543
[LightGBM Valid] Accuracy=0.8894, Macro F1=0.4673
[CatBoost Valid] Accuracy=0.8523, Macro F1=0.4642

[XGBoost Test] Accuracy=0.8304, Macro F1=0.4408
[LightGBM Test] Accuracy=0.8877, Macro F1=0.4504
[CatBoost Test] Accuracy=0.8516, Macro F1=0.4563


(0.85155, 0.45628653879653774)

In [24]:
print("\n====== A_only PERFORMANCE ======")
eval_A_only(best_xgb, X_test_A, y_test_A, "XGBoost")
eval_A_only(best_lgbm, X_test_A, y_test_A, "LightGBM")
eval_A_only(best_cat, X_test_A, y_test_A, "CatBoost")



[XGBoost] A-only Evaluation
  A Recall (on A-only test) = 0.3125
  Predicted label distribution (true=A): {'A': 10, 'C': 22}

[LightGBM] A-only Evaluation
  A Recall (on A-only test) = 0.0000
  Predicted label distribution (true=A): {'C': 32}

[CatBoost] A-only Evaluation
  A Recall (on A-only test) = 0.1250
  Predicted label distribution (true=A): {'A': 4, 'C': 28}


(np.float64(0.125), {'A': 4, 'C': 28})