In [1]:
import os, pandas as pd, numpy as np
from pathlib import Path

csv_path = Path('./creditcard.csv')

if not csv_path.exists():
    raise FileNotFoundError(
        "creditcard.csv가 디렉토리에 없습니다."
    )

df = pd.read_csv(csv_path)
print(df.shape, df.columns.tolist()[:10], '...')
df.head()

(284807, 31) ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9'] ...


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
# 1. Random 데이터 분할 (train/valid/test = 60/20/20, random_state=42)

df_shuffled = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
n = len(df_shuffled)
n_train = int(n*0.6)
n_valid = int(n*0.2)
train = df_shuffled.iloc[:n_train]
valid = df_shuffled.iloc[n_train:n_train + n_valid]
test  = df_shuffled.iloc[n_train + n_valid:]
print(len(train), len(valid), len(test))

170884 56961 56962


In [None]:
# 2. 데이터 전처리

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

feature_cols = [c for c in df.columns if c.startswith('V')] + ['Amount'] #입력(feature)으로 쓸 열
target_col = 'Class' #정답(label)으로 쓸 열

#x = 입력 feature, y = 정답 // train/valid/test에서 X(입력), y(정답) 분리
X_train = train[feature_cols].copy()
y_train = train[target_col].values
X_valid = valid[feature_cols].copy()
y_valid = valid[target_col].values
X_test  = test[feature_cols].copy()
y_test  = test[target_col].values

# Amount 값을 평균 0 표준편차 1의 데이터로 표준화 (V1~V28 은 이미 표준화가 되어있는 상태이다.)
# train 기준으로 평균 0, 표준편차 1이 되도록 학습(fit_transform) 후 같은 기준으로 valid/test도 변환
scaler = StandardScaler()
X_train_amt = scaler.fit_transform(X_train[['Amount']])
X_valid_amt = scaler.transform(X_valid[['Amount']])
X_test_amt  = scaler.transform(X_test[['Amount']])

# [V1~V28 + 표준화된 amount 값] -> 모델에 넣을 최종 입력
v_cols = [c for c in feature_cols if c.startswith('V')] #V1 ~ V28 을 모은 리스트
X_train_final = np.hstack([train[v_cols].values, X_train_amt])
X_valid_final = np.hstack([valid[v_cols].values, X_valid_amt])
X_test_final  = np.hstack([test[v_cols].values,  X_test_amt])


In [4]:
# 3. 학습 및 평가 (기본 모델)

lr = LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=None, solver='liblinear') #로지스틱 회귀모델 사용
lr.fit(X_train_final, y_train) #X_train_final: 아까 만든 V1~V28 + 표준화된 Amount / y_train: 0/1 라벨

def evaluate(X, y, mdl, name='set'):
    proba = mdl.predict_proba(X)[:,1] #class=1(사기) 확률 예측
    pr  = average_precision_score(y, proba) #PR-AUC 계산
    print(f'{name}: PR-AUC={pr:.4f}')
    return proba

p_valid_lr = evaluate(X_valid_final, y_valid, lr, 'valid')
p_test_lr  = evaluate(X_test_final,  y_test,  lr, 'test') #test 셋 각 샘플에 대해 모델이 예측한 사기일 확률


valid: PR-AUC=0.7433
test: PR-AUC=0.7312


In [None]:
# 3. 학습 및 평가 (비교 모델)

import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

# ---- 0) 튜닝용 데이터 = train + valid (test는 절대 포함 X)
X_tune = np.vstack([X_train_final, X_valid_final])
y_tune = np.concatenate([y_train, y_valid])

# ---- 1) 불균형 비율 기반 scale_pos_weight 기준값
n_pos = (y_tune == 1).sum()
n_neg = (y_tune == 0).sum()
base_spw = n_neg / n_pos

# ---- 2) Stratified K-fold 설정
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

def xgb_objective(trial):
    params = {
        # 탐색 파라미터
        "n_estimators": trial.suggest_int("n_estimators", 400, 1200),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 0.5),

        # 불균형 반영(탐색)
        "scale_pos_weight": trial.suggest_float(
            "scale_pos_weight", base_spw * 0.5, base_spw * 1.5
        ),

        # 고정 파라미터 (조건을 항상 동일하게!)
        "objective": "binary:logistic",
        "eval_metric": "aucpr",
        "tree_method": "hist",
        "random_state": 42,
        "n_jobs": -1,
    }

    pr_scores = []

    # ---- 3) K-fold CV로 평균 PR-AUC 계산
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_tune, y_tune), start=1):
        X_tr, X_va = X_tune[tr_idx], X_tune[va_idx]
        y_tr, y_va = y_tune[tr_idx], y_tune[va_idx]

        model = XGBClassifier(**params)
        model.fit(X_tr, y_tr, verbose=False)

        proba_va = model.predict_proba(X_va)[:, 1]
        pr_auc = average_precision_score(y_va, proba_va)
        pr_scores.append(pr_auc)

    return float(np.mean(pr_scores))  # Optuna는 이 값을 최대화

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=20)

print("Best CV PR-AUC:", study.best_value)
print("Best params:", study.best_trial.params)

best_params = study.best_trial.params

In [7]:
# Optuna에서 찾은 best_params에 고정 세팅 섞기
final_params = {
    **best_params,
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "tree_method": "hist",
    "random_state": 42,
    "n_jobs": -1,
}

xgb_final = XGBClassifier(**final_params)
xgb_final.fit(X_tune, y_tune, verbose=False)

p_valid_xgb = evaluate(X_valid_final, y_valid, xgb_final, 'valid')
p_test_xgb  = evaluate(X_test_final,  y_test,  xgb_final, 'test')

valid: PR-AUC=0.9998
test: PR-AUC=0.9043


In [8]:
# 4. Test set에 대한 사기 확률 예측
os.makedirs('Score', exist_ok=True)
pd.DataFrame({'proba': p_test_xgb}).to_csv('Score/score.csv', index=False)