In [1]:
import numpy as np, pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform, loguniform

In [2]:
data = pd.read_csv("./data/users_prep.csv")
data = data.drop("Unnamed: 0", axis=1) 
data = data.dropna()

In [3]:
data.head()

Unnamed: 0,id,current_age,retirement_age,birth_year,birth_month,gender,latitude,longitude,per_capita_income,yearly_income,...,W_avg_median_amount,W_avg_count_transaction,M_avg_min_amount,M_avg_max_amount,M_avg_mean_amount,M_avg_median_amount,M_avg_count_transaction,monthly_evg_unique_merchants,gap_sec,fraud_risk
0,825,53,66,1966,11,Female,34.15,-117.76,29278,59696,...,71.547335,22.13035,2.589153,559.207627,80.417801,68.428941,96.398305,32.644068,18420.0,0.0
1,1746,53,68,1966,12,Female,40.76,-73.74,37891,77254,...,64.614815,10.881323,2.200085,686.285932,79.696503,66.091822,47.398305,28.025424,37530.0,1.0
2,1718,81,67,1938,11,Female,34.02,-117.89,22681,33483,...,31.666401,50.365759,0.978983,412.923729,33.051507,32.344788,219.389831,36.983051,7380.0,1.0
3,708,63,63,1957,1,Female,40.71,-73.99,163145,249925,...,61.903687,16.889105,5.520763,1576.102458,125.770742,58.425551,73.567797,33.559322,20040.0,0.0
4,1164,43,70,1976,9,Male,37.76,-122.44,53797,109687,...,87.13106,17.951362,4.826525,817.697966,96.40837,87.651398,78.194915,36.466102,20160.0,0.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1219 entries, 0 to 1997
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            1219 non-null   int64  
 1   current_age                   1219 non-null   int64  
 2   retirement_age                1219 non-null   int64  
 3   birth_year                    1219 non-null   int64  
 4   birth_month                   1219 non-null   int64  
 5   gender                        1219 non-null   object 
 6   latitude                      1219 non-null   float64
 7   longitude                     1219 non-null   float64
 8   per_capita_income             1219 non-null   int64  
 9   yearly_income                 1219 non-null   int64  
 10  total_debt                    1219 non-null   int64  
 11  credit_score                  1219 non-null   int64  
 12  num_credit_cards              1219 non-null   int64  
 13  fraud_co

In [5]:
# 학습 데이터 (예시)
X = data.drop(columns=['id', 'fraud_count', 'fraud_risk', 'birth_month', 'longitude', 'latitude'])
y = data['fraud_risk']

In [6]:
X

Unnamed: 0,current_age,retirement_age,birth_year,gender,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,chip_ratio,...,W_avg_mean_amount,W_avg_median_amount,W_avg_count_transaction,M_avg_min_amount,M_avg_max_amount,M_avg_mean_amount,M_avg_median_amount,M_avg_count_transaction,monthly_evg_unique_merchants,gap_sec
0,53,66,1966,Female,29278,59696,127613,787,5,0.231560,...,81.057181,71.547335,22.130350,2.589153,559.207627,80.417801,68.428941,96.398305,32.644068,18420.0
1,53,68,1966,Female,37891,77254,191349,701,5,0.434114,...,79.726211,64.614815,10.881323,2.200085,686.285932,79.696503,66.091822,47.398305,28.025424,37530.0
2,81,67,1938,Female,22681,33483,196,698,5,0.190590,...,33.045918,31.666401,50.365759,0.978983,412.923729,33.051507,32.344788,219.389831,36.983051,7380.0
3,63,63,1957,Female,163145,249925,202328,722,4,0.429674,...,126.519642,61.903687,16.889105,5.520763,1576.102458,125.770742,58.425551,73.567797,33.559322,20040.0
4,43,70,1976,Male,53797,109687,183855,675,1,0.389509,...,96.090390,87.131060,17.951362,4.826525,817.697966,96.408370,87.651398,78.194915,36.466102,20160.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992,31,72,1988,Female,13194,26900,74083,758,2,0.908438,...,15.993225,15.823071,29.737828,11.146290,248.217581,22.636976,22.632258,128.064516,23.064516,2100.0
1993,85,66,1934,Female,19025,35270,1769,731,6,0.411307,...,45.547766,40.084776,10.254864,6.469746,325.680424,44.708037,39.516822,44.669492,22.067797,33480.0
1995,32,70,1987,Male,23550,48010,87837,703,3,0.306986,...,23.130076,2.292130,22.447471,0.759576,538.056102,23.209078,1.863475,97.779661,18.610169,18960.0
1996,62,65,1957,Female,24218,49378,104480,740,4,0.257192,...,39.990010,18.858191,20.219844,1.700254,470.129153,40.469902,17.507881,88.076271,32.135593,18240.0


In [7]:
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()
num_cols = X.columns.difference(cat_cols).tolist()

preprocess = ColumnTransformer([("num", StandardScaler(), num_cols),
                                ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols)])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)

### Logistic / RF / GB / SVM / KNN / XGBoost / LGBM 

In [9]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=5000, class_weight="balanced", 
                                             solver="lbfgs", random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=500, class_weight="balanced", 
                                           n_jobs=-1, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVM_RBF": SVC(kernel="rbf", probability=True, class_weight="balanced", C=1.0, 
                   gamma="scale", random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=15, weights="distance"),
    "XGBoost": XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.05, subsample=0.8, 
                             colsample_bytree=0.8, objective="binary:logistic", 
                             eval_metric="logloss", n_jobs=-1, random_state=42),
    "LGBM": LGBMClassifier(n_estimators=1000, num_leaves=63, learning_rate=0.05, subsample=0.8, 
                           colsample_bytree=0.8, class_weight="balanced", n_jobs=-1, random_state=42)
    }

In [None]:
for name, clf in models.items():
    pipe = Pipeline([("prep", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    
    print("\n", "="*20, f"{name}", "="*20)
    print(classification_report(y_test, pred, digits=4))


              precision    recall  f1-score   support

         0.0     0.7009    0.5616    0.6236       146
         1.0     0.4961    0.6429    0.5600        98

    accuracy                         0.5943       244
   macro avg     0.5985    0.6023    0.5918       244
weighted avg     0.6186    0.5943    0.5980       244


              precision    recall  f1-score   support

         0.0     0.6348    0.7740    0.6975       146
         1.0     0.5000    0.3367    0.4024        98

    accuracy                         0.5984       244
   macro avg     0.5674    0.5554    0.5500       244
weighted avg     0.5807    0.5984    0.5790       244


              precision    recall  f1-score   support

         0.0     0.6667    0.7671    0.7134       146
         1.0     0.5526    0.4286    0.4828        98

    accuracy                         0.6311       244
   macro avg     0.6096    0.5978    0.5981       244
weighted avg     0.6209    0.6311    0.6208       244


              p

### 하이퍼파라미터 튜닝 

In [11]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
models = {
    "LogisticRegression": (LogisticRegression(max_iter=5000, class_weight="balanced", 
                                              solver="lbfgs", random_state=42),
                          {"clf__C": loguniform(1e-2, 1e2)}, 25),
    
    "RandomForest": (RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=42),
                    {"clf__n_estimators": randint(500, 1500),
                     "clf__max_depth": randint(3, 30),
                     "clf__min_samples_split": randint(2, 20),
                     "clf__min_samples_leaf": randint(1, 10),
                     "clf__max_features": ["sqrt", "log2", None]}, 35),
                     
    "GradientBoosting": (GradientBoostingClassifier(random_state=42),
                        {"clf__n_estimators": randint(300, 1200),
                         "clf__learning_rate": loguniform(0.01, 0.3),
                         "clf__max_depth": randint(2, 6),
                         "clf__subsample": uniform(0.6, 0.4),
                         "clf__min_samples_leaf": randint(1, 20)}, 35),
    
    "SVM_RBF": (SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42),
               {"clf__C": loguniform(1e-2, 1e3),
                "clf__gamma": loguniform(1e-4, 1e-1)}, 30),
    
    "KNN": (KNeighborsClassifier(),
            {"clf__n_neighbors": randint(3, 61),
             "clf__weights": ["uniform", "distance"],
             "clf__p": [1, 2]}, 30),

    "XGBoost": (XGBClassifier(objective="binary:logistic", eval_metric="aucpr", n_jobs=-1, 
                              tree_method="hist", random_state=42),
               {"clf__n_estimators": randint(400, 1500),
                "clf__max_depth": randint(3, 8),
                "clf__learning_rate": loguniform(0.01, 0.2),
                "clf__subsample": uniform(0.6, 0.4),
                "clf__colsample_bytree": uniform(0.5, 0.5),
                "clf__min_child_weight": randint(1, 10),
                "clf__gamma": uniform(0.0, 5.0),
                "clf__reg_alpha": loguniform(1e-8, 1e-1),
                "clf__reg_lambda": loguniform(1e-3, 10.0)}, 40),
    
    "LGBM": (LGBMClassifier(objective="binary", class_weight="balanced", n_jobs=-1, random_state=42),
            {"clf__n_estimators": randint(600, 2000),
             "clf__num_leaves": randint(31, 255),
             "clf__learning_rate": loguniform(0.02, 0.2),
             "clf__min_child_samples": randint(10, 120),
             "clf__subsample": uniform(0.6, 0.4),
             "clf__colsample_bytree": uniform(0.5, 0.5),
             "clf__reg_lambda": loguniform(1e-3, 10.0),
             "clf__reg_alpha": loguniform(1e-8, 1e-1)}, 40)
    }

In [13]:
best_models = {}
for name, (est, param_dist, n_iter) in models.items():
    pipe = Pipeline([("prep", preprocess), ("clf", est)])
    search = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist,
                                n_iter=n_iter, scoring="f1", cv=cv, n_jobs=-1,
                                verbose=1, random_state=42, refit=True)
    search.fit(X_train, y_train)
    best_models[name] = search.best_estimator_
    print(f"\n===== {name} best params =====")
    print(f"{search.best_params_}")
    print(f"best CV F1: {search.best_score_:.4f}")

Fitting 5 folds for each of 25 candidates, totalling 125 fits

===== LogisticRegression best params =====
{'clf__C': 8.471801418819974}
best CV F1: 0.5563
Fitting 5 folds for each of 35 candidates, totalling 175 fits

===== RandomForest best params =====
{'clf__max_depth': 3, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 9, 'clf__min_samples_split': 14, 'clf__n_estimators': 1244}
best CV F1: 0.5767
Fitting 5 folds for each of 35 candidates, totalling 175 fits

===== GradientBoosting best params =====
{'clf__learning_rate': 0.0187522094557864, 'clf__max_depth': 4, 'clf__min_samples_leaf': 18, 'clf__n_estimators': 1029, 'clf__subsample': 0.7799016533479063}
best CV F1: 0.4949
Fitting 5 folds for each of 30 candidates, totalling 150 fits

===== SVM_RBF best params =====
{'clf__C': 0.33347927286375834, 'clf__gamma': 0.00019634341572933326}
best CV F1: 0.5735
Fitting 5 folds for each of 30 candidates, totalling 150 fits

===== KNN best params =====
{'clf__n_neighbors': 13, 'clf__p':

---

### Bagging + OOB 

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score,average_precision_score,f1_score,classification_report,confusion_matrix

In [15]:
# 현재 X의 컬럼 순서를 기준으로 인덱스 맵 만들기
col_order = list(X.columns)
idx_num = [col_order.index(c) for c in num_cols]
idx_cat = [col_order.index(c) for c in cat_cols]

# 트리/배깅용: 스케일링 없이 숫자는 통과, 범주 원-핫
prep_tree = ColumnTransformer([("cat", OneHotEncoder(handle_unknown="ignore", sparse=True), idx_cat),
                               ("num", "passthrough", idx_num)], remainder="drop")

# 선형/메타용: 숫자 스케일링 + 범주 원-핫
prep_lin = ColumnTransformer([("cat", OneHotEncoder(handle_unknown="ignore", sparse=True), idx_cat), 
                              ("num", StandardScaler(), idx_num)], remainder="drop")

In [16]:
base_est = Pipeline([("prep", prep_tree),
                     ("clf", LGBMClassifier(objective="binary", n_estimators=500, num_leaves=100, learning_rate=0.05, 
                                            subsample=0.8, colsample_bytree=0.8, min_child_samples=50, reg_lambda=1.0,
                                            class_weight="balanced", random_state=42, n_jobs=-1))])

bag = BaggingClassifier(estimator=base_est, n_estimators=30, max_samples=0.8, bootstrap=True, 
                        oob_score=True, n_jobs=-1, random_state=42)

bag.fit(X, y)

In [17]:
def tune_threshold(y_true, proba, metric="f1"):
    grid = np.linspace(0.05, 0.95, 181)
    best_t, best = 0.5, -1
    for t in grid:
        pred = (proba >= t).astype(int)
        val = f1_score(y_true, pred) if metric=="f1" else None
        if val is not None and val > best:
            best, best_t = val, t
    return best_t, best

In [18]:
oob_df = bag.oob_decision_function_
oob_proba = oob_df[:, 1] if oob_df.ndim == 2 and oob_df.shape[1] == 2 else oob_df.ravel()
y_pred = (oob_proba >= 0.5).astype(int)

# (1) 임계값 0.5
y_pred_05 = (oob_proba >= 0.5).astype(int)
print("=== OOB classification_report @ threshold = 0.50 ===")
print(classification_report(y, y_pred_05, digits=4))

# (2) OOB에서 F1 기준 최적 임계값
thr_best, f1_best = tune_threshold(y, oob_proba, metric="f1")
y_pred_best = (oob_proba >= thr_best).astype(int)
print(f"\n=== OOB classification_report @ best F1 threshold = {thr_best:.3f} ===")
print(classification_report(y, y_pred_best, digits=4))

=== OOB classification_report @ threshold = 0.50 ===
              precision    recall  f1-score   support

         0.0     0.6663    0.7366    0.6997       729
         1.0     0.5351    0.4510    0.4895       490

    accuracy                         0.6218      1219
   macro avg     0.6007    0.5938    0.5946      1219
weighted avg     0.6135    0.6218    0.6152      1219


=== OOB classification_report @ best F1 threshold = 0.195 ===
              precision    recall  f1-score   support

         0.0     0.7625    0.3128    0.4436       729
         1.0     0.4554    0.8551    0.5943       490

    accuracy                         0.5308      1219
   macro avg     0.6090    0.5839    0.5190      1219
weighted avg     0.6391    0.5308    0.5042      1219

