1. 匯入必要套件

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from xgboost import XGBClassifier
import kagglehub

2. 讀取資料

In [2]:
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:00<00:00, 180MB/s]

Extracting files...





3. 使用 Isolation Forest 預測異常樣本

In [3]:
X = data.drop(columns=["Class"])
y = data["Class"]

iso_forest = IsolationForest(n_estimators=100, contamination=0.002, random_state=42)
iso_pred = iso_forest.fit_predict(X)

# IsolationForest: -1 表示異常，1 表示正常
data["is_outlier"] = (iso_pred == -1).astype(int)


4. 將 is_outlier 當作額外特徵餵給XGBoost

In [4]:
X = data.drop(columns=["Class"])
y = data["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


Parameters: { "use_label_encoder" } are not used.



5. 結果評估

In [5]:
def evaluate(y_true, y_pred, model_name="Model"):
    print(f"\n{model_name} Evaluation")
    print("="*40)
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1 Score :", f1_score(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

evaluate(y_test, y_pred, "IsolationForest + XGBoost")



IsolationForest + XGBoost Evaluation
Accuracy : 0.9994850368081645
Precision: 0.9333333333333333
Recall   : 0.7567567567567568
F1 Score : 0.835820895522388

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.93      0.76      0.84       148

    accuracy                           1.00     85443
   macro avg       0.97      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443

