In [146]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub
import xgboost as xgb # 監督式模型
from sklearn.ensemble import IsolationForest # 非監督式模型

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

# 結果呈現
def evaluation(y_true, y_pred, model_name="Model"):
   accuracy = accuracy_score(y_true, y_pred)
   precision = precision_score(y_true, y_pred, zero_division=0)
   recall = recall_score(y_true, y_pred)
   f1 = f1_score(y_true, y_pred)

   print(f'\n{model_name} Evaluation:')
   print('===' * 15)
   print('         Accuracy:', accuracy)
   print('  Precision Score:', precision)
   print('     Recall Score:', recall)
   print('         F1 Score:', f1)
   print("\nClassification Report:")
   print(classification_report(y_true, y_pred))

In [3]:
# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# transactions of fraud and non-fraud
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)


In [68]:
# Extract features and labels
X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])

# 分成訓練與測試資料，stratify(分層)是為了分割後，保持挑選的資料中有不同的class
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)

# 監督式模型XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.3,
    subsample=0.5,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=RANDOM_SEED
)

xgb_model.fit(x_train, y_train)

# 預測與結果
y_pred = xgb_model.predict(x_test)
evaluation(y_test, y_pred, model_name="XGBoost")


XGBoost Evaluation:
         Accuracy: 0.9995786664794073
  Precision Score: 0.9590163934426229
     Recall Score: 0.7905405405405406
         F1 Score: 0.8666666666666667

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.96      0.79      0.87       148

    accuracy                           1.00     85443
   macro avg       0.98      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [145]:
# Extract features and labels
x = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

# 分成訓練與測試資料
x_train, x_test, y_train, y_test = train_test_split(
   x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 選小的正常資料集做訓練依據
n_x_train = x_train[y_train == 0]
n_x_train = n_x_train[:1000]

# 非監督式模型IsolationForest
model = IsolationForest(
    n_estimators=5,
    contamination=len(fraud)/(len(fraud)+len(nonfraud)),
    max_samples=0.825,
    max_features=1,
    random_state=RANDOM_SEED
)

model.fit(n_x_train)

# 預測與調整輸出
y_pred = model.predict(x_test)
y_pred = np.where(y_pred == -1, 1, 0) # 異常-1改成1，其餘改成0(正常1改0)

# 報告結果
evaluation(y_test, y_pred, model_name="IsolationForest")


IsolationForest Evaluation:
         Accuracy: 0.9984785178423042
  Precision Score: 0.5584415584415584
     Recall Score: 0.581081081081081
         F1 Score: 0.5695364238410596

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.56      0.58      0.57       148

    accuracy                           1.00     85443
   macro avg       0.78      0.79      0.78     85443
weighted avg       1.00      1.00      1.00     85443

