In [6]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:

data_path = r'D:\project\fraud_detection\data'
data = "creditcard.csv"
df = pd.read_csv(os.path.join(data_path, data))

In [3]:
X = df.drop(['Time', 'Class'], axis=1)
y = df['Class']

scaler = StandardScaler()
X['Amount'] = scaler.fit_transform(X[['Amount']])

In [4]:
features = X.columns
X_feat = X[features].copy()

n_estimators = 100
contamination = 'auto'
max_samples = 'auto'
max_features = 1.0
bootstrap = False
random_state = 13

model = IsolationForest(
    n_estimators=n_estimators,
    contamination=contamination,
    max_samples=max_samples,
    max_features=max_features,
    bootstrap=bootstrap,
    random_state=random_state
)

model.fit(X_feat)
X['score'] = model.decision_function(X_feat)
X['anomaly'] = model.predict(X_feat)
X['anomaly'] = X['anomaly'].map({1: 0, -1: 1})

In [7]:
X_merged = X.merge(y, left_index=True, right_index=True, how='left')

cm = confusion_matrix(X_merged['Class'], X_merged['anomaly'])
clf_report = classification_report(X_merged['Class'], X_merged['anomaly'], target_names=['Normal', 'Fraud'])

print(cm)
print(clf_report)

[[273357  10958]
 [    79    413]]
              precision    recall  f1-score   support

      Normal       1.00      0.96      0.98    284315
       Fraud       0.04      0.84      0.07       492

    accuracy                           0.96    284807
   macro avg       0.52      0.90      0.52    284807
weighted avg       1.00      0.96      0.98    284807



In [11]:
X_eval = X.copy()
X_eval['Class'] = y

X_sorted = X_eval.sort_values(by='score', ascending=True)

count_fraud = X_sorted.head(1000)['Class'].value_counts().get(1, 0)
print(f'Number of fraud cases in top 1000 anomalies: {count_fraud}')

Number of fraud cases in top 1000 anomalies: 190
