In [4]:
# Load in relevant modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

from src.data import prepare_train_valid_test

# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [5]:
# Load in the data
df = pd.read_csv('data/creditcard.csv')

In [40]:
# The first step is to create train and test datasets.
df_train, _, df_test = prepare_train_valid_test(df, valid_prop=0, test_prop=.3)

In [113]:
# Create variables that need to be passed into isolation forest
contamination = len(df_train[df_train['Class'] == 1]) / len(df_train)

# Create the isolation forest
if_model = IsolationForest(n_estimators=100, contamination=contamination)

# Fit the model; note that for this model we are not looking at time
if_model.fit(df_train[df_train.columns[1:-1]])

# See if there is any overlap between what the model identifies as
# anomalous and what is labeled as fraud
train_fraud_indices = list(df_train.reset_index()[df_train.reset_index()['Class'] == 1].index)
train_no_fraud_indices = list(df_train.reset_index()[df_train.reset_index()['Class'] == 0].index)
preds = if_model.predict(df_train[df_train.columns[1:-1]])
prop_anomalous_of_frauds = list(preds[train_fraud_indices]).count(-1) / len(preds[train_fraud_indices])
prop_anomalous_of_not_frauds = list(preds[train_no_fraud_indices]).count(-1) / len(preds[train_no_fraud_indices])

print(f'proportion of frauds that are anomalous: {np.round(100*prop_anomalous_of_frauds, 2)}%')
print(f'proportion of not frauds that are anomalous: {np.round(100*prop_anomalous_of_not_frauds, 2)}%')

proportion of frauds that are anomalous: 37.24%
proportion of not frauds that are anomalous: 0.12%


In [114]:
# Verify that results apply to the test set
test_fraud_indices = list(df_test.reset_index()[df_test.reset_index()['Class'] == 1].index)
test_no_fraud_indices = list(df_test.reset_index()[df_test.reset_index()['Class'] == 0].index)
preds = if_model.predict(df_test[df_test.columns[1:-1]])
prop_anomalous_of_frauds = list(preds[test_fraud_indices]).count(-1) / len(preds[test_fraud_indices])
prop_anomalous_of_not_frauds = list(preds[test_no_fraud_indices]).count(-1) / len(preds[test_no_fraud_indices])

print(f'proportion of frauds that are anomalous: {np.round(100*prop_anomalous_of_frauds, 2)}%')
print(f'proportion of not frauds that are anomalous: {np.round(100*prop_anomalous_of_not_frauds, 2)}%')

proportion of frauds that are anomalous: 0.93%
proportion of not frauds that are anomalous: 0.13%


**Takeaway**: Isolation forests don't do a great job of generalizing to the test set and/or there is data drift. Because of this, I am not going to use isolation forests for fraud detection and am going to work with something that is inherently more flexible.