# Anomaly detection

First install and import needed packages and libraries

In [1]:
# !pip install numpy scipy pandas matplotlib scikit-learn missingno imbalanced-learn pyod xgboost threadpoolctl

In [2]:
import numpy as np
import pandas as pd

We'll use the [Credit Card Fraud Detection dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud). 

It contains data about more than 284 000 credit card transactions, of which barely 492 were fraud. It means the classification is extremely imbalanced. All features are numerical and there's no missing values. Most of the features are unnamed for the sake of anonymity. The only public features are "Time" and "Amount".

Because of the nature of the dataset, authors have proposed use of **Area Under Precision-Recall Curve (AUPRC)** metric.

### Preprocessing

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

df = pd.read_parquet("credit_card_fraud_data.parquet")

df = df.drop(columns="Time")
y = df.pop("Class")

In [4]:
y_pre_count = (y == 1).sum()
y_pre_perc = y_pre_count / len(y)

print(f"Fraud class before resampling: {100 * y_pre_perc:.2f}% of the dataset")

Fraud class before resampling: 0.17% of the dataset


Randomly select 50000 elements of the negative class

In [5]:
sampling_strategy = {0: 50000, 1: (y == 1).sum()}
random_under_sampler = RandomUnderSampler(
    sampling_strategy=sampling_strategy, random_state=0
)
df, y = random_under_sampler.fit_resample(df, y)

Split and scale the data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.25, random_state=0, stratify=y
)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
y_pos_count = (y == 1).sum()
y_pos_perc = y_pos_count / len(y)

print(f"Fraud class after resampling: {100 * y_pos_perc:.2f}% of the dataset")

Fraud class after resampling: 0.97% of the dataset


We'll use 3 unsupervised learning algorithms for outlier detection
- kNN
- Local Outlier Factor (LOF)
- Isolation Forest

In [8]:
from sklearn.metrics import average_precision_score


def assess_anomaly_detection_model(estimator, X_test, y_test) -> None:
    y_pred_score = estimator.predict_proba(X_test)

    # in case pred score is a distribution. That depends on the used estimator
    if len(y_pred_score.shape) > 1:
        y_pred_score = y_pred_score[:, 1]

    auprc = average_precision_score(y_test, y_pred_score)
    print(f"AUPRC: {100 * auprc:.2f}%")

In [9]:
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF

# General rule of thumb is to use the fraction of training dataset as contamination factor
# However, it might not always be a known number
contamination = (y == 1).sum() / len(y)

knn = KNN(contamination=contamination, n_jobs=-1)
knn.fit(X_train)
print("kNN metrics")
assess_anomaly_detection_model(knn, X_test, y_test)
print()

lof = LOF(contamination=contamination, n_jobs=-1)
lof.fit(X_train)
print("Local Outlier Factor metrics")
assess_anomaly_detection_model(lof, X_test, y_test)
print()

iforest = IForest(contamination=contamination, random_state=0, n_jobs=-1)
iforest.fit(X_train)
print("Isolation Forest metrics")
assess_anomaly_detection_model(iforest, X_test, y_test)
print()

kNN metrics
AUPRC: 16.87%
Local Outlier Factor metrics
AUPRC: 1.02%
Isolation Forest metrics
AUPRC: 55.85%


Isolation forest achieved a really high AUPRC score! AUPRC score generally is very low even for the best models. Achieving score of over 30% is a good score.

Let's perform stepwise hyperparameter tuning on an isolation forest model

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import warnings

iforest = IForest(contamination=contamination, random_state=0, n_jobs=-1, verbose=False)

param_grid = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_samples": [100, 200, 256, 300, 400, 500, "auto"],
}

scorer = make_scorer(average_precision_score)

cv = GridSearchCV(
    estimator=iforest,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,
    error_score="raise",
    n_jobs=-1,
)

# We get many warnings about passing y value. We need it for tuning
warnings.filterwarnings("ignore", category=UserWarning)
cv.fit(X_train, y_train)
warnings.filterwarnings("default", category=UserWarning)

assess_anomaly_detection_model(cv.best_estimator_, X_test, y_test)
print(cv.best_params_)

AUPRC: 55.64%
{'max_samples': 300, 'n_estimators': 400}


That's a great result. 

Let's see if we can improve it using XGBoost Outlier Detection (XGBOD)

In [12]:
knn_list = [
    KNN(contamination=contamination, n_jobs=-1, n_neighbors=nn)
    for nn in [1, 3, 5, 10, 20, 30, 40, 50]
]
iforest_list = [
    IForest(contamination=contamination, random_state=0, n_jobs=-1, n_estimators=ne)
    for ne in [50, 100, 200, 300]
]

estimator_list = knn_list + iforest_list

from pyod.models.xgbod import XGBOD

xgbod_model = XGBOD(estimator_list, n_jobs=-1, random_state=0)

warnings.filterwarnings("ignore", category=UserWarning)
xgbod_model.fit(X_train, y_train)
warnings.filterwarnings("default", category=UserWarning)

In [13]:
assess_anomaly_detection_model(xgbod_model, X_test, y_test)

AUPRC: 88.97%


We have a very high score however, the prediction time was very high