In [1]:
import os
import numpy as np
import pandas as pd
from pyod.models.lof import LOF
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.lunar import LUNAR
from sklearn.metrics import roc_auc_score, precision_score
import time




In [2]:

folder_path = './benchad_datasets/'  

datasets = []
dataset_names = []
for root, _, filenames in os.walk(folder_path):
    for filename in filenames:
        file_path = os.path.join(root, filename)
        datasets.append(np.load(file_path))
        dataset_names.append(filename.split('_')[1].split('.')[0])

In [3]:
results = pd.DataFrame(columns=['Dataset', '# Samples', '# Features', 'Anomaly Ratio',
                                'ABOD Precision', 'ABOD AUC', 'ABOD Time',
                                'LOF Precision', 'LOF AUC', 'LOF Time',
                                'iForest Precision', 'iForest AUC', 'iForest Time',
                                'AutoEncoder Precision', 'AutoEncoder AUC', 'AutoEncoder Time',
                                'LUNAR Precision', 'LUNAR AUC', 'LUNAR Time'])
algorithms = {
    'ABOD': ABOD,
    'LOF': LOF,
    'iForest': IForest,
    'AutoEncoder': AutoEncoder,
    'LUNAR': LUNAR
}
unsupervised_algorithms = ['ABOD', 'LOF', 'iForest']

In [4]:
def evaluate_unsupervised_algorithm(algorithm, X,  y):
    clf = algorithm(contamination=np.mean(y))

    start_time = time.time()
    clf.fit(X)
    test_scores = clf.decision_function(X)
    end_time = time.time()
    y_pred = clf.predict(X)

    precision = round(precision_score(y, y_pred), ndigits=4)
    auc = round(roc_auc_score(y, test_scores), ndigits=4)
    duration = round(end_time - start_time, ndigits=4)
    return precision, auc, duration

In [5]:
def evaluate_semi_supervised_algorithm(algorithm, X, y):
    if algorithm is AutoEncoder:
        print('AutoEncoder')
        clf = algorithm(
            hidden_neurons=[16, 16, 4, 16, 16], epochs=5, contamination=np.mean(y))
    else:
        clf = algorithm(contamination=np.mean(y))

    X_normal = X[y == 0]
    X_anomaly = X[y == 1]

    random_indices = np.random.choice(
        len(X_normal), size=len(X_anomaly), replace=False)
    array1 = X_normal[random_indices]
    X_train = np.delete(X_normal, random_indices, axis=0)
    X_test = np.concatenate((X_anomaly, array1), axis=0)
    y_test = np.concatenate(
        (np.ones(len(X_anomaly)), np.zeros(len(array1))), axis=0)

    start_time = time.time()
    print(X_train.shape, X_test.shape, y_test.shape)
    clf.fit(X_train)
    test_scores = clf.decision_function(X_test)
    end_time = time.time()
    y_pred = clf.predict(X_test)

    precision = round(precision_score(y_test, y_pred), ndigits=4)
    auc = round(roc_auc_score(y_test, test_scores), ndigits=4)
    duration = round(end_time - start_time, ndigits=4)
    return precision, auc, duration

In [6]:
for i, (dataset, dataset_name) in enumerate(zip(datasets, dataset_names)):
    X = dataset['X']
    y = dataset['y']
    anomaly_ratio = np.mean(y)*100

    for name, algorithm in algorithms.items():
        print(f"Running {name} on {dataset_name}...")
        if name in unsupervised_algorithms:
            precision, auc, exec_time = evaluate_unsupervised_algorithm(
                algorithm, X, y)
        else:
            precision, auc, exec_time = evaluate_semi_supervised_algorithm(
                algorithm, X, y)
        results.loc[i, f'{name} Precision'] = precision
        results.loc[i, f'{name} AUC'] = auc
        results.loc[i, f'{name} Time'] = exec_time

    results.loc[i, 'Dataset'] = dataset_name
    results.loc[i, '# Samples'] = X.shape[0]
    results.loc[i, '# Features'] = X.shape[1]
    results.loc[i, 'Anomaly Ratio'] = anomaly_ratio

Running ABOD on fault...
Running LOF on fault...
Running iForest on fault...
Running AutoEncoder on fault...
AutoEncoder
(595, 27) (1346, 27) (1346,)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 27)                756       
                                                                 
 dropout (Dropout)           (None, 27)                0         
                                                                 
 dense_1 (Dense)             (None, 27)                756       
                                                                 
 dropout_1 (Dropout)         (None, 27)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                448       
                                                                 
 dropout_2 (Dropout)         (None,

  from .autonotebook import tqdm as notebook_tqdm


Running ABOD on glass...
Running LOF on glass...
Running iForest on glass...
Running AutoEncoder on glass...
AutoEncoder
(196, 7) (18, 7) (18,)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 7)                 56        
                                                                 
 dropout_7 (Dropout)         (None, 7)                 0         
                                                                 
 dense_9 (Dense)             (None, 7)                 56        
                                                                 
 dropout_8 (Dropout)         (None, 7)                 0         
                                                                 
 dense_10 (Dense)            (None, 16)                128       
                                                                 
 dropout_9 (Dropout)         (None, 16)   

In [7]:
results.to_csv('results.csv', index=False)
results

Unnamed: 0,Dataset,# Samples,# Features,Anomaly Ratio,ABOD Precision,ABOD AUC,ABOD Time,LOF Precision,LOF AUC,LOF Time,iForest Precision,iForest AUC,iForest Time,AutoEncoder Precision,AutoEncoder AUC,AutoEncoder Time,LUNAR Precision,LUNAR AUC,LUNAR Time
0,fault,1941,27,34.672849,0.5035,0.6986,1.5692,0.423,0.5957,0.02,0.4264,0.5629,0.1145,0.5749,0.5391,5.2817,0.7883,0.8092,4.2711
1,glass,214,7,4.205607,0.1818,0.845,0.047,0.125,0.8157,0.004,0.1111,0.7626,0.0945,0.5,0.7901,2.7349,0.5,0.9012,1.6424
2,Hepatitis,80,19,16.25,0.1429,0.4788,0.0204,0.3333,0.589,0.012,0.1538,0.7049,0.1091,0.875,0.8047,2.7512,0.6364,0.7515,1.2779
3,InternetAds,1966,1555,18.71821,0.2879,0.6299,5.437,0.4073,0.6451,0.1426,0.462,0.6895,0.1689,0.7879,0.7993,7.1861,0.8924,0.8628,4.9923
4,Ionosphere,351,32,35.897436,0.8092,0.927,0.0753,0.7886,0.8603,0.0199,0.6667,0.8383,0.1125,0.6455,0.8825,2.7592,0.7396,0.9828,1.8464
5,landsat,6435,36,20.714841,0.2198,0.5025,1.6365,0.2778,0.5466,0.116,0.2198,0.491,0.1618,0.5355,0.4587,3.1605,0.8536,0.7902,12.2003
6,mnist,7603,100,9.206892,0.2383,0.7005,6.915,0.2454,0.6449,0.2347,0.2857,0.7885,0.2018,0.8715,0.9087,4.6273,0.9422,0.9333,19.9083
7,musk,3062,166,3.167864,0.0094,0.0528,1.5429,0.0469,0.4124,0.0605,1.0,1.0,0.1309,0.951,1.0,3.2882,0.9798,1.0,9.3589
8,Cardiotocography,2114,21,22.043519,0.2869,0.5553,0.4172,0.3364,0.5965,0.0223,0.4292,0.7172,0.1181,0.747,0.8178,3.0573,0.8517,0.8081,4.6133
