# Run ADBench 
- Here we provide a demo for testing AD algorithms on the datasets proposed in ADBench.
- Feel free to evaluate any customized algorithm in ADBench.
- For reproducing the complete experiment results in ADBench, please run the code in the run.py file.

In [2]:
# import basic package
import os
import pandas as pd
import numpy as np

# import the necessary package
from data_generator import DataGenerator
from myutils import Utils

datagenerator = DataGenerator() # data generator
utils = Utils() # utils function

- We include all the datasets of ADBench in the "datasets" folder, as the "number_data_class.npz" filename. Please see the table in the markdown for details.
    - You can specify the dataset name by removing the filename ".npz" suffix in the data generator, e.g., "10_cover.npz" as "10_cover". 
    
    
- All the algorithms included in the ADBench are illustrated in the table of markdown.
    - You need to specify the model name when initialization, as some algorithms (e.g., supervised algorithms) are integrated in one class, please see the table in the markdown for details.
    - You can also test your own AD algorithms on our generated dataset, as long as the algorithm can output anomaly score for evaluation.

In [3]:
os.listdir('datasets/Classical')

['26_optdigits.npz',
 '42_WBC.npz',
 '05_campaign.npz',
 '21_Lymphography.npz',
 '33_skin.npz',
 '34_smtp.npz',
 '28_pendigits.npz',
 '39_vertebral.npz',
 '11_donors.npz',
 '43_WDBC.npz',
 '36_speech.npz',
 '44_Wilt.npz',
 '01_ALOI.npz',
 '10_cover.npz',
 '08_celeba.npz',
 '46_WPBC.npz',
 '03_backdoor.npz',
 '04_breastw.npz',
 '37_Stamps.npz',
 '27_PageBlocks.npz',
 '31_satimage-2.npz',
 '38_thyroid.npz',
 '29_Pima.npz',
 '24_mnist.npz',
 '07_Cardiotocography.npz',
 '15_Hepatitis.npz',
 '09_census.npz',
 '22_magic.gamma.npz',
 '16_http.npz',
 '32_shuttle.npz',
 '12_fault.npz',
 '47_yeast.npz',
 '13_fraud.npz',
 '35_SpamBase.npz',
 '41_Waveform.npz',
 '17_InternetAds.npz',
 '23_mammography.npz',
 '02_annthyroid.npz',
 '40_vowels.npz',
 '45_wine.npz',
 '25_musk.npz',
 '06_cardio.npz',
 '18_Ionosphere.npz',
 '20_letter.npz',
 '19_landsat.npz',
 '14_glass.npz',
 '30_satellite.npz']

In [None]:
import pyod
models = {'IForest': pyod.models.iforest.IForest,
 'OCSVM': pyod.models.ocsvm.OCSVM,
 'CBLOF': pyod.models.cblof.CBLOF,
 'COF': pyod.models.cof.COF,
 'COPOD': pyod.models.copod.COPOD,
 'ECOD': pyod.models.ecod.ECOD,
 'HBOS': pyod.models.hbos.HBOS,
 'KNN': pyod.models.knn.KNN,
 'LODA': pyod.models.loda.LODA,
 'LOF': pyod.models.lof.LOF,
 'PCA': pyod.models.pca.PCA,
 'SOD': pyod.models.sod.SOD
}

In [16]:
from baseline.PyOD import PYOD
from baseline.DevNet.run import DevNet
from baseline.Supervised import supervised

# dataset and model list / dict
dataset_list = ['02_annthyroid', '06_cardio', '23_mammography', '27_PageBlocks', '40_vowels', '44_Wilt', '47_yeast']
model_dict = dict(
    **{model_name: PYOD for model_name in models},
)


# save the results
df_AUCROC = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())
df_AUCPR = pd.DataFrame(data=None, index=dataset_list, columns = model_dict.keys())

In [18]:
PYOD(0, "OCSVM").model_dict

{'IForest': pyod.models.iforest.IForest,
 'OCSVM': pyod.models.ocsvm.OCSVM,
 'ABOD': pyod.models.abod.ABOD,
 'CBLOF': pyod.models.cblof.CBLOF,
 'COF': pyod.models.cof.COF,
 'AOM': <function pyod.models.combination.aom(scores, n_buckets=5, method='static', bootstrap_estimators=False, random_state=None)>,
 'COPOD': pyod.models.copod.COPOD,
 'ECOD': pyod.models.ecod.ECOD,
 'FeatureBagging': pyod.models.feature_bagging.FeatureBagging,
 'HBOS': pyod.models.hbos.HBOS,
 'KNN': pyod.models.knn.KNN,
 'LMDD': pyod.models.lmdd.LMDD,
 'LODA': pyod.models.loda.LODA,
 'LOF': pyod.models.lof.LOF,
 'LOCI': pyod.models.loci.LOCI,
 'LSCP': pyod.models.lscp.LSCP,
 'MAD': pyod.models.mad.MAD,
 'MCD': pyod.models.mcd.MCD,
 'PCA': pyod.models.pca.PCA,
 'ROD': pyod.models.rod.ROD,
 'SOD': pyod.models.sod.SOD,
 'SOS': pyod.models.sos.SOS,
 'VAE': pyod.models.vae.VAE,
 'DeepSVDD': pyod.models.deep_svdd.DeepSVDD,
 'AutoEncoder': pyod.models.auto_encoder_torch.AutoEncoder,
 'SOGAAL': pyod.models.so_gaal.SO_GAAL,

In [19]:
from tqdm.auto import tqdm

In [20]:
seed = 42

for dataset in dataset_list:
    '''
    la: ratio of labeled anomalies, from 0.0 to 1.0
    realistic_synthetic_mode: types of synthetic anomalies, can be local, global, dependency or cluster
    noise_type: inject data noises for testing model robustness, can be duplicated_anomalies, irrelevant_features or label_contamination
    '''
    
    # import the dataset
    print(dataset)
    datagenerator.dataset = dataset # specify the dataset name
    data = datagenerator.generator(la=0.1, realistic_synthetic_mode=None, noise_type=None) # only 10% labeled anomalies are available
    

02_annthyroid
current noise type: None
{'Samples': 7200, 'Features': 6, 'Anomalies': 534, 'Anomalies Ratio(%)': 7.42}
06_cardio
current noise type: None
{'Samples': 1831, 'Features': 21, 'Anomalies': 176, 'Anomalies Ratio(%)': 9.61}
23_mammography
subsampling for dataset 23_mammography...
current noise type: None
{'Samples': 10000, 'Features': 6, 'Anomalies': 226, 'Anomalies Ratio(%)': 2.26}
27_PageBlocks
current noise type: None
{'Samples': 5393, 'Features': 10, 'Anomalies': 510, 'Anomalies Ratio(%)': 9.46}
40_vowels
current noise type: None
{'Samples': 1456, 'Features': 12, 'Anomalies': 50, 'Anomalies Ratio(%)': 3.43}
44_Wilt
current noise type: None
{'Samples': 4819, 'Features': 5, 'Anomalies': 257, 'Anomalies Ratio(%)': 5.33}
47_yeast
current noise type: None
{'Samples': 1484, 'Features': 8, 'Anomalies': 507, 'Anomalies Ratio(%)': 34.16}


In [21]:
datagenerator.dataset = dataset # specify the dataset name
data = datagenerator.generator(la=0.1, realistic_synthetic_mode=None, noise_type=None)
{key: np.isnan(value).any() for key, value in data.items()}

current noise type: None
{'Samples': 1484, 'Features': 8, 'Anomalies': 507, 'Anomalies Ratio(%)': 34.16}


{'X_train': False, 'y_train': False, 'X_test': False, 'y_test': False}

In [22]:
import numpy as np

In [None]:
# seed for reproducible results
seed = 42

for dataset in dataset_list:
    '''
    la: ratio of labeled anomalies, from 0.0 to 1.0
    realistic_synthetic_mode: types of synthetic anomalies, can be local, global, dependency or cluster
    noise_type: inject data noises for testing model robustness, can be duplicated_anomalies, irrelevant_features or label_contamination
    '''
    
    # import the dataset
    datagenerator.dataset = dataset # specify the dataset name
    data = datagenerator.generator(la=0.1, realistic_synthetic_mode=None, noise_type=None) # only 10% labeled anomalies are available
    
    for name, clf in tqdm(model_dict.items()):
        # model initialization
        clf = clf(seed=seed, model_name=name)
        
        # training, for unsupervised models the y label will be discarded
        try:
            clf = clf.fit(X_train=data['X_train'], y_train=data['y_train'])
            
                    # output predicted anomaly score on testing set
            score = clf.predict_score(data['X_test'])

            # evaluation
            result = utils.metric(y_true=data['y_test'], y_score=score)
            
            # save results
            df_AUCROC.loc[dataset, name] = result['aucroc']
            df_AUCPR.loc[dataset, name] = result['aucpr']
        except:
            continue


current noise type: None
{'Samples': 7200, 'Features': 6, 'Anomalies': 534, 'Anomalies Ratio(%)': 7.42}


  0%|          | 0/12 [00:00<?, ?it/s]

best param: None




best param: None




best param: None




best param: None




best param: None
best param: None
best param: None




best param: None




best param: None
best param: None




best param: None
best param: None


OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [30]:
df_AUCROC.to_csv('AUCROC.csv')

In [31]:
df_AUCPR.to_csv('AUCPR.csv')

In [27]:
df_AUCROC

Unnamed: 0,IForest,OCSVM,ABOD,CBLOF,COF,AOM,COPOD,ECOD,FeatureBagging,HBOS,...,MCD,PCA,SOD,SOS,VAE,DeepSVDD,AutoEncoder,SOGAAL,MOGAAL,XGBOD
2_annthyroid,0.826387,0.606069,,0.673525,0.704828,,0.795847,0.803716,0.806159,0.691522,...,0.920006,0.692759,0.790217,0.608144,,0.754313,0.76345,0.192325,0.403831,0.989428
6_cardio,0.944193,0.939676,0.498576,0.851296,0.54455,,0.928363,0.942827,0.646293,0.865343,...,0.884439,0.961087,0.672526,0.579021,,0.680346,0.934627,0.130329,0.143578,0.959227
23_mammography,0.849503,0.854704,,0.84844,0.792004,,0.898732,0.907484,0.785237,0.871755,...,0.481078,0.893982,0.809837,,,0.716212,0.426207,0.31396,0.123889,0.921029
25_musk,1.0,0.818675,0.085936,1.0,0.400387,,0.95525,0.964549,0.459512,1.0,...,0.999923,1.0,0.760209,0.468307,1.0,0.732158,1.0,0.005114,0.006432,1.0
30_satellite,0.699404,,0.538013,0.717879,0.556999,,0.645112,0.593575,0.524199,0.76813,...,0.80369,0.609416,0.637436,0.471234,0.780963,0.682707,0.786286,0.545097,0.451446,0.925978
31_satimage-2,0.996484,0.983527,0.762625,0.997924,0.451384,,0.983444,0.97608,0.427243,0.985936,...,0.994491,0.986185,0.754125,0.557254,0.990532,0.542663,0.980703,0.751412,0.001467,0.971055
32_shuttle,0.996861,0.987461,0.618752,0.997058,0.557606,,0.99655,0.993542,0.522474,0.994925,...,0.990698,0.993764,0.755756,0.486732,,0.434838,0.993733,0.970135,0.519471,0.99908
47_yeast,0.429556,0.448353,0.417114,0.477242,0.428639,,0.406373,0.477197,0.478383,0.410032,...,0.425752,0.444057,0.470439,0.457147,,0.469813,0.518663,0.604972,0.607344,0.565901


In [64]:
df_AUCPR

Unnamed: 0,IForest,OCSVM,ABOD,CBLOF,COF,AOM,COPOD,ECOD,FeatureBagging,HBOS,...,MCD,PCA,SOD,SOS,VAE,DeepSVDD,AutoEncoder,SOGAAL,MOGAAL,XGBOD
2_annthyroid,0.353499,0.130856,,0.236025,0.183333,,0.192934,0.298425,0.193313,0.229328,...,0.514953,0.206922,0.221867,0.133813,,0.21219,0.349782,0.042717,0.059597,0.781004
6_cardio,0.615718,0.572908,0.143099,0.486642,0.12551,,0.604146,0.592825,0.166842,0.517139,...,0.453585,0.652319,0.215601,0.143175,,0.312177,0.603999,0.053829,0.05449,0.784746
23_mammography,0.189354,0.116258,,0.111213,0.117497,,0.404433,0.414901,0.08996,0.177061,...,0.021849,0.2055,0.149106,,,0.097473,0.018759,0.016766,0.012191,0.396075
25_musk,1.0,0.090669,0.017537,1.0,0.027348,,0.43955,0.582992,0.030433,1.0,...,0.997662,1.0,0.079929,0.062824,1.0,0.261068,1.0,0.016501,0.016501,1.0
30_satellite,0.660275,,0.373054,0.621696,0.407289,,0.58856,0.544425,0.361873,0.698374,...,0.764358,0.618827,0.473194,0.296156,0.727678,0.52019,0.722089,0.525758,0.476643,0.86264
31_satimage-2,0.948434,0.832774,0.063628,0.96064,0.104588,,0.792396,0.707195,0.025745,0.79499,...,0.626793,0.843801,0.172567,0.036892,0.822389,0.01641,0.479722,0.041641,0.006347,0.879892
32_shuttle,0.968758,0.972579,0.163101,0.986061,0.137095,,0.969237,0.904617,0.080851,0.984733,...,0.87626,0.944208,0.264981,0.115418,,0.100485,0.889622,0.961884,0.070142,0.987813
47_yeast,0.333248,0.322699,0.310292,0.328484,0.305052,,0.326955,0.36528,0.336494,0.344416,...,0.31038,0.316245,0.321582,0.313357,,0.326368,0.35471,0.452439,0.448737,0.417138


In [59]:
df_AUCRR_total.to_csv("results/AUCPR.csv")