# IMPORT PYTHON PACKAGES

In [8]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

In [9]:
pip install pyod

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install combo

Note: you may need to restart the kernel to use updated packages.


# IMPORT PYOD PACKAGES AND THE METHODS

In [11]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

# IMPORT METRICS PACKAGE

In [12]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# DEFINE DATA FILE AND READ X AND Y.

In [13]:
mat_file_list = ['arrhythmia.mat',
                 'cardio.mat',
                 'glass.mat',
                 'ionosphere.mat',
                 'letter.mat',
                 'lympho.mat',
                 'mnist.mat',
                 'musk.mat',
                 'optdigits.mat',
                 'pendigits.mat',
                 'pima.mat',
                 'satellite.mat',
                 'satimage-2.mat',
                 'shuttle.mat',
                 'vertebral.mat',
                 'vowels.mat',
                 'wbc.mat']

# HOW TO LOAD MAT FILE

In [14]:
from scipy.io import loadmat

In [15]:
data=loadmat('cardio.mat')

In [16]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [17]:
len(data)

5

In [18]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [19]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

# INPUT(INDEPENDENT) FEATURE SHAPE IN MAT FILE FORMAT

In [20]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

# DEPENDENT/TARGET/OUTPUT FEATURE SHAPE

In [21]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [22]:
df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc',
              'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD',
              'OCSVM', 'PCA']

# ROC PERFORMANCE EVOLUTION TABLE

In [23]:
roc_df = pd.DataFrame(columns=df_columns)

In [24]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# PRECISION_N_SCORES - PERFORMANCE EVOLUTION TABLE

In [25]:
prn_df = pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# TIME DATAFRAME

In [26]:
time_df = pd.DataFrame(columns=df_columns)

In [27]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# EXPLORING ALL MAT FILES

In [29]:
from time import time
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join(mat_file))

    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                        random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor': CBLOF(
            contamination=outliers_fraction, check_estimator=False,
            random_state=random_state),
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)': MCD(
            contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)': PCA(
            contamination=outliers_fraction, random_state=random_state),
    }

    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(
            clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 2.2251s




Cluster-based Local Outlier Factor ROC:0.7789, precision @ rank n:0.4643, execution time: 2.2166s
Feature Bagging ROC:0.7796, precision @ rank n:0.4643, execution time: 0.0832s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 1.6163s
Isolation Forest ROC:0.8595, precision @ rank n:0.5714, execution time: 0.528s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.0485s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.0092s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 0.7787s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.039s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.0817s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5892, precision @ rank n:0.1918, execution time: 0.4863s
Cluster-based Local Outlier Factor ROC:0.8845, precision @ rank n:0.4932, execution time: 0.1352s




Feature Bagging ROC:0.6385, precision @ rank n:0.1781, execution time: 0.7526s
Histogram-base Outlier Detection (HBOS) ROC:0.8373, precision @ rank n:0.4521, execution time: 0.0038s
Isolation Forest ROC:0.9527, precision @ rank n:0.6027, execution time: 0.5467s
K Nearest Neighbors (KNN) ROC:0.734, precision @ rank n:0.3562, execution time: 0.1643s
Local Outlier Factor (LOF) ROC:0.588, precision @ rank n:0.1507, execution time: 0.0857s




Minimum Covariance Determinant (MCD) ROC:0.8534, precision @ rank n:0.411, execution time: 0.769s
One-class SVM (OCSVM) ROC:0.9478, precision @ rank n:0.5342, execution time: 0.2383s
Principal Component Analysis (PCA) ROC:0.9616, precision @ rank n:0.6849, execution time: 0.0186s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6951, precision @ rank n:0.25, execution time: 0.0863s
Cluster-based Local Outlier Factor ROC:0.811, precision @ rank n:0.25, execution time: 0.049s




Feature Bagging ROC:0.7073, precision @ rank n:0.25, execution time: 0.0639s
Histogram-base Outlier Detection (HBOS) ROC:0.7073, precision @ rank n:0.0, execution time: 0.0107s
Isolation Forest ROC:0.7134, precision @ rank n:0.25, execution time: 0.3859s
K Nearest Neighbors (KNN) ROC:0.8384, precision @ rank n:0.25, execution time: 0.0057s
Local Outlier Factor (LOF) ROC:0.7043, precision @ rank n:0.25, execution time: 0.0122s
Minimum Covariance Determinant (MCD) ROC:0.8293, precision @ rank n:0.0, execution time: 0.076s
One-class SVM (OCSVM) ROC:0.6585, precision @ rank n:0.25, execution time: 0.007s
Principal Component Analysis (PCA) ROC:0.686, precision @ rank n:0.25, execution time: 0.0098s

... Processing ionosphere.mat ...




Angle-based Outlier Detector (ABOD) ROC:0.9181, precision @ rank n:0.8431, execution time: 0.0961s
Cluster-based Local Outlier Factor ROC:0.9176, precision @ rank n:0.8039, execution time: 0.0659s
Feature Bagging ROC:0.9303, precision @ rank n:0.8039, execution time: 0.0672s
Histogram-base Outlier Detection (HBOS) ROC:0.6052, precision @ rank n:0.3922, execution time: 0.0129s
Isolation Forest ROC:0.8516, precision @ rank n:0.6078, execution time: 0.3781s
K Nearest Neighbors (KNN) ROC:0.932, precision @ rank n:0.8824, execution time: 0.0176s
Local Outlier Factor (LOF) ROC:0.9227, precision @ rank n:0.7843, execution time: 0.002s
Minimum Covariance Determinant (MCD) ROC:0.9669, precision @ rank n:0.8627, execution time: 0.0861s
One-class SVM (OCSVM) ROC:0.8257, precision @ rank n:0.6863, execution time: 0.0027s
Principal Component Analysis (PCA) ROC:0.7941, precision @ rank n:0.5686, execution time: 0.0122s

... Processing letter.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.8783, pr



Feature Bagging ROC:0.8947, precision @ rank n:0.4062, execution time: 0.6578s
Histogram-base Outlier Detection (HBOS) ROC:0.6063, precision @ rank n:0.0938, execution time: 0.0258s
Isolation Forest ROC:0.6178, precision @ rank n:0.0625, execution time: 0.7589s
K Nearest Neighbors (KNN) ROC:0.8573, precision @ rank n:0.3125, execution time: 0.1839s
Local Outlier Factor (LOF) ROC:0.8765, precision @ rank n:0.3438, execution time: 0.0777s
Minimum Covariance Determinant (MCD) ROC:0.8061, precision @ rank n:0.1875, execution time: 1.7972s
One-class SVM (OCSVM) ROC:0.5927, precision @ rank n:0.125, execution time: 0.2933s
Principal Component Analysis (PCA) ROC:0.5216, precision @ rank n:0.125, execution time: 0.012s

... Processing lympho.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9831, precision @ rank n:0.0, execution time: 0.0969s




Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.1253s
Feature Bagging ROC:1.0, precision @ rank n:1.0, execution time: 0.0662s
Histogram-base Outlier Detection (HBOS) ROC:1.0, precision @ rank n:1.0, execution time: 0.0062s
Isolation Forest ROC:1.0, precision @ rank n:1.0, execution time: 0.7396s
K Nearest Neighbors (KNN) ROC:1.0, precision @ rank n:1.0, execution time: 0.0148s
Local Outlier Factor (LOF) ROC:1.0, precision @ rank n:1.0, execution time: 0.0055s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ rank n:1.0, execution time: 0.048s
One-class SVM (OCSVM) ROC:1.0, precision @ rank n:1.0, execution time: 0.0076s
Principal Component Analysis (PCA) ROC:1.0, precision @ rank n:1.0, execution time: 0.0071s

... Processing mnist.mat ...




Angle-based Outlier Detector (ABOD) ROC:0.7628, precision @ rank n:0.3367, execution time: 5.6675s




Cluster-based Local Outlier Factor ROC:0.8389, precision @ rank n:0.3912, execution time: 0.7242s
Feature Bagging ROC:0.7157, precision @ rank n:0.3741, execution time: 11.509s
Histogram-base Outlier Detection (HBOS) ROC:0.5766, precision @ rank n:0.1361, execution time: 0.0534s
Isolation Forest ROC:0.7804, precision @ rank n:0.2823, execution time: 2.0755s
K Nearest Neighbors (KNN) ROC:0.8498, precision @ rank n:0.432, execution time: 2.9942s
Local Outlier Factor (LOF) ROC:0.7195, precision @ rank n:0.3673, execution time: 1.0634s




Minimum Covariance Determinant (MCD) ROC:0.8713, precision @ rank n:0.2653, execution time: 2.6688s
One-class SVM (OCSVM) ROC:0.854, precision @ rank n:0.3946, execution time: 4.7443s
Principal Component Analysis (PCA) ROC:0.8534, precision @ rank n:0.3878, execution time: 0.1556s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2161, precision @ rank n:0.1, execution time: 1.4008s




Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.2327s
Feature Bagging ROC:0.473, precision @ rank n:0.125, execution time: 1.8347s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.975, execution time: 0.0788s
Isolation Forest ROC:1.0, precision @ rank n:1.0, execution time: 1.3468s
K Nearest Neighbors (KNN) ROC:0.8009, precision @ rank n:0.175, execution time: 0.8511s
Local Outlier Factor (LOF) ROC:0.4629, precision @ rank n:0.125, execution time: 0.1746s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ rank n:1.0, execution time: 10.5883s
One-class SVM (OCSVM) ROC:1.0, precision @ rank n:1.0, execution time: 0.8694s
Principal Component Analysis (PCA) ROC:1.0, precision @ rank n:1.0, execution time: 0.1438s

... Processing optdigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.4894, precision @ rank n:0.0152, execution time: 2.0105s




Cluster-based Local Outlier Factor ROC:0.7901, precision @ rank n:0.0, execution time: 0.3501s
Feature Bagging ROC:0.5062, precision @ rank n:0.0303, execution time: 4.9696s
Histogram-base Outlier Detection (HBOS) ROC:0.8774, precision @ rank n:0.2121, execution time: 0.0258s
Isolation Forest ROC:0.6682, precision @ rank n:0.0, execution time: 1.0633s
K Nearest Neighbors (KNN) ROC:0.406, precision @ rank n:0.0, execution time: 0.9556s
Local Outlier Factor (LOF) ROC:0.5277, precision @ rank n:0.0303, execution time: 0.4755s




Minimum Covariance Determinant (MCD) ROC:0.3822, precision @ rank n:0.0, execution time: 1.2686s
One-class SVM (OCSVM) ROC:0.5171, precision @ rank n:0.0, execution time: 1.8098s
Principal Component Analysis (PCA) ROC:0.526, precision @ rank n:0.0, execution time: 0.0779s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.667, precision @ rank n:0.0526, execution time: 1.9064s




Cluster-based Local Outlier Factor ROC:0.8082, precision @ rank n:0.1579, execution time: 0.2415s
Feature Bagging ROC:0.4889, precision @ rank n:0.0526, execution time: 3.9381s
Histogram-base Outlier Detection (HBOS) ROC:0.9348, precision @ rank n:0.2632, execution time: 0.0243s
Isolation Forest ROC:0.9414, precision @ rank n:0.2807, execution time: 0.7459s
K Nearest Neighbors (KNN) ROC:0.7371, precision @ rank n:0.0702, execution time: 0.9763s
Local Outlier Factor (LOF) ROC:0.4965, precision @ rank n:0.0702, execution time: 0.929s
Minimum Covariance Determinant (MCD) ROC:0.8204, precision @ rank n:0.0877, execution time: 1.9944s
One-class SVM (OCSVM) ROC:0.9235, precision @ rank n:0.3158, execution time: 2.4774s
Principal Component Analysis (PCA) ROC:0.9309, precision @ rank n:0.3158, execution time: 0.0043s

... Processing pima.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7163, precision @ rank n:0.5253, execution time: 0.1448s




Cluster-based Local Outlier Factor ROC:0.67, precision @ rank n:0.4949, execution time: 0.0977s
Feature Bagging ROC:0.6448, precision @ rank n:0.4444, execution time: 0.1058s
Histogram-base Outlier Detection (HBOS) ROC:0.711, precision @ rank n:0.5354, execution time: 0.0092s
Isolation Forest ROC:0.6872, precision @ rank n:0.5253, execution time: 0.3674s
K Nearest Neighbors (KNN) ROC:0.7395, precision @ rank n:0.5859, execution time: 0.039s
Local Outlier Factor (LOF) ROC:0.6574, precision @ rank n:0.4646, execution time: 0.0134s
Minimum Covariance Determinant (MCD) ROC:0.7175, precision @ rank n:0.5152, execution time: 0.0543s
One-class SVM (OCSVM) ROC:0.6561, precision @ rank n:0.5051, execution time: 0.062s
Principal Component Analysis (PCA) ROC:0.6762, precision @ rank n:0.5354, execution time: 0.0066s

... Processing satellite.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5653, precision @ rank n:0.3962, execution time: 2.2178s




Cluster-based Local Outlier Factor ROC:0.7241, precision @ rank n:0.5412, execution time: 0.3592s
Feature Bagging ROC:0.572, precision @ rank n:0.4, execution time: 7.2116s
Histogram-base Outlier Detection (HBOS) ROC:0.7486, precision @ rank n:0.57, execution time: 0.017s
Isolation Forest ROC:0.6827, precision @ rank n:0.5688, execution time: 0.8983s
K Nearest Neighbors (KNN) ROC:0.6853, precision @ rank n:0.4988, execution time: 0.9093s
Local Outlier Factor (LOF) ROC:0.572, precision @ rank n:0.395, execution time: 0.7042s
Minimum Covariance Determinant (MCD) ROC:0.8055, precision @ rank n:0.6762, execution time: 2.2085s
One-class SVM (OCSVM) ROC:0.6478, precision @ rank n:0.5225, execution time: 2.4652s
Principal Component Analysis (PCA) ROC:0.5923, precision @ rank n:0.465, execution time: 0.0196s

... Processing satimage-2.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.8432, precision @ rank n:0.2333, execution time: 2.0605s




Cluster-based Local Outlier Factor ROC:0.9998, precision @ rank n:0.9333, execution time: 0.3023s
Feature Bagging ROC:0.5235, precision @ rank n:0.1667, execution time: 6.1532s
Histogram-base Outlier Detection (HBOS) ROC:0.9784, precision @ rank n:0.6, execution time: 0.0183s
Isolation Forest ROC:0.9947, precision @ rank n:0.8333, execution time: 0.782s
K Nearest Neighbors (KNN) ROC:0.9515, precision @ rank n:0.4333, execution time: 0.8058s
Local Outlier Factor (LOF) ROC:0.5257, precision @ rank n:0.1667, execution time: 0.6319s
Minimum Covariance Determinant (MCD) ROC:0.9963, precision @ rank n:0.6667, execution time: 1.9158s
One-class SVM (OCSVM) ROC:0.9997, precision @ rank n:0.9, execution time: 1.9457s
Principal Component Analysis (PCA) ROC:0.9816, precision @ rank n:0.7333, execution time: 0.0211s

... Processing shuttle.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6171, precision @ rank n:0.2003, execution time: 16.29s




Cluster-based Local Outlier Factor ROC:0.6273, precision @ rank n:0.2025, execution time: 0.5884s
Feature Bagging ROC:0.4725, precision @ rank n:0.0257, execution time: 79.8494s
Histogram-base Outlier Detection (HBOS) ROC:0.9871, precision @ rank n:0.9985, execution time: 0.026s
Isolation Forest ROC:0.9977, precision @ rank n:0.9552, execution time: 4.3168s
K Nearest Neighbors (KNN) ROC:0.6507, precision @ rank n:0.212, execution time: 10.4669s
Local Outlier Factor (LOF) ROC:0.5556, precision @ rank n:0.1548, execution time: 12.3339s








Minimum Covariance Determinant (MCD) ROC:0.9899, precision @ rank n:0.7395, execution time: 11.7763s
One-class SVM (OCSVM) ROC:0.9934, precision @ rank n:0.956, execution time: 156.041s
Principal Component Analysis (PCA) ROC:0.9915, precision @ rank n:0.9516, execution time: 0.0223s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5366, precision @ rank n:0.2143, execution time: 0.0468s
Cluster-based Local Outlier Factor ROC:0.439, precision @ rank n:0.0714, execution time: 0.0392s




Feature Bagging ROC:0.5279, precision @ rank n:0.1429, execution time: 0.0656s
Histogram-base Outlier Detection (HBOS) ROC:0.3506, precision @ rank n:0.0, execution time: 0.0029s
Isolation Forest ROC:0.3789, precision @ rank n:0.0, execution time: 0.3745s
K Nearest Neighbors (KNN) ROC:0.4573, precision @ rank n:0.0714, execution time: 0.0088s
Local Outlier Factor (LOF) ROC:0.4983, precision @ rank n:0.1429, execution time: 0.012s
Minimum Covariance Determinant (MCD) ROC:0.4085, precision @ rank n:0.0714, execution time: 0.0636s
One-class SVM (OCSVM) ROC:0.4686, precision @ rank n:0.0714, execution time: 0.0023s
Principal Component Analysis (PCA) ROC:0.4085, precision @ rank n:0.0, execution time: 0.0076s

... Processing vowels.mat ...




Angle-based Outlier Detector (ABOD) ROC:0.9616, precision @ rank n:0.6316, execution time: 0.3002s
Cluster-based Local Outlier Factor ROC:0.8963, precision @ rank n:0.3158, execution time: 0.0948s




Feature Bagging ROC:0.9365, precision @ rank n:0.3684, execution time: 0.3305s
Histogram-base Outlier Detection (HBOS) ROC:0.6876, precision @ rank n:0.1579, execution time: 0.0118s
Isolation Forest ROC:0.8174, precision @ rank n:0.2105, execution time: 0.3734s
K Nearest Neighbors (KNN) ROC:0.9734, precision @ rank n:0.4737, execution time: 0.0819s
Local Outlier Factor (LOF) ROC:0.9398, precision @ rank n:0.3684, execution time: 0.0245s
Minimum Covariance Determinant (MCD) ROC:0.7243, precision @ rank n:0.1053, execution time: 0.8138s
One-class SVM (OCSVM) ROC:0.8163, precision @ rank n:0.2632, execution time: 0.1184s
Principal Component Analysis (PCA) ROC:0.6297, precision @ rank n:0.1579, execution time: 0.0069s

... Processing wbc.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.921, precision @ rank n:0.375, execution time: 0.0989s
Cluster-based Local Outlier Factor ROC:0.9149, precision @ rank n:0.375, execution time: 0.0652s
Feature Bagging ROC:0.9271, precision @ rank n:0.375, 



Histogram-base Outlier Detection (HBOS) ROC:0.9479, precision @ rank n:0.5, execution time: 0.0226s
Isolation Forest ROC:0.9418, precision @ rank n:0.625, execution time: 0.3813s
K Nearest Neighbors (KNN) ROC:0.9444, precision @ rank n:0.5, execution time: 0.0206s
Local Outlier Factor (LOF) ROC:0.9227, precision @ rank n:0.375, execution time: 0.0098s
Minimum Covariance Determinant (MCD) ROC:0.9288, precision @ rank n:0.5, execution time: 0.0673s
One-class SVM (OCSVM) ROC:0.9358, precision @ rank n:0.375, execution time: 0.0287s
Principal Component Analysis (PCA) ROC:0.9262, precision @ rank n:0.375, execution time: 0.0056s


In [30]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7789,0.7796,0.8511,0.8595,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5892,0.8845,0.6385,0.8373,0.9527,0.734,0.588,0.8534,0.9478,0.9616
0,glass,214,9,4.2056,0.6951,0.811,0.7073,0.7073,0.7134,0.8384,0.7043,0.8293,0.6585,0.686
0,ionosphere,351,33,35.8974,0.9181,0.9176,0.9303,0.6052,0.8516,0.932,0.9227,0.9669,0.8257,0.7941
0,letter,1600,32,6.25,0.8783,0.7783,0.8947,0.6063,0.6178,0.8573,0.8765,0.8061,0.5927,0.5216
0,lympho,148,18,4.0541,0.9831,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.7628,0.8389,0.7157,0.5766,0.7804,0.8498,0.7195,0.8713,0.854,0.8534
0,musk,3062,166,3.1679,0.2161,1.0,0.473,0.9999,1.0,0.8009,0.4629,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4894,0.7901,0.5062,0.8774,0.6682,0.406,0.5277,0.3822,0.5171,0.526
0,pendigits,6870,16,2.2707,0.667,0.8082,0.4889,0.9348,0.9414,0.7371,0.4965,0.8204,0.9235,0.9309


In [31]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.4643,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1918,0.4932,0.1781,0.4521,0.6027,0.3562,0.1507,0.411,0.5342,0.6849
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8431,0.8039,0.8039,0.3922,0.6078,0.8824,0.7843,0.8627,0.6863,0.5686
0,letter,1600,32,6.25,0.4375,0.1875,0.4062,0.0938,0.0625,0.3125,0.3438,0.1875,0.125,0.125
0,lympho,148,18,4.0541,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.3367,0.3912,0.3741,0.1361,0.2823,0.432,0.3673,0.2653,0.3946,0.3878
0,musk,3062,166,3.1679,0.1,1.0,0.125,0.975,1.0,0.175,0.125,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.0152,0.0,0.0303,0.2121,0.0,0.0,0.0303,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0526,0.1579,0.0526,0.2632,0.2807,0.0702,0.0702,0.0877,0.3158,0.3158


In [32]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,2.2251,2.2166,0.0832,1.6163,0.528,0.0485,0.0092,0.7787,0.039,0.0817
0,cardio,1831,21,9.6122,0.4863,0.1352,0.7526,0.0038,0.5467,0.1643,0.0857,0.769,0.2383,0.0186
0,glass,214,9,4.2056,0.0863,0.049,0.0639,0.0107,0.3859,0.0057,0.0122,0.076,0.007,0.0098
0,ionosphere,351,33,35.8974,0.0961,0.0659,0.0672,0.0129,0.3781,0.0176,0.002,0.0861,0.0027,0.0122
0,letter,1600,32,6.25,0.4429,0.1498,0.6578,0.0258,0.7589,0.1839,0.0777,1.7972,0.2933,0.012
0,lympho,148,18,4.0541,0.0969,0.1253,0.0662,0.0062,0.7396,0.0148,0.0055,0.048,0.0076,0.0071
0,mnist,7603,100,9.2069,5.6675,0.7242,11.509,0.0534,2.0755,2.9942,1.0634,2.6688,4.7443,0.1556
0,musk,3062,166,3.1679,1.4008,0.2327,1.8347,0.0788,1.3468,0.8511,0.1746,10.5883,0.8694,0.1438
0,optdigits,5216,64,2.8758,2.0105,0.3501,4.9696,0.0258,1.0633,0.9556,0.4755,1.2686,1.8098,0.0779
0,pendigits,6870,16,2.2707,1.9064,0.2415,3.9381,0.0243,0.7459,0.9763,0.929,1.9944,2.4774,0.0043


In [34]:
# Define the number of inliers and outliers
n_samples = len(y)
clusters_separation = [0]

n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

In [35]:
# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth)

Number of inliers: 357
Number of outliers: 21
Ground truth shape is (378,). Outlier are 1 and inliers are 0.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]


In [36]:
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

Model 1 Angle-based Outlier Detector (ABOD)
Model 2 Cluster-based Local Outlier Factor
Model 3 Feature Bagging
Model 4 Histogram-base Outlier Detection (HBOS)
Model 5 Isolation Forest
Model 6 K Nearest Neighbors (KNN)
Model 7 Local Outlier Factor (LOF)
Model 8 Minimum Covariance Determinant (MCD)
Model 9 One-class SVM (OCSVM)
Model 10 Principal Component Analysis (PCA)
