In [1]:
# Installation in conda ---- conda install -c conda-forge pyod
# Installation in colab ---- !pip install pyod
# Installation in anaconda ---- pip install pyod

__basic packages + mat files loading packages__

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.io import loadmat   # to load mat files
from time import time

__anomaly detection packages__

In [3]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



__metrics packages__

In [4]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [5]:
# Creating a list of all the mat files

mat_files_data = ['arrhythmia.mat', 'cardio.mat', 'glass.mat', 'ionosphere.mat', 'letter.mat', 'lympho.mat', 'mnist.mat', 'musk.mat', 'optdigits.mat', 'pendigits.mat', 'pima.mat', 'satellite.mat', 'satimage-2.mat', 'shuttle.mat', 'vertebral.mat', 'vowels.mat', 'wbc.mat']

mat_files_data

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [6]:
# demo to open a mat file and verify X -- IDV and y -- DV features

data = loadmat('arrhythmia.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Tue May 17 11:53:12 2016',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 75. ,   0. , 190. , ...,   2.9,  23.3,  49.4],
        [ 56. ,   1. , 165. , ...,   2.1,  20.4,  38.8],
        [ 54. ,   0. , 172. , ...,   3.4,  12.3,  49. ],
        ...,
        [ 36. ,   0. , 166. , ...,   1. , -44.2, -33.2],
        [ 32. ,   1. , 155. , ...,   2.4,  25. ,  46.6],
        [ 78. ,   1. , 160. , ...,   1.6,  21.3,  32.8]]),
 'y': array([[1],
        [0],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
 

In [7]:
# Check the shape & type of the data

type(data['X'])

numpy.ndarray

In [8]:
data['X'].shape

(452, 274)

In [9]:
type(data['y']), data['y'].shape

(numpy.ndarray, (452, 1))

In [10]:
# Check the length of the data

len(data)

5

In [11]:
# Its a dictionary object. We can check its keys and corresponding values.

dict.keys(data)

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [12]:
dict.values(data)

dict_values([b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Tue May 17 11:53:12 2016', '1.0', [], array([[ 75. ,   0. , 190. , ...,   2.9,  23.3,  49.4],
       [ 56. ,   1. , 165. , ...,   2.1,  20.4,  38.8],
       [ 54. ,   0. , 172. , ...,   3.4,  12.3,  49. ],
       ...,
       [ 36. ,   0. , 166. , ...,   1. , -44.2, -33.2],
       [ 32. ,   1. , 155. , ...,   2.4,  25. ,  46.6],
       [ 78. ,   1. , 160. , ...,   1.6,  21.3,  32.8]]), array([[1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
      

In [13]:
df_columns = ['Data', '#Samples', '#Dimensions', 'Outlier_Percentage', 'PCA', 'MCD', 'OCSVM', 'LOF', 'CBLOF', 'KNN', 'HBOS', 'ABOD', 'IForest', 'FeatureBagging']

df_columns

['Data',
 '#Samples',
 '#Dimensions',
 'Outlier_Percentage',
 'PCA',
 'MCD',
 'OCSVM',
 'LOF',
 'CBLOF',
 'KNN',
 'HBOS',
 'ABOD',
 'IForest',
 'FeatureBagging']

In [14]:
roc_df = pd.DataFrame(columns=df_columns)

roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier_Percentage,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FeatureBagging


In [15]:
time_df = pd.DataFrame(columns=df_columns)

time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier_Percentage,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FeatureBagging


In [16]:
prn_df = pd.DataFrame(columns=df_columns)

prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier_Percentage,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FeatureBagging


In [23]:
for i in mat_files_data:
    print('\n\n\n --------- Processing mat file : ', i, '---------')
    mat = loadmat(os.path.join(i))
    
    # Split the data into X and y
    X = mat['X']
    y = mat['y'].ravel()
    
    outliers_fraction = np.count_nonzero(y)/len(y)
    outliers_percentage = round(outliers_fraction*100, ndigits=4)
    
    # Results to be saved as:
    roc_list = [i[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [i[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [i[:-4], X.shape[0], X.shape[1], outliers_percentage]
    
    # Split data into test and train dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    
    # Standardiszing data
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    
    # Classifier functions
    classifiers = {'      PCA     '  : PCA(contamination=outliers_fraction, random_state=0), 
                   '      MCD     '  : MCD(contamination=outliers_fraction,random_state=0), 
                   '     OCSVM    '  : OCSVM(contamination=outliers_fraction), 
                   '      LOF     '  : LOF(contamination=outliers_fraction), 
                   '     CBLOF    '  : CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0), 
                   '      KNN     '  : KNN(contamination=outliers_fraction), 
                   '     HBOS     '  : HBOS(contamination=outliers_fraction), 
                   '     ABOD     '  : ABOD(contamination=outliers_fraction), 
                   '    IForest   '  : IForest(contamination=outliers_fraction, random_state=0), 
                   'FeatureBagging'  : FeatureBagging(contamination=outliers_fraction, random_state=0)}
    

    for clf_key, clf_value in classifiers.items():
        t0 = time()
        clf_value.fit(X_train_norm)
        y_pred = clf_value.decision_function(X_test_norm)
        t1 = time()
        t = round(t1 - t0, ndigits=4)
        time_list.append(t)

        roc = round(roc_auc_score(y_test, y_pred), ndigits=4)
        prn = round(precision_n_scores(y_test, y_pred), ndigits=4)

        print('{clf_key} ROC : {roc}, Precision   :   {prn}, Duration  :  {t}sec'.format(clf_key=clf_key, roc=roc, prn=prn, t=t))

        roc_list.append(roc)
        prn_list.append(prn)
    

    temp_df = pd.DataFrame(time_list).T
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).T
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).T
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)




 --------- Processing mat file :  arrhythmia.mat ---------
      PCA      ROC : 0.8164, Precision   :   0.5, Duration  :  0.147sec




      MCD      ROC : 0.8464, Precision   :   0.5, Duration  :  3.023sec
     OCSVM     ROC : 0.8102, Precision   :   0.5, Duration  :  0.063sec
      LOF      ROC : 0.8082, Precision   :   0.5, Duration  :  0.143sec
     CBLOF     ROC : 0.8203, Precision   :   0.5, Duration  :  0.423sec
      KNN      ROC : 0.8176, Precision   :   0.5, Duration  :  0.189sec
     HBOS      ROC : 0.8608, Precision   :   0.5385, Duration  :  0.25sec
     ABOD      ROC : 0.8035, Precision   :   0.3462, Duration  :  0.492sec
    IForest    ROC : 0.8094, Precision   :   0.5385, Duration  :  1.099sec
FeatureBagging ROC : 0.804, Precision   :   0.5, Duration  :  1.021sec



 --------- Processing mat file :  cardio.mat ---------
      PCA      ROC : 0.9454, Precision   :   0.5522, Duration  :  0.01sec




      MCD      ROC : 0.8375, Precision   :   0.403, Duration  :  1.717sec
     OCSVM     ROC : 0.928, Precision   :   0.4179, Duration  :  0.151sec
      LOF      ROC : 0.6078, Precision   :   0.194, Duration  :  0.264sec
     CBLOF     ROC : 0.7552, Precision   :   0.2687, Duration  :  0.379sec
      KNN      ROC : 0.7309, Precision   :   0.2836, Duration  :  0.437sec
     HBOS      ROC : 0.8435, Precision   :   0.4627, Duration  :  0.022sec
     ABOD      ROC : 0.5783, Precision   :   0.2388, Duration  :  1.484sec
    IForest    ROC : 0.9284, Precision   :   0.4925, Duration  :  0.9651sec
FeatureBagging ROC : 0.6352, Precision   :   0.1791, Duration  :  1.716sec



 --------- Processing mat file :  glass.mat ---------
      PCA      ROC : 0.4578, Precision   :   0.0, Duration  :  0.004sec
      MCD      ROC : 0.759, Precision   :   0.0, Duration  :  0.113sec
     OCSVM     ROC : 0.3534, Precision   :   0.0, Duration  :  0.005sec
      LOF      ROC : 0.9116, Precision   :   0.3333, Du



     CBLOF     ROC : 0.8394, Precision   :   0.0, Duration  :  0.175sec
      KNN      ROC : 0.8072, Precision   :   0.0, Duration  :  0.046sec
     HBOS      ROC : 0.6707, Precision   :   0.0, Duration  :  0.009sec
     ABOD      ROC : 0.6546, Precision   :   0.0, Duration  :  0.172sec
    IForest    ROC : 0.6506, Precision   :   0.0, Duration  :  0.917sec
FeatureBagging ROC : 0.8996, Precision   :   0.3333, Duration  :  0.135sec



 --------- Processing mat file :  ionosphere.mat ---------
      PCA      ROC : 0.7786, Precision   :   0.5965, Duration  :  0.009sec
      MCD      ROC : 0.9714, Precision   :   0.9123, Duration  :  0.262sec
     OCSVM     ROC : 0.8584, Precision   :   0.7193, Duration  :  0.011sec
      LOF      ROC : 0.9046, Precision   :   0.7544, Duration  :  0.034sec
     CBLOF     ROC : 0.9135, Precision   :   0.7895, Duration  :  0.196sec
      KNN      ROC : 0.9365, Precision   :   0.8596, Duration  :  0.053sec
     HBOS      ROC : 0.5792, Precision   :   0.4211, 




      LOF      ROC : 1.0, Precision   :   1.0, Duration  :  0.01sec
     CBLOF     ROC : 1.0, Precision   :   1.0, Duration  :  0.202sec
      KNN      ROC : 1.0, Precision   :   1.0, Duration  :  0.025sec
     HBOS      ROC : 1.0, Precision   :   1.0, Duration  :  0.019sec
     ABOD      ROC : 0.9831, Precision   :   0.0, Duration  :  0.147sec
    IForest    ROC : 1.0, Precision   :   1.0, Duration  :  0.882sec
FeatureBagging ROC : 1.0, Precision   :   1.0, Duration  :  0.107sec



 --------- Processing mat file :  mnist.mat ---------
      PCA      ROC : 0.8713, Precision   :   0.4301, Duration  :  0.437sec




      MCD      ROC : 0.8666, Precision   :   0.172, Duration  :  10.91sec
     OCSVM     ROC : 0.8728, Precision   :   0.4409, Duration  :  9.865sec
      LOF      ROC : 0.732, Precision   :   0.3477, Duration  :  16.275sec
     CBLOF     ROC : 0.8636, Precision   :   0.4552, Duration  :  4.9017sec
      KNN      ROC : 0.8592, Precision   :   0.4409, Duration  :  21.8045sec
     HBOS      ROC : 0.5828, Precision   :   0.1613, Duration  :  0.246sec
     ABOD      ROC : 0.7936, Precision   :   0.3548, Duration  :  27.384sec
    IForest    ROC : 0.8136, Precision   :   0.3011, Duration  :  7.098sec
FeatureBagging ROC : 0.7408, Precision   :   0.3548, Duration  :  162.5841sec



 --------- Processing mat file :  musk.mat ---------
      PCA      ROC : 0.9999, Precision   :   0.9444, Duration  :  0.39sec
      MCD      ROC : 0.9999, Precision   :   0.9722, Duration  :  58.847sec
     OCSVM     ROC : 1.0, Precision   :   1.0, Duration  :  2.6841sec
      LOF      ROC : 0.3941, Precision   : 



      MCD      ROC : 0.3972, Precision   :   0.0, Duration  :  5.7237sec
     OCSVM     ROC : 0.5132, Precision   :   0.0, Duration  :  3.3841sec
      LOF      ROC : 0.4871, Precision   :   0.0179, Duration  :  5.1772sec
     CBLOF     ROC : 0.7487, Precision   :   0.0, Duration  :  1.9997sec
      KNN      ROC : 0.364, Precision   :   0.0, Duration  :  6.802sec
     HBOS      ROC : 0.867, Precision   :   0.1964, Duration  :  0.195sec
     ABOD      ROC : 0.4455, Precision   :   0.0, Duration  :  10.878sec
    IForest    ROC : 0.6787, Precision   :   0.0179, Duration  :  3.643sec
FeatureBagging ROC : 0.4841, Precision   :   0.0179, Duration  :  46.6173sec



 --------- Processing mat file :  pendigits.mat ---------
      PCA      ROC : 0.9376, Precision   :   0.3273, Duration  :  0.036sec
      MCD      ROC : 0.8389, Precision   :   0.0909, Duration  :  10.6435sec
     OCSVM     ROC : 0.939, Precision   :   0.3818, Duration  :  2.8981sec
      LOF      ROC : 0.5243, Precision   :   0.





      MCD      ROC : 0.9908, Precision   :   0.7615, Duration  :  49.6704sec
     OCSVM     ROC : 0.9911, Precision   :   0.9519, Duration  :  165.8454sec
      LOF      ROC : 0.5249, Precision   :   0.1374, Duration  :  54.3345sec
     CBLOF     ROC : 0.6004, Precision   :   0.2775, Duration  :  3.203sec
      KNN      ROC : 0.6617, Precision   :   0.2233, Duration  :  43.7267sec
     HBOS      ROC : 0.9882, Precision   :   0.9348, Duration  :  0.082sec
     ABOD      ROC : 0.6326, Precision   :   0.2085, Duration  :  83.7957sec
    IForest    ROC : 0.9959, Precision   :   0.9575, Duration  :  13.1211sec
FeatureBagging ROC : 0.4416, Precision   :   0.0195, Duration  :  314.6322sec



 --------- Processing mat file :  vertebral.mat ---------
      PCA      ROC : 0.4272, Precision   :   0.0, Duration  :  0.005sec
      MCD      ROC : 0.4461, Precision   :   0.0, Duration  :  0.177sec
     OCSVM     ROC : 0.4881, Precision   :   0.0, Duration  :  0.006sec
      LOF      ROC : 0.4807, Pre

In [20]:
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier_Percentage,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FeatureBagging
0,arrhythmia,452,274,14.6018,1.6911,6.1124,0.088,0.22,10.9445,0.3167,8.125,5.5651,1.845,1.634
0,cardio,1831,21,9.6122,0.108,3.5278,0.262,0.321,0.651,0.92,0.038,2.526,1.689,2.58
0,glass,214,9,4.2056,0.016,0.211,0.006,0.013,0.195,0.037,0.011,0.2261,1.1026,0.158
0,ionosphere,351,33,35.8974,0.046,0.559,0.015,0.029,0.315,0.087,0.0855,0.411,1.571,0.28
0,letter,1600,32,6.25,0.024,7.8618,0.195,0.29,0.558,0.619,0.057,2.129,2.031,2.44
0,lympho,148,18,4.0541,0.006,0.205,0.007,0.014,0.246,0.043,0.064,0.176,1.312,0.222
0,mnist,7603,100,9.2069,0.535,20.4132,18.0847,29.8339,6.0492,25.852,0.311,34.0969,7.795,140.527
0,musk,3062,166,3.1679,0.364,45.27,2.45,4.496,1.347,5.172,0.248,7.451,4.184,27.7671
0,optdigits,5216,64,2.8758,0.136,4.531,3.0,4.369,1.752,5.629,0.126,9.098,3.189,35.117
0,pendigits,6870,16,2.2707,0.027,7.411,2.876,1.765,0.995,2.122,0.032,6.956,2.222,11.73


In [21]:
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier_Percentage,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FeatureBagging
0,arrhythmia,452,274,14.6018,0.8164,0.8464,0.8102,0.8082,0.8203,0.8176,0.8608,0.8035,0.8094,0.804
0,cardio,1831,21,9.6122,0.9454,0.8375,0.928,0.6078,0.7552,0.7309,0.8435,0.5783,0.9284,0.6352
0,glass,214,9,4.2056,0.4578,0.759,0.3534,0.9116,0.8394,0.8072,0.6707,0.6546,0.6506,0.8996
0,ionosphere,351,33,35.8974,0.7786,0.9714,0.8584,0.9046,0.9135,0.9365,0.5792,0.9532,0.8323,0.8985
0,letter,1600,32,6.25,0.5073,0.7883,0.5859,0.8523,0.7566,0.8564,0.5857,0.852,0.6178,0.8677
0,lympho,148,18,4.0541,1.0,0.9492,1.0,1.0,1.0,1.0,1.0,0.9831,1.0,1.0
0,mnist,7603,100,9.2069,0.8713,0.8666,0.8728,0.732,0.8636,0.8592,0.5828,0.7936,0.8136,0.7408
0,musk,3062,166,3.1679,0.9999,0.9999,1.0,0.3941,1.0,0.7583,1.0,0.1127,0.9988,0.3838
0,optdigits,5216,64,2.8758,0.5126,0.3972,0.5132,0.4871,0.7487,0.364,0.867,0.4455,0.6787,0.4841
0,pendigits,6870,16,2.2707,0.9376,0.8389,0.939,0.5243,0.9365,0.7819,0.9335,0.7344,0.9553,0.5345


In [22]:
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier_Percentage,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FeatureBagging
0,arrhythmia,452,274,14.6018,0.5,0.5,0.5,0.5,0.5,0.5,0.5385,0.3462,0.5385,0.5
0,cardio,1831,21,9.6122,0.5522,0.403,0.4179,0.194,0.2687,0.2836,0.4627,0.2388,0.4925,0.1791
0,glass,214,9,4.2056,0.0,0.0,0.0,0.3333,0.0,0.0,0.0,0.0,0.0,0.3333
0,ionosphere,351,33,35.8974,0.5965,0.9123,0.7193,0.7544,0.7895,0.8596,0.4211,0.8929,0.6316,0.7193
0,letter,1600,32,6.25,0.0732,0.122,0.122,0.3902,0.2683,0.2927,0.0244,0.3902,0.1463,0.3902
0,lympho,148,18,4.0541,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
0,mnist,7603,100,9.2069,0.4301,0.172,0.4409,0.3477,0.4552,0.4409,0.1613,0.3548,0.3011,0.3548
0,musk,3062,166,3.1679,0.9444,0.9722,1.0,0.0833,1.0,0.2222,0.9722,0.0,0.8857,0.1667
0,optdigits,5216,64,2.8758,0.0,0.0,0.0,0.0179,0.0,0.0,0.1964,0.0,0.0179,0.0179
0,pendigits,6870,16,2.2707,0.3273,0.0909,0.3818,0.0727,0.2364,0.0909,0.3455,0.0364,0.3273,0.0545
