# Project Day 1

In [2]:
import os

In [3]:
os.getcwd()

'/home/jovyan/demo'

In [5]:
os.chdir('/home/jovyan/demo/ProjectDay1Data')

In [6]:
os.getcwd()

'/home/jovyan/demo/ProjectDay1Data'

In [7]:
import sys
import numpy as np
import pandas as pd
from time import time 
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

In [10]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

from warnings import filterwarnings
filterwarnings('ignore')

In [11]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [12]:
MatFileList = ['arrhythmia.mat','cardio.mat',
                 'glass.mat','ionosphere.mat',
                 'letter.mat','lympho.mat',
                 'mnist.mat','musk.mat',
                 'optdigits.mat','pendigits.mat',
                 'pima.mat','satellite.mat',
                 'satimage-2.mat','shuttle.mat',
                 'vertebral.mat','vowels.mat','wbc.mat']

In [13]:
MatFileList

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [14]:
df_columns=['Data','#Sample','#Dimensions','Outlier Perc','PCA','MCD','OCSVM','LOF','CBLOF','KNN','HBOS','ABOD','IFOREST','FEATUREBAGGING']

In [19]:
#ROC Dataframe
roc_df=pd.DataFrame(columns=df_columns)
#Precision Dataframe
prn_df=pd.DataFrame(columns=df_columns)
#Execution time Dataframe
time_df=pd.DataFrame(columns=df_columns)
print(roc_df,prn_df,time_df)

Empty DataFrame
Columns: [Data, #Sample, #Dimensions, Outlier Perc, PCA, MCD, OCSVM, LOF, CBLOF, KNN, HBOS, ABOD, IFOREST, FEATUREBAGGING]
Index: [] Empty DataFrame
Columns: [Data, #Sample, #Dimensions, Outlier Perc, PCA, MCD, OCSVM, LOF, CBLOF, KNN, HBOS, ABOD, IFOREST, FEATUREBAGGING]
Index: [] Empty DataFrame
Columns: [Data, #Sample, #Dimensions, Outlier Perc, PCA, MCD, OCSVM, LOF, CBLOF, KNN, HBOS, ABOD, IFOREST, FEATUREBAGGING]
Index: []


In [20]:
#Load a dataframe
data_1 = loadmat("glass.mat") 
data_1

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 06:10:37 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 1.52101, 13.64   ,  4.49   , ...,  8.75   ,  0.     ,  0.     ],
        [ 1.51761, 13.89   ,  3.6    , ...,  7.83   ,  0.     ,  0.     ],
        [ 1.51618, 13.53   ,  3.55   , ...,  7.78   ,  0.     ,  0.     ],
        ...,
        [ 1.52065, 14.36   ,  0.     , ...,  8.44   ,  1.64   ,  0.     ],
        [ 1.51651, 14.38   ,  0.     , ...,  8.48   ,  1.57   ,  0.     ],
        [ 1.51711, 14.23   ,  0.     , ...,  8.62   ,  1.67   ,  0.     ]]),
 'y': array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
 

In [22]:
# Creating random state
random_state = np.random.RandomState(42)

# Processing mat files one by one : 
for mat_file in MatFileList:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join(mat_file))

    X = mat['X']
    y = mat['y'].ravel()  #ravel() function converts 2D to 1D
    
    # Counting Outlier :
    
    # Counts the number of non-zero values in the array y and divide by length of y : It gives outlier in fraction
    outliers_fraction = np.count_nonzero(y) / len(y)
    
    # Calculating Outlier percentage
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # Construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # Spliting Data into : 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)
    
    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    # Applying all the algorithms and storing thier result in a dictionary format:
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                   
                   'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False,
                                                               random_state=random_state),
                   
                   'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
                   
                   'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                   
                   'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),
                   
                   'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                   
                   'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
                   
                   'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
                   
                   'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                   
                   'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),
   }


    # Calculating Time taken for each algorithm
    for clf_name, clf in classifiers.items():
        # Initialize the start time 
        t0 = time() 
        
        # Fit( Train )the data
        clf.fit(X_train_norm) 
        
        # Predicting Value on Xtest
        test_scores = clf.decision_function(X_test_norm)  
        
        # Final Time
        t1 = time()   
        
        # Total time duration : t1 - t0
        duration = round(t1 - t0, ndigits=4) 
        
        # Append duration in time list
        time_list.append(duration)

        #Calculating roc and precision value of the algorithm
        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        # Print the roc , precision and executing time 
        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        # Append roc and precision value to their respective list
        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 1.6843s
Cluster-based Local Outlier Factor ROC:0.7789, precision @ rank n:0.4643, execution time: 3.2133s
Feature Bagging ROC:0.7796, precision @ rank n:0.4643, execution time: 0.8551s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 1.8113s
Isolation Forest ROC:0.8637, precision @ rank n:0.6071, execution time: 0.424s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.0957s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.0799s
Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 16.0892s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.0585s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.5487s

... Processing cardio.mat ...
Angle-based Outlier Det

In [24]:
print(roc_df)

         Data #Sample #Dimensions Outlier Perc     PCA     MCD   OCSVM  \
0  arrhythmia     452         274      14.6018  0.7687  0.7789  0.7796   
0      cardio    1831          21       9.6122  0.5892  0.8845  0.6385   
0       glass     214           9       4.2056  0.6951   0.811  0.7073   
0  ionosphere     351          33      35.8974  0.9181  0.9176  0.9303   
0      letter    1600          32         6.25  0.8783  0.7783  0.8947   
0      lympho     148          18       4.0541  0.9831       1       1   
0       mnist    7603         100       9.2069  0.7628  0.8389  0.7157   
0        musk    3062         166       3.1679  0.2161       1   0.473   
0   optdigits    5216          64       2.8758  0.4894  0.7901  0.5062   
0   pendigits    6870          16       2.2707   0.667  0.8082  0.4889   
0        pima     768           8      34.8958  0.7163    0.67  0.6448   
0   satellite    6435          36      31.6395  0.5653  0.7241   0.572   
0  satimage-2    5803          36     

In [25]:
print(prn_df)

         Data #Sample #Dimensions Outlier Perc     PCA     MCD   OCSVM  \
0  arrhythmia     452         274      14.6018  0.3571  0.4643  0.4643   
0      cardio    1831          21       9.6122  0.1918  0.4932  0.1781   
0       glass     214           9       4.2056    0.25    0.25    0.25   
0  ionosphere     351          33      35.8974  0.8431  0.8039  0.8039   
0      letter    1600          32         6.25  0.4375  0.1875  0.4062   
0      lympho     148          18       4.0541       0       1       1   
0       mnist    7603         100       9.2069  0.3367  0.3912  0.3741   
0        musk    3062         166       3.1679     0.1       1   0.125   
0   optdigits    5216          64       2.8758  0.0152       0  0.0303   
0   pendigits    6870          16       2.2707  0.0526  0.1579  0.0526   
0        pima     768           8      34.8958  0.5253  0.4949  0.4444   
0   satellite    6435          36      31.6395  0.3962  0.5412     0.4   
0  satimage-2    5803          36     

In [26]:
print(time_df)

         Data #Sample #Dimensions Outlier Perc      PCA     MCD    OCSVM  \
0  arrhythmia     452         274      14.6018   1.6843  3.2133   0.8551   
0      cardio    1831          21       9.6122   0.3911  1.3857   0.9436   
0       glass     214           9       4.2056   0.1294  0.0492   0.0507   
0  ionosphere     351          33      35.8974   0.1176  0.0547   0.0861   
0      letter    1600          32         6.25   0.3762  1.6109   0.8623   
0      lympho     148          18       4.0541   0.1166  0.0438   0.0435   
0       mnist    7603         100       9.2069   8.2621  3.3244  55.3837   
0        musk    3062         166       3.1679   2.3965  2.2521  14.2176   
0   optdigits    5216          64       2.8758   2.7905  2.4536  14.9388   
0   pendigits    6870          16       2.2707   1.4481  2.5285   3.3875   
0        pima     768           8      34.8958   0.1501  0.1158   0.1015   
0   satellite    6435          36      31.6395   1.8478   2.322   6.7463   
0  satimage-