In [1]:
!pip install pyod

Collecting pyod
  Downloading pyod-1.0.9.tar.gz (149 kB)
                                              0.0/150.0 kB ? eta -:--:--
     -------------------------------------- 150.0/150.0 kB 9.3 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py): started
  Building wheel for pyod (setup.py): finished with status 'done'
  Created wheel for pyod: filename=pyod-1.0.9-py3-none-any.whl size=184113 sha256=d4384f2bd4de5606f587df940fbe9a45f9015d0d9815f3332f03897f2cbba487
  Stored in directory: c:\users\sumeruinfra\appdata\local\pip\cache\wheels\83\55\6b\552e083cf5509c0afe808b76cf434f1be284d01a112623bd37
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.0.9


In [6]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.ocsvm import OCSVM
from pyod.utils.data import generate_data, get_outliers_inliers
from pyod.utils.data import evaluate_print

%matplotlib inline
import matplotlib.font_manager
state=42

In [8]:
path = "Dataset\hh114_ann_features.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,lastSensorEventHours,lastSensorEventSeconds,lastSensorDayOfWeek,windowDuration,timeSinceLastSensorEvent,prevDominantSensor1,prevDominantSensor2,lastSensorID,lastSensorLocation,lastMotionLocation,...,sensorElTime-Chair,sensorElTime-DiningRoom,sensorElTime-Hall,sensorElTime-Ignore,sensorElTime-Kitchen,sensorElTime-LivingRoom,sensorElTime-Office,sensorElTime-OutsideDoor,sensorElTime-WorkArea,activity
0,0.0,3074.0,2.0,2050.0,0.0,0.0,0.0,1.0,1.0,1.0,...,86400.0,86400.0,86400.0,12.186482,86400.0,86400.0,86400.0,86400.0,86400.0,Sleep
1,0.0,3075.0,2.0,1621.0,1.0,0.0,0.0,5.0,5.0,1.0,...,86400.0,86400.0,86400.0,0.0,86400.0,86400.0,86400.0,86400.0,86400.0,Sleep
2,0.0,3075.0,2.0,1021.0,0.0,0.0,0.0,1.0,1.0,1.0,...,86400.0,86400.0,86400.0,0.032108,86400.0,86400.0,86400.0,86400.0,86400.0,Sleep
3,0.0,3080.0,2.0,856.0,5.0,0.0,0.0,1.0,1.0,1.0,...,86400.0,86400.0,86400.0,4.501647,86400.0,86400.0,86400.0,86400.0,86400.0,Sleep
4,0.0,3081.0,2.0,43.0,1.0,0.0,0.0,1.0,1.0,1.0,...,86400.0,86400.0,86400.0,5.630662,86400.0,86400.0,86400.0,86400.0,86400.0,Sleep


In [9]:
x_feature = df.drop('activity',axis=1)
y_target = df['activity']


In [10]:
# by default the outlier fraction is 0.1 in generate data function 
outlier_fraction = 0.1

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(x_feature, 
                                                    y_target, 
                                                    test_size = .33, 
                                                    shuffle = True, 
                                                    stratify = y_target, 
                                                    random_state = 5)

In [14]:
models = {
     'Angle-based Outlier Detector (ABOD)'   : ABOD(contamination=outlier_fraction),

     'Isolation Forest': IForest(contamination=outlier_fraction,random_state=state),

     'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outlier_fraction,check_estimator=False, random_state=state),

     'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outlier_fraction),

     'K Nearest Neighbors (KNN)' :  KNN(contamination=outlier_fraction),
     
     "Support Vector Machine":OCSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05,max_iter=-1)
}

In [17]:
# store outliers and inliers in different numpy arrays
x_outliers, x_inliers = get_outliers_inliers(X_train,y_train)

n_inliers = len(x_inliers)
n_outliers = len(x_outliers)

#separate the two features and use it to plot the data 
F1 = X_train[:,[0]].reshape(-1,1)
F2 = X_train[:,[1]].reshape(-1,1)

# create a meshgrid 
xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))


InvalidIndexError: (array([], dtype=int64),)

In [16]:
#set the figure size
plt.figure(figsize=(20, 20))

for i, (model_name,model) in enumerate(models.items()) :
    # fit the dataset to the model
    model.fit(X_train)

    # predict raw anomaly score
    scores_pred = model.decision_function(X_train)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = model.predict(X_train)

    # no of errors in prediction
    n_errors = (y_pred != y_train).sum()
    print('No of Errors : ',model_name, n_errors)

    # rest of the code is to create the visualization

    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(scores_pred,100 *outlier_fraction)

    # decision function calculates the raw anomaly score for every point
    Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)

    subplot = plt.subplot(2, 3, i + 1)

    # fill blue colormap from minimum anomaly score to threshold value
    subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 10),cmap=plt.cm.Blues_r)

    # draw red contour line where anomaly score is equal to threshold
    a = subplot.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')

    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')

    # scatter plot of inliers with white dots
    b = subplot.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',s=20, edgecolor='k') 
    # scatter plot of outliers with black dots
    c = subplot.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',s=20, edgecolor='k')
    subplot.axis('tight')

    subplot.legend(
        [a.collections[0], b, c],
        ['learned decision function', 'true inliers', 'true outliers'],
        prop=matplotlib.font_manager.FontProperties(size=10),
        loc='lower right')

    subplot.set_title(model_name)
    subplot.set_xlim((-10, 10))
    subplot.set_ylim((-10, 10))
plt.show() 

No of Errors :  Angle-based Outlier Detector (ABOD) 128984


NameError: name 'xx' is not defined

<Figure size 2000x2000 with 0 Axes>

Model Evaluation

In [None]:
from pyod.utils.data import evaluate_print
for i, (model_name,model) in enumerate(models.items()) :
    # fit the dataset to the model
    model.fit(X_train)

    # predict raw anomaly score
    scores_pred = model.decision_function(X_train)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = model.predict(X_train)
    y_train_scores = model.decision_scores_

    # no of errors in prediction
    n_errors = (y_pred != y_train).sum()

    print('No of Errors using {} : {} '.format(model_name, n_errors))
    
    evaluate_print(model_name, y_train, y_train_scores)
    print("\n")
    