# Histogram Based Outlier Detection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('fridge1.csv')
df.head(10)

Unnamed: 0,physical_quantity,power
0,2012-12-14 22:21:32+00:00,85
1,2012-12-14 22:21:38+00:00,85
2,2012-12-14 22:21:44+00:00,84
3,2012-12-14 22:21:50+00:00,85
4,2012-12-14 22:21:56+00:00,85
5,2012-12-14 22:22:02+00:00,85
6,2012-12-14 22:22:08+00:00,85
7,2012-12-14 22:22:14+00:00,85
8,2012-12-14 22:22:20+00:00,85
9,2012-12-14 22:22:26+00:00,86


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048574 entries, 0 to 1048573
Data columns (total 2 columns):
physical_quantity    1048574 non-null object
power                1048574 non-null int64
dtypes: int64(1), object(1)
memory usage: 16.0+ MB


In [4]:
df.dtypes

physical_quantity    object
power                 int64
dtype: object

In [5]:
df['physical_quantity'] = pd.to_datetime(df['physical_quantity'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048574 entries, 0 to 1048573
Data columns (total 2 columns):
physical_quantity    1048574 non-null datetime64[ns, UTC]
power                1048574 non-null int64
dtypes: datetime64[ns, UTC](1), int64(1)
memory usage: 16.0 MB


In [6]:
df.set_index('physical_quantity',inplace=True)

In [7]:
import datetime
df.index= df.index.tz_localize(None)
df.head(10)

Unnamed: 0_level_0,power
physical_quantity,Unnamed: 1_level_1
2012-12-14 22:21:32,85
2012-12-14 22:21:38,85
2012-12-14 22:21:44,84
2012-12-14 22:21:50,85
2012-12-14 22:21:56,85
2012-12-14 22:22:02,85
2012-12-14 22:22:08,85
2012-12-14 22:22:14,85
2012-12-14 22:22:20,85
2012-12-14 22:22:26,86


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1048574 entries, 2012-12-14 22:21:32 to 2013-03-25 00:06:20
Data columns (total 1 columns):
power    1048574 non-null int64
dtypes: int64(1)
memory usage: 16.0 MB


In [9]:
df.describe()

Unnamed: 0,power
count,1048574.0
mean,35.65282
std,51.62273
min,0.0
25%,0.0
50%,0.0
75%,85.0
max,2984.0


In [10]:
print(df.shape)

(1048574, 1)


In [11]:
df.isnull().values.any()

False

In [12]:
#!pip install pyod


In [13]:
from pyod.models import hbos

In [14]:
outliers_fraction = 0.01
clf = hbos.HBOS(contamination = outliers_fraction)
clf.fit(df[['power']])

HBOS(alpha=0.1, contamination=0.01, n_bins=10, tol=0.5)

Histogram Based Outlier Score

$HBOS(v) = \sum\limits_{i=0}^{d}\frac{1}{hist_i(v)}$

$hist_i(v)$ = Density Estimation

Inverse of HBOS is the Decision Function Score

In [15]:
df['Score'] = clf.decision_function(df[['power']])
y_pred = clf.predict(df[['power']])
n_inliers = len(y_pred)-np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred)
df1 = df
df1['outlier'] = y_pred.tolist()

In [16]:
score = clf.decision_function(df[['power']])
score

array([3.27439165, 3.27439165, 3.27439165, ..., 3.27439165, 3.27439165,
       3.27439165])

In [17]:
threshold = np.percentile(score, 100*outliers_fraction)
round(threshold,3)

3.274

score > Threshold -------> Anomaly  (1)

score <= Threshold ------> Normal  (0)

In [18]:
len(y_pred)

1048574

In [19]:
print(n_inliers, n_outliers)

1048153 421


In [20]:
print(np.count_nonzero(y_pred))

421


In [21]:
df.head()

Unnamed: 0_level_0,power,Score,outlier
physical_quantity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-12-14 22:21:32,85,3.274392,0
2012-12-14 22:21:38,85,3.274392,0
2012-12-14 22:21:44,84,3.274392,0
2012-12-14 22:21:50,85,3.274392,0
2012-12-14 22:21:56,85,3.274392,0


In [22]:
df1['outlier']

physical_quantity
2012-12-14 22:21:32    0
2012-12-14 22:21:38    0
2012-12-14 22:21:44    0
2012-12-14 22:21:50    0
2012-12-14 22:21:56    0
                      ..
2013-03-25 00:05:52    0
2013-03-25 00:05:59    0
2013-03-25 00:06:06    0
2013-03-25 00:06:13    0
2013-03-25 00:06:20    0
Name: outlier, Length: 1048574, dtype: int64

In [23]:
df1.head()

Unnamed: 0_level_0,power,Score,outlier
physical_quantity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-12-14 22:21:32,85,3.274392,0
2012-12-14 22:21:38,85,3.274392,0
2012-12-14 22:21:44,84,3.274392,0
2012-12-14 22:21:50,85,3.274392,0
2012-12-14 22:21:56,85,3.274392,0


In [24]:
df1.tail(10)

Unnamed: 0_level_0,power,Score,outlier
physical_quantity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-03-25 00:05:17,0,3.274392,0
2013-03-25 00:05:24,0,3.274392,0
2013-03-25 00:05:31,0,3.274392,0
2013-03-25 00:05:38,0,3.274392,0
2013-03-25 00:05:45,0,3.274392,0
2013-03-25 00:05:52,0,3.274392,0
2013-03-25 00:05:59,0,3.274392,0
2013-03-25 00:06:06,0,3.274392,0
2013-03-25 00:06:13,0,3.274392,0
2013-03-25 00:06:20,0,3.274392,0


In [25]:
inliers = np.array(df1['power'][df1['outlier']==0]).reshape(-1,1)
outliers = df1['power'][df1['outlier']==1].values.reshape(-1,1)
print("Anomalies : ",n_outliers)
print("Inliers : ",n_inliers)

Anomalies :  421
Inliers :  1048153


In [26]:
len(df1)

1048574

In [27]:
print("Anomalies Percentage : ", (n_outliers/len(df1))*100, "%")

Anomalies Percentage :  0.04014976530030308 %


In [28]:
print("Inliers Percentage : ", (n_inliers/len(df1))*100, "%")

Inliers Percentage :  99.9598502346997 %


In [29]:
inliers

array([[85],
       [85],
       [84],
       ...,
       [ 0],
       [ 0],
       [ 0]], dtype=int64)

In [30]:
outliers

array([[ 323],
       [ 468],
       [ 467],
       [ 458],
       [ 437],
       [1461],
       [ 457],
       [1654],
       [1730],
       [ 480],
       [1678],
       [ 399],
       [ 426],
       [1749],
       [ 907],
       [ 470],
       [2420],
       [ 473],
       [1663],
       [ 504],
       [1778],
       [1747],
       [1523],
       [ 365],
       [ 330],
       [1171],
       [ 451],
       [ 405],
       [1774],
       [ 343],
       [ 483],
       [ 532],
       [ 430],
       [ 729],
       [ 524],
       [1771],
       [1778],
       [1646],
       [ 807],
       [1781],
       [ 378],
       [ 840],
       [1838],
       [ 788],
       [ 694],
       [ 493],
       [1828],
       [1769],
       [ 438],
       [ 327],
       [ 425],
       [ 377],
       [1469],
       [ 328],
       [1701],
       [ 464],
       [ 390],
       [ 515],
       [ 468],
       [ 521],
       [ 396],
       [ 413],
       [1636],
       [ 415],
       [ 299],
       [ 344],
       [ 3

In [31]:
len(inliers)

1048153

In [32]:
len(outliers)

421

In [33]:
a = [1200,85,0,95,323]
a_df = pd.DataFrame(a)
y1 = clf.predict(a_df)
y2 = pd.DataFrame(y1)
y3 = clf.decision_function(a_df)
y3 = pd.DataFrame(y3)
y2 = pd.concat([a_df,y3,y2],axis = 1)
y2.columns = ['Power','Score','Target']
print("Predicted Values:")
y2

Predicted Values:


Unnamed: 0,Power,Score,Target
0,1200,3.321927,1
1,85,3.274392,0
2,0,3.274392,0
3,95,3.274392,0
4,323,3.321916,1


In [34]:
print("Results of Histogram Based Outlier Detection Model:")

print("Anomalies : ",n_outliers)
print("Total instances : ",len(df1))
print("Anomalies Percentage : ", round((n_outliers/len(df1))*100,2), "%")
print("\n")
print("Inliars : ",n_inliers)
print("Total instances : ",len(df1))
print("Inliers Percentage : ", round((n_inliers/len(df1))*100,2), "%")

Results of Histogram Based Outlier Detection Model:
Anomalies :  421
Total instances :  1048574
Anomalies Percentage :  0.04 %


Inliars :  1048153
Total instances :  1048574
Inliers Percentage :  99.96 %
