In [63]:
import pandas as pd
import numpy as np

# Import models
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
# reading the big mart sales training data
df = pd.read_csv("CPU_UTIL.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Label                36 non-null     object 
 1   CPUCreditUsage       36 non-null     float64
 2   NetworkPacketsOut    35 non-null     float64
 3   MetadataNoToken      36 non-null     float64
 4   CPUUtilization       36 non-null     float64
 5   NetworkPacketsOut.1  36 non-null     float64
 6   CPUCreditBalance     36 non-null     int64  
 7   NetworkPacketsIn     35 non-null     float64
 8   NetworkOut           36 non-null     float64
 9   NetworkPacketsIn.1   36 non-null     float64
 10  MetadataNoToken.1    36 non-null     float64
 11  CPUCreditBalance.1   36 non-null     int64  
 12  CPUUtilization.1     35 non-null     float64
 13  NetworkIn            35 non-null     float64
 14  NetworkIn.1          36 non-null     float64
 15  NetworkOut.1         35 non-null     float

In [69]:
# 날짜데이터를 Datetime 형식으로 바꿔준다
df.loc[:,'Label'] = pd.to_datetime(df.Label)

# 데이터의 정합성을 확인한다
df.isna().sum()

print("삭제 전 데이터 길이(일자수):",len(df))
df = df.dropna(axis=0).reset_index(drop=True)

print("삭제 후 데이터 길이(일자수):",len(df))
df.isna().sum()

삭제 전 데이터 길이(일자수): 36
삭제 후 데이터 길이(일자수): 35


Label                  0
CPUCreditUsage         0
NetworkPacketsOut      0
MetadataNoToken        0
CPUUtilization         0
NetworkPacketsOut.1    0
CPUCreditBalance       0
NetworkPacketsIn       0
NetworkOut             0
NetworkPacketsIn.1     0
MetadataNoToken.1      0
CPUCreditBalance.1     0
CPUUtilization.1       0
NetworkIn              0
NetworkIn.1            0
NetworkOut.1           0
CPUCreditUsage.1       0
MetadataNoToken.2      0
CPUCreditBalance.2     0
NetworkPacketsIn.2     0
NetworkOut.2           0
CPUCreditUsage.2       0
NetworkIn.2            0
CPUUtilization.2       0
NetworkPacketsOut.2    0
CPUCreditUsage.3       0
NetworkPacketsOut.3    0
CPUUtilization.3       0
NetworkOut.3           0
MetadataNoToken.3      0
NetworkPacketsIn.3     0
CPUCreditBalance.3     0
NetworkIn.3            0
dtype: int64

In [72]:
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams

plt.rcParams["figure.figsize"] = (10,5)
# Line Graph by matplotlib with wide-form DataFrame

plt.plot(df.Label, df.Close, marker='s', color='r')
plt.plot(df.Label, df.High, marker='o', color='g')
plt.plot(df.Label, df.Low, marker='*', color='b')
plt.plot(df.Label, df.Open, marker='+', color='y')

plt.title('KOSPI ', fontsize=20) 
plt.ylabel('Stock', fontsize=14)
plt.xlabel('Date', fontsize=14)
plt.legend(['Close', 'High', 'Low', 'Open'], fontsize=12, loc='best')

plt.show()

AttributeError: 'DataFrame' object has no attribute 'Close'

In [62]:
# Extract the readings from the BROKEN state of the pump
broken = df[df['machine_status']=='BROKEN']
# Extract the names of the numerical columns
df2 = df.drop(['machine_status'], axis=1)
names=df2.columns
# Plot time series for each sensor with BROKEN state marked with X in red color
for name in names:
    _ = plt.figure(figsize=(18,3))
    _ = plt.plot(broken[name], linestyle='none', marker='X', color='red', markersize=12)
    _ = plt.plot(df[name], color='blue')
    _ = plt.title(name)
    plt.show()

KeyError: 'machine_status'

In [50]:
X1 = df['CPUCreditUsage'].values.reshape(-1,1)
X2 = df['NetworkPacketsOut'].values.reshape(-1,1)

X = np.concatenate((X1,X2),axis=1)

In [57]:
random_state = np.random.RandomState(42)
outliers_fraction = 0.05
# Define seven outlier detection tools to be compared
classifiers = {
        'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
        'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Average KNN': KNN(method='mean',contamination=outliers_fraction)
}

In [58]:
xx , yy = np.meshgrid(np.linspace(0,1 , 200), np.linspace(0, 1, 200))

for i, (clf_name, clf) in enumerate(classifiers.items()):
    clf.fit(X)
    # predict raw anomaly score
    scores_pred = clf.decision_function(X) * -1
        
    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)
    plt.figure(figsize=(10, 10))
    
    # copy of dataframe
    dfx = df
    dfx['outlier'] = y_pred.tolist()
    
    # IX1 - inlier feature 1,  IX2 - inlier feature 2
    IX1 =  np.array(dfx['Item_MRP'][dfx['outlier'] == 0]).reshape(-1,1)
    IX2 =  np.array(dfx['Item_Outlet_Sales'][dfx['outlier'] == 0]).reshape(-1,1)
    
    # OX1 - outlier feature 1, OX2 - outlier feature 2
    OX1 =  dfx['CPUCreditUsage'][dfx['outlier'] == 1].values.reshape(-1,1)
    OX2 =  dfx['NetworkPacketsOut'][dfx['outlier'] == 1].values.reshape(-1,1)
         
    print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name)
        
    # threshold value to consider a datapoint inlier or outlier
    threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)
        
    # decision function calculates the raw anomaly score for every point
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)
          
    # fill blue map colormap from minimum anomaly score to threshold value
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
        
    # draw red contour line where anomaly score is equal to thresold
    a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
        
    # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
    plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
        
    b = plt.scatter(IX1,IX2, c='white',s=20, edgecolor='k')
    
    c = plt.scatter(OX1,OX2, c='black',s=20, edgecolor='k')
       
    plt.axis('tight')  
    
    # loc=2 is used for the top left corner 
    plt.legend(
        [a.collections[0], b,c],
        ['learned decision function', 'inliers','outliers'],
        prop=matplotlib.font_manager.FontProperties(size=20),
        loc=2)
      
    plt.xlim((0, 1))
    plt.ylim((0, 1))
    plt.title(clf_name)
    plt.show()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').