In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings; warnings.filterwarnings('ignore')

# LOF (local outlier factor)
from sklearn.neighbors import LocalOutlierFactor
# Isolation Forest
from sklearn.ensemble import IsolationForest
# Robust Random Cut Forest
# pip install -q rrcf
import rrcf
# SVM (support vector machine)
from sklearn.svm import LinearSVC # linear
from sklearn.svm import SVC # non-linear
from sklearn.svm import OneClassSVM # one-class svm

In [None]:
# -- 3 sigma rule & boxplot --

# UCL, LCL
# 1 sigma
df['UCL_X1'] = df['X1'].mean() + 1*df['X1'].std()
df['LCL_X1'] = df['X1'].mean() - 1*df['X1'].std()

df['UCL_X2'] = df['X2'].mean() + 1*df['X2'].std()
df['LCL_X2'] = df['X2'].mean() - 1*df['X2'].std()

# distribution을 확인해 tail이 있는지 확인
sns.distplot(df['X1'], color='blue', label='X1')
sns.distplot(df['X2'], color='red', label='X2')

# X1 plotting
%matplotlib inline
plt.style.use(['seaborn-white'])

sns.scatterplot(x=list(df.index), y=df['X1'], c='b')
plt.axhline(y=df['UCL_X1'][0], color='r', linewidth=1)
plt.axhline(y=df['LCL_X1'][0], color='r', linewidth=1)
plt.gcf().set_size_inches(15,5)

# boxplot
plt.style.use(['default'])
labels = ['X1', 'X2', 'X3']

fig, ax = plt.subplots(figsize=(3,7))
box = ax.boxplot([df['X1'], df['X2'], df['X3']], widths=.7, sym='b*', showmeans=True)
ax.set_ylim(-10.0, 13.0)
ax.set_xlabel("X's")
ax.set_ylabel('Value')
plt.show()

In [None]:
# LOF

# parameters
    # n_neighbors : 밀도를 계산하고자 하는 주변 관측치 개수
    # algorithm : 가까운 이웃 계산 알고리즘
        # auto, ball_tree, kd_tree, brute
    # leaf_size : for tree algorithm
        # ball_tree, kd_tree
    # metric : cityblock, cosine, euclidean, l1, l2, manhattan, minkowski
        # default=minkowski
    # p : minkowski parameter
        # p=2 by default -> just euclidean_distance
    # contamination

# set up
lof = LocalOutlierFactor(n_neighbors=10,  # 10-distance
                         contamination=.1 # outlier 비율
                         )
y_pred = lof.fit_predict(df[['x1', 'x2']])

# 정상 데이터를 0으로 바꿔주는 작업
for i in range(y_pred.shape[0]):
    if y_pred[i] == 1:
        y_pred[i] = 0
    else:
        y_pred[i] = 1

n_errors = (y_pred != df['Y']).sum()

# score
X_scores = lof.negative_outlier_factor # needs scaling

# -- plotting --
# -- 1. adjusting percentile --
plt.figure(figsize=(13,8))
plt.title('LOF')
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color='k', s=3.0, label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) # MinMax scale

for i in range(df.shape[0]):
    if radius[i] >= np.percentile(radius, 99): # 비정상
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='r',
            facecolors='none',
            # label='Outlier scores'
        )

    elif radius[i] < np.percentile(radius, 99): # 정상
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='b',
            facecolors='none',
            # label='Outlier scores'
        )

plt.axis('tight')
plt.xlim((-10,10))
plt.ylim((-10,10))
plt.xlabel('prediction erros: %d' % (n_erros))
legend = plt.legend(loc='upper left')
# legend.legendHandles[0]._sizes = [10]
# legend.legendHandles[1]._sizes = [20]
plt.show()

# -- 2. based on model prediction --
plt.figure(figsize=(13,8))
plt.title('LOF')
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color='k', s=3.0, label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) # MinMax scale

for i in range(df.shape[0]):
    if y_pred[i] == 1:
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='r',
            facecolors='none',
            # label='Outlier scores'
        )

    elif y_pred[i] == 0:
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='b',
            facecolors='none',
            # label='Outlier scores'
        )

plt.axis('tight')
plt.xlim((-10,10))
plt.ylim((-10,10))
plt.xlabel('prediction erros: %d' % (n_erros))
legend = plt.legend(loc='upper left')
# legend.legendHandles[0]._sizes = [10]
# legend.legendHandles[1]._sizes = [20]
plt.show()

In [None]:
# Isolation Forest

# parameters
    # n_estimators
    # max_samples : default='auto'
    # contamination
    # max_features
    # bootstrap : 사용 추천 x
        # default=False
        # 이상치가 뽑히지 않을 수 있음

# set up
IF = IsolationForest(n_estimators=150,
                     max_samples=500,
                     contamination=.1
                     )
IF.fit(df[['x1', 'x2']])
y_pred = IF.predict(df[['x1', 'x2']])

# 정상 데이터를 0으로 바꿔주는 작업
for i in range(y_pred.shape[0]):
    if y_pred[i] == 1:
        y_pred[i] = 0
    else:
        y_pred[i] = 1

n_errors = (y_pred != df['Y']).sum()

# X_scores
X_scores = IF.score_samples(df[['x1', 'x2']])

# -- plotting --
# -- 1. adjusting percentile --
plt.figure(figsize=(13,8))
plt.title('Isolation Forest')
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color='k', s=3.0, label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) # MinMax scale

for i in range(df.shape[0]):
    if radius[i] >= np.percentile(radius, 95): # 비정상
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='r',
            facecolors='none',
            # label='Outlier scores'
        )

    elif radius[i] < np.percentile(radius, 95): # 정상
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='b',
            facecolors='none',
            # label='Outlier scores'
        )

plt.axis('tight')
plt.xlim((-10,10))
plt.ylim((-10,10))
plt.xlabel('prediction erros: %d' % (n_errors))
legend = plt.legend(loc='upper left')
# legend.legendHandles[0]._sizes = [10]
# legend.legendHandles[1]._sizes = [20]
plt.show()

# -- 2. based on model prediction --
plt.figure(figsize=(13,8))
plt.title('Isolation Forest')
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color='k', s=3.0, label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) # MinMax scale

for i in range(df.shape[0]):
    if y_pred[i] == 1:
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=10000*radius[i]**2, # <-- **2
            edgecolors='r',
            facecolors='none',
            # label='Outlier scores'
        )

    elif y_pred[i] == 0:
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i]**2,
            edgecolors='b',
            facecolors='none',
            # label='Outlier scores'
        )

plt.axis('tight')
plt.xlim((-10,10))
plt.ylim((-10,10))
plt.xlabel('prediction erros: %d' % (n_errors))
legend = plt.legend(loc='upper left')
# legend.legendHandles[0]._sizes = [10]
# legend.legendHandles[1]._sizes = [20]
plt.show()

In [None]:
# Robust Random Cut Forest

# Set parameters
num_trees = 150
tree_size = 500

# Construct forest
forest = []
while len(forest) < num_trees:
    # Select random subsets of points uniformly from point set
    ixs = np.random.choice(df_array.shape[0],
                           size=(df_array.shape[0] // tree_size, tree_size),
                           replace=False # 중복 허용 x
                           )
    # print(ixs)
    # Add sampled trees to forest
    trees = [rrcf.RCTree(df_array[ix, :2], index_labels=ix) for ix in ixs]
    print(trees)
    forest.extend(trees)

# Compute average CoDisp
avg_codisp = pd.Series(0.0, index=np.arange(df_array.shape[0]))
index = np.zeros(df_array.shape[0])
for tree in forest:
    codisp = pd.Series({leaf : tree.codisp(leaf) for leaf in tree.leaves})
    avg_codisp[codisp.index] += codisp
    np.add.at(index, codisp.index.values, 1)
avg_codisp /= index

# plotting
# percentile을 for loop로 돌려가며 확인
for p in range(80, 99):
    plt.figure(figsize=(13,8))
    plt.title('RRCF {} percentile'.format(p))
    plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color='k', s=3.0, label='Data points')
    # plot circles with radius proportional to the outlier scores
    radius = avg_codisp

    for i in range(df.shape[0]):
        if radius[i] >= np.percentile(radius, p): # 비정상
            plt.scatter(
                df.iloc[i, 0],
                df.iloc[i, 1],
                s=10*radius[i],
                edgecolors='r',
                facecolors='none',
                # label='Outlier scores'
            )

        elif radius[i] < np.percentile(radius, p): # 정상
            plt.scatter(
                df.iloc[i, 0],
                df.iloc[i, 1],
                s=10*radius[i],
                edgecolors='b',
                facecolors='none',
                # label='Outlier scores'
            )

    plt.axis('tight')
    plt.xlim((-10,10))
    plt.ylim((-10,10))
    # plt.xlabel('prediction erros: %d' % (n_errors))
    legend = plt.legend(loc='upper left')
    # legend.legendHandles[0]._sizes = [10]
    # legend.legendHandles[1]._sizes = [20]
    plt.show()

In [None]:
# One-Class SVM

# parameters
    # nu : error 개수에 대한 parameter
        # 커지면 abnormal 수 증가
    # kernel
        # linear
        # poly
        # rbf : Gaussian
        # sigmoid
        # precomputed
        # default  : rbf
    # degree : only for poly
    # gamma : rbf, poly, sigmoid의 degree

# set up
OCSVM = OneClassSVM(nu=.01,
                    kernel='poly',
                    gamma=1
                    )
OCSVM.fit(df[['x1', 'x2']])
y_pred = OCSVM.predict(df[['x1', 'x2']])

# 정상 데이터를 0으로 바꿔주는 작업
for i in range(y_pred.shape[0]):
    if y_pred[i] == 1:
        y_pred[i] = 0
    else:
        y_pred[i] = 1

n_errors = (y_pred != df['Y']).sum()

# anomly score
X_scores = OCSVM.score_samples(df[['X1', 'X2']])

# plotting
plt.figure(figsize=(13,8))
plt.title('One-Class SVM')
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], color='k', s=3.0, label='Data points')
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())

for i in range(df.shape[0]):
    if radius[i] >= np.percentile(radius, 99): # 비정상
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='r',
            facecolors='none',
            # label='Outlier scores'
        )

    elif radius[i] < np.percentile(radius, 99): # 정상
        plt.scatter(
            df.iloc[i, 0],
            df.iloc[i, 1],
            s=1000*radius[i],
            edgecolors='b',
            facecolors='none',
            # label='Outlier scores'
        )

plt.axis('tight')
plt.xlim((-10,10))
plt.ylim((-10,10))
# plt.xlabel('prediction erros: %d' % (n_errors))
legend = plt.legend(loc='upper left')
# legend.legendHandles[0]._sizes = [10]
# legend.legendHandles[1]._sizes = [20]
plt.show()