### Anomaly Detection
- To indentify the data points that are different of the other data point
- There are 3 categories
    - Supervised Anomaly Dection: labels for both nomal and anomalous data
    - Semi-Supervised Anomaly Dectection: only label of nomal data 
    - Unsupervise Anomaly Dectection: no label is available
- Algorithms
    - Mahalanobis Distance: 
        - For one single dimension data, z-score can be used for outlier/anomaly detection. A data point is labeled as an anomaly if |z| > T (Threshold)
        - For multidimensional data, the z-score is not applicable. The Mahalanobis distance defined as https://en.wikipedia.org/wiki/Mahalanobis_distance
        
    - Density-based spatial clustering of application with noise (DBSCAN)
    - Local Outlier Factor (LOF)
    - Isolation Forest
    - One-Class Support Vector Machine

In [3]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# make dataset
from sklearn.datasets import make_blobs

# statistical modules for data generation and critical values
from scipy.stats import multivariate_normal, beta, uniform, t

from functools import partial

from sklearn import svm
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from sklearn.covariance import EllipticEnvelope # use for Mohalanobis distance
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import SGDOneClassSVM

from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

from typing import Any


In [None]:
class MahalanobisOutlier(EllipticEnvelope):

    def __init__(self, *, store_precision = True, assume_centered = False, support_fracetion = None, contamination = 0.1, alpha:float = 0.05, random_state = None, ) -> None:
        super().__init__(store_precision = store_precision,
                         assume_centered = assume_centered,
                         support_fracetion = support_fracetion,
                         contamination = contamination
                         random_state = random_state,)
        self.alpha = alpha

    def predict(self, X):
        dist = self.mahalanobis(X = X)
        p = self.location_.shape[0] # dimension
        n = self.support_.shape[0] # number of sample

        if p > 1:
            Tm = ((n - 1) ** 2)/n
            alpha = self.alpha
            b = beta.ppf(1 - alpha, a = p/2, b = (b - p -1)/2) # return quantile corresponding to the lower tail probability
            Tm = Tm * b
            inliers = 2 * (dist <= Tm).astype('int') - 1 # -1v for anomalies/outliers, +1 for inliers
        else: # one dimension use Gubb's test
            Tm = ((n - 1) ** 2)/n
            t_sq = t.ppf(1 - alpha/2.0, df = n - 2) ** 2
            th = t_sq/(n -2 - t_sq)
            Tm = Tm * th
            inliers = 2 * (dist <= Tm).astype('int') - 1
        return inliers
    
    

        