# HW3 Approaches in anomaly detection

## Reading the input data

In [1]:
from sklearn.neighbors import KernelDensity
from sklearn.cross_validation import train_test_split
import numpy as np
import scipy.io
mat = scipy.io.loadmat('oc_514.mat')
train = mat['x']
xtrain = train[0,0][0]

# Commented split of train and test as using k-fold cross validation approach for outlier detection.
X_train, X_test = train_test_split(
   xtrain, test_size=0.33, random_state=42)


## Anomaly Detection using KDE

In [2]:

#Using Gaussian Kernel Density Estimation
def GaussianKDE(X_train, X_test):
    #pdf = KernelDensity(bandwidth=0.25, kernel='linear')
    pdf = KernelDensity(bandwidth=0.1, kernel='gaussian')
    pdf.fit(X_train)
    resTrain = []
    resTest = []
    
    for data in X_train[:,:]:
        resTrain.append(np.exp(pdf.score_samples(data))[0])
    
    for data in X_test[:,:]:
        resTest.append(np.exp(pdf.score_samples(data))[0])
    
    nresTrain = np.array(resTrain)
    nresTest = np.array(resTest)
    
    # Assumption: "Normal data instances occur in high probability
    # regions of a stochastic model, while anomalies occur in the low
    # probability regions of the stochastic model."
    return nresTrain[nresTrain <= 0.05].size, nresTest[nresTest <= 0.05].size

In [3]:

# Cross Validation using K-Fold so that every data sample contributes in Training

from sklearn.cross_validation import KFold

X_train = xtrain
#print X_train.shape
n_folds = 5
kf = KFold(420, n_folds)
error_train = error_test = 0
for train, test in kf:
    #print train,test
    n_error_train, n_error_test = GaussianKDE(X_train[train,:], X_train[test,:])
    error_test += n_error_test
    error_train += n_error_train
    
print float(error_train)/n_folds , float(error_test)/n_folds


0.0 81.2


In [4]:
# Split the train and test set in the ratio of 3:1

n_error_train, n_error_test = GaussianKDE(X_train[:315,:], X_train[315:,:])


print n_error_train, n_error_test

0 105


### Performance using KDE for Anomaly Detection

In [5]:
print "Using KDE we got", (error_train + error_test)/n_folds , "anomalies out of 420 samples. This performs poorly as the number of anomalies is higher than the expected number of 183 as defined in the webpage of the dataset. Kernel Density Estimation (KDE) can be performed in any number of dimensions, though in practice the curse of dimensionality causes its performance to degrade in high dimensions which is the case with our dataset as it has 278 features." 

Using KDE we got 81 anomalies out of 420 samples. This performs poorly as the number of anomalies is higher than the expected number of 183 as defined in the webpage of the dataset. Kernel Density Estimation (KDE) can be performed in any number of dimensions, though in practice the curse of dimensionality causes its performance to degrade in high dimensions which is the case with our dataset as it has 278 features.
