# HW3 Approaches in anomaly detection

## Reading the input data

In [20]:
from sklearn.neighbors import KernelDensity
from sklearn.cross_validation import train_test_split
import numpy as np
import scipy.io
mat = scipy.io.loadmat('oc_514.mat')
train = mat['x']
xtrain = train[0,0][0]

# Commented split of train and test as using k-fold cross validation approach for outlier detection.
X_train, X_test = train_test_split(
   xtrain, test_size=0.33, random_state=42)


## Anomaly Detection using KDE

In [21]:

#Using Gaussian Kernel Density Estimation
def GaussianKDE(X_train, X_test):
    #pdf = KernelDensity(bandwidth=0.25, kernel='linear')
    pdf = KernelDensity(bandwidth=0.1, kernel='gaussian')
    pdf.fit(X_train)
    resTrain = []
    resTest = []
    
    for data in X_train[:,:]:
        resTrain.append(np.exp(pdf.score_samples(data))[0])
    
    for data in X_test[:,:]:
        resTest.append(np.exp(pdf.score_samples(data))[0])
    
    nresTrain = np.array(resTrain)
    nresTest = np.array(resTest)
    
    # Assumption: "Normal data instances occur in high probability
    # regions of a stochastic model, while anomalies occur in the low
    # probability regions of the stochastic model."
    return nresTrain[nresTrain <= 0.05].size, nresTest[nresTest <= 0.05].size

In [22]:

# Cross Validation using K-Fold so that every data sample contributes in Training

from sklearn.cross_validation import KFold

X_train = xtrain
#print X_train.shape
n_folds = 5
kf = KFold(420, n_folds)
error_train = error_test = 0
for train, test in kf:
    #print train,test
    n_error_train, n_error_test = GaussianKDE(X_train[train,:], X_train[test,:])
    error_test += n_error_test
    error_train += n_error_train
    
print float(error_train)/n_folds , float(error_test)/n_folds


0.0 81.2


In [23]:
# Split the train and test set in the ratio of 3:1

n_error_train, n_error_test = GaussianKDE(X_train[:315,:], X_train[315:,:])


print n_error_train, n_error_test

0 105


### Performance of KDE for Anomaly Detection

In [24]:
print "Using KDE we got", (error_train + error_test)/n_folds , "anomalies out of 420 samples. This performs poorly as the number of anomalies is higher than the expected number of 183 as defined in the webpage of the dataset. Kernel Density Estimation (KDE) can be performed in any number of dimensions, though in practice the curse of dimensionality causes its performance to degrade in high dimensions which is the case with our dataset as it has 278 features." 

Using KDE we got 81 anomalies out of 420 samples. This performs poorly as the number of anomalies is higher than the expected number of 183 as defined in the webpage of the dataset. Kernel Density Estimation (KDE) can be performed in any number of dimensions, though in practice the curse of dimensionality causes its performance to degrade in high dimensions which is the case with our dataset as it has 278 features.


## Anomaly Detection using One Class SVM

In [25]:
#xtrain = train[0,0][0]
def OutlierUsingOneClassSVM(X_train, X_test):

    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.font_manager
    from sklearn import svm
    import scipy.io as sio

    #xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))

    # Generate train data
    #X = 0.3 * np.random.randn(100, 2)
    #X_train = np.r_[X + 2, X - 2]
    # Generate some regular novel observations
    #X = 0.3 * np.random.randn(20, 2)
    #X_test = np.r_[X + 2, X - 2]
    # Generate some abnormal novel observations
    #X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))


    # fit the model
    #clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=0.1)
    clf = svm.OneClassSVM(nu=0.435, kernel="linear", gamma=0.1)
    clf.fit(X_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    #y_pred_outliers = clf.predict(X_outliers)
    n_error_train = y_pred_train[y_pred_train == -1].size
    n_error_test = y_pred_test[y_pred_test == -1].size
    #n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

    # Plots
    # plot the line, the points, and the nearest vectors to the plane
#     Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
#     Z = Z.reshape(xx.shape)

#     plt.title("Novelty Detection")
#     plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.Blues_r)
#     a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='red')
#     plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='orange')

#     b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
#     b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
#     # #c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
#     plt.axis('tight')
#     plt.xlim((-5, 5))
#     plt.ylim((-5, 5))
#     plt.legend([a.collections[0], b1, b2], #, c],
#                ["learned frontier", "training observations",
#                 "new regular observations", "new abnormal observations"],
#                loc="upper left",
#                prop=matplotlib.font_manager.FontProperties(size=11))
#     plt.xlabel(
#         "error train: %d/200 ; errors novel regular: %d/40 ; "
#         "errors novel abnormal:"
#         % (n_error_train, n_error_test)) #, n_error_outliers))
#     plt.show()

    return n_error_train, n_error_test


In [26]:
# Cross Validation using K-Fold so that every data sample contributes in Training

from sklearn.cross_validation import KFold

X_train = xtrain
print X_train.shape
n_folds = 5
kf = KFold(420, n_folds)
error_train = error_test = 0
for train, test in kf:
    #print train,test
    n_error_train, n_error_test = OutlierUsingOneClassSVM(X_train[train,:], X_train[test,:])
    #print error_train, error_test
    #print n_error_train, n_error_test
    error_train += n_error_train
    error_test += n_error_test
    
print float(error_train)/n_folds, float(error_test)/n_folds


(420L, 278L)
146.8 37.0


In [27]:
# Split the train and test set in the ratio of 4:1
n_error_train, n_error_test = OutlierUsingOneClassSVM(X_train[:336,:], X_train[336:,:])

print n_error_train, n_error_test


147 37


### Performance using One Class SVM for Anomaly Detection

In [19]:
print "Using One Class SVM we got", (n_error_train + n_error_test) , "anomalies out of 420 samples. This performs well after tuning the kernel to be linear and with appropriate parameters as the number of anomalies matches the expected number of 183 as defined in the webpage of the dataset." 


Using One Class SVM we got 184 anomalies out of 420 samples. This performs well after tuning the kernel to be linear and with appropriate parameters as the number of anomalies matches the expected number of 183 as defined in the webpage of the dataset.
