# Experiment 1
Miguel Sandim, 27/03/2017

## Algorithms
- One-class SVM
- LOF
- DBSCAN
- Multivariate Guassian distribution

## Combiner Methodology
- Stacking with Logistic Regression

## Evaluation Methodology
- ROC-AUC with 10-fold cross validation

## Datasets

All the duplicate instances in each dataset were removed (may have problems with LOF).

- ALOI
- KDDCup99


In [1]:
setwd("~/master-thesis")
set.seed(20)
library(ggplot2)
library(dplyr)
library(caret)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading required package: lattice


### Divided the dataset in 10 folds, each fold with 0.4% outliers (in KDDCup99) and 4% outliers (in ALOI)

In [2]:
# Load data
data_kddcup <- read.csv("datasets_processed/KDDCup99.csv")
data_aloi <- read.csv("datasets_processed/ALOI.csv")
dim(data_kddcup)
dim(data_aloi)

In [3]:
createStratCV <- function(data, k_value = 10)
{
  folds <- createFolds(data$outlier, k = k_value, list = TRUE, returnTrain = FALSE)
  folds_index <- lapply(folds, function(f) data[f, "id"])
}

In [4]:
folds_kddcup <- createStratCV(data_kddcup, k_value = 5)

In [5]:
folds_aloi <- createStratCV(data_aloi, k_value = 5)

In [6]:
table(data_kddcup[folds_kddcup[[1]], "outlier"])


  no  yes 
7818   30 

In [7]:
prop.table(table(data_kddcup[folds_kddcup[[1]], "outlier"])) * 100


       no       yes 
99.617737  0.382263 

In [8]:
str(folds_kddcup)

List of 5
 $ Fold1: int [1:9622] 1 4 5 17 18 27 30 33 34 38 ...
 $ Fold2: int [1:9623] 14 15 23 32 47 50 74 75 76 82 ...
 $ Fold3: int [1:9622] 10 11 13 25 26 41 45 48 49 59 ...
 $ Fold4: int [1:9623] 9 12 19 36 37 42 44 51 54 56 ...
 $ Fold5: int [1:9623] 6 7 8 16 20 22 24 28 29 31 ...


### Cross-validation Evaluation

In [9]:
# Unit test: train and test should be exclusive:
evaluateDataset <- function(data_folds_id)
{
  results <- lapply(names(data_folds_id),
    function(fold) # fold is the name of the fold
    {
      train <- stack(data_folds_id[names(data_folds_id) != fold])$values
      test <- data_folds_id[[fold]]
      list(train = train, test = test)
    })
}

results <- evaluateDataset(folds_kddcup)
all(sapply(results, function(x) length(intersect(x$train, x$test)) == 0 && !is.null(x$train) && !is.null(x$test)))
rm(results)
rm(evaluateDataset)

In [10]:
library(dbscan)
library(e1071)
library(mclust)

Package 'mclust' version 5.2.3
Type 'citation("mclust")' for citing this R package in publications.


#### Details:
![title](http://i.imgur.com/Pn3g3cr.png)

In [61]:
getFeatures <- function(data, data_folds_id)
{
  results <- lapply(names(data_folds_id) %>% first, # Only doing for the first fold for now
    function(fold_name)
    {
      # Train data: all the folds except the current fold
      train <- stack(data_folds_id[names(data_folds_id) != fold_name])$values
      
      # Train with inliers: half of the inliers in the train data
      train_inliers <- data %>%
        filter(id %in% train & outlier == "no") %>% .$id %>%
        sample(., size = length(.) / 2)
      
      # Train ensemble: the rest of the data
      train_ensemble <- setdiff(train, train_inliers)
      
      test <- data_folds_id[[fold_name]]
      
      print(paste0("total_train: ", length(train), " inliers: ", length(train_inliers), " ensemble: ", length(train_ensemble), " test: ", length(test)))
      
      # Remove class column:
      train_inliers_data <- data %>% filter(id %in% train_inliers)
      train_ensemble_data <- data %>% filter(id %in% train_ensemble)
      test_data <- data %>% filter(id %in% test)
      
      print(dim(train_inliers_data))
      
      #stopifnot(all(train_inliers_data$outlier == "no"))
      
      ## Convension 1 - Outlier, 0 - Non-outlier
      # Unsupervised Algorithms:
      alg1_results <- (dbscan(as.matrix(select(train_ensemble_data, -outlier)), eps = 0.3, minPts = 10)$cluster == 0) %>% as.numeric
      alg2_results <- lof(select(train_ensemble_data, -outlier), k = 3)
      message("*   Finished unsupervised algorithms")
      
      # Supervised or Semi-Supervised Algorithms:  
      alg3_model <- svm(select(train_inliers_data, -outlier), type='one-classification', scale = FALSE)
      alg3_results <- predict(alg3_model, select(train_ensemble_data, -outlier)) %>% `!`(.) %>% as.numeric # True => Not outlier
      #message("*   Finished One-class SVM")
      #alg4_model <- 
      message("*** Finished fold")
      
      # Add prediction to the original dataset:
      train_ensemble_data <- train_ensemble_data %>% mutate(dbscan = alg1_results, lof = alg2_results, svm = alg3_results)
      
      return(list(train_ensemble_data, train_ensemble_data, test_data))
    })
}

In [62]:
x <- getFeatures(data_kddcup, folds_kddcup)

[1] "total_train: 38491 inliers: 19165 ensemble: 19326 test: 9622"
[1] 19165    81


*   Finished unsupervised algorithms
*** Finished fold


In [63]:
features <- x[[1]][[1]]
features$dbscan <- ifelse(features$dbscan == 1, "yes", "no") %>% factor(levels = c("no", "yes"))
features$svm <- ifelse(features$svm == 1, "yes", "no") %>% factor(levels = c("no", "yes"))
summary(features)

 protocol_type...udp protocol_type...tcp protocol_type...icmp service...private
 Min.   :0.00000     Min.   :0.0000      Min.   :0.000000     Min.   :0.00000  
 1st Qu.:0.00000     1st Qu.:1.0000      1st Qu.:0.000000     1st Qu.:0.00000  
 Median :0.00000     Median :1.0000      Median :0.000000     Median :0.00000  
 Mean   :0.07932     Mean   :0.9145      Mean   :0.006157     Mean   :0.02442  
 3rd Qu.:0.00000     3rd Qu.:1.0000      3rd Qu.:0.000000     3rd Qu.:0.00000  
 Max.   :1.00000     Max.   :1.0000      Max.   :1.000000     Max.   :1.00000  
 service...domain_u service...http   service...smtp    service...ftp_data
 Min.   :0.00000    Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   
 1st Qu.:0.00000    1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:0.00000   
 Median :0.00000    Median :1.0000   Median :0.00000   Median :0.00000   
 Mean   :0.05547    Mean   :0.8116   Mean   :0.06375   Mean   :0.02173   
 3rd Qu.:0.00000    3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000

DBSCAN said everything is an outlier, as well as the One-class SVM

In [67]:
cor(features_cor %>% select(outlier, dbscan, lof, svm)

ERROR: Error in parse(text = x, srcfile = src): <text>:2:0: unexpected end of input
1: cor(features_cor %>% select(outlier, dbscan, lof, svm)
   ^


In [73]:
library(corrplot)
features_cor <- features
features_cor$outlier <- as.numeric(features_cor$outlier)
features_cor$dbscan <- as.numeric(features_cor$dbscan)
features_cor$svm <- as.numeric(features_cor$svm)
cor(features_cor %>% select(outlier, dbscan, lof, svm))

"the standard deviation is zero"

Unnamed: 0,outlier,dbscan,lof,svm
outlier,1.0,,0.039650434,-0.005838539
dbscan,,1.0,,
lof,0.039650434,,1.0,0.002677976
svm,-0.005838539,,0.002677976,1.0


In [65]:
model_ensemble <- glm(outlier ~., family=binomial(link='logit'), data = x[[1]][[2]])

"glm.fit: fitted probabilities numerically 0 or 1 occurred"

### Next tasks:
- Apply one model from each category of Anomaly Detection algorithms
- Tuning of the algorithms (insure that we have meaningful features)