# Experiment 2
Miguel Sandim, 05/04/2017

In [1]:
setwd("~/master-thesis")
set.seed(20)
library(ggplot2)
library(dplyr)
library(caret)
library(foreign)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading required package: lattice


In [2]:
dataset_aloi <- read.arff("datasets/literature/ALOI/ALOI_withoutdupl.arff")
dataset_glass <- read.arff("datasets/literature/Glass/Glass_withoutdupl_norm.arff")
dataset_iono <- read.arff("datasets/literature/Ionosphere/Ionosphere_withoutdupl_norm.arff")
dataset_kdd <- read.arff("datasets/literature/KDDCup99/KDDCup99_withoutdupl_1ofn.arff")
dataset_lym <- read.arff("datasets/literature/Lymphography/Lymphography_withoutdupl_1ofn.arff")
dataset_pen <- read.arff("datasets/literature/PenDigits/PenDigits_withoutdupl_norm_v01.arff")
dataset_shuttle <- read.arff("datasets/literature/Shuttle/Shuttle_withoutdupl_v01.arff")
dataset_waveform <- read.arff("datasets/literature/Waveform/Waveform_withoutdupl_v01.arff")
dataset_wbc <- read.arff("datasets/literature/WBC/WBC_withoutdupl_v01.arff")
dataset_wdbc <- read.arff("datasets/literature/WDBC/WDBC_withoutdupl_v01.arff")
dataset_wpbc <- read.arff("datasets/literature/WPBC/WPBC_withoutdupl_norm.arff")

dataset_lym <- dataset_lym %>% rename(outlier = Outlier)

In [3]:
datasets <- list(
  dataset_aloi = dataset_aloi,
  dataset_glass = dataset_glass,
  dataset_iono = dataset_iono,
  dataset_kdd = dataset_kdd,
  dataset_lym = dataset_lym,
  dataset_pen = dataset_pen,
  dataset_shuttle = dataset_shuttle,
  dataset_waveform = dataset_waveform,
  dataset_wbc = dataset_wbc,
  dataset_wdbc = dataset_wdbc,
  dataset_wpbc = dataset_wpbc
)

### Percentage of outliers per dataset

In [4]:
lapply(datasets, function(data) prop.table(table(data[, "outlier"])) * 100)

$dataset_aloi

       no       yes 
96.955626  3.044374 

$dataset_glass

       no       yes 
95.794393  4.205607 

$dataset_iono

      no      yes 
64.10256 35.89744 

$dataset_kdd

        no        yes 
99.5843119  0.4156881 

$dataset_lym

       no       yes 
95.945946  4.054054 

$dataset_pen

        no        yes 
99.7973247  0.2026753 

$dataset_shuttle

       no       yes 
98.716683  1.283317 

$dataset_waveform

       no       yes 
97.095556  2.904444 

$dataset_wbc

       no       yes 
95.515695  4.484305 

$dataset_wdbc

       no       yes 
97.275204  2.724796 

$dataset_wpbc

      no      yes 
76.26263 23.73737 


### Number of outliers per dataset

In [5]:
lapply(datasets, function(data) table(data[, "outlier"]))

$dataset_aloi

   no   yes 
48026  1508 

$dataset_glass

 no yes 
205   9 

$dataset_iono

 no yes 
225 126 

$dataset_kdd

   no   yes 
47913   200 

$dataset_lym

 no yes 
142   6 

$dataset_pen

  no  yes 
9848   20 

$dataset_shuttle

  no  yes 
1000   13 

$dataset_waveform

  no  yes 
3343  100 

$dataset_wbc

 no yes 
213  10 

$dataset_wdbc

 no yes 
357  10 

$dataset_wpbc

 no yes 
151  47 


In [6]:
#print(object.size(datasets) , units = "auto")

In [7]:
#dataset_ann <- read.arff("datasets/semantic/Annthyroid/Annthyroid_withoutdupl_07.arff")
#dataset_arr <- read.arff("dataset/semantic/Arrhythmia/Arrhythmia_withoutdupl_10_v01.arff")
#dataset_cardio <- read.arff("dataset/semantic/Cardiotocography/Cardiotocography_withoutdupl_20_v01.arff")
#dataset_heart <- read.arff("dataset/semantic/HeartDisease/HeartDisease_withoutdupl_20_v01.arff")
#dataset_hepatitis <- read.arff("dataset/semantic/Hepatitis/Hepatitis_withoutdupl_16.arff")
#dataset_ads <- read.arff("dataset/semantic/InternetAds/InternetAds_withoutdupl_norm_19.arff")
#dataset_blocks <- read.arff("dataset/semantic/PageBlocks/PageBlocks_withoutdupl_09.arff")
#dataset_parkinson <- read.arff("dataset/semantic/Parkinson/Parkinson_withoutdupl_20_v01.arff")
#dataset_pima <- read.arff("dataset/semantic/Pima/Pima_withoutdupl_20_v01.arff")
#dataset_spam <- read.arff("dataset/semantic/SpamBase/SpamBase_withoutdupl_20_v01.arff")
#dataset_stamps <- read.arff("dataset/semantic/Stamps/Stamps_withoutdupl_09.arff")
#dataset_wilt <- read.arff("dataset/semantic/Wilt/Wilt_withoutdupl_05.arff")

In [8]:
createStratCV <- function(data, k_value = 10)
{
  folds <- createFolds(data$outlier, k = k_value, list = TRUE, returnTrain = FALSE)
  folds_index <- lapply(folds, function(f) data[f, "id"])
}

In [9]:
datasets_folds <- lapply(datasets, createStratCV, k_value = 5)

### Number of outliers per fold

In [10]:
lapply(names(datasets), function(dname) 
  {
    datasets[[dname]] %>% filter(id %in% datasets_folds[[dname]][[1]]) %>% select(outlier) %>% table
  }) %>% setNames(names(datasets))

$dataset_aloi
.
  no  yes 
9605  301 

$dataset_glass
.
 no yes 
 41   2 

$dataset_iono
.
 no yes 
 45  25 

$dataset_kdd
.
  no  yes 
9583   40 

$dataset_lym
.
 no yes 
 28   1 

$dataset_pen
.
  no  yes 
1970    4 

$dataset_shuttle
.
 no yes 
200   2 

$dataset_waveform
.
 no yes 
669  20 

$dataset_wbc
.
 no yes 
 43   2 

$dataset_wdbc
.
 no yes 
 72   2 

$dataset_wpbc
.
 no yes 
 30   9 
