In [41]:
library('tidyverse')
library('e1071')

In [42]:
dataset <- read.csv("mushroomdata/agaricus-lepiota.csv", header=FALSE, as.is=TRUE,
                               strip.white=TRUE, colClasses="character")

In [43]:
colnames(dataset) <- c("class", "cap_shape", "cap_surface", 
                        "cap_color", "bruises", "odor", 
                        "gill_attachement", "gill_spacing", "gill_size", 
                        "gill_color", "stalk_shape", "stalk_root", 
                        "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", 
                        "stalk_color_below_ring", "veil_type", "veil_color", 
                        "ring_number", "ring_type", "spore_print_color", 
                        "population", "habitat")
dataset <- dataset %>% map_df(function(.x) as.factor(.x))
levels(dataset$class) <- c("edible", "poisonous")
levels(dataset$cap_shape) <- c("bell", "conical", "flat", "knobbed", "sunken", "convex")
levels(dataset$cap_color) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                "green", "purple", "white", "yellow")
levels(dataset$cap_surface) <- c("fibrous", "grooves", "scaly", "smooth")
levels(dataset$bruises) <- c("no", "yes")
levels(dataset$odor) <- c("almond", "creosote", "foul", "anise", "musty", "none", "pungent", "spicy", "fishy")
levels(dataset$gill_attachement) <- c("attached", "free")
levels(dataset$gill_spacing) <- c("close", "crowded")
levels(dataset$gill_size) <- c("broad", "narrow")
levels(dataset$gill_color) <- c("buff", "red", "gray", "chocolate", "black", "brown", "orange", 
                                 "pink", "green", "purple", "white", "yellow")
levels(dataset$stalk_shape) <- c("enlarging", "tapering")
levels(dataset$stalk_root) <- c("missing", "bulbous", "club", "equal", "rooted")
levels(dataset$stalk_surface_above_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(dataset$stalk_surface_below_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(dataset$stalk_color_above_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                "green", "purple", "white", "yellow")
levels(dataset$stalk_color_below_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                "green", "purple", "white", "yellow")
levels(dataset$veil_type) <- "partial"
levels(dataset$veil_color) <- c("brown", "orange", "white", "yellow")
levels(dataset$ring_number) <- c("none", "one", "two")
levels(dataset$ring_type) <- c("evanescent", "flaring", "large", "none", "pendant")
levels(dataset$spore_print_color) <- c("buff", "chocolate", "black", "brown", "orange", 
                                        "green", "purple", "white", "yellow")
levels(dataset$population) <- c("abundant", "clustered", "numerous", "scattered", "several", "solitary")
levels(dataset$habitat) <- c("wood", "grasses", "leaves", "meadows", "paths", "urban", "waste")

dataset <- dataset %>% select(- veil_type)

In [44]:
set.seed(1810)
datasetsample <- caret::createDataPartition(y = dataset$class, times = 1, p = 0.1, list = FALSE)
train_dataset <- dataset[datasetsample, ]
test_dataset <- dataset[-datasetsample, ]

Data ready, ensemble model:

In [65]:
#function returning a list of trained models:
ensemble <- function(algorithm, formula, dataset, mcount=10, pattr=0.8, ...) {
  results <- list()
  for (i in 1:mcount) {
    attr_count <- max(1, floor(pattr * (ncol(dataset)-1))) # Of all columns, attr_count will be sampled.
    
    bootstrapped <- sample_n(dataset, nrow(dataset), replace=T)
        
    attr_randomized <- sample(bootstrapped[,-1], attr_count)
              
    attr_randomized$class <- bootstrapped$class
      
    new_model <- algorithm(formula, data=attr_randomized, ...)
    results[[i]] <- new_model
  }
  return(results)
}

model_ens <- ensemble(algorithm=naiveBayes, formula=class~., dataset=train_dataset, mcount=10, pattr=0.7, usekernel=T, laplace=1)

In [67]:
# a function predicting the class probabilities for a given ensemble model and data
predict_fun <- function (ensemble_model, dataset, type, single_predict=predict) {
    modelClasses <- levels(test_dataset$class)
    print(modelClasses)
    predictions <- matrix(0, nrow=nrow(dataset), ncol=length(modelClasses))
    colnames(predictions) <- modelClasses
    
    modelCount <- length(ensemble_model)
    
    for (i in 1:length(ensemble_model)) {
        newPrediction <- single_predict(ensemble_model[[i]], dataset, type="class")
        
        for (j in 1:length(newPrediction)) {
            predictions[j, newPrediction[j]] = predictions[j, newPrediction[j]] + 1/modelCount
        }
    }

    return(predictions)
}

pred <- predict_fun(model_ens, test_dataset[1:10,])
cbind(pred, as.data.frame(test_dataset[1:10,]$class))

[1] "edible"    "poisonous"


edible,poisonous,"test_dataset[1:10, ]$class"
<dbl>,<dbl>,<fct>
0.9,0.1,poisonous
1.0,0.0,edible
1.0,0.0,edible
0.9,0.1,poisonous
1.0,0.0,edible
1.0,0.0,edible
1.0,0.0,edible
1.0,0.0,edible
1.0,0.0,edible
1.0,0.0,edible


In [72]:
#function returning a list of trained models:

new_model <- naiveBayes(class~., data=train_dataset, usekernel=T, laplace=1)

pred <- predict(new_model, test_dataset[1:10,], type="raw")
cbind(pred, as.data.frame(test_dataset[1:10,]$class))

edible,poisonous,"test_dataset[1:10, ]$class"
<dbl>,<dbl>,<fct>
0.9469639,0.05303613,poisonous
1.0,1.511736e-08,edible
1.0,1.251585e-10,edible
0.9596226,0.04037743,poisonous
0.9999999,1.251882e-07,edible
1.0,1.352209e-09,edible
1.0,2.297447e-09,edible
0.9999999,5.631447e-08,edible
0.9999998,1.90189e-07,edible
1.0,3.890968e-08,edible
