In [2]:
library('tidyverse')
library('naivebayes')

In [3]:
dataset <- read.csv("mushroomdata/agaricus-lepiota.csv", header=FALSE, as.is=TRUE,
                               strip.white=TRUE, colClasses="character")

In [4]:
colnames(dataset) <- c("class", "cap_shape", "cap_surface", 
                        "cap_color", "bruises", "odor", 
                        "gill_attachement", "gill_spacing", "gill_size", 
                        "gill_color", "stalk_shape", "stalk_root", 
                        "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", 
                        "stalk_color_below_ring", "veil_type", "veil_color", 
                        "ring_number", "ring_type", "spore_print_color", 
                        "population", "habitat")
dataset <- dataset %>% map_df(function(.x) as.factor(.x))
levels(dataset$class) <- c("edible", "poisonous")
levels(dataset$cap_shape) <- c("bell", "conical", "flat", "knobbed", "sunken", "convex")
levels(dataset$cap_color) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                "green", "purple", "white", "yellow")
levels(dataset$cap_surface) <- c("fibrous", "grooves", "scaly", "smooth")
levels(dataset$bruises) <- c("no", "yes")
levels(dataset$odor) <- c("almond", "creosote", "foul", "anise", "musty", "none", "pungent", "spicy", "fishy")
levels(dataset$gill_attachement) <- c("attached", "free")
levels(dataset$gill_spacing) <- c("close", "crowded")
levels(dataset$gill_size) <- c("broad", "narrow")
levels(dataset$gill_color) <- c("buff", "red", "gray", "chocolate", "black", "brown", "orange", 
                                 "pink", "green", "purple", "white", "yellow")
levels(dataset$stalk_shape) <- c("enlarging", "tapering")
levels(dataset$stalk_root) <- c("missing", "bulbous", "club", "equal", "rooted")
levels(dataset$stalk_surface_above_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(dataset$stalk_surface_below_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(dataset$stalk_color_above_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                "green", "purple", "white", "yellow")
levels(dataset$stalk_color_below_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                "green", "purple", "white", "yellow")
levels(dataset$veil_type) <- "partial"
levels(dataset$veil_color) <- c("brown", "orange", "white", "yellow")
levels(dataset$ring_number) <- c("none", "one", "two")
levels(dataset$ring_type) <- c("evanescent", "flaring", "large", "none", "pendant")
levels(dataset$spore_print_color) <- c("buff", "chocolate", "black", "brown", "orange", 
                                        "green", "purple", "white", "yellow")
levels(dataset$population) <- c("abundant", "clustered", "numerous", "scattered", "several", "solitary")
levels(dataset$habitat) <- c("wood", "grasses", "leaves", "meadows", "paths", "urban", "waste")

dataset <- dataset %>% select(- veil_type)

In [5]:
set.seed(1810)
datasetsample <- caret::createDataPartition(y = dataset$class, times = 1, p = 0.1, list = FALSE)
train_dataset <- dataset[datasetsample, ]
test_dataset <- dataset[-datasetsample, ]

Data ready, ensemble model:

In [29]:
#function returning a list of trained models:
ensemble <- function(model, dataset, mcount=10, pattr=0.8, ...) {
  results <- list()
  for (i in 1:mcount) {
    attr_count <- max(1, floor(pattr * (ncol(dataset)-1))) # Of all columns, attr_count will be sampled.

    bootstrapped <- sample_n(dataset, nrow(dataset), replace=T)
      
    attr_randomized <- sample(bootstrapped[,-1], attr_count)
      
    attr_randomized$class <- bootstrapped$class
      
    new_model <- model(class ~ ., data=attr_randomized, ...)
    results[[i]] <- new_model
  }
  return(results)
}

model_ens <- ensemble(model=naive_bayes, dataset=train_dataset, mcount=10, pattr=0.5, usekernel=T, laplace=1)

In [30]:
# a function predicting the class probabilities for a given ensemble model and data
predict_fun <- function (ensemble_model, dataset) {
    predictions <- list()
    for (i in 1:length(ensemble_model)) {
        predictions[[i]] <- predict(ensemble_model[[i]], dataset, type="prob")
    }
    print(predictions)
    result <- mean(predictions) #Doesn't work -- need to average multiple identical, numerical dataframes
}

predict_fun(model_ens, test_dataset[1:3,])

"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."
"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."
"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."
"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."
"predict.naive_bayes(): more features in the newdata are provided as there are probability tables in the object. Calculation is performed based on features to be found in the tables."
"predict.naive_bayes(): more features in the newdata are provided as there are p

[[1]]
        edible    poisonous
[1,] 0.9866362 1.336377e-02
[2,] 0.9999491 5.094853e-05
[3,] 0.9999675 3.246254e-05

[[2]]
        edible    poisonous
[1,] 0.5749479 0.4250521253
[2,] 0.9938552 0.0061448451
[3,] 0.9998579 0.0001420794

[[3]]
        edible    poisonous
[1,] 0.9862269 0.0137731110
[2,] 0.9914749 0.0085251208
[3,] 0.9996046 0.0003954224

[[4]]
        edible  poisonous
[1,] 0.9165732 0.08342684
[2,] 0.8644683 0.13553168
[3,] 0.9899799 0.01002008

[[5]]
        edible   poisonous
[1,] 0.2687606 0.731239420
[2,] 0.9940919 0.005908115
[3,] 0.9983726 0.001627392

[[6]]
        edible    poisonous
[1,] 0.9960595 0.0039405066
[2,] 0.9990063 0.0009936661
[3,] 0.9995695 0.0004304597

[[7]]
        edible    poisonous
[1,] 0.0653799 9.346201e-01
[2,] 0.9999633 3.669390e-05
[3,] 0.9999928 7.224220e-06

[[8]]
        edible    poisonous
[1,] 0.6943062 3.056938e-01
[2,] 0.9994053 5.947006e-04
[3,] 0.9999078 9.219506e-05

[[9]]
         edible    poisonous
[1,] 0.00318249 9.968175e

"argument nie jest wartością liczbową ani logiczną: zwracanie wartości NA"
