/
BaggingWrapper.R
157 lines (148 loc) · 5.87 KB
/
BaggingWrapper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#' @title Fuse learner with the bagging technique.
#'
#' @description
#' Fuses a learner with the bagging method
#' (i.e., similar to what a `randomForest` does).
#' Creates a learner object, which can be
#' used like any other learner object.
#' Models can easily be accessed via [getLearnerModel].
#'
#' Bagging is implemented as follows:
#' For each iteration a random data subset is sampled (with or without replacement)
#' and potentially the number of features is also restricted to
#' a random subset. Note that this is usually handled in a slightly different way
#' in the random forest where features are sampled at each tree split).
#'
#' Prediction works as follows:
#' For classification we do majority voting to create a discrete label and
#' probabilities are predicted by considering the proportions of all predicted labels.
#' For regression the mean value and the standard deviations across predictions is computed.
#'
#' Note that the passed base learner must always have `predict.type = 'response'`,
#' while the BaggingWrapper can estimate probabilities and standard errors, so it can
#' be set, e.g., to `predict.type = 'prob'`. For this reason, when you call
#' [setPredictType], the type is only set for the BaggingWrapper, not passed
#' down to the inner learner.
#'
#' @template arg_learner
#' @param bw.iters (`integer(1)`)\cr
#' Iterations = number of fitted models in bagging.
#' Default is 10.
#' @param bw.replace (`logical(1)`)\cr
#' Sample bags with replacement (bootstrapping)?
#' Default is TRUE.
#' @param bw.size (`numeric(1)`)\cr
#' Percentage size of sampled bags.
#' Default is 1 for bootstrapping and 0.632 for subsampling.
#' @param bw.feats (`numeric(1)`)\cr
#' Percentage size of randomly selected features in bags.
#' Default is 1.
#' At least one feature will always be selected.
#' @template ret_learner
#' @family wrapper
#' @export
makeBaggingWrapper = function(learner, bw.iters = 10L, bw.replace = TRUE, bw.size, bw.feats = 1) {
learner = checkLearner(learner, type = c("classif", "regr"))
pv = list()
if (!missing(bw.iters)) {
bw.iters = asInt(bw.iters, lower = 1L)
pv$bw.iters = bw.iters
}
if (!missing(bw.replace)) {
assertFlag(bw.replace)
pv$bw.replace = bw.replace
}
if (!missing(bw.size)) {
assertNumber(bw.size, lower = 0, upper = 1)
pv$bw.size = bw.size
}
if (!missing(bw.feats)) {
assertNumber(bw.feats, lower = 0, upper = 1)
pv$bw.feats = bw.feats
}
if (learner$predict.type != "response") {
stop("Predict type of the basic learner must be 'response'.")
}
id = stri_paste(learner$id, "bagged", sep = ".")
packs = learner$package
ps = makeParamSet(
makeIntegerLearnerParam(id = "bw.iters", lower = 1L, default = 10L),
makeLogicalLearnerParam(id = "bw.replace", default = TRUE),
makeNumericLearnerParam(id = "bw.size", lower = 0, upper = 1),
makeNumericLearnerParam(id = "bw.feats", lower = 0, upper = 1, default = 2 / 3)
)
makeHomogeneousEnsemble(id, learner$type, learner, packs, par.set = ps, par.vals = pv,
learner.subclass = "BaggingWrapper", model.subclass = "BaggingModel")
}
#' @export
print.BaggingModel = function(x, ...) {
s = capture.output(print.WrappedModel(x))
u = sprintf("Bagged Learner: %s", class(x$learner$next.learner)[1L])
s = append(s, u, 1L)
lapply(s, catf)
}
#' @export
trainLearner.BaggingWrapper = function(.learner, .task, .subset = NULL, .weights = NULL,
bw.iters = 10, bw.replace = TRUE, bw.size, bw.feats = 1, ...) {
if (missing(bw.size)) {
bw.size = if (bw.replace) 1 else 0.632
}
.task = subsetTask(.task, subset = .subset)
n = getTaskSize(.task)
# number of observations to sample
m = round(n * bw.size)
# number of features to sample
k = max(round(bw.feats * getTaskNFeats(.task)), 1)
args = list(n = n, m = m, k = k, bw.replace = bw.replace,
task = .task, learner = .learner, weights = .weights)
parallelLibrary("mlr", master = FALSE, level = "mlr.ensemble", show.info = FALSE)
exportMlrOptions(level = "mlr.ensemble")
models = parallelMap(doBaggingTrainIteration, i = seq_len(bw.iters), more.args = args, level = "mlr.ensemble")
makeHomChainModel(.learner, models)
}
doBaggingTrainIteration = function(i, n, m, k, bw.replace, task, learner, weights) {
setSlaveOptions()
bag = sample(seq_len(n), m, replace = bw.replace)
task = subsetTask(task, features = sample(getTaskFeatureNames(task), k, replace = FALSE))
train(learner$next.learner, task, subset = bag, weights = weights[bag])
}
#' @export
predictLearner.BaggingWrapper = function(.learner, .model, .newdata, .subset = NULL, ...) {
models = getLearnerModel(.model, more.unwrap = FALSE)
g = if (.learner$type == "classif") as.character else identity
p = asMatrixCols(lapply(models, function(m) {
nd = .newdata[, m$features, drop = FALSE]
g(predict(m, newdata = nd, subset = .subset, ...)$data$response)
}))
if (.learner$predict.type == "response") {
if (.learner$type == "classif") {
as.factor(apply(p, 1L, computeMode))
} else {
rowMeans(p)
}
} else {
if (.learner$type == "classif") {
levs = .model$task.desc$class.levels
p = apply(p, 1L, function(x) {
x = factor(x, levels = levs) # we need all level for the table and we need them in consistent order!
as.numeric(prop.table(table(x)))
})
setColNames(t(p), levs)
} else {
cbind(rowMeans(p), apply(p, 1L, sd))
}
}
}
# we need to override here. while the predtype of the encapsulated learner must always
# be response, we can estimates probs and se on the outside
#' @export
setPredictType.BaggingWrapper = function(learner, predict.type) {
setPredictType.Learner(learner, predict.type)
}
#' @export
getLearnerProperties.BaggingWrapper = function(learner) {
switch(learner$type,
"classif" = union(getLearnerProperties(learner$next.learner), "prob"),
"regr" = union(getLearnerProperties(learner$next.learner), "se")
)
}