Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensembles parallel #1116

Merged
merged 26 commits into from
Mar 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4b1c274
parallel ensemble training
zmjones Aug 11, 2016
40192d9
add description to variables
jakob-r Aug 11, 2016
48c45ff
remove unnecessary variables
jakob-r Aug 11, 2016
b2cf9df
cleanup passed variables for MulticlassWrapper
jakob-r Aug 11, 2016
583e877
cleanup passed variables for MultilabelBinaryRelevanceWrapper
jakob-r Aug 11, 2016
48db77c
add missing bracket
jakob-r Aug 11, 2016
10f3066
this solved the problem; do not really now why it appeared now, but i…
PhilippPro Aug 12, 2016
a98a789
wrongly named variables
jakob-r Sep 6, 2016
ce11575
Merge branch 'master' into ensembles_parallel
jakob-r Sep 6, 2016
c5c9a9a
remove unused variables
jakob-r Sep 6, 2016
dc644e5
models have to be named for multilabelBinaryRelevanceWrapper
jakob-r Sep 6, 2016
7c7f7c1
Merge branch 'master' into ensembles_parallel
jakob-r Sep 28, 2016
05c8835
parallelize predictions
jakob-r Sep 28, 2016
fa78fad
revert parallel predictions
jakob-r Oct 25, 2016
3a65989
Merge branch 'master' into ensembles_parallel
jakob-r Oct 25, 2016
4a1742d
add parallelization doc
jakob-r Feb 15, 2017
845fb70
fix indentations
jakob-r Feb 15, 2017
600af90
Merge branch 'master' into ensembles_parallel
jakob-r Feb 15, 2017
1a23e8f
skip on viper api down
jakob-r Feb 15, 2017
9e87da2
Merge branch 'master' into ensembles_parallel
jakob-r Feb 23, 2017
7eba442
fix brackets
jakob-r Feb 23, 2017
205ea51
typo
jakob-r Feb 23, 2017
4b48c14
more typos
jakob-r Feb 23, 2017
012c6a2
add rd file [ci-skip]
jakob-r Feb 23, 2017
435d528
Update CostSensRegrWrapper.R
larskotthoff Mar 8, 2017
78e03bd
Update parallelization.R
larskotthoff Mar 8, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions R/BaggingWrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,24 +90,24 @@ trainLearner.BaggingWrapper = function(.learner, .task, .subset, .weights = NULL
bw.size = if (bw.replace) 1 else 0.632
.task = subsetTask(.task, subset = .subset)
n = getTaskSize(.task)
# number of observations to sample
m = round(n * bw.size)
allinds = seq_len(n)
if (bw.feats < 1) {
feats = getTaskFeatureNames(.task)
k = max(round(bw.feats * length(feats)), 1)
}
models = lapply(seq_len(bw.iters), function(i) {
bag = sample(allinds, m, replace = bw.replace)
w = .weights[bag]
if (bw.feats < 1) {
feats2 = sample(feats, k, replace = FALSE)
.task2 = subsetTask(.task, features = feats2)
train(.learner$next.learner, .task2, subset = bag, weights = w)
} else {
train(.learner$next.learner, .task, subset = bag, weights = w)
}
})
m = makeHomChainModel(.learner, models)
# number of features to sample
k = max(round(bw.feats * getTaskNFeats(.task)), 1)

args = list(n = n, m = m, k = k, bw.replace = bw.replace,
task = .task, learner = .learner, weights = .weights)
parallelLibrary("mlr", master = FALSE, level = "mlr.ensemble", show.info = FALSE)
exportMlrOptions(level = "mlr.ensemble")
models = parallelMap(doBaggingTrainIteration, i = seq_len(bw.iters), more.args = args, level = "mlr.ensemble")
makeHomChainModel(.learner, models)
}

doBaggingTrainIteration = function(i, n, m, k, bw.replace, task, learner, weights) {
setSlaveOptions()
bag = sample(seq_len(n), m, replace = bw.replace)
task = subsetTask(task, features = sample(getTaskFeatureNames(task), k, replace = FALSE))
train(learner$next.learner, task, subset = bag, weights = weights[bag])
}

#' @export
Expand Down
23 changes: 11 additions & 12 deletions R/CostSensRegrWrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,20 @@ makeCostSensRegrWrapper = function(learner) {
trainLearner.CostSensRegrWrapper = function(.learner, .task, .subset, ...) {
# note that no hyperpars can be in ..., they would refer to the wrapper
.task = subsetTask(.task, subset = .subset)
costs = getTaskCosts(.task)
td = getTaskDescription(.task)
classes = td$class.levels
models = vector("list", length = length(classes))
for (i in seq_along(classes)) {
cl = classes[i]
y = costs[, cl]
data = cbind(getTaskData(.task), ..y.. = y)
task = makeRegrTask(id = cl, data = data, target = "..y..",
check.data = FALSE, fixup.data = "quiet")
models[[i]] = train(.learner$next.learner, task)
}
d = getTaskData(.task)
parallelLibrary("mlr", master = FALSE, level = "mlr.ensemble", show.info = FALSE)
exportMlrOptions(level = "mlr.ensemble")
models = parallelMap(doCostSensRegrTrainIteration, cl = getTaskDescription(.task)$class.levels,more.args = list("d" = d, "costs" = getTaskCosts(.task), "learner" = .learner), level = "mlr.ensemble")
makeHomChainModel(.learner, models)
}

doCostSensRegrTrainIteration = function(learner, cl, costs, d) {
setSlaveOptions()
data = cbind(d, ..y.. = costs[, cl])
task = makeRegrTask(id = cl, data = data, target = "..y..", check.data = FALSE, fixup.data = "quiet")
train(learner$next.learner, task)
}

#' @export
predictLearner.CostSensRegrWrapper = function(.learner, .model, .newdata, ...) {
p = predictHomogeneousEnsemble(.learner, .model, .newdata, ...)
Expand Down
1 change: 0 additions & 1 deletion R/HomogeneousEnsemble.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ predictHomogeneousEnsemble = function(.learner, .model, .newdata, ...) {
do.call(cbind, preds)
}


# call this at end of trainLearner.CostSensRegrWrapper
# FIXME: potentially remove this when ChainModel is removed
makeHomChainModel = function(learner, models) {
Expand Down
27 changes: 16 additions & 11 deletions R/MulticlassWrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,30 @@ makeMulticlassWrapper = function(learner, mcw.method = "onevsrest") {
#' @export
trainLearner.MulticlassWrapper = function(.learner, .task, .subset, .weights = NULL, mcw.method, ...) {
.task = subsetTask(.task, .subset)
tn = getTaskTargetNames(.task)
d = getTaskData(.task)
y = getTaskTargets(.task)
cm = buildCMatrix(mcw.method, .task)
x = multi.to.binary(y, cm)
# now fit models
models = lapply(seq_along(x$row.inds), function(i) {
data2 = d[x$row.inds[[i]], , drop = FALSE]
data2[, tn] = x$targets[[i]]
ct = changeData(.task, data2)
ct$task.desc$positive = "1"
ct$task.desc$negative = "-1"
train(.learner$next.learner, ct, weights = .weights)
})
args = list(x = x, learner = .learner, task = .task, weights = .weights)
parallelLibrary("mlr", master = FALSE, level = "mlr.ensemble", show.info = FALSE)
exportMlrOptions(level = "mlr.ensemble")
models = parallelMap(i = seq_along(x$row.inds), doMulticlassTrainIteration,
more.args = args, level = "mlr.ensemble")
m = makeHomChainModel(.learner, models)
m$cm = cm
return(m)
}

doMulticlassTrainIteration = function(x, i, learner, task, weights) {
setSlaveOptions()
d = getTaskData(task)
tn = getTaskTargetNames(task)
data2 = d[x$row.inds[[i]],, drop = FALSE]
data2[, tn] = x$targets[[i]]
ct = changeData(task, data2)
ct$task.desc$positive = "1"
ct$task.desc$negative = "-1"
train(learner$next.learner, ct, weights = weights)
}

#' @export
predictLearner.MulticlassWrapper = function(.learner, .model, .newdata, ...) {
Expand Down
24 changes: 16 additions & 8 deletions R/MultilabelBinaryRelevanceWrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,24 @@ makeMultilabelBinaryRelevanceWrapper = function(learner) {
trainLearner.MultilabelBinaryRelevanceWrapper = function(.learner, .task, .subset, .weights = NULL,...) {
targets = getTaskTargetNames(.task)
.task = subsetTask(.task, subset = .subset)
data = getTaskData(.task)
models = namedList(targets)
for (tn in targets) {
data2 = dropNamed(data, setdiff(targets, tn))
ctask = makeClassifTask(id = tn, data = data2, target = tn)
models[[tn]] = train(.learner$next.learner, ctask, weights = .weights)
}
parallelLibrary("mlr", master = FALSE, level = "mlr.ensemble", show.info = FALSE)
exportMlrOptions(level = "mlr.ensemble")
models = parallelMap(
doMultilabelBinaryRelevanceTrainIteration, tn = targets,
more.args = list(weights = .weights, learner = .learner$next.learner, task = .task),
level = "mlr.ensemble")
names(models) = targets
makeHomChainModel(.learner, models)
}

doMultilabelBinaryRelevanceTrainIteration = function(tn, learner, task, weights) {
setSlaveOptions()
data = getTaskData(task)
task = makeClassifTask(id = tn, data = dropNamed(data, setdiff(getTaskTargetNames(task), tn)), target = tn)
train(learner, task, weights = weights)
}


#' @export
predictLearner.MultilabelBinaryRelevanceWrapper = function(.learner, .model, .newdata, .subset = NULL, ...) {
models = getLearnerModel(.model, more.unwrap = FALSE)
Expand All @@ -57,4 +65,4 @@ predictLearner.MultilabelBinaryRelevanceWrapper = function(.learner, .model, .ne
else
function(m) getPredictionProbabilities(predict(m, newdata = .newdata, subset = .subset,...), cl = "TRUE")
asMatrixCols(lapply(models, f))
}
}
15 changes: 11 additions & 4 deletions R/OverBaggingWrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,20 @@ trainLearner.OverBaggingWrapper = function(.learner, .task, .subset, .weights =
z = getMinMaxClass(y)
obw.cl = z$min.name
}
models = lapply(seq_len(obw.iters), function(i) {
bag = sampleBinaryClass(y, rate = obw.rate, cl = obw.cl, resample.other.class = (obw.maxcl == "boot"))
train(.learner$next.learner, .task, subset = bag, weights = .weights)
})
args = list("y" = y, "obw.rate" = obw.rate, "obw.maxcl" = obw.maxcl, "obw.cl" = obw.cl, "learner" = .learner, "task" = .task, "weights" = .weights)
parallelLibrary("mlr", master = FALSE, level = "mlr.ensemble", show.info = FALSE)
exportMlrOptions(level = "mlr.ensemble")
models = parallelMap(doOverBaggingTrainIteration, i = seq_len(obw.iters), more.args = args)
makeHomChainModel(.learner, models)
}

doOverBaggingTrainIteration = function(i, y, obw.rate, obw.cl, obw.maxcl, learner, task, weights) {
setSlaveOptions()
bag = sampleBinaryClass(y, rate = obw.rate, cl = obw.cl, resample.other.class = (obw.maxcl == "boot"))
train(learner$next.learner, task, subset = bag, weights = weights)
}


#' @export
getLearnerProperties.OverBaggingWrapper = function(learner) {
union(getLearnerProperties(learner$next.learner), "prob")
Expand Down
26 changes: 26 additions & 0 deletions R/parallelization.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#' @title Supported parallelization methods
#'
#' @description
#' mlr supports different methods to activate parallel computing capabilities through the integration of the \code{\link[parallelMap]{parallelMap}} package, which supports all major parallelization backends for R.
#' You can start parallelization with \code{\link[parallelMap]{parallelStart}*}, where \code{*} should be replaced with the chosen backend.
#' \code{\link[parallelMap]{parallelStop}} is used to stop all parallelization backends.
#'
#' Parallelization is divided into different levels and will automatically be carried out for the first level that occurs, e.g. if you call \code{resample()} after \code{\link[parallelMap]{parallelStart}}, each resampling iteration is a parallel job and possible underlying calls like parameter tuning won't be parallelized further.
#'
#' The supported levels of parallelization are:
#' \describe{
#' \item{\code{"mlr.resample"}}{Each resampling iteration (a train/test step) is a parallel job.}
#' \item{\code{"mlr.benchmark"}}{Each experiment "run this learner on this data set" is a parallel job.}
#' \item{\code{"mlr.tuneParams"}}{Each evaluation in hyperparameter space "resample with these parameter settings" is a parallel job.
#' How many of these can be run independently in parallel depends on the tuning algorithm.
#' For grid search or random search there is no limit, but for other tuners it depends on how many points to evaluate are produced in each iteration of the optimization.
#' If a tuner works in a purely sequential fashion, we cannot work magic and the hyperparameter evaluation will also run sequentially. But note that you can still parallelize the underlying resampling.}
#' \item{\code{"mlr.selectFeatures"}}{Each evaluation in feature space "resample with this feature subset" is a parallel job. The same comments as for \code{"mlr.tuneParams"} apply here.}
#' \item{\code{"mlr.ensemble"}}{For all ensemble methods, the training and prediction of each individual learner is a parallel job.
#' Supported ensemble methods are the \code{\link{makeBaggingWrapper}}, \code{\link{makeCostSensRegrWrapper}}, \code{\link{makeMulticlassWrapper}}, \code{\link{makeMultilabelBinaryRelevanceWrapper}} and the \code{\link{makeOverBaggingWrapper}}.}
#' }
#'
#'
#' @name parallelization
#' @rdname parallelization
NULL
2 changes: 1 addition & 1 deletion R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

.onAttach = function(libname, pkgname) {
configureMlr()
parallelRegisterLevels(package = "mlr", levels = c("benchmark", "resample", "selectFeatures", "tuneParams"))
parallelRegisterLevels(package = "mlr", levels = c("benchmark", "resample", "selectFeatures", "tuneParams", "ensemble"))
}

mlr = new.env(parent = emptyenv())
Expand Down
25 changes: 25 additions & 0 deletions man/parallelization.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions tests/testthat/test_base_BaggingWrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ test_that("BaggingWrapper", {
lrn2 = makeBaggingWrapper(lrn1, bw.size = 0.1, bw.replace = FALSE)
m = train(lrn2, multiclass.task)
bms = getLearnerModel(m)
expect_equal(unique(sapply(bms, function(m) length(bms[[1]]$subset))), 15L)
expect_equal(unique(sapply(extractSubList(bms, "subset", simplify = FALSE), length)), 15L)
lrn2 = makeBaggingWrapper(lrn1, bw.iters = 3L, bw.feats = 0.5)
m = train(lrn2, multiclass.task)
bms = getLearnerModel(m)
expect_equal(unique(sapply(bms, function(m) length(bms[[1]]$features))), 2L)
expect_equal(unique(sapply(extractSubList(bms, "features", simplify = FALSE), length)), 2L)
lrn1 = makeLearner("classif.rpart")
lrn2 = makeBaggingWrapper(lrn1, bw.iters = 3L)
lrn2 = setPredictType(lrn2, "prob")
Expand Down
1 change: 1 addition & 0 deletions tests/testthat/test_base_plotViperCharts.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ context("plotViperCharts")

test_that("plotViperCharts", {
skip_on_cran()
skip_if_not(RCurl::url.exists("http://viper.ijs.si/api/"), message = "viper API not reachable")

lrn1 = makeLearner("classif.rpart", predict.type = "prob")
lrn2 = makeLearner("classif.lda", predict.type = "prob")
Expand Down
52 changes: 52 additions & 0 deletions tests/testthat/test_parallel_all.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,55 @@ test_that("parallel partial dependence", {
}
doit("socket")
})

test_that("parallel ensembles", {
doit = function(mode, level) {
on.exit(parallelStop())
parallelStart(mode = mode, cpus = 2L, show.info = FALSE)

## bagging wrapper
lrn = makeBaggingWrapper(makeLearner("regr.rpart"), bw.iters = 2L)
fit = train(lrn, regr.task)
models = getLearnerModel(fit, more.unwrap = TRUE)
expect_equal(length(models), 2L)
expect_equal(class(models[[1]]), "rpart")
p = predict(fit, regr.task)

## multiclass wrapper
lrn = makeMulticlassWrapper(makeLearner("classif.rpart"))
fit = train(lrn, multiclass.task)
models = getLearnerModel(fit)
expect_equal(length(models), length(getTaskClassLevels(multiclass.task)))
levs = do.call("rbind", extractSubList(models, "factor.levels"))
expect_equal(unique(levs[, 1]), "-1")
expect_equal(unique(levs[, 2]), "1")
p = predict(fit, multiclass.task)

## overbagging wrapper
lrn = makeOverBaggingWrapper(makeLearner("classif.rpart"), 2L)
fit = train(lrn, binaryclass.task)
models = getLearnerModel(fit)
expect_equal(length(models), 2L)
p = predict(fit, binaryclass.task) ## calls predictHomogeneousEnsemble

## costsensregrwrapper
lrn = makeCostSensRegrWrapper(makeLearner("regr.rpart"))
fit = train(lrn, costsens.task)
models = getLearnerModel(fit)
expect_equal(length(models), ncol(getTaskCosts(costsens.task)))
p = predict(fit, costsens.task)

## MultilabelBinaryRelevanceWrapper
lrn = makeMultilabelBinaryRelevanceWrapper("classif.rpart")
lrn = setPredictType(lrn, "prob")
fit = train(lrn, multilabel.task)
p = predict(fit, multilabel.task)
}

## CostSensWeightedPairsWrapper
if (Sys.info()["sysname"] != "Windows") {
doit("multicore", "mlr.ensemble")
doit("mpi", "mlr.ensemble")
}
doit("socket", "mlr.ensemble")
})