Skip to content

Commit

Permalink
require weight.fun to be vectorized (#1206)
Browse files Browse the repository at this point in the history
accept a data.frame for x arg to weight.fun in
generatePartialDependenceData and generateFunctionalANOVAData which
allows the use of faster vectorized weighting functions. for
nonvectorized operations users can write a loop or use apply. also now
checking that weights are nonnegative.
  • Loading branch information
zmjones authored and larskotthoff committed Aug 29, 2016
1 parent ca168d0 commit 94e6859
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 22 deletions.
45 changes: 26 additions & 19 deletions R/generatePartialDependence.R
Expand Up @@ -61,9 +61,10 @@
#' The default is the mean, unless \code{obj} is classification with \code{predict.type = "response"}
#' in which case the default is the proportion of observations predicted to be in each class.
#' @param weight.fun [\code{function}]\cr
#' A function which takes a numeric vector \code{x} and a \code{data.frame} \code{data} and returns a
#' numeric weight. The length of \code{x} must match the column dimension of \code{data}.
#' By default \code{weight.fun} returns a weight of 1 for every input.
#' A function which takes a \code{data.frame} \code{x} and a \code{data.frame} \code{data}
#' (both of which are coerced to numeric matrices) and returns a non-negative
#' numeric weight. The number of columns in \code{x} must match the column dimension of \code{data}.
#' By default \code{weight.fun} returns a weight of 1.
#' @param bounds [\code{numeric(2)}]\cr
#' The value (lower, upper) the estimated standard error is multiplied by to estimate the bound on a
#' confidence region for a partial dependence. Ignored if \code{predict.type != "se"} for the learner.
Expand Down Expand Up @@ -136,7 +137,8 @@
#' @export
generatePartialDependenceData = function(obj, input, features,
interaction = FALSE, derivative = FALSE, individual = FALSE, center = NULL,
fun = mean, weight.fun = function(x, data) 1, bounds = c(qnorm(.025), qnorm(.975)),
fun = mean, weight.fun = function(x, data) rep(1, nrow(x)),
bounds = c(qnorm(.025), qnorm(.975)),
resample = "none", fmin, fmax, gridsize = 10L, ...) {

assertClass(obj, "WrappedModel")
Expand Down Expand Up @@ -188,10 +190,10 @@ generatePartialDependenceData = function(obj, input, features,
stop("function argument must return a numeric vector of length 1 or 3.")

assertFunction(weight.fun)
x = c(0, 0)
x = matrix(0, 2, 2)
test = as.matrix(replicate(2, rnorm(5)))
weights = weight.fun(x, test)
if (!is.numeric(weights) && length(weights) == 1L)
if (!is.numeric(weights) && length(weights) == 2L)
stop("Invalid weight.fun.")

assertNumeric(bounds, len = 2L)
Expand Down Expand Up @@ -334,9 +336,10 @@ generatePartialDependenceData = function(obj, input, features,
#' a measure of location, and an upper bound. Note if three numbers are returned they must be
#' in this order. The default is the mean.
#' @param weight.fun [\code{function}]\cr
#' A function which takes a numeric vector \code{x} and a \code{data.frame} \code{data} and returns a
#' numeric weight. The length of \code{x} must match the column dimension of \code{data}.
#' By default \code{weight.fun} returns a weight of 1 for every input.
#' A function which takes a \code{data.frame} \code{x} and a \code{data.frame} \code{data}
#' (both of which are coerced to numeric matrices) and returns a non-negative
#' numeric weight. The number of columns in \code{x} must match the column dimension of \code{data}.
#' By default \code{weight.fun} returns a weight of 1.
#' @param bounds [\code{numeric(2)}]\cr
#' The value (lower, upper) the estimated standard error is multiplied by to estimate the bound on a
#' confidence region for a partial dependence. Ignored if \code{predict.type != "se"} for the learner.
Expand Down Expand Up @@ -388,7 +391,8 @@ generatePartialDependenceData = function(obj, input, features,
#' plotPartialDependence(fa)
#' @export
generateFunctionalANOVAData = function(obj, input, features, depth = 1L, fun = mean,
weight.fun = function(x, data) 1, bounds = c(qnorm(.025), qnorm(.975)),
weight.fun = function(x, data) rep(1, nrow(x)),
bounds = c(qnorm(.025), qnorm(.975)),
resample = "none", fmin, fmax, gridsize = 10L, ...) {

assertClass(obj, "WrappedModel")
Expand Down Expand Up @@ -426,10 +430,10 @@ generateFunctionalANOVAData = function(obj, input, features, depth = 1L, fun = m
assertIntegerish(depth, lower = 1L, upper = length(features), len = 1L)

assertFunction(weight.fun)
x = c(0, 0)
x = matrix(0, 2, 2)
test = as.matrix(replicate(2, rnorm(5)))
weights = weight.fun(x, test)
if (!is.numeric(weights) && length(weights) == 1L)
if (!is.numeric(weights) && length(weights) == 2L)
stop("Invalid weight.fun.")

assertFunction(fun)
Expand Down Expand Up @@ -514,8 +518,9 @@ doPartialDerivativeIteration = function(x, obj, data, features, fun, td, individ
f = function(x, obj, data, features, fun, td, ...) {
newdata = data
newdata[features] = x
weights = apply(newdata[, obj$features, drop = FALSE], 1, function(x)
do.call("weight.fun", c(list("x" = x, "data" = data[, obj$features, drop = FALSE]), list(...))))
weights = do.call("weight.fun", list("x" = newdata[, obj$features, drop = FALSE],
"data" = data[, obj$features, drop = FALSE], ...))
assertNumeric(weights, lower = 0, finite = TRUE, all.missing = FALSE, len = nrow(newdata))
if (sum(weights) == 0) weights = rep(NA, length(weights))
pred = do.call("predict", c(list("object" = obj, "newdata" = newdata), list(...)))
if (obj$learner$predict.type == "response")
Expand All @@ -533,8 +538,9 @@ doPartialDerivativeIteration = function(x, obj, data, features, fun, td, individ
f = function(x, obj, data, features, fun, td, ...) {
newdata = data
newdata[features] = x
weights = apply(newdata[, obj$features, drop = FALSE], 1, function(x)
do.call("weight.fun", c(list("x" = x, "data" = data[, obj$features, drop = FALSE]), list(...))))
weights = do.call("weight.fun", list("x" = newdata[, obj$features, drop = FALSE],
"data" = data[, obj$features, drop = FALSE], ...))
assertNumeric(weights, lower = 0, finite = TRUE, all.missing = FALSE, len = nrow(newdata))
if (sum(weights) == 0) weights = rep(NA, length(weights))
pred = do.call("predict", c(list("object" = obj, "newdata" = newdata), list(...)))
if (obj$learner$predict.type == "response")
Expand All @@ -557,13 +563,14 @@ doPartialDependenceIteration = function(obj, data, rng, features, fun, td, i, bo

newdata = data
newdata[features] = rng[i, ]
weights = apply(newdata[, obj$features, drop = FALSE], 1, function(x)
do.call("weight.fun", c(list("x" = x, "data" = data[, obj$features, drop = FALSE]), list(...))))
weights = do.call("weight.fun", list("x" = newdata[, obj$features, drop = FALSE],
"data" = data[, obj$features, drop = FALSE], ...))
assertNumeric(weights, lower = 0, finite = TRUE, all.missing = FALSE, len = nrow(newdata))
if (sum(weights) == 0) weights = rep(NA, length(weights))
pred = do.call("predict", c(list("object" = obj, "newdata" = newdata), list(...)))
if (obj$learner$predict.type == "response") {
if (td$type == "classif") {
if (identical(functionBody(weight.fun), functionBody(function(x, data) 1)))
if (identical(functionBody(weight.fun), functionBody(function(x, data) rep(1, nrow(x)))))
fun(getPredictionResponse(pred))
else
stop('Classification with predict.type = "response" is incompatible with weights.')
Expand Down
6 changes: 3 additions & 3 deletions tests/testthat/test_base_generatePartialDependence.R
Expand Up @@ -224,14 +224,14 @@ test_that("generatePartialDependenceData", {

# with the joint distribution as the weight function generatePartialDependenceData
# should return NA for regions with zero probability
x = runif(10)
x = runif(50L)
y = 2 * x
idx = x > .5
idx = which(x > .5)
x[idx] = NA
test.task = makeRegrTask(data = data.frame(x = x[-idx], y = y[-idx]), target = "y")
fit = train("regr.rpart", test.task)
pd = generatePartialDependenceData(fit, test.task,
weight.fun = function(x, data) ifelse(x > .5, 0, 1),
weight.fun = function(x, data) apply(x, 1, function(z) ifelse(z > .5, 0, 1)),
fmin = list("x" = 0), fmax = list("x" = 1), gridsize = gridsize)
expect_that(all(is.na(pd$data[pd$data$x > .5, "y"])), is_true())

Expand Down

0 comments on commit 94e6859

Please sign in to comment.