Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature importance for h2o learners #2434

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion R/RLearner_classif_h2odeeplearning.R
Expand Up @@ -219,7 +219,7 @@ makeRLearner.classif.h2o.deeplearning = function() {
makeLogicalLearnerParam("reproducible", default = FALSE, tunable = FALSE),
makeLogicalLearnerParam("export_weights_and_biases", default = FALSE, tunable = FALSE)
),
properties = c("twoclass", "multiclass", "numerics", "factors", "prob", "weights", "missings"),
properties = c("twoclass", "multiclass", "numerics", "factors", "prob", "weights", "missings", "featimp"),
name = "h2o.deeplearning",
short.name = "h2o.dl",
note = 'The default value of `missing_values_handling` is `"MeanImputation"`, so missing values are automatically mean-imputed.',
Expand Down Expand Up @@ -266,3 +266,9 @@ predictLearner.classif.h2o.deeplearning = function(.learner, .model, .newdata, .
return(as.matrix(p.df))
}
}

#' @export
getFeatureImportanceLearner.classif.h2o.deeplearning = function(.learner, .model, ...) {
mod = getLearnerModel(.model, more.unwrap = TRUE)
extractH2OVarImp(mod, ...)
}
9 changes: 8 additions & 1 deletion R/RLearner_classif_h2ogbm.R
Expand Up @@ -14,7 +14,7 @@ makeRLearner.classif.h2o.gbm = function() {
makeIntegerLearnerParam("max_after_balance_size", lower = 0L, default = 1L),
makeIntegerLearnerParam("seed", tunable = FALSE)
),
properties = c("twoclass", "multiclass", "numerics", "factors", "prob", "missings"),
properties = c("twoclass", "multiclass", "numerics", "factors", "prob", "missings", "featimp"),
name = "h2o.gbm",
short.name = "h2o.gbm",
note = "'distribution' is set automatically to 'gaussian'.",
Expand Down Expand Up @@ -57,3 +57,10 @@ predictLearner.classif.h2o.gbm = function(.learner, .model, .newdata, ...) {
return(as.matrix(p.df))
}
}

#' @export
getFeatureImportanceLearner.classif.h2o.gbm = function(.learner, .model, ...) {
mod = getLearnerModel(.model, more.unwrap = TRUE)
extractH2OVarImp(mod, ...)
}

16 changes: 15 additions & 1 deletion R/RLearner_classif_h2oglm.R
Expand Up @@ -21,7 +21,7 @@ makeRLearner.classif.h2o.glm = function() {
makeUntypedLearnerParam("beta_constraints"),
makeLogicalLearnerParam("intercept", default = TRUE)
),
properties = c("twoclass", "numerics", "factors", "prob", "weights", "missings"),
properties = c("twoclass", "numerics", "factors", "prob", "weights", "missings", "featimp"),
name = "h2o.glm",
short.name = "h2o.glm",
note = '`family` is always set to `"binomial"` to get a binary classifier. The default value of `missing_values_handling` is `"MeanImputation"`, so missing values are automatically mean-imputed.',
Expand Down Expand Up @@ -69,3 +69,17 @@ predictLearner.classif.h2o.glm = function(.learner, .model, .newdata, ...) {
return(ret)
}
}

#' @export
getFeatureImportanceLearner.classif.h2o.glm = function(.learner, .model, ...) {
mod = getLearnerModel(.model, more.unwrap = TRUE)
extractH2OGlmVarImp(mod, ...)
}


extractH2OGlmVarImp = function(.learner.model, ...) {
imp = na.omit(as.data.frame(h2o::h2o.varimp(.learner.model)))
res = imp$coefficients
names(res) = imp$names
res
}
15 changes: 14 additions & 1 deletion R/RLearner_classif_h2orandomForest.R
Expand Up @@ -17,7 +17,7 @@ makeRLearner.classif.h2o.randomForest = function() {
makeIntegerLearnerParam("max_after_balance_size", lower = 0L, default = 5L),
makeIntegerLearnerParam("seed", tunable = FALSE)
),
properties = c("twoclass", "multiclass", "numerics", "factors", "missings", "prob"),
properties = c("twoclass", "multiclass", "numerics", "factors", "missings", "prob", "featimp"),
name = "h2o.randomForest",
short.name = "h2o.rf",
callees = "h2o.randomForest"
Expand Down Expand Up @@ -58,3 +58,16 @@ predictLearner.classif.h2o.randomForest = function(.learner, .model, .newdata, .
return(as.matrix(p.df))
}
}

#' @export
getFeatureImportanceLearner.classif.h2o.randomForest = function(.learner, .model, ...) {
mod = getLearnerModel(.model, more.unwrap = TRUE)
extractH2OVarImp(mod, ...)
}

extractH2OVarImp = function(.learner.model, ...) {
imp = na.omit(as.data.frame(h2o::h2o.varimp(.learner.model)))
res = imp$relative_importance
names(res) = imp$variable
res
}
2 changes: 2 additions & 0 deletions R/getFeatureImportance.R
Expand Up @@ -20,6 +20,8 @@
#' {Estimation of relative influence for each feature. See
#' \link[gbm:relative.influence]{relative.influence}
#' for details and further parameters.}
#' \item{h2o} \cr
#' {Relative feature importances as returned by \link[h2o:h2o.varimp]{varimp}.}
#' \item{randomForest} \cr
#' {For `type = 2` (the default) the 'MeanDecreaseGini' is measured,
#' which is based on the Gini impurity index used for the calculation of the nodes.
Expand Down
16 changes: 16 additions & 0 deletions tests/testthat/test_classif_h2odeeplearning.R
Expand Up @@ -41,3 +41,19 @@ test_that("class names are integers and probabilities predicted (#1787)", {
r = resample(gb.lrn, classif.task, rin)
expect_false(is.null(r$pred))
})

test_that("feature importances are returned", {
iris2 = iris[iris$Species %in% c("versicolor", "virginica"), ]
iris2$Species = droplevels(iris2$Species)
task = makeClassifTask(data = iris2, target = "Species")

lrn = makeLearner("classif.h2o.deeplearning")
mod = train(lrn, task)
feat.imp = getFeatureImportance(mod)$res
feat.imp.h2o = h2o::h2o.varimp(getLearnerModel(mod))[, c("variable", "relative_importance")]
# Convert to data.frame with same structure for equality check
feat.imp.h2o = data.frame(as.list(xtabs(relative_importance ~ variable, data = feat.imp.h2o)))[names(feat.imp)]

expect_equal(feat.imp,
feat.imp.h2o)
})
16 changes: 16 additions & 0 deletions tests/testthat/test_classif_h2ogbm.R
Expand Up @@ -39,3 +39,19 @@ test_that("class names are integers and probabilities predicted (#1787)", {
r = resample(gb.lrn, classif.task, rin)
expect_false(is.null(r$pred))
})

test_that("feature importances are returned", {
iris2 = iris[iris$Species %in% c("versicolor", "virginica"), ]
iris2$Species = droplevels(iris2$Species)
task = makeClassifTask(data = iris2, target = "Species")

lrn = makeLearner("classif.h2o.gbm")
mod = train(lrn, task)
feat.imp = getFeatureImportance(mod)$res
feat.imp.h2o = h2o::h2o.varimp(getLearnerModel(mod))[, c("variable", "relative_importance")]
# Convert to data.frame with same structure for equality check
feat.imp.h2o = data.frame(as.list(xtabs(relative_importance ~ variable, data = feat.imp.h2o)))[names(feat.imp)]

expect_equal(feat.imp,
feat.imp.h2o)
})
16 changes: 16 additions & 0 deletions tests/testthat/test_classif_h2oglm.R
Expand Up @@ -38,3 +38,19 @@ test_that("class names are integers and probabilities predicted (#1787)", {
r = resample(gb.lrn, classif.task, rin)
expect_false(is.null(r$pred))
})

test_that("feature importances are returned", {
iris2 = iris[iris$Species %in% c("versicolor", "virginica"), ]
iris2$Species = droplevels(iris2$Species)
task = makeClassifTask(data = iris2, target = "Species")

lrn = makeLearner("classif.h2o.glm")
mod = train(lrn, task)
feat.imp = getFeatureImportance(mod)$res
feat.imp.h2o = na.omit(h2o::h2o.varimp(getLearnerModel(mod))[, c("names", "coefficients")])
# Convert to data.frame with same structure for equality check
feat.imp.h2o = data.frame(as.list(xtabs(coefficients ~ names, data = feat.imp.h2o)))[names(feat.imp)]

expect_equal(feat.imp,
feat.imp.h2o)
})
16 changes: 16 additions & 0 deletions tests/testthat/test_classif_h2orandomForest.R
Expand Up @@ -40,3 +40,19 @@ test_that("class names are integers and probabilities predicted (#1787)", {
r = resample(gb.lrn, classif.task, rin)
expect_false(is.null(r$pred))
})

test_that("feature importances are returned", {
iris2 = iris[iris$Species %in% c("versicolor", "virginica"), ]
iris2$Species = droplevels(iris2$Species)
task = makeClassifTask(data = iris2, target = "Species")

lrn = makeLearner("classif.h2o.randomForest")
mod = train(lrn, task)
feat.imp = getFeatureImportance(mod)$res
feat.imp.h2o = h2o::h2o.varimp(getLearnerModel(mod))[, c("variable", "relative_importance")]
# Convert to data.frame with same structure for equality check
feat.imp.h2o = data.frame(as.list(xtabs(relative_importance ~ variable, data = feat.imp.h2o)))[names(feat.imp)]

expect_equal(feat.imp,
feat.imp.h2o)
})