Skip to content

Commit

Permalink
[R-package] enable use of trees with linear models at leaves (fixes #…
Browse files Browse the repository at this point in the history
…3319) (#3699)

* [R-package] enable use of trees with linear models at leaves (fixes #3319)

* remove problematic pragmas

* fix tests

* try to fix build scripts

* try fixing pragma check

* more pragma checks

* ok fix pragma stuff for real

* empty commit

* regenerate documentation

* try skipping test

* uncomment CI

* add note on missing value types for R

* add tests on saving and re-loading booster
  • Loading branch information
jameslamb committed Jan 18, 2021
1 parent 706f2af commit ed651e8
Show file tree
Hide file tree
Showing 11 changed files with 473 additions and 20 deletions.
6 changes: 6 additions & 0 deletions R-package/configure
Expand Up @@ -1699,6 +1699,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
# LightGBM-specific flags
LGB_CPPFLAGS=""

#########
# Eigen #
#########

LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"

###############
# MM_PREFETCH #
###############
Expand Down
6 changes: 6 additions & 0 deletions R-package/configure.ac
Expand Up @@ -26,6 +26,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
# LightGBM-specific flags
LGB_CPPFLAGS=""

#########
# Eigen #
#########

LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"

###############
# MM_PREFETCH #
###############
Expand Down
6 changes: 6 additions & 0 deletions R-package/configure.win
Expand Up @@ -12,6 +12,12 @@ CC=`"${R_EXE}" CMD config CC`
# LightGBM-specific flags
LGB_CPPFLAGS=""

#########
# Eigen #
#########

LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"

###############
# MM_PREFETCH #
###############
Expand Down
280 changes: 280 additions & 0 deletions R-package/tests/testthat/test_basic.R
Expand Up @@ -345,6 +345,45 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric
expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)])
})

test_that("lgb.cv() fit on linearly-relatead data improves when using linear learners", {
set.seed(708L)
.new_dataset <- function() {
X <- matrix(rnorm(1000L), ncol = 1L)
return(lgb.Dataset(
data = X
, label = 2L * X + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- .new_dataset()
cv_bst <- lgb.cv(
data = dtrain
, nrounds = 10L
, params = params
, nfold = 5L
)
expect_is(cv_bst, "lgb.CVBooster")

dtrain <- .new_dataset()
cv_bst_linear <- lgb.cv(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, nfold = 5L
)
expect_is(cv_bst_linear, "lgb.CVBooster")

expect_true(cv_bst_linear$best_score < cv_bst$best_score)
})

context("lgb.train()")

test_that("lgb.train() works as expected with multiple eval metrics", {
Expand Down Expand Up @@ -1631,6 +1670,247 @@ test_that("early stopping works with lgb.cv()", {
)
})

context("linear learner")

test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", {
set.seed(708L)
.new_dataset <- function() {
X <- matrix(rnorm(100L), ncol = 1L)
return(lgb.Dataset(
data = X
, label = 2L * X + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})


test_that("lgb.train() w/ linear learner fails already-constructed dataset with linear=false", {
testthat::skip("Skipping this test because it causes issues for valgrind")
set.seed(708L)
params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- lgb.Dataset(
data = matrix(rnorm(100L), ncol = 1L)
, label = rnorm(100L)
)
dtrain$construct()
expect_error({
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
}, regexp = "Cannot change linear_tree after constructed Dataset handle")
})

test_that("lgb.train() works with linear learners even if Dataset has missing values", {
set.seed(708L)
.new_dataset <- function() {
values <- rnorm(100L)
values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
X <- matrix(
data = sample(values, size = 100L)
, ncol = 1L
)
return(lgb.Dataset(
data = X
, label = 2L * X + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})

test_that("lgb.train() works with linear learners, bagging, and a Dataset that has missing values", {
set.seed(708L)
.new_dataset <- function() {
values <- rnorm(100L)
values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
X <- matrix(
data = sample(values, size = 100L)
, ncol = 1L
)
return(lgb.Dataset(
data = X
, label = 2L * X + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
, bagging_freq = 1L
, subsample = 0.8
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})

test_that("lgb.train() works with linear learners and data where a feature has only 1 non-NA value", {
set.seed(708L)
.new_dataset <- function() {
values <- rep(NA_real_, 100L)
values[18L] <- rnorm(1L)
X <- matrix(
data = values
, ncol = 1L
)
return(lgb.Dataset(
data = X
, label = 2L * X + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
)

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))
})

test_that("lgb.train() works with linear learners when Dataset has categorical features", {
set.seed(708L)
.new_dataset <- function() {
X <- matrix(numeric(200L), nrow = 100L, ncol = 2L)
X[, 1L] <- rnorm(100L)
X[, 2L] <- sample(seq_len(4L), size = 100L, replace = TRUE)
return(lgb.Dataset(
data = X
, label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1)
))
}

params <- list(
objective = "regression"
, verbose = -1L
, metric = "mse"
, seed = 0L
, num_leaves = 2L
, categorical_featurs = 1L
)

dtrain <- .new_dataset()
bst <- lgb.train(
data = dtrain
, nrounds = 10L
, params = params
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst))

dtrain <- .new_dataset()
bst_linear <- lgb.train(
data = dtrain
, nrounds = 10L
, params = modifyList(params, list(linear_tree = TRUE))
, valids = list("train" = dtrain)
)
expect_true(lgb.is.Booster(bst_linear))

bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
expect_true(bst_lin_last_mse < bst_last_mse)
})

context("interaction constraints")

test_that("lgb.train() throws an informative error if interaction_constraints is not a list", {
Expand Down

0 comments on commit ed651e8

Please sign in to comment.