[R-package] enable use of trees with linear models at leaves (fixes #…

…3319) (#3699) * [R-package] enable use of trees with linear models at leaves (fixes #3319) * remove problematic pragmas * fix tests * try to fix build scripts * try fixing pragma check * more pragma checks * ok fix pragma stuff for real * empty commit * regenerate documentation * try skipping test * uncomment CI * add note on missing value types for R * add tests on saving and re-loading booster
microsoft · Jan 18, 2021 · ed651e8 · ed651e8
1 parent 706f2af
commit ed651e8
Show file tree

Hide file tree

Showing 11 changed files with 473 additions and 20 deletions.
diff --git a/R-package/configure b/R-package/configure
@@ -1699,6 +1699,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
 # LightGBM-specific flags
 LGB_CPPFLAGS=""
 
+#########
+# Eigen #
+#########
+
+LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"
+
 ###############
 # MM_PREFETCH #
 ###############

diff --git a/R-package/configure.ac b/R-package/configure.ac
@@ -26,6 +26,12 @@ CXX=`"${R_HOME}/bin/R" CMD config CXX11`
 # LightGBM-specific flags
 LGB_CPPFLAGS=""
 
+#########
+# Eigen #
+#########
+
+LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"
+
 ###############
 # MM_PREFETCH #
 ###############

diff --git a/R-package/configure.win b/R-package/configure.win
@@ -12,6 +12,12 @@ CC=`"${R_EXE}" CMD config CC`
 # LightGBM-specific flags
 LGB_CPPFLAGS=""
 
+#########
+# Eigen #
+#########
+
+LGB_CPPFLAGS="${LGB_CPPFLAGS} -DEIGEN_MPL2_ONLY"
+
 ###############
 # MM_PREFETCH #
 ###############

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
@@ -345,6 +345,45 @@ test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric
   expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)])
 })
 
+test_that("lgb.cv() fit on linearly-relatead data improves when using linear learners", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    X <- matrix(rnorm(1000L), ncol = 1L)
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  cv_bst <- lgb.cv(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , nfold = 5L
+  )
+  expect_is(cv_bst, "lgb.CVBooster")
+
+  dtrain <- .new_dataset()
+  cv_bst_linear <- lgb.cv(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , nfold = 5L
+  )
+  expect_is(cv_bst_linear, "lgb.CVBooster")
+
+  expect_true(cv_bst_linear$best_score < cv_bst$best_score)
+})
+
 context("lgb.train()")
 
 test_that("lgb.train() works as expected with multiple eval metrics", {
@@ -1631,6 +1670,247 @@ test_that("early stopping works with lgb.cv()", {
   )
 })
 
+context("linear learner")
+
+test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    X <- matrix(rnorm(100L), ncol = 1L)
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
+
+test_that("lgb.train() w/ linear learner fails already-constructed dataset with linear=false", {
+  testthat::skip("Skipping this test because it causes issues for valgrind")
+  set.seed(708L)
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- lgb.Dataset(
+    data = matrix(rnorm(100L), ncol = 1L)
+    , label = rnorm(100L)
+  )
+  dtrain$construct()
+  expect_error({
+    bst_linear <- lgb.train(
+      data = dtrain
+      , nrounds = 10L
+      , params = modifyList(params, list(linear_tree = TRUE))
+      , valids = list("train" = dtrain)
+    )
+  }, regexp = "Cannot change linear_tree after constructed Dataset handle")
+})
+
+test_that("lgb.train() works with linear learners even if Dataset has missing values", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    values <- rnorm(100L)
+    values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
+    X <- matrix(
+      data = sample(values, size = 100L)
+      , ncol = 1L
+    )
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
+test_that("lgb.train() works with linear learners, bagging, and a Dataset that has missing values", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    values <- rnorm(100L)
+    values[sample(seq_len(length(values)), size = 10L)] <- NA_real_
+    X <- matrix(
+      data = sample(values, size = 100L)
+      , ncol = 1L
+    )
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+    , bagging_freq = 1L
+    , subsample = 0.8
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
+test_that("lgb.train() works with linear learners and data where a feature has only 1 non-NA value", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    values <- rep(NA_real_, 100L)
+    values[18L] <- rnorm(1L)
+    X <- matrix(
+      data = values
+      , ncol = 1L
+    )
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+  )
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+})
+
+test_that("lgb.train() works with linear learners when Dataset has categorical features", {
+  set.seed(708L)
+  .new_dataset <- function() {
+    X <- matrix(numeric(200L), nrow = 100L, ncol = 2L)
+    X[, 1L] <- rnorm(100L)
+    X[, 2L] <- sample(seq_len(4L), size = 100L, replace = TRUE)
+    return(lgb.Dataset(
+      data = X
+      , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1)
+    ))
+  }
+
+  params <- list(
+    objective = "regression"
+    , verbose = -1L
+    , metric = "mse"
+    , seed = 0L
+    , num_leaves = 2L
+    , categorical_featurs = 1L
+  )
+
+  dtrain <- .new_dataset()
+  bst <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = params
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst))
+
+  dtrain <- .new_dataset()
+  bst_linear <- lgb.train(
+    data = dtrain
+    , nrounds = 10L
+    , params = modifyList(params, list(linear_tree = TRUE))
+    , valids = list("train" = dtrain)
+  )
+  expect_true(lgb.is.Booster(bst_linear))
+
+  bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
+  expect_true(bst_lin_last_mse <  bst_last_mse)
+})
+
 context("interaction constraints")
 
 test_that("lgb.train() throws an informative error if interaction_constraints is not a list", {