[python-package][R-package] allow using feature names when retrieving…

… number of bins (#5116) * allow using feature names when retrieving number of bins * unname vector * use default feature names when not defined * lint * apply suggestions * remove extra comma * add test with categorical feature * make feature names sync more transparent
microsoft · May 17, 2022 · 5b664b6 · 5b664b6
1 parent 53218c1
commit 5b664b6
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 9 deletions.
diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
@@ -289,6 +289,13 @@ Dataset <- R6::R6Class(
         self$set_colnames(colnames = private$colnames)
       }
 
+      # Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
+      # in cases like constructing from a file or from a matrix with no column names.
+      private$colnames <- .Call(
+          LGBM_DatasetGetFeatureNames_R
+          , private$handle
+      )
+
       # Load init score if requested
       if (!is.null(private$predictor) && is.null(private$used_indices)) {
 
@@ -381,6 +388,13 @@ Dataset <- R6::R6Class(
       if (lgb.is.null.handle(x = private$handle)) {
         stop("Cannot get number of bins in feature before constructing Dataset.")
       }
+      if (is.character(feature)) {
+        feature_name <- feature
+        feature <- which(private$colnames == feature_name)
+        if (length(feature) == 0L) {
+          stop(sprintf("feature '%s' not found", feature_name))
+        }
+      }
       num_bin <- integer(1L)
       .Call(
         LGBM_DatasetGetFeatureNumBin_R

diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R
@@ -533,20 +533,47 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
     , three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
     , two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
     , all_zero = rep(0.0, 100L)
+    , categorical = sample.int(2L, 100L, replace = TRUE)
   )
+  n_features <- ncol(raw_df)
   raw_mat <- data.matrix(raw_df)
   min_data_in_bin <- 2L
-  ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin))
+  ds <- lgb.Dataset(
+    raw_mat
+    , params = list(min_data_in_bin = min_data_in_bin)
+    , categorical_feature = n_features
+  )
   ds$construct()
   expected_num_bins <- c(
     100L %/% min_data_in_bin + 1L  # extra bin for zero
     , 3L  # 0, 1, 2
     , 3L  # 0, 1, 2
     , 4L  # 0, 1, 2 + NA
     , 0L  # unused
+    , 3L  # 1, 2 + NA
   )
-  actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
+  actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin)
   expect_identical(actual_num_bins, expected_num_bins)
+  # test using defined feature names
+  bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
+  expect_identical(unname(bins_by_name), expected_num_bins)
+  # test using default feature names
+  no_names_mat <- raw_mat
+  colnames(no_names_mat) <- NULL
+  ds_no_names <- lgb.Dataset(
+    no_names_mat
+    , params = list(min_data_in_bin = min_data_in_bin)
+    , categorical_feature = n_features
+  )
+  ds_no_names$construct()
+  default_names <- lapply(
+    X = seq(1L, ncol(raw_mat))
+    , FUN = function(i) {
+      sprintf("Column_%d", i - 1L)
+    }
+  )
+  bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
+  expect_identical(bins_by_default_name, expected_num_bins)
 })
 
 test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
@@ -555,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without
   ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
   sparse_mat <- as(raw_mat, "dgCMatrix")
   ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
-  # check that the column names are NULL
-  expect_null(ds$.__enclos_env__$private$colnames)
-  expect_null(ds2$.__enclos_env__$private$colnames)
+  # check that the column names are the default ones
+  expect_equal(ds$.__enclos_env__$private$colnames, "Column_0")
+  expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0")
   # check for error when index is greater than the number of columns
   expect_error({
     lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()

diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
@@ -1817,6 +1817,7 @@ def construct(self):
                                 feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
             if self.free_raw_data:
                 self.data = None
+            self.feature_name = self.get_feature_name()
         return self
 
     def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
@@ -2382,20 +2383,22 @@ def num_feature(self):
         else:
             raise LightGBMError("Cannot get num_feature before construct dataset")
 
-    def feature_num_bin(self, feature: int) -> int:
+    def feature_num_bin(self, feature: Union[int, str]) -> int:
         """Get the number of bins for a feature.
 
         Parameters
         ----------
-        feature : int
-            Index of the feature.
+        feature : int or str
+            Index or name of the feature.
 
         Returns
         -------
         number_of_bins : int
             The number of constructed bins for the feature in the Dataset.
         """
         if self.handle is not None:
+            if isinstance(feature, str):
+                feature = self.feature_name.index(feature)
             ret = ctypes.c_int(0)
             _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
                                                          ctypes.c_int(feature),

diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
@@ -663,17 +663,33 @@ def test_feature_num_bin(min_data_in_bin):
         np.array([0, 1, 2] * 33 + [0]),
         np.array([1, 2] * 49 + 2 * [np.nan]),
         np.zeros(100),
+        np.random.choice([0, 1], 100),
     ]).T
-    ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
+    n_continuous = X.shape[1] - 1
+    feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
+    ds_kwargs = dict(
+        params={'min_data_in_bin': min_data_in_bin},
+        categorical_feature=[n_continuous],  # last feature
+    )
+    ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
     expected_num_bins = [
         100 // min_data_in_bin + 1,  # extra bin for zero
         3,  # 0, 1, 2
         3,  # 0, 1, 2
         4,  # 0, 1, 2 + nan
         0,  # unused
+        3,  # 0, 1 + nan
     ]
     actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
     assert actual_num_bins == expected_num_bins
+    # test using defined feature names
+    bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
+    assert bins_by_name == expected_num_bins
+    # test using default feature names
+    ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
+    default_names = [f'Column_{i}' for i in range(X.shape[1])]
+    bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
+    assert bins_by_default_name == expected_num_bins
     # check for feature indices outside of range
     num_features = X.shape[1]
     with pytest.raises(