Skip to content

Commit

Permalink
[python-package][R-package] allow using feature names when retrieving…
Browse files Browse the repository at this point in the history
… number of bins (#5116)

* allow using feature names when retrieving number of bins

* unname vector

* use default feature names when not defined

* lint

* apply suggestions

* remove extra comma

* add test with categorical feature

* make feature names sync more transparent
  • Loading branch information
jmoralez committed May 17, 2022
1 parent 53218c1 commit 5b664b6
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 9 deletions.
14 changes: 14 additions & 0 deletions R-package/R/lgb.Dataset.R
Expand Up @@ -289,6 +289,13 @@ Dataset <- R6::R6Class(
self$set_colnames(colnames = private$colnames)
}

# Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
# in cases like constructing from a file or from a matrix with no column names.
private$colnames <- .Call(
LGBM_DatasetGetFeatureNames_R
, private$handle
)

# Load init score if requested
if (!is.null(private$predictor) && is.null(private$used_indices)) {

Expand Down Expand Up @@ -381,6 +388,13 @@ Dataset <- R6::R6Class(
if (lgb.is.null.handle(x = private$handle)) {
stop("Cannot get number of bins in feature before constructing Dataset.")
}
if (is.character(feature)) {
feature_name <- feature
feature <- which(private$colnames == feature_name)
if (length(feature) == 0L) {
stop(sprintf("feature '%s' not found", feature_name))
}
}
num_bin <- integer(1L)
.Call(
LGBM_DatasetGetFeatureNumBin_R
Expand Down
37 changes: 32 additions & 5 deletions R-package/tests/testthat/test_dataset.R
Expand Up @@ -533,20 +533,47 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
, three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
, two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
, all_zero = rep(0.0, 100L)
, categorical = sample.int(2L, 100L, replace = TRUE)
)
n_features <- ncol(raw_df)
raw_mat <- data.matrix(raw_df)
min_data_in_bin <- 2L
ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin))
ds <- lgb.Dataset(
raw_mat
, params = list(min_data_in_bin = min_data_in_bin)
, categorical_feature = n_features
)
ds$construct()
expected_num_bins <- c(
100L %/% min_data_in_bin + 1L # extra bin for zero
, 3L # 0, 1, 2
, 3L # 0, 1, 2
, 4L # 0, 1, 2 + NA
, 0L # unused
, 3L # 1, 2 + NA
)
actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin)
expect_identical(actual_num_bins, expected_num_bins)
# test using defined feature names
bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
expect_identical(unname(bins_by_name), expected_num_bins)
# test using default feature names
no_names_mat <- raw_mat
colnames(no_names_mat) <- NULL
ds_no_names <- lgb.Dataset(
no_names_mat
, params = list(min_data_in_bin = min_data_in_bin)
, categorical_feature = n_features
)
ds_no_names$construct()
default_names <- lapply(
X = seq(1L, ncol(raw_mat))
, FUN = function(i) {
sprintf("Column_%d", i - 1L)
}
)
bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
expect_identical(bins_by_default_name, expected_num_bins)
})

test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
Expand All @@ -555,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without
ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
sparse_mat <- as(raw_mat, "dgCMatrix")
ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
# check that the column names are NULL
expect_null(ds$.__enclos_env__$private$colnames)
expect_null(ds2$.__enclos_env__$private$colnames)
# check that the column names are the default ones
expect_equal(ds$.__enclos_env__$private$colnames, "Column_0")
expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0")
# check for error when index is greater than the number of columns
expect_error({
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
Expand Down
9 changes: 6 additions & 3 deletions python-package/lightgbm/basic.py
Expand Up @@ -1817,6 +1817,7 @@ def construct(self):
feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
if self.free_raw_data:
self.data = None
self.feature_name = self.get_feature_name()
return self

def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
Expand Down Expand Up @@ -2382,20 +2383,22 @@ def num_feature(self):
else:
raise LightGBMError("Cannot get num_feature before construct dataset")

def feature_num_bin(self, feature: int) -> int:
def feature_num_bin(self, feature: Union[int, str]) -> int:
"""Get the number of bins for a feature.
Parameters
----------
feature : int
Index of the feature.
feature : int or str
Index or name of the feature.
Returns
-------
number_of_bins : int
The number of constructed bins for the feature in the Dataset.
"""
if self.handle is not None:
if isinstance(feature, str):
feature = self.feature_name.index(feature)
ret = ctypes.c_int(0)
_safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
ctypes.c_int(feature),
Expand Down
18 changes: 17 additions & 1 deletion tests/python_package_test/test_basic.py
Expand Up @@ -663,17 +663,33 @@ def test_feature_num_bin(min_data_in_bin):
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
np.random.choice([0, 1], 100),
]).T
ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
ds_kwargs = dict(
params={'min_data_in_bin': min_data_in_bin},
categorical_feature=[n_continuous], # last feature
)
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
expected_num_bins = [
100 // min_data_in_bin + 1, # extra bin for zero
3, # 0, 1, 2
3, # 0, 1, 2
4, # 0, 1, 2 + nan
0, # unused
3, # 0, 1 + nan
]
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
assert actual_num_bins == expected_num_bins
# test using defined feature names
bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
assert bins_by_name == expected_num_bins
# test using default feature names
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
default_names = [f'Column_{i}' for i in range(X.shape[1])]
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
assert bins_by_default_name == expected_num_bins
# check for feature indices outside of range
num_features = X.shape[1]
with pytest.raises(
Expand Down

0 comments on commit 5b664b6

Please sign in to comment.