Skip to content

Commit

Permalink
check sorted indices in Subset (#2510)
Browse files Browse the repository at this point in the history
* Update sparse_bin.hpp

* check sorted in c_api

* fix python package

* fix tests

* fix test

* std::is_sorted

* Update basic.py
  • Loading branch information
guolinke committed Oct 21, 2019
1 parent 198d0f3 commit 465d126
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 7 deletions.
2 changes: 1 addition & 1 deletion python-package/lightgbm/basic.py
Expand Up @@ -1095,7 +1095,7 @@ def subset(self, used_indices, params=None):
free_raw_data=self.free_raw_data)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices
ret.used_indices = sorted(used_indices)
return ret

def save_binary(self, filename):
Expand Down
4 changes: 2 additions & 2 deletions python-package/lightgbm/engine.py
Expand Up @@ -339,8 +339,8 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi

ret = _CVBooster()
for train_idx, test_idx in folds:
train_set = full_data.subset(train_idx)
valid_set = full_data.subset(test_idx)
train_set = full_data.subset(sorted(train_idx))
valid_set = full_data.subset(sorted(test_idx))
# run preprocessing on the data set if needed
if fpreproc is not None:
train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
Expand Down
3 changes: 3 additions & 0 deletions src/c_api.cpp
Expand Up @@ -921,6 +921,9 @@ int LGBM_DatasetGetSubset(
const int32_t lower = 0;
const int32_t upper = full_dataset->num_data() - 1;
Common::CheckElementsIntervalClosed(used_row_indices, lower, upper, num_used_row_indices, "Used indices of subset");
if (!std::is_sorted(used_row_indices, used_row_indices + num_used_row_indices)) {
Log::Fatal("used_row_indices should be sorted in Subset");
}
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices));
ret->CopyFeatureMapperFrom(full_dataset);
ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true);
Expand Down
8 changes: 4 additions & 4 deletions tests/python_package_test/test_engine.py
Expand Up @@ -820,9 +820,9 @@ def test_init_with_subset(self):
data = np.random.random((500, 2))
y = [1] * 250 + [0] * 250
lgb_train = lgb.Dataset(data, y, free_raw_data=False)
subset_index_1 = np.random.choice(np.arange(500), 300, replace=False)
subset_index_1 = sorted(np.random.choice(np.arange(500), 300, replace=False))
subset_data_1 = lgb_train.subset(subset_index_1)
subset_index_2 = np.random.choice(np.arange(500), 200, replace=False)
subset_index_2 = sorted(np.random.choice(np.arange(500), 200, replace=False))
subset_data_2 = lgb_train.subset(subset_index_2)
params = {
'objective': 'binary',
Expand Down Expand Up @@ -1601,8 +1601,8 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration,
iter_min = min([iter_min_l1, iter_min_l2])
iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])

iter_cv_l1 = 3
iter_cv_l2 = 17
iter_cv_l1 = 4
iter_cv_l2 = 12
self.assertEqual(len(set([iter_cv_l1, iter_cv_l2])), 2)
iter_cv_min = min([iter_cv_l1, iter_cv_l2])

Expand Down

0 comments on commit 465d126

Please sign in to comment.