From 465d1262eb1d8eb3cfa7cc505140c035e52c8118 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Mon, 21 Oct 2019 08:07:26 +0800 Subject: [PATCH] check sorted indices in Subset (#2510) * Update sparse_bin.hpp * check sorted in c_api * fix python package * fix tests * fix test * std::is_sorted * Update basic.py --- python-package/lightgbm/basic.py | 2 +- python-package/lightgbm/engine.py | 4 ++-- src/c_api.cpp | 3 +++ tests/python_package_test/test_engine.py | 8 ++++---- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 26270543fa1..59f2b9d67e5 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1095,7 +1095,7 @@ def subset(self, used_indices, params=None): free_raw_data=self.free_raw_data) ret._predictor = self._predictor ret.pandas_categorical = self.pandas_categorical - ret.used_indices = used_indices + ret.used_indices = sorted(used_indices) return ret def save_binary(self, filename): diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index a3f0189e571..c5329541b88 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -339,8 +339,8 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi ret = _CVBooster() for train_idx, test_idx in folds: - train_set = full_data.subset(train_idx) - valid_set = full_data.subset(test_idx) + train_set = full_data.subset(sorted(train_idx)) + valid_set = full_data.subset(sorted(test_idx)) # run preprocessing on the data set if needed if fpreproc is not None: train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy()) diff --git a/src/c_api.cpp b/src/c_api.cpp index fbf272d3dab..5b28be8346e 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -921,6 +921,9 @@ int LGBM_DatasetGetSubset( const int32_t lower = 0; const int32_t upper = full_dataset->num_data() - 1; Common::CheckElementsIntervalClosed(used_row_indices, lower, upper, num_used_row_indices, "Used indices of subset"); + if (!std::is_sorted(used_row_indices, used_row_indices + num_used_row_indices)) { + Log::Fatal("used_row_indices should be sorted in Subset"); + } auto ret = std::unique_ptr(new Dataset(num_used_row_indices)); ret->CopyFeatureMapperFrom(full_dataset); ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true); diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 51c99494e68..b226f4fd590 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -820,9 +820,9 @@ def test_init_with_subset(self): data = np.random.random((500, 2)) y = [1] * 250 + [0] * 250 lgb_train = lgb.Dataset(data, y, free_raw_data=False) - subset_index_1 = np.random.choice(np.arange(500), 300, replace=False) + subset_index_1 = sorted(np.random.choice(np.arange(500), 300, replace=False)) subset_data_1 = lgb_train.subset(subset_index_1) - subset_index_2 = np.random.choice(np.arange(500), 200, replace=False) + subset_index_2 = sorted(np.random.choice(np.arange(500), 200, replace=False)) subset_data_2 = lgb_train.subset(subset_index_2) params = { 'objective': 'binary', @@ -1601,8 +1601,8 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration, iter_min = min([iter_min_l1, iter_min_l2]) iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2]) - iter_cv_l1 = 3 - iter_cv_l2 = 17 + iter_cv_l1 = 4 + iter_cv_l2 = 12 self.assertEqual(len(set([iter_cv_l1, iter_cv_l2])), 2) iter_cv_min = min([iter_cv_l1, iter_cv_l2])