From ad8e8ccc9c9a5f8acb192b28715827ebc28dcc07 Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Thu, 12 Sep 2019 20:52:52 +0800 Subject: [PATCH] update feature_fraction_bynode (#2381) * update * fix a bug * Update config.h * Update Parameters.rst --- docs/Parameters.rst | 18 +++-- include/LightGBM/config.h | 17 ++-- src/io/config_auto.cpp | 14 ++-- .../data_parallel_tree_learner.cpp | 6 +- src/treelearner/serial_tree_learner.cpp | 77 +++++++++++++------ src/treelearner/serial_tree_learner.h | 6 +- .../voting_parallel_tree_learner.cpp | 6 +- tests/python_package_test/test_engine.py | 4 +- 8 files changed, 92 insertions(+), 56 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index aaa10eef347..476013b9884 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -254,22 +254,24 @@ Learning Control Parameters - random seed for bagging -- ``feature_fraction_bynode`` :raw-html:`🔗︎`, default = ``false``, type = bool, aliases: ``sub_feature_bynode``, ``colsample_bytree_bynode`` - - - set this to ``true`` to randomly select part of features for each node +- ``feature_fraction`` :raw-html:`🔗︎`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0`` - - set this to ``false`` to randomly select part of features for each tree (use the same sub features for each tree) + - LightGBM will randomly select part of features on each iteration (tree) if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree - - **Note**: set this to ``true`` cannot speed up the training, but set this to ``false`` can speed up the training linearly + - can be used to speed up training -- ``feature_fraction`` :raw-html:`🔗︎`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0`` + - can be used to deal with over-fitting - - LightGBM will randomly select part of features on each iteration if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree +- ``feature_fraction_bynode`` :raw-html:`🔗︎`, default = ``1.0``, type = double, aliases: ``sub_feature_bynode``, ``colsample_bynode``, constraints: ``0.0 < feature_fraction_bynode <= 1.0`` - - can be used to speed up training + - LightGBM will randomly select part of features on each tree node if ``feature_fraction_bynode`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features at each tree node. - can be used to deal with over-fitting + - **Note**: unlike ``feature_fraction``, this cannot speed up training + + - **Note**: if both ``feature_fraction`` and ``feature_fraction_bynode`` are smaller than ``1.0``, the final fraction of each node is ``feature_fraction * feature_fraction_bynode`` + - ``feature_fraction_seed`` :raw-html:`🔗︎`, default = ``2``, type = int - random seed for ``feature_fraction`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 3e1a6c4f0bd..b679aaed9c3 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -268,20 +268,23 @@ struct Config { // desc = random seed for bagging int bagging_seed = 3; - // alias = sub_feature_bynode, colsample_bytree_bynode - // desc = set this to ``true`` to randomly select part of features for each node - // desc = set this to ``false`` to randomly select part of features for each tree (use the same sub features for each tree) - // desc = **Note**: set this to ``true`` cannot speed up the training, but set this to ``false`` can speed up the training linearly - bool feature_fraction_bynode = false; - // alias = sub_feature, colsample_bytree // check = >0.0 // check = <=1.0 - // desc = LightGBM will randomly select part of features on each iteration if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree + // desc = LightGBM will randomly select part of features on each iteration (tree) if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree // desc = can be used to speed up training // desc = can be used to deal with over-fitting double feature_fraction = 1.0; + // alias = sub_feature_bynode, colsample_bynode + // check = >0.0 + // check = <=1.0 + // desc = LightGBM will randomly select part of features on each tree node if ``feature_fraction_bynode`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features at each tree node. + // desc = can be used to deal with over-fitting + // desc = **Note**: unlike ``feature_fraction``, this cannot speed up training + // desc = **Note**: if both ``feature_fraction`` and ``feature_fraction_bynode`` are smaller than ``1.0``, the final fraction of each node is ``feature_fraction * feature_fraction_bynode`` + double feature_fraction_bynode = 1.0; + // desc = random seed for ``feature_fraction`` int feature_fraction_seed = 2; diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index b2957cb6335..e6cc7a1cb96 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -66,10 +66,10 @@ std::unordered_map Config::alias_table({ {"neg_bagging", "neg_bagging_fraction"}, {"subsample_freq", "bagging_freq"}, {"bagging_fraction_seed", "bagging_seed"}, - {"sub_feature_bynode", "feature_fraction_bynode"}, - {"colsample_bytree_bynode", "feature_fraction_bynode"}, {"sub_feature", "feature_fraction"}, {"colsample_bytree", "feature_fraction"}, + {"sub_feature_bynode", "feature_fraction_bynode"}, + {"colsample_bynode", "feature_fraction_bynode"}, {"early_stopping_rounds", "early_stopping_round"}, {"early_stopping", "early_stopping_round"}, {"max_tree_output", "max_delta_step"}, @@ -188,8 +188,8 @@ std::unordered_set Config::parameter_set({ "neg_bagging_fraction", "bagging_freq", "bagging_seed", - "feature_fraction_bynode", "feature_fraction", + "feature_fraction_bynode", "feature_fraction_seed", "early_stopping_round", "first_metric_only", @@ -327,12 +327,14 @@ void Config::GetMembersFromString(const std::unordered_map0.0); CHECK(feature_fraction <=1.0); + GetDouble(params, "feature_fraction_bynode", &feature_fraction_bynode); + CHECK(feature_fraction_bynode >0.0); + CHECK(feature_fraction_bynode <=1.0); + GetInt(params, "feature_fraction_seed", &feature_fraction_seed); GetInt(params, "early_stopping_round", &early_stopping_round); @@ -591,8 +593,8 @@ std::string Config::SaveMembersToString() const { str_buf << "[neg_bagging_fraction: " << neg_bagging_fraction << "]\n"; str_buf << "[bagging_freq: " << bagging_freq << "]\n"; str_buf << "[bagging_seed: " << bagging_seed << "]\n"; - str_buf << "[feature_fraction_bynode: " << feature_fraction_bynode << "]\n"; str_buf << "[feature_fraction: " << feature_fraction << "]\n"; + str_buf << "[feature_fraction_bynode: " << feature_fraction_bynode << "]\n"; str_buf << "[feature_fraction_seed: " << feature_fraction_seed << "]\n"; str_buf << "[early_stopping_round: " << early_stopping_round << "]\n"; str_buf << "[first_metric_only: " << first_metric_only << "]\n"; diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 1a705e53ac7..f74e432ecb3 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -169,9 +169,9 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const std::vector larger_bests_per_thread(this->num_threads_, SplitInfo()); std::vector smaller_node_used_features(this->num_features_, 1); std::vector larger_node_used_features(this->num_features_, 1); - if (this->config_->feature_fraction_bynode) { - smaller_node_used_features = this->GetUsedFeatures(); - larger_node_used_features = this->GetUsedFeatures(); + if (this->config_->feature_fraction_bynode < 1.0f) { + smaller_node_used_features = this->GetUsedFeatures(false); + larger_node_used_features = this->GetUsedFeatures(false); } OMP_INIT_EX(); #pragma omp parallel for schedule(static) diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index e03f1cf44f2..ee09d77fc0a 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -104,8 +104,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian } } Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_); - feature_used.clear(); - feature_used.resize(train_data->num_features()); + is_feature_used_in_split_.clear(); + is_feature_used_in_split_.resize(train_data->num_features()); if (!config_->cegb_penalty_feature_coupled.empty()) { CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast(train_data_->num_total_features())); @@ -268,24 +268,51 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vect return FitByExistingTree(old_tree, gradients, hessians); } -std::vector SerialTreeLearner::GetUsedFeatures() { +std::vector SerialTreeLearner::GetUsedFeatures(bool is_tree_level) { std::vector ret(num_features_, 1); - if (config_->feature_fraction >= 1.0f) { + if (config_->feature_fraction >= 1.0f && is_tree_level) { + return ret; + } + if (config_->feature_fraction_bynode >= 1.0f && !is_tree_level) { return ret; } - int used_feature_cnt = static_cast(valid_feature_indices_.size()*config_->feature_fraction); - // at least use one feature - used_feature_cnt = std::max(used_feature_cnt, 1); - // initialize used features std::memset(ret.data(), 0, sizeof(int8_t) * num_features_); - auto sampled_indices = random_.Sample(static_cast(valid_feature_indices_.size()), used_feature_cnt); - int omp_loop_size = static_cast(sampled_indices.size()); - #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) - for (int i = 0; i < omp_loop_size; ++i) { - int used_feature = valid_feature_indices_[sampled_indices[i]]; - int inner_feature_index = train_data_->InnerFeatureIndex(used_feature); - CHECK(inner_feature_index >= 0); - ret[inner_feature_index] = 1; + if (is_tree_level) { + int used_feature_cnt = static_cast(valid_feature_indices_.size() * config_->feature_fraction); + used_feature_cnt = std::max(used_feature_cnt, 1); + used_feature_indices_ = random_.Sample(static_cast(valid_feature_indices_.size()), used_feature_cnt); + int omp_loop_size = static_cast(used_feature_indices_.size()); + #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) + for (int i = 0; i < omp_loop_size; ++i) { + int used_feature = valid_feature_indices_[used_feature_indices_[i]]; + int inner_feature_index = train_data_->InnerFeatureIndex(used_feature); + CHECK(inner_feature_index >= 0); + ret[inner_feature_index] = 1; + } + } else if(used_feature_indices_.size() <= 0) { + int used_feature_cnt = static_cast(valid_feature_indices_.size() * config_->feature_fraction_bynode); + used_feature_cnt = std::max(used_feature_cnt, 1); + auto sampled_indices = random_.Sample(static_cast(valid_feature_indices_.size()), used_feature_cnt); + int omp_loop_size = static_cast(sampled_indices.size()); + #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) + for (int i = 0; i < omp_loop_size; ++i) { + int used_feature = valid_feature_indices_[sampled_indices[i]]; + int inner_feature_index = train_data_->InnerFeatureIndex(used_feature); + CHECK(inner_feature_index >= 0); + ret[inner_feature_index] = 1; + } + } else { + int used_feature_cnt = static_cast(used_feature_indices_.size() * config_->feature_fraction_bynode); + used_feature_cnt = std::max(used_feature_cnt, 1); + auto sampled_indices = random_.Sample(static_cast(used_feature_indices_.size()), used_feature_cnt); + int omp_loop_size = static_cast(sampled_indices.size()); + #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) + for (int i = 0; i < omp_loop_size; ++i) { + int used_feature = valid_feature_indices_[used_feature_indices_[sampled_indices[i]]]; + int inner_feature_index = train_data_->InnerFeatureIndex(used_feature); + CHECK(inner_feature_index >= 0); + ret[inner_feature_index] = 1; + } } return ret; } @@ -294,8 +321,8 @@ void SerialTreeLearner::BeforeTrain() { // reset histogram pool histogram_pool_.ResetMap(); - if (config_->feature_fraction < 1 && !config_->feature_fraction_bynode) { - is_feature_used_ = GetUsedFeatures(); + if (config_->feature_fraction < 1.0f) { + is_feature_used_ = GetUsedFeatures(true); } else { #pragma omp parallel for schedule(static, 512) if (num_features_ >= 1024) for (int i = 0; i < num_features_; ++i) { @@ -523,9 +550,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& std::vector larger_best(num_threads_); std::vector smaller_node_used_features(num_features_, 1); std::vector larger_node_used_features(num_features_, 1); - if (config_->feature_fraction_bynode) { - smaller_node_used_features = GetUsedFeatures(); - larger_node_used_features = GetUsedFeatures(); + if (config_->feature_fraction_bynode < 1.0f) { + smaller_node_used_features = GetUsedFeatures(false); + larger_node_used_features = GetUsedFeatures(false); } OMP_INIT_EX(); // find splits @@ -549,7 +576,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& &smaller_split); smaller_split.feature = real_fidx; smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf(); - if (!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]) { + if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) { smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx]; } if (!config_->cegb_penalty_feature_lazy.empty()) { @@ -580,7 +607,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& &larger_split); larger_split.feature = real_fidx; larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf(); - if (!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]) { + if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) { larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx]; } if (!config_->cegb_penalty_feature_lazy.empty()) { @@ -775,8 +802,8 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) { const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); - if (!config_->cegb_penalty_feature_coupled.empty() && !feature_used[inner_feature_index]) { - feature_used[inner_feature_index] = true; + if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) { + is_feature_used_in_split_[inner_feature_index] = true; for (int i = 0; i < tree->num_leaves(); ++i) { if (i == best_leaf) continue; auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index]; diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 279b1fd4a68..b5cb3e5e613 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -79,7 +79,7 @@ class SerialTreeLearner: public TreeLearner { protected: - virtual std::vector GetUsedFeatures(); + virtual std::vector GetUsedFeatures(bool is_tree_level); /*! * \brief Some initial works before training */ @@ -135,6 +135,8 @@ class SerialTreeLearner: public TreeLearner { Random random_; /*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */ std::vector is_feature_used_; + /*! \brief used feature indices in current tree */ + std::vector used_feature_indices_; /*! \brief pointer to histograms array of parent of current leaves */ FeatureHistogram* parent_leaf_histogram_array_; /*! \brief pointer to histograms array of smaller leaf */ @@ -179,7 +181,7 @@ class SerialTreeLearner: public TreeLearner { std::vector ordered_bin_indices_; bool is_constant_hessian_; - std::vector feature_used; + std::vector is_feature_used_in_split_; std::vector feature_used_in_data; }; diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index 978f2b18e64..e4936e21b53 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -379,9 +379,9 @@ void VotingParallelTreeLearner::FindBestSplitsFromHistograms(cons std::vector larger_best_per_thread(this->num_threads_); std::vector smaller_node_used_features(this->num_features_, 1); std::vector larger_node_used_features(this->num_features_, 1); - if (this->config_->feature_fraction_bynode) { - smaller_node_used_features = this->GetUsedFeatures(); - larger_node_used_features = this->GetUsedFeatures(); + if (this->config_->feature_fraction_bynode < 1.0f) { + smaller_node_used_features = this->GetUsedFeatures(false); + larger_node_used_features = this->GetUsedFeatures(false); } // find best split from local aggregated histograms diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 63f1468132a..3150d62985f 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1591,8 +1591,8 @@ def test_node_level_subcol(self): params = { 'objective': 'binary', 'metric': 'binary_logloss', - 'feature_fraction': 0.8, - 'feature_fraction_bynode': True, + 'feature_fraction_bynode': 0.8, + 'feature_fraction': 1.0, 'verbose': -1 } lgb_train = lgb.Dataset(X_train, y_train)