microsoft · StrikerRUS · Sep 12, 2019 · Sep 5, 2019 · Sep 5, 2019 · Sep 8, 2019
@@ -254,22 +254,24 @@ Learning Control Parameters
 
    -  random seed for bagging
 
--  ``feature_fraction_bynode`` :raw-html:`<a id="feature_fraction_bynode" title="Permalink to this parameter" href="#feature_fraction_bynode">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``sub_feature_bynode``, ``colsample_bytree_bynode``
-
-   -  set this to ``true`` to randomly select part of features for each node
+-  ``feature_fraction`` :raw-html:`<a id="feature_fraction" title="Permalink to this parameter" href="#feature_fraction">&#x1F517;&#xFE0E;</a>`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0``
 
-   -  set this to ``false`` to randomly select part of features for each tree (use the same sub features for each tree)
+   -  LightGBM will randomly select part of features on each iteration (tree) if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree
 
-   -  **Note**: set this to ``true`` cannot speed up the training, but set this to ``false`` can speed up the training linearly
+   -  can be used to speed up training
 
--  ``feature_fraction`` :raw-html:`<a id="feature_fraction" title="Permalink to this parameter" href="#feature_fraction">&#x1F517;&#xFE0E;</a>`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0``
+   -  can be used to deal with over-fitting
 
-   -  LightGBM will randomly select part of features on each iteration if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree
+-  ``feature_fraction_bynode`` :raw-html:`<a id="feature_fraction_bynode" title="Permalink to this parameter" href="#feature_fraction_bynode">&#x1F517;&#xFE0E;</a>`, default = ``1.0``, type = double, aliases: ``sub_feature_bynode``, ``colsample_bynode``, constraints: ``0.0 < feature_fraction_bynode <= 1.0``
 
-   -  can be used to speed up training
+   -  LightGBM will randomly select part of features on each tree node if ``feature_fraction_bynode`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features at each tree node.
 
    -  can be used to deal with over-fitting
 
+   -  **Note**: unlike ``feature_fraction``, this cannot speed up training
+
+   -  **Note**: if both ``feature_fraction`` and ``feature_fraction_bynode`` are smaller than ``1.0``, the final fraction of each node is ``feature_fraction * feature_fraction_bynode``
+
 -  ``feature_fraction_seed`` :raw-html:`<a id="feature_fraction_seed" title="Permalink to this parameter" href="#feature_fraction_seed">&#x1F517;&#xFE0E;</a>`, default = ``2``, type = int
 
    -  random seed for ``feature_fraction``

@@ -268,20 +268,23 @@ struct Config {
   // desc = random seed for bagging
   int bagging_seed = 3;
 
-  // alias = sub_feature_bynode, colsample_bytree_bynode
-  // desc = set this to ``true`` to randomly select part of features for each node
-  // desc = set this to ``false`` to randomly select part of features for each tree (use the same sub features for each tree)
-  // desc = **Note**: set this to ``true`` cannot speed up the training, but set this to ``false`` can speed up the training linearly
-  bool feature_fraction_bynode = false;
-
   // alias = sub_feature, colsample_bytree
   // check = >0.0
   // check = <=1.0
-  // desc = LightGBM will randomly select part of features on each iteration if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree
+  // desc = LightGBM will randomly select part of features on each iteration (tree) if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree
   // desc = can be used to speed up training
   // desc = can be used to deal with over-fitting
   double feature_fraction = 1.0;
 
+  // alias = sub_feature_bynode, colsample_bynode
+  // check = >0.0
+  // check = <=1.0
+  // desc = LightGBM will randomly select part of features on each tree node if ``feature_fraction_bynode`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features at each tree node.
+  // desc = can be used to deal with over-fitting
+  // desc = **Note**: unlike ``feature_fraction``, this cannot speed up training
+  // desc = **Note**: if both ``feature_fraction`` and ``feature_fraction_bynode`` are smaller than ``1.0``, the final fraction of each node is ``feature_fraction * feature_fraction_bynode``
+  double feature_fraction_bynode = 1.0;
+
   // desc = random seed for ``feature_fraction``
   int feature_fraction_seed = 2;
 

@@ -66,10 +66,10 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"neg_bagging", "neg_bagging_fraction"},
   {"subsample_freq", "bagging_freq"},
   {"bagging_fraction_seed", "bagging_seed"},
-  {"sub_feature_bynode", "feature_fraction_bynode"},
-  {"colsample_bytree_bynode", "feature_fraction_bynode"},
   {"sub_feature", "feature_fraction"},
   {"colsample_bytree", "feature_fraction"},
+  {"sub_feature_bynode", "feature_fraction_bynode"},
+  {"colsample_bynode", "feature_fraction_bynode"},
   {"early_stopping_rounds", "early_stopping_round"},
   {"early_stopping", "early_stopping_round"},
   {"max_tree_output", "max_delta_step"},
@@ -188,8 +188,8 @@ std::unordered_set<std::string> Config::parameter_set({
   "neg_bagging_fraction",
   "bagging_freq",
   "bagging_seed",
-  "feature_fraction_bynode",
   "feature_fraction",
+  "feature_fraction_bynode",
   "feature_fraction_seed",
   "early_stopping_round",
   "first_metric_only",
@@ -327,12 +327,14 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetInt(params, "bagging_seed", &bagging_seed);
 
-  GetBool(params, "feature_fraction_bynode", &feature_fraction_bynode);
-
   GetDouble(params, "feature_fraction", &feature_fraction);
   CHECK(feature_fraction >0.0);
   CHECK(feature_fraction <=1.0);
 
+  GetDouble(params, "feature_fraction_bynode", &feature_fraction_bynode);
+  CHECK(feature_fraction_bynode >0.0);
+  CHECK(feature_fraction_bynode <=1.0);
+
   GetInt(params, "feature_fraction_seed", &feature_fraction_seed);
 
   GetInt(params, "early_stopping_round", &early_stopping_round);
@@ -591,8 +593,8 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[neg_bagging_fraction: " << neg_bagging_fraction << "]\n";
   str_buf << "[bagging_freq: " << bagging_freq << "]\n";
   str_buf << "[bagging_seed: " << bagging_seed << "]\n";
-  str_buf << "[feature_fraction_bynode: " << feature_fraction_bynode << "]\n";
   str_buf << "[feature_fraction: " << feature_fraction << "]\n";
+  str_buf << "[feature_fraction_bynode: " << feature_fraction_bynode << "]\n";
   str_buf << "[feature_fraction_seed: " << feature_fraction_seed << "]\n";
   str_buf << "[early_stopping_round: " << early_stopping_round << "]\n";
   str_buf << "[first_metric_only: " << first_metric_only << "]\n";

@@ -169,9 +169,9 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
   std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
   std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
-  if (this->config_->feature_fraction_bynode) {
-    smaller_node_used_features = this->GetUsedFeatures();
-    larger_node_used_features = this->GetUsedFeatures();
+  if (this->config_->feature_fraction_bynode < 1.0f) {
+    smaller_node_used_features = this->GetUsedFeatures(false);
+    larger_node_used_features = this->GetUsedFeatures(false);
   }
   OMP_INIT_EX();
   #pragma omp parallel for schedule(static)

@@ -104,8 +104,8 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
     }
   }
   Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
-  feature_used.clear();
-  feature_used.resize(train_data->num_features());
+  is_feature_used_in_split_.clear();
+  is_feature_used_in_split_.resize(train_data->num_features());
 
   if (!config_->cegb_penalty_feature_coupled.empty()) {
     CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast<size_t>(train_data_->num_total_features()));
@@ -268,24 +268,51 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vect
   return FitByExistingTree(old_tree, gradients, hessians);
 }
 
-std::vector<int8_t> SerialTreeLearner::GetUsedFeatures() {
+std::vector<int8_t> SerialTreeLearner::GetUsedFeatures(bool is_tree_level) {
   std::vector<int8_t> ret(num_features_, 1);
-  if (config_->feature_fraction >= 1.0f) {
+  if (config_->feature_fraction >= 1.0f && is_tree_level) {
+    return ret;
+  }
+  if (config_->feature_fraction_bynode >= 1.0f && !is_tree_level) {
     return ret;
   }
-  int used_feature_cnt = static_cast<int>(valid_feature_indices_.size()*config_->feature_fraction);
-  // at least use one feature
-  used_feature_cnt = std::max(used_feature_cnt, 1);
-  // initialize used features
   std::memset(ret.data(), 0, sizeof(int8_t) * num_features_);
-  auto sampled_indices = random_.Sample(static_cast<int>(valid_feature_indices_.size()), used_feature_cnt);
-  int omp_loop_size = static_cast<int>(sampled_indices.size());
-  #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
-  for (int i = 0; i < omp_loop_size; ++i) {
-    int used_feature = valid_feature_indices_[sampled_indices[i]];
-    int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
-    CHECK(inner_feature_index >= 0);
-    ret[inner_feature_index] = 1;
+  if (is_tree_level) {
+    int used_feature_cnt = static_cast<int>(valid_feature_indices_.size() * config_->feature_fraction);
+    used_feature_cnt = std::max(used_feature_cnt, 1);
+    used_feature_indices_ = random_.Sample(static_cast<int>(valid_feature_indices_.size()), used_feature_cnt);
+    int omp_loop_size = static_cast<int>(used_feature_indices_.size());
+    #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+    for (int i = 0; i < omp_loop_size; ++i) {
+      int used_feature = valid_feature_indices_[used_feature_indices_[i]];
+      int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
+      CHECK(inner_feature_index >= 0);
+      ret[inner_feature_index] = 1;
+    }
+  } else if(used_feature_indices_.size() <= 0) {
+    int used_feature_cnt = static_cast<int>(valid_feature_indices_.size() * config_->feature_fraction_bynode);
+    used_feature_cnt = std::max(used_feature_cnt, 1);
+    auto sampled_indices = random_.Sample(static_cast<int>(valid_feature_indices_.size()), used_feature_cnt);
+    int omp_loop_size = static_cast<int>(sampled_indices.size());
+    #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+    for (int i = 0; i < omp_loop_size; ++i) {
+      int used_feature = valid_feature_indices_[sampled_indices[i]];
+      int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
+      CHECK(inner_feature_index >= 0);
+      ret[inner_feature_index] = 1;
+    }
+  } else {
+    int used_feature_cnt = static_cast<int>(used_feature_indices_.size() * config_->feature_fraction_bynode);
+    used_feature_cnt = std::max(used_feature_cnt, 1);
+    auto sampled_indices = random_.Sample(static_cast<int>(used_feature_indices_.size()), used_feature_cnt);
+    int omp_loop_size = static_cast<int>(sampled_indices.size());
+    #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+    for (int i = 0; i < omp_loop_size; ++i) {
+      int used_feature = valid_feature_indices_[used_feature_indices_[sampled_indices[i]]];
+      int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
+      CHECK(inner_feature_index >= 0);
+      ret[inner_feature_index] = 1;
+    }
   }
   return ret;
 }
@@ -294,8 +321,8 @@ void SerialTreeLearner::BeforeTrain() {
   // reset histogram pool
   histogram_pool_.ResetMap();
 
-  if (config_->feature_fraction < 1 && !config_->feature_fraction_bynode) {
-    is_feature_used_ = GetUsedFeatures();
+  if (config_->feature_fraction < 1.0f) {
+    is_feature_used_ = GetUsedFeatures(true);
   } else {
     #pragma omp parallel for schedule(static, 512) if (num_features_ >= 1024)
     for (int i = 0; i < num_features_; ++i) {
@@ -523,9 +550,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
   std::vector<SplitInfo> larger_best(num_threads_);
   std::vector<int8_t> smaller_node_used_features(num_features_, 1);
   std::vector<int8_t> larger_node_used_features(num_features_, 1);
-  if (config_->feature_fraction_bynode) {
-    smaller_node_used_features = GetUsedFeatures();
-    larger_node_used_features = GetUsedFeatures();
+  if (config_->feature_fraction_bynode < 1.0f) {
+    smaller_node_used_features = GetUsedFeatures(false);
+    larger_node_used_features = GetUsedFeatures(false);
   }
   OMP_INIT_EX();
   // find splits
@@ -549,7 +576,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
       &smaller_split);
     smaller_split.feature = real_fidx;
     smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf();
-    if (!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]) {
+    if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
       smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
     }
     if (!config_->cegb_penalty_feature_lazy.empty()) {
@@ -580,7 +607,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
       &larger_split);
     larger_split.feature = real_fidx;
     larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf();
-    if (!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]) {
+    if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[feature_index]) {
       larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
     }
     if (!config_->cegb_penalty_feature_lazy.empty()) {
@@ -775,8 +802,8 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int*
 void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
   const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
   const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
-  if (!config_->cegb_penalty_feature_coupled.empty() && !feature_used[inner_feature_index]) {
-    feature_used[inner_feature_index] = true;
+  if (!config_->cegb_penalty_feature_coupled.empty() && !is_feature_used_in_split_[inner_feature_index]) {
+    is_feature_used_in_split_[inner_feature_index] = true;
     for (int i = 0; i < tree->num_leaves(); ++i) {
       if (i == best_leaf) continue;
       auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index];

@@ -79,7 +79,7 @@ class SerialTreeLearner: public TreeLearner {
 
  protected:
 
-  virtual std::vector<int8_t> GetUsedFeatures();
+  virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
   /*!
   * \brief Some initial works before training
   */
@@ -135,6 +135,8 @@ class SerialTreeLearner: public TreeLearner {
   Random random_;
   /*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
   std::vector<int8_t> is_feature_used_;
+  /*! \brief used feature indices in current tree */
+  std::vector<int> used_feature_indices_;
   /*! \brief pointer to histograms array of parent of current leaves */
   FeatureHistogram* parent_leaf_histogram_array_;
   /*! \brief pointer to histograms array of smaller leaf */
@@ -179,7 +181,7 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<int> ordered_bin_indices_;
   bool is_constant_hessian_;
 
-  std::vector<bool> feature_used;
+  std::vector<bool> is_feature_used_in_split_;
   std::vector<uint32_t> feature_used_in_data;
 };
 

@@ -379,9 +379,9 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
   std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
   std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
-  if (this->config_->feature_fraction_bynode) {
-    smaller_node_used_features = this->GetUsedFeatures();
-    larger_node_used_features = this->GetUsedFeatures();
+  if (this->config_->feature_fraction_bynode < 1.0f) {
+    smaller_node_used_features = this->GetUsedFeatures(false);
+    larger_node_used_features = this->GetUsedFeatures(false);
   }
   // find best split from local aggregated histograms
 

@@ -1591,8 +1591,8 @@ def test_node_level_subcol(self):
         params = {
             'objective': 'binary',
             'metric': 'binary_logloss',
-            'feature_fraction': 0.8,
-            'feature_fraction_bynode': True,
+            'feature_fraction_bynode': 0.8,
+            'feature_fraction': 1.0,
             'verbose': -1
         }
         lgb_train = lgb.Dataset(X_train, y_train)