diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 0b287fdc65f..42fb3febc70 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -254,6 +254,14 @@ Learning Control Parameters
 
    -  random seed for bagging
 
+-  ``feature_fraction_bynode`` :raw-html:`<a id="feature_fraction_bynode" title="Permalink to this parameter" href="#feature_fraction_bynode">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``sub_feature_bynode``, ``colsample_bytree_bynode``
+
+   -  set this to ``true`` to randomly select part of features for each node
+
+   -  set this to ``false`` to randomly select part of features for each tree (use the same sub features for each tree)
+
+   -  **Note**: set this to ``true`` cannot speed up the training, but set this to ``false`` can speed up the training linearly
+
 -  ``feature_fraction`` :raw-html:`<a id="feature_fraction" title="Permalink to this parameter" href="#feature_fraction">&#x1F517;&#xFE0E;</a>`, default = ``1.0``, type = double, aliases: ``sub_feature``, ``colsample_bytree``, constraints: ``0.0 < feature_fraction <= 1.0``
 
    -  LightGBM will randomly select part of features on each iteration if ``feature_fraction`` smaller than ``1.0``. For example, if you set it to ``0.8``, LightGBM will select 80% of features before training each tree
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 5e92582337e..ad341c1d8be 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -268,6 +268,12 @@ struct Config {
   // desc = random seed for bagging
   int bagging_seed = 3;
 
+  // alias = sub_feature_bynode, colsample_bytree_bynode
+  // desc = set this to ``true`` to randomly select part of features for each node
+  // desc = set this to ``false`` to randomly select part of features for each tree (use the same sub features for each tree)
+  // desc = **Note**: set this to ``true`` cannot speed up the training, but set this to ``false`` can speed up the training linearly
+  bool feature_fraction_bynode = false;
+
   // alias = sub_feature, colsample_bytree
   // check = >0.0
   // check = <=1.0
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index d598dc907e3..b2957cb6335 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -66,6 +66,8 @@ std::unordered_map<std::string, std::string> Config::alias_table({
   {"neg_bagging", "neg_bagging_fraction"},
   {"subsample_freq", "bagging_freq"},
   {"bagging_fraction_seed", "bagging_seed"},
+  {"sub_feature_bynode", "feature_fraction_bynode"},
+  {"colsample_bytree_bynode", "feature_fraction_bynode"},
   {"sub_feature", "feature_fraction"},
   {"colsample_bytree", "feature_fraction"},
   {"early_stopping_rounds", "early_stopping_round"},
@@ -186,6 +188,7 @@ std::unordered_set<std::string> Config::parameter_set({
   "neg_bagging_fraction",
   "bagging_freq",
   "bagging_seed",
+  "feature_fraction_bynode",
   "feature_fraction",
   "feature_fraction_seed",
   "early_stopping_round",
@@ -324,6 +327,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetInt(params, "bagging_seed", &bagging_seed);
 
+  GetBool(params, "feature_fraction_bynode", &feature_fraction_bynode);
+
   GetDouble(params, "feature_fraction", &feature_fraction);
   CHECK(feature_fraction >0.0);
   CHECK(feature_fraction <=1.0);
@@ -586,6 +591,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[neg_bagging_fraction: " << neg_bagging_fraction << "]\n";
   str_buf << "[bagging_freq: " << bagging_freq << "]\n";
   str_buf << "[bagging_seed: " << bagging_seed << "]\n";
+  str_buf << "[feature_fraction_bynode: " << feature_fraction_bynode << "]\n";
   str_buf << "[feature_fraction: " << feature_fraction << "]\n";
   str_buf << "[feature_fraction_seed: " << feature_fraction_seed << "]\n";
   str_buf << "[early_stopping_round: " << early_stopping_round << "]\n";
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index a1fa299bded..1a705e53ac7 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -167,7 +167,12 @@ template <typename TREELEARNER_T>
 void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
   std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
   std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
-
+  std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
+  std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
+  if (this->config_->feature_fraction_bynode) {
+    smaller_node_used_features = this->GetUsedFeatures();
+    larger_node_used_features = this->GetUsedFeatures();
+  }
   OMP_INIT_EX();
   #pragma omp parallel for schedule(static)
   for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
@@ -193,7 +198,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
       this->smaller_leaf_splits_->max_constraint(),
       &smaller_split);
     smaller_split.feature = real_feature_index;
-    if (smaller_split > smaller_bests_per_thread[tid]) {
+    if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) {
       smaller_bests_per_thread[tid] = smaller_split;
     }
 
@@ -213,7 +218,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
       this->larger_leaf_splits_->max_constraint(),
       &larger_split);
     larger_split.feature = real_feature_index;
-    if (larger_split > larger_bests_per_thread[tid]) {
+    if (larger_split > larger_bests_per_thread[tid] && larger_node_used_features[feature_index]) {
       larger_bests_per_thread[tid] = larger_split;
     }
     OMP_LOOP_EX_END();
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index f37be94abb1..e03f1cf44f2 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -268,26 +268,34 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vect
   return FitByExistingTree(old_tree, gradients, hessians);
 }
 
+std::vector<int8_t> SerialTreeLearner::GetUsedFeatures() {
+  std::vector<int8_t> ret(num_features_, 1);
+  if (config_->feature_fraction >= 1.0f) {
+    return ret;
+  }
+  int used_feature_cnt = static_cast<int>(valid_feature_indices_.size()*config_->feature_fraction);
+  // at least use one feature
+  used_feature_cnt = std::max(used_feature_cnt, 1);
+  // initialize used features
+  std::memset(ret.data(), 0, sizeof(int8_t) * num_features_);
+  auto sampled_indices = random_.Sample(static_cast<int>(valid_feature_indices_.size()), used_feature_cnt);
+  int omp_loop_size = static_cast<int>(sampled_indices.size());
+  #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+  for (int i = 0; i < omp_loop_size; ++i) {
+    int used_feature = valid_feature_indices_[sampled_indices[i]];
+    int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
+    CHECK(inner_feature_index >= 0);
+    ret[inner_feature_index] = 1;
+  }
+  return ret;
+}
+
 void SerialTreeLearner::BeforeTrain() {
   // reset histogram pool
   histogram_pool_.ResetMap();
 
-  if (config_->feature_fraction < 1) {
-    int used_feature_cnt = static_cast<int>(valid_feature_indices_.size()*config_->feature_fraction);
-    // at least use one feature
-    used_feature_cnt = std::max(used_feature_cnt, 1);
-    // initialize used features
-    std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
-    // Get used feature at current tree
-    auto sampled_indices = random_.Sample(static_cast<int>(valid_feature_indices_.size()), used_feature_cnt);
-    int omp_loop_size = static_cast<int>(sampled_indices.size());
-    #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
-    for (int i = 0; i < omp_loop_size; ++i) {
-      int used_feature = valid_feature_indices_[sampled_indices[i]];
-      int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
-      CHECK(inner_feature_index >= 0);
-      is_feature_used_[inner_feature_index] = 1;
-    }
+  if (config_->feature_fraction < 1 && !config_->feature_fraction_bynode) {
+    is_feature_used_ = GetUsedFeatures();
   } else {
     #pragma omp parallel for schedule(static, 512) if (num_features_ >= 1024)
     for (int i = 0; i < num_features_; ++i) {
@@ -513,6 +521,12 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
   #endif
   std::vector<SplitInfo> smaller_best(num_threads_);
   std::vector<SplitInfo> larger_best(num_threads_);
+  std::vector<int8_t> smaller_node_used_features(num_features_, 1);
+  std::vector<int8_t> larger_node_used_features(num_features_, 1);
+  if (config_->feature_fraction_bynode) {
+    smaller_node_used_features = GetUsedFeatures();
+    larger_node_used_features = GetUsedFeatures();
+  }
   OMP_INIT_EX();
   // find splits
   #pragma omp parallel for schedule(static)
@@ -542,7 +556,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
       smaller_split.gain -= config_->cegb_tradeoff * CalculateOndemandCosts(real_fidx, smaller_leaf_splits_->LeafIndex());
     }
     splits_per_leaf_[smaller_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = smaller_split;
-    if (smaller_split > smaller_best[tid]) {
+    if (smaller_split > smaller_best[tid] && smaller_node_used_features[feature_index]) {
       smaller_best[tid] = smaller_split;
     }
     // only has root leaf
@@ -573,7 +587,7 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
       larger_split.gain -= config_->cegb_tradeoff*CalculateOndemandCosts(real_fidx, larger_leaf_splits_->LeafIndex());
     }
     splits_per_leaf_[larger_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = larger_split;
-    if (larger_split > larger_best[tid]) {
+    if (larger_split > larger_best[tid] && larger_node_used_features[feature_index]) {
       larger_best[tid] = larger_split;
     }
     OMP_LOOP_EX_END();
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 52feee1daf1..279b1fd4a68 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -78,6 +78,8 @@ class SerialTreeLearner: public TreeLearner {
                        data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
 
  protected:
+
+  virtual std::vector<int8_t> GetUsedFeatures();
   /*!
   * \brief Some initial works before training
   */
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index f1c35e71f31..978f2b18e64 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -377,6 +377,12 @@ template <typename TREELEARNER_T>
 void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
   std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
   std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
+  std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
+  std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
+  if (this->config_->feature_fraction_bynode) {
+    smaller_node_used_features = this->GetUsedFeatures();
+    larger_node_used_features = this->GetUsedFeatures();
+  }
   // find best split from local aggregated histograms
 
   OMP_INIT_EX();
@@ -405,7 +411,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
         smaller_leaf_splits_global_->max_constraint(),
         &smaller_split);
       smaller_split.feature = real_feature_index;
-      if (smaller_split > smaller_bests_per_thread[tid]) {
+      if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) {
         smaller_bests_per_thread[tid] = smaller_split;
       }
     }
@@ -429,7 +435,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(cons
         larger_leaf_splits_global_->max_constraint(),
         &larger_split);
       larger_split.feature = real_feature_index;
-      if (larger_split > larger_best_per_thread[tid]) {
+      if (larger_split > larger_best_per_thread[tid] && larger_node_used_features[feature_index]) {
         larger_best_per_thread[tid] = larger_split;
       }
     }
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index d7448d053a7..63f1468132a 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1584,3 +1584,30 @@ def constant_metric(preds, train_data):
                                                          decreasing_metric(preds, train_data)],
                         early_stopping_rounds=5, verbose_eval=False)
         self.assertEqual(gbm.best_iteration, 1)
+
+    def test_node_level_subcol(self):
+        X, y = load_breast_cancer(True)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+        params = {
+            'objective': 'binary',
+            'metric': 'binary_logloss',
+            'feature_fraction': 0.8,
+            'feature_fraction_bynode': True,
+            'verbose': -1
+        }
+        lgb_train = lgb.Dataset(X_train, y_train)
+        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+        evals_result = {}
+        gbm = lgb.train(params, lgb_train,
+                        num_boost_round=25,
+                        valid_sets=lgb_eval,
+                        verbose_eval=False,
+                        evals_result=evals_result)
+        ret = log_loss(y_test, gbm.predict(X_test))
+        self.assertLess(ret, 0.13)
+        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
+        params['feature_fraction'] = 0.5
+        gbm2 = lgb.train(params, lgb_train,
+                         num_boost_round=25)
+        ret2 = log_loss(y_test, gbm2.predict(X_test))
+        self.assertNotEqual(ret, ret2)