don't save num_thread as possible (#2839)

* don't cache `num_thread`, to avoid change outside * rename * update document * Update docs/Parameters.rst * Update include/LightGBM/config.h * Apply suggestions from code review Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> * Apply suggestions from code review Co-Authored-By: Nikita Titov <nekit94-08@mail.ru> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
microsoft · Mar 2, 2020 · 0aa7bfe · 0aa7bfe
1 parent 5a80b78
commit 0aa7bfe
Show file tree

Hide file tree

Showing 18 changed files with 40 additions and 93 deletions.
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
@@ -171,6 +171,8 @@ Core Parameters
 
    -  for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
 
+   -  **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
+
 -  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, aliases: ``device``
 
    -  device for the tree learning, you can use GPU to achieve the faster learning

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
@@ -192,6 +192,7 @@ struct Config {
   // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
   // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
   // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
+  // desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
   int num_threads = 0;
 
   // [doc-only]

diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
@@ -292,10 +292,7 @@ struct TrainingTempState {
       return;
     }
     multi_val_bin.reset(bin);
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = OMP_NUM_THREADS();
     num_bin_aligned =
         (bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
     size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;

diff --git a/include/LightGBM/utils/array_args.h b/include/LightGBM/utils/array_args.h
@@ -21,12 +21,7 @@ template<typename VAL_T>
 class ArrayArgs {
  public:
   inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    {
-      num_threads = omp_get_num_threads();
-    }
+    int num_threads = OMP_NUM_THREADS();
     std::vector<size_t> arg_maxs(num_threads, 0);
     int n_blocks = Threading::For<size_t>(
         0, array.size(), 1024,

diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
@@ -727,12 +727,7 @@ template<typename _RanIt, typename _Pr, typename _VTRanIt> inline
 static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
   size_t len = _Last - _First;
   const size_t kMinInnerLen = 1024;
-  int num_threads = 1;
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads = omp_get_num_threads();
-  }
+  int num_threads = OMP_NUM_THREADS();
   if (len <= kMinInnerLen || num_threads <= 1) {
     std::sort(_First, _Last, _Pred);
     return;
@@ -1032,10 +1027,7 @@ class Timer {
  public:
   Timer() {
 #ifdef TIMETAG
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = OMP_NUM_THREADS();
     start_time_.resize(num_threads);
     stats_.resize(num_threads);
 #endif  // TIMETAG

diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h
@@ -15,6 +15,14 @@
 #include <stdexcept>
 #include <vector>
 
+inline int OMP_NUM_THREADS() {
+  int ret = 1;
+#pragma omp parallel
+#pragma omp master
+  { ret = omp_get_num_threads(); }
+  return ret;
+}
+
 class ThreadExceptionHelper {
  public:
   ThreadExceptionHelper() {
@@ -70,6 +78,7 @@ class ThreadExceptionHelper {
   inline void omp_set_num_threads(int) {}
   inline int omp_get_num_threads() {return 1;}
   inline int omp_get_thread_num() {return 0;}
+  inline int OMP_NUM_THREADS() { return 1; }
 #ifdef __cplusplus
 };  // extern "C"
 #endif

diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h
@@ -21,10 +21,7 @@ class Threading {
   template <typename INDEX_T>
   static inline void BlockInfo(INDEX_T cnt, INDEX_T min_cnt_per_block,
                                int* out_nblock, INDEX_T* block_size) {
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = OMP_NUM_THREADS();
     BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
                        block_size);
   }
@@ -84,10 +81,7 @@ class ParallelPartitionRunner {
  public:
   ParallelPartitionRunner(INDEX_T num_data, INDEX_T min_block_size)
       : min_block_size_(min_block_size) {
-    num_threads_ = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads_ = omp_get_num_threads(); }
+    num_threads_ = OMP_NUM_THREADS();
     left_.resize(num_data);
     if (TWO_BUFFER) {
       right_.resize(num_data);

diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
@@ -56,16 +56,13 @@ class Predictor {
       }
     }
 
-#pragma omp parallel
-#pragma omp master
-    { num_threads_ = omp_get_num_threads(); }
     boosting->InitPredict(num_iteration, predict_contrib);
     boosting_ = boosting;
     num_pred_one_row_ = boosting_->NumPredictOneRow(
         num_iteration, predict_leaf_index, predict_contrib);
     num_feature_ = boosting_->MaxFeatureIdx() + 1;
     predict_buf_.resize(
-        num_threads_,
+        OMP_NUM_THREADS(),
         std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(
             num_feature_, 0.0f));
     const int kFeatureThreshold = 100000;
@@ -281,7 +278,6 @@ class Predictor {
   PredictionEarlyStopInstance early_stop_;
   int num_feature_;
   int num_pred_one_row_;
-  int num_threads_;
   std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
 };
 

diff --git a/src/c_api.cpp b/src/c_api.cpp
@@ -1529,12 +1529,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
   if (config.num_threads > 0) {
     omp_set_num_threads(config.num_threads);
   }
-  int num_threads = 1;
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads = omp_get_num_threads();
-  }
+  int num_threads = OMP_NUM_THREADS();
   int ncol = static_cast<int>(ncol_ptr - 1);
   std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
   for (int i = 0; i < num_threads; ++i) {

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
@@ -506,10 +506,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
   }
   const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
   const int num_feature = feature_groups_[multi_group_id]->num_feature_;
-  int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
+  int num_threads = OMP_NUM_THREADS();
 
   std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
   std::vector<uint32_t> most_freq_bins;
@@ -539,10 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
 MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
   Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
                                  global_timer);
-  int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
+  int num_threads = OMP_NUM_THREADS();
   double sum_dense_ratio = 0;
 
   std::unique_ptr<MultiValBin> ret;
@@ -1185,10 +1179,7 @@ void Dataset::ConstructHistogramsMultiVal(
   if (multi_val_bin == nullptr) {
     return;
   }
-  int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-  { num_threads = omp_get_num_threads(); }
+  int num_threads = OMP_NUM_THREADS();
 
   global_timer.Start("Dataset::sparse_bin_histogram");
   const int num_bin = multi_val_bin->num_bin();

diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
@@ -25,10 +25,7 @@ class MultiValSparseBin : public MultiValBin {
         estimate_element_per_row_(estimate_element_per_row) {
     row_ptr_.resize(num_data_ + 1, 0);
     INDEX_T estimate_num_data = static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
-    int num_threads = 1;
-#pragma omp parallel
-#pragma omp master
-    { num_threads = omp_get_num_threads(); }
+    int num_threads = OMP_NUM_THREADS();
     if (num_threads > 1) {
       t_data_.resize(num_threads - 1);
       for (size_t i = 0; i < t_data_.size(); ++i) {

diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
@@ -73,12 +73,7 @@ class SparseBin: public Bin {
 
   explicit SparseBin(data_size_t num_data)
     : num_data_(num_data) {
-    int num_threads = 1;
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads = omp_get_num_threads();
-    }
+    int num_threads = OMP_NUM_THREADS();
     push_buffers_.resize(num_threads);
   }
 

diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp
@@ -23,12 +23,6 @@ class MapMetric:public Metric {
     // get eval position
     eval_at_ = config.eval_at;
     DCGCalculator::DefaultEvalAt(&eval_at_);
-    // get number of threads
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads_ = omp_get_num_threads();
-    }
   }
 
   ~MapMetric() {
@@ -110,8 +104,9 @@ class MapMetric:public Metric {
   }
   std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
     // some buffers for multi-threading sum up
+    int num_threads = OMP_NUM_THREADS();
     std::vector<std::vector<double>> result_buffer_;
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < num_threads; ++i) {
       result_buffer_.emplace_back(eval_at_.size(), 0.0f);
     }
     std::vector<double> tmp_map(eval_at_.size(), 0.0f);
@@ -139,7 +134,7 @@ class MapMetric:public Metric {
     // Get final average MAP
     std::vector<double> result(eval_at_.size(), 0.0f);
     for (size_t j = 0; j < result.size(); ++j) {
-      for (int i = 0; i < num_threads_; ++i) {
+      for (int i = 0; i < num_threads; ++i) {
         result[j] += result_buffer_[i][j];
       }
       result[j] /= sum_query_weights_;
@@ -162,8 +157,6 @@ class MapMetric:public Metric {
   double sum_query_weights_;
   /*! \brief Evaluate position of Nmap */
   std::vector<data_size_t> eval_at_;
-  /*! \brief Number of threads */
-  int num_threads_;
   std::vector<std::string> name_;
   std::vector<data_size_t> npos_per_query_;
 };

diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp
@@ -26,12 +26,6 @@ class NDCGMetric:public Metric {
     DCGCalculator::DefaultLabelGain(&label_gain);
     // initialize DCG calculator
     DCGCalculator::Init(label_gain);
-    // get number of threads
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads_ = omp_get_num_threads();
-    }
   }
 
   ~NDCGMetric() {
@@ -89,9 +83,10 @@ class NDCGMetric:public Metric {
   }
 
   std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
+    int num_threads = OMP_NUM_THREADS();
     // some buffers for multi-threading sum up
     std::vector<std::vector<double>> result_buffer_;
-    for (int i = 0; i < num_threads_; ++i) {
+    for (int i = 0; i < num_threads; ++i) {
       result_buffer_.emplace_back(eval_at_.size(), 0.0f);
     }
     std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
@@ -139,7 +134,7 @@ class NDCGMetric:public Metric {
     // Get final average NDCG
     std::vector<double> result(eval_at_.size(), 0.0f);
     for (size_t j = 0; j < result.size(); ++j) {
-      for (int i = 0; i < num_threads_; ++i) {
+      for (int i = 0; i < num_threads; ++i) {
         result[j] += result_buffer_[i][j];
       }
       result[j] /= sum_query_weights_;
@@ -166,8 +161,6 @@ class NDCGMetric:public Metric {
   std::vector<data_size_t> eval_at_;
   /*! \brief Cache the inverse max dcg for all queries */
   std::vector<std::vector<double>> inverse_max_dcgs_;
-  /*! \brief Number of threads */
-  int num_threads_;
 };
 
 }  // namespace LightGBM

diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
@@ -165,8 +165,9 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 
 template <typename TREELEARNER_T>
 void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
-  std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
+  int num_threads = OMP_NUM_THREADS();
+  std::vector<SplitInfo> smaller_bests_per_thread(num_threads, SplitInfo());
+  std::vector<SplitInfo> larger_bests_per_thread(num_threads, SplitInfo());
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
   std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
   if (this->config_->feature_fraction_bynode < 1.0f) {

diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
@@ -21,11 +21,6 @@ namespace LightGBM {
 SerialTreeLearner::SerialTreeLearner(const Config* config)
   :config_(config) {
   random_ = Random(config_->feature_fraction_seed);
-  #pragma omp parallel
-  #pragma omp master
-  {
-    num_threads_ = omp_get_num_threads();
-  }
 }
 
 SerialTreeLearner::~SerialTreeLearner() {
@@ -400,8 +395,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
     const std::vector<int8_t>& is_feature_used, bool use_subtract) {
   Common::FunctionTimer fun_timer(
       "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
-  std::vector<SplitInfo> smaller_best(num_threads_);
-  std::vector<SplitInfo> larger_best(num_threads_);
+  int num_threads = OMP_NUM_THREADS();
+  std::vector<SplitInfo> smaller_best(num_threads);
+  std::vector<SplitInfo> larger_best(num_threads);
   std::vector<int8_t> smaller_node_used_features(num_features_, 1);
   std::vector<int8_t> larger_node_used_features(num_features_, 1);
   if (config_->feature_fraction_bynode < 1.0f) {

diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
@@ -189,7 +189,6 @@ class SerialTreeLearner: public TreeLearner {
   HistogramPool histogram_pool_;
   /*! \brief config of tree learner*/
   const Config* config_;
-  int num_threads_;
   std::vector<int> ordered_bin_indices_;
   bool is_constant_hessian_;
   std::unique_ptr<TrainingTempState> temp_state_;

diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -349,8 +349,9 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
 
 template <typename TREELEARNER_T>
 void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
-  std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
-  std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
+  int num_threads = OMP_NUM_THREADS();
+  std::vector<SplitInfo> smaller_bests_per_thread(num_threads);
+  std::vector<SplitInfo> larger_best_per_thread(num_threads);
   std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
   std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
   if (this->config_->feature_fraction_bynode < 1.0f) {