Skip to content

Commit

Permalink
don't save num_thread as possible (#2839)
Browse files Browse the repository at this point in the history
* don't cache `num_thread`, to avoid change outside

* rename

* update document

* Update docs/Parameters.rst

* Update include/LightGBM/config.h

* Apply suggestions from code review

Co-Authored-By: Nikita Titov <nekit94-08@mail.ru>

* Apply suggestions from code review

Co-Authored-By: Nikita Titov <nekit94-08@mail.ru>

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
  • Loading branch information
guolinke and StrikerRUS committed Mar 2, 2020
1 parent 5a80b78 commit 0aa7bfe
Show file tree
Hide file tree
Showing 18 changed files with 40 additions and 93 deletions.
2 changes: 2 additions & 0 deletions docs/Parameters.rst
Expand Up @@ -171,6 +171,8 @@ Core Parameters

- for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication

- **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors

- ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, aliases: ``device``

- device for the tree learning, you can use GPU to achieve the faster learning
Expand Down
1 change: 1 addition & 0 deletions include/LightGBM/config.h
Expand Up @@ -192,6 +192,7 @@ struct Config {
// desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
// desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
// desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
// desc = **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
int num_threads = 0;

// [doc-only]
Expand Down
5 changes: 1 addition & 4 deletions include/LightGBM/dataset.h
Expand Up @@ -292,10 +292,7 @@ struct TrainingTempState {
return;
}
multi_val_bin.reset(bin);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
num_bin_aligned =
(bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
size_t new_size = static_cast<size_t>(num_bin_aligned) * 2 * num_threads;
Expand Down
7 changes: 1 addition & 6 deletions include/LightGBM/utils/array_args.h
Expand Up @@ -21,12 +21,7 @@ template<typename VAL_T>
class ArrayArgs {
public:
inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
std::vector<size_t> arg_maxs(num_threads, 0);
int n_blocks = Threading::For<size_t>(
0, array.size(), 1024,
Expand Down
12 changes: 2 additions & 10 deletions include/LightGBM/utils/common.h
Expand Up @@ -727,12 +727,7 @@ template<typename _RanIt, typename _Pr, typename _VTRanIt> inline
static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
size_t len = _Last - _First;
const size_t kMinInnerLen = 1024;
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
if (len <= kMinInnerLen || num_threads <= 1) {
std::sort(_First, _Last, _Pred);
return;
Expand Down Expand Up @@ -1032,10 +1027,7 @@ class Timer {
public:
Timer() {
#ifdef TIMETAG
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
start_time_.resize(num_threads);
stats_.resize(num_threads);
#endif // TIMETAG
Expand Down
9 changes: 9 additions & 0 deletions include/LightGBM/utils/openmp_wrapper.h
Expand Up @@ -15,6 +15,14 @@
#include <stdexcept>
#include <vector>

inline int OMP_NUM_THREADS() {
int ret = 1;
#pragma omp parallel
#pragma omp master
{ ret = omp_get_num_threads(); }
return ret;
}

class ThreadExceptionHelper {
public:
ThreadExceptionHelper() {
Expand Down Expand Up @@ -70,6 +78,7 @@ class ThreadExceptionHelper {
inline void omp_set_num_threads(int) {}
inline int omp_get_num_threads() {return 1;}
inline int omp_get_thread_num() {return 0;}
inline int OMP_NUM_THREADS() { return 1; }
#ifdef __cplusplus
}; // extern "C"
#endif
Expand Down
10 changes: 2 additions & 8 deletions include/LightGBM/utils/threading.h
Expand Up @@ -21,10 +21,7 @@ class Threading {
template <typename INDEX_T>
static inline void BlockInfo(INDEX_T cnt, INDEX_T min_cnt_per_block,
int* out_nblock, INDEX_T* block_size) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
BlockInfo<INDEX_T>(num_threads, cnt, min_cnt_per_block, out_nblock,
block_size);
}
Expand Down Expand Up @@ -84,10 +81,7 @@ class ParallelPartitionRunner {
public:
ParallelPartitionRunner(INDEX_T num_data, INDEX_T min_block_size)
: min_block_size_(min_block_size) {
num_threads_ = 1;
#pragma omp parallel
#pragma omp master
{ num_threads_ = omp_get_num_threads(); }
num_threads_ = OMP_NUM_THREADS();
left_.resize(num_data);
if (TWO_BUFFER) {
right_.resize(num_data);
Expand Down
6 changes: 1 addition & 5 deletions src/application/predictor.hpp
Expand Up @@ -56,16 +56,13 @@ class Predictor {
}
}

#pragma omp parallel
#pragma omp master
{ num_threads_ = omp_get_num_threads(); }
boosting->InitPredict(num_iteration, predict_contrib);
boosting_ = boosting;
num_pred_one_row_ = boosting_->NumPredictOneRow(
num_iteration, predict_leaf_index, predict_contrib);
num_feature_ = boosting_->MaxFeatureIdx() + 1;
predict_buf_.resize(
num_threads_,
OMP_NUM_THREADS(),
std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(
num_feature_, 0.0f));
const int kFeatureThreshold = 100000;
Expand Down Expand Up @@ -281,7 +278,6 @@ class Predictor {
PredictionEarlyStopInstance early_stop_;
int num_feature_;
int num_pred_one_row_;
int num_threads_;
std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
};

Expand Down
7 changes: 1 addition & 6 deletions src/c_api.cpp
Expand Up @@ -1529,12 +1529,7 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle,
if (config.num_threads > 0) {
omp_set_num_threads(config.num_threads);
}
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
int ncol = static_cast<int>(ncol_ptr - 1);
std::vector<std::vector<CSC_RowIterator>> iterators(num_threads, std::vector<CSC_RowIterator>());
for (int i = 0; i < num_threads; ++i) {
Expand Down
15 changes: 3 additions & 12 deletions src/io/dataset.cpp
Expand Up @@ -506,10 +506,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
}
const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
const int num_feature = feature_groups_[multi_group_id]->num_feature_;
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();

std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
Expand Down Expand Up @@ -539,10 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
global_timer);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
double sum_dense_ratio = 0;

std::unique_ptr<MultiValBin> ret;
Expand Down Expand Up @@ -1185,10 +1179,7 @@ void Dataset::ConstructHistogramsMultiVal(
if (multi_val_bin == nullptr) {
return;
}
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();

global_timer.Start("Dataset::sparse_bin_histogram");
const int num_bin = multi_val_bin->num_bin();
Expand Down
5 changes: 1 addition & 4 deletions src/io/multi_val_sparse_bin.hpp
Expand Up @@ -25,10 +25,7 @@ class MultiValSparseBin : public MultiValBin {
estimate_element_per_row_(estimate_element_per_row) {
row_ptr_.resize(num_data_ + 1, 0);
INDEX_T estimate_num_data = static_cast<INDEX_T>(estimate_element_per_row_ * 1.1 * num_data_);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
int num_threads = OMP_NUM_THREADS();
if (num_threads > 1) {
t_data_.resize(num_threads - 1);
for (size_t i = 0; i < t_data_.size(); ++i) {
Expand Down
7 changes: 1 addition & 6 deletions src/io/sparse_bin.hpp
Expand Up @@ -73,12 +73,7 @@ class SparseBin: public Bin {

explicit SparseBin(data_size_t num_data)
: num_data_(num_data) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int num_threads = OMP_NUM_THREADS();
push_buffers_.resize(num_threads);
}

Expand Down
13 changes: 3 additions & 10 deletions src/metric/map_metric.hpp
Expand Up @@ -23,12 +23,6 @@ class MapMetric:public Metric {
// get eval position
eval_at_ = config.eval_at;
DCGCalculator::DefaultEvalAt(&eval_at_);
// get number of threads
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}

~MapMetric() {
Expand Down Expand Up @@ -110,8 +104,9 @@ class MapMetric:public Metric {
}
std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
// some buffers for multi-threading sum up
int num_threads = OMP_NUM_THREADS();
std::vector<std::vector<double>> result_buffer_;
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_threads; ++i) {
result_buffer_.emplace_back(eval_at_.size(), 0.0f);
}
std::vector<double> tmp_map(eval_at_.size(), 0.0f);
Expand Down Expand Up @@ -139,7 +134,7 @@ class MapMetric:public Metric {
// Get final average MAP
std::vector<double> result(eval_at_.size(), 0.0f);
for (size_t j = 0; j < result.size(); ++j) {
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_threads; ++i) {
result[j] += result_buffer_[i][j];
}
result[j] /= sum_query_weights_;
Expand All @@ -162,8 +157,6 @@ class MapMetric:public Metric {
double sum_query_weights_;
/*! \brief Evaluate position of Nmap */
std::vector<data_size_t> eval_at_;
/*! \brief Number of threads */
int num_threads_;
std::vector<std::string> name_;
std::vector<data_size_t> npos_per_query_;
};
Expand Down
13 changes: 3 additions & 10 deletions src/metric/rank_metric.hpp
Expand Up @@ -26,12 +26,6 @@ class NDCGMetric:public Metric {
DCGCalculator::DefaultLabelGain(&label_gain);
// initialize DCG calculator
DCGCalculator::Init(label_gain);
// get number of threads
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}

~NDCGMetric() {
Expand Down Expand Up @@ -89,9 +83,10 @@ class NDCGMetric:public Metric {
}

std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
int num_threads = OMP_NUM_THREADS();
// some buffers for multi-threading sum up
std::vector<std::vector<double>> result_buffer_;
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_threads; ++i) {
result_buffer_.emplace_back(eval_at_.size(), 0.0f);
}
std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
Expand Down Expand Up @@ -139,7 +134,7 @@ class NDCGMetric:public Metric {
// Get final average NDCG
std::vector<double> result(eval_at_.size(), 0.0f);
for (size_t j = 0; j < result.size(); ++j) {
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < num_threads; ++i) {
result[j] += result_buffer_[i][j];
}
result[j] /= sum_query_weights_;
Expand All @@ -166,8 +161,6 @@ class NDCGMetric:public Metric {
std::vector<data_size_t> eval_at_;
/*! \brief Cache the inverse max dcg for all queries */
std::vector<std::vector<double>> inverse_max_dcgs_;
/*! \brief Number of threads */
int num_threads_;
};

} // namespace LightGBM
Expand Down
5 changes: 3 additions & 2 deletions src/treelearner/data_parallel_tree_learner.cpp
Expand Up @@ -165,8 +165,9 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {

template <typename TREELEARNER_T>
void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
int num_threads = OMP_NUM_THREADS();
std::vector<SplitInfo> smaller_bests_per_thread(num_threads, SplitInfo());
std::vector<SplitInfo> larger_bests_per_thread(num_threads, SplitInfo());
std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
if (this->config_->feature_fraction_bynode < 1.0f) {
Expand Down
10 changes: 3 additions & 7 deletions src/treelearner/serial_tree_learner.cpp
Expand Up @@ -21,11 +21,6 @@ namespace LightGBM {
SerialTreeLearner::SerialTreeLearner(const Config* config)
:config_(config) {
random_ = Random(config_->feature_fraction_seed);
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}

SerialTreeLearner::~SerialTreeLearner() {
Expand Down Expand Up @@ -400,8 +395,9 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
const std::vector<int8_t>& is_feature_used, bool use_subtract) {
Common::FunctionTimer fun_timer(
"SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
std::vector<SplitInfo> smaller_best(num_threads_);
std::vector<SplitInfo> larger_best(num_threads_);
int num_threads = OMP_NUM_THREADS();
std::vector<SplitInfo> smaller_best(num_threads);
std::vector<SplitInfo> larger_best(num_threads);
std::vector<int8_t> smaller_node_used_features(num_features_, 1);
std::vector<int8_t> larger_node_used_features(num_features_, 1);
if (config_->feature_fraction_bynode < 1.0f) {
Expand Down
1 change: 0 additions & 1 deletion src/treelearner/serial_tree_learner.h
Expand Up @@ -189,7 +189,6 @@ class SerialTreeLearner: public TreeLearner {
HistogramPool histogram_pool_;
/*! \brief config of tree learner*/
const Config* config_;
int num_threads_;
std::vector<int> ordered_bin_indices_;
bool is_constant_hessian_;
std::unique_ptr<TrainingTempState> temp_state_;
Expand Down
5 changes: 3 additions & 2 deletions src/treelearner/voting_parallel_tree_learner.cpp
Expand Up @@ -349,8 +349,9 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {

template <typename TREELEARNER_T>
void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_);
std::vector<SplitInfo> larger_best_per_thread(this->num_threads_);
int num_threads = OMP_NUM_THREADS();
std::vector<SplitInfo> smaller_bests_per_thread(num_threads);
std::vector<SplitInfo> larger_best_per_thread(num_threads);
std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
if (this->config_->feature_fraction_bynode < 1.0f) {
Expand Down

0 comments on commit 0aa7bfe

Please sign in to comment.